Magellan Linux

Contents of /trunk/kernel26-magellan/patches-2.6.31-r4/0107-2.6.31.8-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 968 - (show annotations) (download)
Fri Jan 1 14:52:51 2010 UTC (14 years, 3 months ago) by niro
File size: 165293 byte(s)
-2.6.31-magellan-r4:
-updated to linux-2.6.31.9

1 diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
2 index 7be02ac..32c3da4 100644
3 --- a/Documentation/filesystems/ext4.txt
4 +++ b/Documentation/filesystems/ext4.txt
5 @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
6 identified through its new major/minor numbers encoded
7 in devnum.
8
9 -noload Don't load the journal on mounting. Note that
10 - if the filesystem was not unmounted cleanly,
11 +norecovery Don't load the journal on mounting. Note that
12 +noload if the filesystem was not unmounted cleanly,
13 skipping the journal replay will lead to the
14 filesystem containing inconsistencies that can
15 lead to any number of problems.
16 @@ -338,6 +338,12 @@ noauto_da_alloc replacing existing files via patterns such as
17 system crashes before the delayed allocation
18 blocks are forced to disk.
19
20 +discard Controls whether ext4 should issue discard/TRIM
21 +nodiscard(*) commands to the underlying block device when
22 + blocks are freed. This is useful for SSD devices
23 + and sparse/thinly-provisioned LUNs, but it is off
24 + by default until sufficient testing has been done.
25 +
26 Data Mode
27 =========
28 There are 3 different data modes:
29 diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
30 index 5fd2da4..28a753d 100644
31 --- a/drivers/scsi/hosts.c
32 +++ b/drivers/scsi/hosts.c
33 @@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *shost)
34 EXPORT_SYMBOL(scsi_remove_host);
35
36 /**
37 - * scsi_add_host - add a scsi host
38 + * scsi_add_host_with_dma - add a scsi host with dma device
39 * @shost: scsi host pointer to add
40 * @dev: a struct device of type scsi class
41 + * @dma_dev: dma device for the host
42 + *
43 + * Note: You rarely need to worry about this unless you're in a
44 + * virtualised host environments, so use the simpler scsi_add_host()
45 + * function instead.
46 *
47 * Return value:
48 * 0 on success / != 0 for error
49 **/
50 -int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
51 +int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
52 + struct device *dma_dev)
53 {
54 struct scsi_host_template *sht = shost->hostt;
55 int error = -EINVAL;
56 @@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
57
58 if (!shost->shost_gendev.parent)
59 shost->shost_gendev.parent = dev ? dev : &platform_bus;
60 + shost->dma_dev = dma_dev;
61
62 error = device_add(&shost->shost_gendev);
63 if (error)
64 @@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
65 fail:
66 return error;
67 }
68 -EXPORT_SYMBOL(scsi_add_host);
69 +EXPORT_SYMBOL(scsi_add_host_with_dma);
70
71 static void scsi_host_dev_release(struct device *dev)
72 {
73 diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
74 index fc67cc6..cf13ff2 100644
75 --- a/drivers/scsi/lpfc/lpfc_init.c
76 +++ b/drivers/scsi/lpfc/lpfc_init.c
77 @@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
78 vport->els_tmofunc.function = lpfc_els_timeout;
79 vport->els_tmofunc.data = (unsigned long)vport;
80
81 - error = scsi_add_host(shost, dev);
82 + error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev);
83 if (error)
84 goto out_put_shost;
85
86 diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
87 index 7dc3d18..7a838c8 100644
88 --- a/drivers/scsi/megaraid/megaraid_sas.c
89 +++ b/drivers/scsi/megaraid/megaraid_sas.c
90 @@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
91 int error = 0, i;
92 void *sense = NULL;
93 dma_addr_t sense_handle;
94 - u32 *sense_ptr;
95 + unsigned long *sense_ptr;
96
97 memset(kbuff_arr, 0, sizeof(kbuff_arr));
98
99 @@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
100 }
101
102 sense_ptr =
103 - (u32 *) ((unsigned long)cmd->frame + ioc->sense_off);
104 + (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off);
105 *sense_ptr = sense_handle;
106 }
107
108 @@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
109 * sense_ptr points to the location that has the user
110 * sense buffer address
111 */
112 - sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw +
113 - ioc->sense_off);
114 + sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw +
115 + ioc->sense_off);
116
117 if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)),
118 sense, ioc->sense_len)) {
119 diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
120 index 0f87962..67e016d 100644
121 --- a/drivers/scsi/qla2xxx/qla_attr.c
122 +++ b/drivers/scsi/qla2xxx/qla_attr.c
123 @@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc_vport, bool disable)
124 fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN);
125 }
126
127 - if (scsi_add_host(vha->host, &fc_vport->dev)) {
128 + if (scsi_add_host_with_dma(vha->host, &fc_vport->dev,
129 + &ha->pdev->dev)) {
130 DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n",
131 vha->host_no, vha->vp_idx));
132 goto vport_create_failed_2;
133 diff --git a/drivers/scsi/scsi_lib_dma.c b/drivers/scsi/scsi_lib_dma.c
134 index ac6855c..dcd1285 100644
135 --- a/drivers/scsi/scsi_lib_dma.c
136 +++ b/drivers/scsi/scsi_lib_dma.c
137 @@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd)
138 int nseg = 0;
139
140 if (scsi_sg_count(cmd)) {
141 - struct device *dev = cmd->device->host->shost_gendev.parent;
142 + struct device *dev = cmd->device->host->dma_dev;
143
144 nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
145 cmd->sc_data_direction);
146 @@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map);
147 void scsi_dma_unmap(struct scsi_cmnd *cmd)
148 {
149 if (scsi_sg_count(cmd)) {
150 - struct device *dev = cmd->device->host->shost_gendev.parent;
151 + struct device *dev = cmd->device->host->dma_dev;
152
153 dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
154 cmd->sc_data_direction);
155 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
156 index e2126d7..34bb797 100644
157 --- a/fs/ext4/balloc.c
158 +++ b/fs/ext4/balloc.c
159 @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
160 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
161 ext4_group_t group)
162 {
163 - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
164 + if (!ext4_bg_has_super(sb, group))
165 + return 0;
166 +
167 + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
168 + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
169 + else
170 + return EXT4_SB(sb)->s_gdb_count;
171 }
172
173 /**
174 diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
175 index 50784ef..dc79b75 100644
176 --- a/fs/ext4/block_validity.c
177 +++ b/fs/ext4/block_validity.c
178 @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
179 if (ext4_bg_has_super(sb, i) &&
180 ((i < 5) || ((i % flex_size) == 0)))
181 add_system_zone(sbi, ext4_group_first_block_no(sb, i),
182 - sbi->s_gdb_count + 1);
183 + ext4_bg_num_gdb(sb, i) + 1);
184 gdp = ext4_get_group_desc(sb, i, NULL);
185 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
186 if (ret)
187 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
188 index 9714db3..3b8321b 100644
189 --- a/fs/ext4/ext4.h
190 +++ b/fs/ext4/ext4.h
191 @@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t;
192 #define EXT4_MB_HINT_TRY_GOAL 512
193 /* blocks already pre-reserved by delayed allocation */
194 #define EXT4_MB_DELALLOC_RESERVED 1024
195 +/* We are doing stream allocation */
196 +#define EXT4_MB_STREAM_ALLOC 2048
197
198
199 struct ext4_allocation_request {
200 @@ -111,6 +113,33 @@ struct ext4_allocation_request {
201 unsigned int flags;
202 };
203
204 +#define DIO_AIO_UNWRITTEN 0x1
205 +typedef struct ext4_io_end {
206 + struct list_head list; /* per-file finished AIO list */
207 + struct inode *inode; /* file being written to */
208 + unsigned int flag; /* sync IO or AIO */
209 + int error; /* I/O error code */
210 + ext4_lblk_t offset; /* offset in the file */
211 + size_t size; /* size of the extent */
212 + struct work_struct work; /* data work queue */
213 +} ext4_io_end_t;
214 +
215 +/*
216 + * Delayed allocation stuff
217 + */
218 +
219 +struct mpage_da_data {
220 + struct inode *inode;
221 + sector_t b_blocknr; /* start block number of extent */
222 + size_t b_size; /* size of extent */
223 + unsigned long b_state; /* state of the extent */
224 + unsigned long first_page, next_page; /* extent of pages */
225 + struct writeback_control *wbc;
226 + int io_done;
227 + int pages_written;
228 + int retval;
229 +};
230 +
231 /*
232 * Special inodes numbers
233 */
234 @@ -251,7 +280,6 @@ struct flex_groups {
235 #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
236 #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
237 #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
238 -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
239 #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
240
241 #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
242 @@ -289,6 +317,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
243 #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
244 #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
245 #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
246 +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
247 +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
248
249 /* Used to pass group descriptor data when online resize is done */
250 struct ext4_new_group_input {
251 @@ -330,7 +360,16 @@ struct ext4_new_group_data {
252 /* Call ext4_da_update_reserve_space() after successfully
253 allocating the blocks */
254 #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
255 -
256 + /* caller is from the direct IO path, request to creation of an
257 + unitialized extents if not allocated, split the uninitialized
258 + extent if blocks has been preallocated already*/
259 +#define EXT4_GET_BLOCKS_DIO 0x0010
260 +#define EXT4_GET_BLOCKS_CONVERT 0x0020
261 +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
262 + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
263 + /* Convert extent to initialized after direct IO complete */
264 +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
265 + EXT4_GET_BLOCKS_DIO_CREATE_EXT)
266
267 /*
268 * ioctl commands
269 @@ -386,6 +425,9 @@ struct ext4_mount_options {
270 #endif
271 };
272
273 +/* Max physical block we can addres w/o extents */
274 +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
275 +
276 /*
277 * Structure of an inode on the disk
278 */
279 @@ -481,8 +523,8 @@ struct move_extent {
280 static inline __le32 ext4_encode_extra_time(struct timespec *time)
281 {
282 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
283 - time->tv_sec >> 32 : 0) |
284 - ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
285 + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
286 + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
287 }
288
289 static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
290 @@ -490,7 +532,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
291 if (sizeof(time->tv_sec) > 4)
292 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
293 << 32;
294 - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
295 + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
296 }
297
298 #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
299 @@ -653,6 +695,18 @@ struct ext4_inode_info {
300 __u16 i_extra_isize;
301
302 spinlock_t i_block_reservation_lock;
303 +
304 + /* completed async DIOs that might need unwritten extents handling */
305 + struct list_head i_aio_dio_complete_list;
306 + /* current io_end structure for async DIO write*/
307 + ext4_io_end_t *cur_aio_dio;
308 +
309 + /*
310 + * Transactions that contain inode's metadata needed to complete
311 + * fsync and fdatasync, respectively.
312 + */
313 + tid_t i_sync_tid;
314 + tid_t i_datasync_tid;
315 };
316
317 /*
318 @@ -700,6 +754,7 @@ struct ext4_inode_info {
319 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
320 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
321 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
322 +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
323
324 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
325 #define set_opt(o, opt) o |= EXT4_MOUNT_##opt
326 @@ -841,6 +896,7 @@ struct ext4_sb_info {
327 unsigned long s_gdb_count; /* Number of group descriptor blocks */
328 unsigned long s_desc_per_block; /* Number of group descriptors per block */
329 ext4_group_t s_groups_count; /* Number of groups in the fs */
330 + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
331 unsigned long s_overhead_last; /* Last calculated overhead */
332 unsigned long s_blocks_last; /* Last seen block count */
333 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
334 @@ -923,6 +979,7 @@ struct ext4_sb_info {
335 unsigned int s_mb_stats;
336 unsigned int s_mb_order2_reqs;
337 unsigned int s_mb_group_prealloc;
338 + unsigned int s_max_writeback_mb_bump;
339 /* where last allocation was done - for stream allocation */
340 unsigned long s_mb_last_group;
341 unsigned long s_mb_last_start;
342 @@ -950,6 +1007,7 @@ struct ext4_sb_info {
343 atomic_t s_mb_lost_chunks;
344 atomic_t s_mb_preallocated;
345 atomic_t s_mb_discarded;
346 + atomic_t s_lock_busy;
347
348 /* locality groups */
349 struct ext4_locality_group *s_locality_groups;
350 @@ -960,6 +1018,9 @@ struct ext4_sb_info {
351
352 unsigned int s_log_groups_per_flex;
353 struct flex_groups *s_flex_groups;
354 +
355 + /* workqueue for dio unwritten */
356 + struct workqueue_struct *dio_unwritten_wq;
357 };
358
359 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
360 @@ -1367,6 +1428,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
361 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
362 extern int ext4_can_truncate(struct inode *inode);
363 extern void ext4_truncate(struct inode *);
364 +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
365 extern void ext4_set_inode_flags(struct inode *);
366 extern void ext4_get_inode_flags(struct ext4_inode_info *);
367 extern int ext4_alloc_da_blocks(struct inode *inode);
368 @@ -1378,7 +1440,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
369 struct address_space *mapping, loff_t from);
370 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
371 extern qsize_t ext4_get_reserved_space(struct inode *inode);
372 -
373 +extern int flush_aio_dio_completed_IO(struct inode *inode);
374 /* ioctl.c */
375 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
376 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
377 @@ -1591,15 +1653,42 @@ struct ext4_group_info {
378 #define EXT4_MB_GRP_NEED_INIT(grp) \
379 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
380
381 +#define EXT4_MAX_CONTENTION 8
382 +#define EXT4_CONTENTION_THRESHOLD 2
383 +
384 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
385 ext4_group_t group)
386 {
387 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
388 }
389
390 +/*
391 + * Returns true if the filesystem is busy enough that attempts to
392 + * access the block group locks has run into contention.
393 + */
394 +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
395 +{
396 + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
397 +}
398 +
399 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
400 {
401 - spin_lock(ext4_group_lock_ptr(sb, group));
402 + spinlock_t *lock = ext4_group_lock_ptr(sb, group);
403 + if (spin_trylock(lock))
404 + /*
405 + * We're able to grab the lock right away, so drop the
406 + * lock contention counter.
407 + */
408 + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
409 + else {
410 + /*
411 + * The lock is busy, so bump the contention counter,
412 + * and then wait on the spin lock.
413 + */
414 + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
415 + EXT4_MAX_CONTENTION);
416 + spin_lock(lock);
417 + }
418 }
419
420 static inline void ext4_unlock_group(struct super_block *sb,
421 @@ -1650,6 +1739,8 @@ extern void ext4_ext_init(struct super_block *);
422 extern void ext4_ext_release(struct super_block *);
423 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
424 loff_t len);
425 +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
426 + loff_t len);
427 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
428 sector_t block, unsigned int max_blocks,
429 struct buffer_head *bh, int flags);
430 diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
431 index 20a8410..1c2db3f 100644
432 --- a/fs/ext4/ext4_extents.h
433 +++ b/fs/ext4/ext4_extents.h
434 @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
435 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
436 }
437
438 +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
439 +{
440 + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
441 +}
442 +
443 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
444 extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
445 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
446 @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
447 struct ext4_ext_path *path,
448 struct ext4_extent *);
449 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
450 -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
451 +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
452 extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
453 ext_prepare_callback, void *);
454 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
455 diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
456 index eb27fd0..6a94099 100644
457 --- a/fs/ext4/ext4_jbd2.c
458 +++ b/fs/ext4/ext4_jbd2.c
459 @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
460 handle, err);
461 }
462 else
463 - brelse(bh);
464 + bforget(bh);
465 return err;
466 }
467
468 @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
469 handle, err);
470 }
471 else
472 - brelse(bh);
473 + bforget(bh);
474 return err;
475 }
476
477 @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
478 ext4_journal_abort_handle(where, __func__, bh,
479 handle, err);
480 } else {
481 - mark_buffer_dirty(bh);
482 + if (inode && bh)
483 + mark_buffer_dirty_inode(bh, inode);
484 + else
485 + mark_buffer_dirty(bh);
486 if (inode && inode_needs_sync(inode)) {
487 sync_dirty_buffer(bh);
488 if (buffer_req(bh) && !buffer_uptodate(bh)) {
489 diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
490 index 139fb8c..1892a77 100644
491 --- a/fs/ext4/ext4_jbd2.h
492 +++ b/fs/ext4/ext4_jbd2.h
493 @@ -49,7 +49,7 @@
494
495 #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
496 EXT4_XATTR_TRANS_BLOCKS - 2 + \
497 - 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
498 + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
499
500 /*
501 * Define the number of metadata blocks we need to account to modify data.
502 @@ -57,7 +57,7 @@
503 * This include super block, inode block, quota blocks and xattr blocks
504 */
505 #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
506 - 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
507 + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
508
509 /* Delete operations potentially hit one directory's namespace plus an
510 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
511 @@ -92,6 +92,7 @@
512 * but inode, sb and group updates are done only once */
513 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
514 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
515 +
516 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
517 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
518 #else
519 @@ -99,6 +100,9 @@
520 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
521 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
522 #endif
523 +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
524 +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
525 +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
526
527 int
528 ext4_mark_iloc_dirty(handle_t *handle,
529 @@ -161,11 +165,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
530 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
531 int __ext4_journal_stop(const char *where, handle_t *handle);
532
533 -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
534 +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
535
536 +/* Note: Do not use this for NULL handles. This is only to determine if
537 + * a properly allocated handle is using a journal or not. */
538 static inline int ext4_handle_valid(handle_t *handle)
539 {
540 - if (handle == EXT4_NOJOURNAL_HANDLE)
541 + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
542 return 0;
543 return 1;
544 }
545 @@ -252,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
546 return 0;
547 }
548
549 +static inline void ext4_update_inode_fsync_trans(handle_t *handle,
550 + struct inode *inode,
551 + int datasync)
552 +{
553 + struct ext4_inode_info *ei = EXT4_I(inode);
554 +
555 + if (ext4_handle_valid(handle)) {
556 + ei->i_sync_tid = handle->h_transaction->t_tid;
557 + if (datasync)
558 + ei->i_datasync_tid = handle->h_transaction->t_tid;
559 + }
560 +}
561 +
562 /* super.c */
563 int ext4_force_commit(struct super_block *sb);
564
565 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
566 index 73ebfb4..24fb20b 100644
567 --- a/fs/ext4/extents.c
568 +++ b/fs/ext4/extents.c
569 @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
570 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
571 }
572
573 -static int ext4_ext_journal_restart(handle_t *handle, int needed)
574 +static int ext4_ext_truncate_extend_restart(handle_t *handle,
575 + struct inode *inode,
576 + int needed)
577 {
578 int err;
579
580 @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
581 err = ext4_journal_extend(handle, needed);
582 if (err <= 0)
583 return err;
584 - return ext4_journal_restart(handle, needed);
585 + err = ext4_truncate_restart_trans(handle, inode, needed);
586 + /*
587 + * We have dropped i_data_sem so someone might have cached again
588 + * an extent we are going to truncate.
589 + */
590 + ext4_ext_invalidate_cache(inode);
591 +
592 + return err;
593 }
594
595 /*
596 @@ -701,7 +710,7 @@ err:
597 * insert new index [@logical;@ptr] into the block at @curp;
598 * check where to insert: before @curp or after @curp
599 */
600 -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
601 +int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
602 struct ext4_ext_path *curp,
603 int logical, ext4_fsblk_t ptr)
604 {
605 @@ -1563,7 +1572,7 @@ out:
606 */
607 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
608 struct ext4_ext_path *path,
609 - struct ext4_extent *newext)
610 + struct ext4_extent *newext, int flag)
611 {
612 struct ext4_extent_header *eh;
613 struct ext4_extent *ex, *fex;
614 @@ -1579,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
615 BUG_ON(path[depth].p_hdr == NULL);
616
617 /* try to insert block into found extent and return */
618 - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
619 + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
620 + && ext4_can_extents_be_merged(inode, ex, newext)) {
621 ext_debug("append %d block to %d:%d (from %llu)\n",
622 ext4_ext_get_actual_len(newext),
623 le32_to_cpu(ex->ee_block),
624 @@ -1694,7 +1704,8 @@ has_space:
625
626 merge:
627 /* try to merge extents to the right */
628 - ext4_ext_try_to_merge(inode, path, nearex);
629 + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
630 + ext4_ext_try_to_merge(inode, path, nearex);
631
632 /* try to merge extents to the left */
633
634 @@ -1731,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
635 while (block < last && block != EXT_MAX_BLOCK) {
636 num = last - block;
637 /* find extent for this block */
638 + down_read(&EXT4_I(inode)->i_data_sem);
639 path = ext4_ext_find_extent(inode, block, path);
640 + up_read(&EXT4_I(inode)->i_data_sem);
641 if (IS_ERR(path)) {
642 err = PTR_ERR(path);
643 path = NULL;
644 @@ -2044,7 +2057,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
645 ext_debug("free last %u blocks starting %llu\n", num, start);
646 for (i = 0; i < num; i++) {
647 bh = sb_find_get_block(inode->i_sb, start + i);
648 - ext4_forget(handle, 0, inode, bh, start + i);
649 + ext4_forget(handle, metadata, inode, bh, start + i);
650 }
651 ext4_free_blocks(handle, inode, start, num, metadata);
652 } else if (from == le32_to_cpu(ex->ee_block)
653 @@ -2136,9 +2149,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
654 correct_index = 1;
655 credits += (ext_depth(inode)) + 1;
656 }
657 - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
658 + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
659
660 - err = ext4_ext_journal_restart(handle, credits);
661 + err = ext4_ext_truncate_extend_restart(handle, inode, credits);
662 if (err)
663 goto out;
664
665 @@ -2461,7 +2474,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
666 }
667
668 #define EXT4_EXT_ZERO_LEN 7
669 -
670 /*
671 * This function is called by ext4_ext_get_blocks() if someone tries to write
672 * to an uninitialized extent. It may result in splitting the uninitialized
673 @@ -2554,7 +2566,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
674 ex3->ee_block = cpu_to_le32(iblock);
675 ext4_ext_store_pblock(ex3, newblock);
676 ex3->ee_len = cpu_to_le16(allocated);
677 - err = ext4_ext_insert_extent(handle, inode, path, ex3);
678 + err = ext4_ext_insert_extent(handle, inode, path,
679 + ex3, 0);
680 if (err == -ENOSPC) {
681 err = ext4_ext_zeroout(inode, &orig_ex);
682 if (err)
683 @@ -2610,7 +2623,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
684 ext4_ext_store_pblock(ex3, newblock + max_blocks);
685 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
686 ext4_ext_mark_uninitialized(ex3);
687 - err = ext4_ext_insert_extent(handle, inode, path, ex3);
688 + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
689 if (err == -ENOSPC) {
690 err = ext4_ext_zeroout(inode, &orig_ex);
691 if (err)
692 @@ -2728,7 +2741,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
693 err = ext4_ext_dirty(handle, inode, path + depth);
694 goto out;
695 insert:
696 - err = ext4_ext_insert_extent(handle, inode, path, &newex);
697 + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
698 + if (err == -ENOSPC) {
699 + err = ext4_ext_zeroout(inode, &orig_ex);
700 + if (err)
701 + goto fix_extent_len;
702 + /* update the extent length and mark as initialized */
703 + ex->ee_block = orig_ex.ee_block;
704 + ex->ee_len = orig_ex.ee_len;
705 + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
706 + ext4_ext_dirty(handle, inode, path + depth);
707 + /* zero out the first half */
708 + return allocated;
709 + } else if (err)
710 + goto fix_extent_len;
711 +out:
712 + return err ? err : allocated;
713 +
714 +fix_extent_len:
715 + ex->ee_block = orig_ex.ee_block;
716 + ex->ee_len = orig_ex.ee_len;
717 + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
718 + ext4_ext_mark_uninitialized(ex);
719 + ext4_ext_dirty(handle, inode, path + depth);
720 + return err;
721 +}
722 +
723 +/*
724 + * This function is called by ext4_ext_get_blocks() from
725 + * ext4_get_blocks_dio_write() when DIO to write
726 + * to an uninitialized extent.
727 + *
728 + * Writing to an uninitized extent may result in splitting the uninitialized
729 + * extent into multiple /intialized unintialized extents (up to three)
730 + * There are three possibilities:
731 + * a> There is no split required: Entire extent should be uninitialized
732 + * b> Splits in two extents: Write is happening at either end of the extent
733 + * c> Splits in three extents: Somone is writing in middle of the extent
734 + *
735 + * One of more index blocks maybe needed if the extent tree grow after
736 + * the unintialized extent split. To prevent ENOSPC occur at the IO
737 + * complete, we need to split the uninitialized extent before DIO submit
738 + * the IO. The uninitilized extent called at this time will be split
739 + * into three uninitialized extent(at most). After IO complete, the part
740 + * being filled will be convert to initialized by the end_io callback function
741 + * via ext4_convert_unwritten_extents().
742 + *
743 + * Returns the size of uninitialized extent to be written on success.
744 + */
745 +static int ext4_split_unwritten_extents(handle_t *handle,
746 + struct inode *inode,
747 + struct ext4_ext_path *path,
748 + ext4_lblk_t iblock,
749 + unsigned int max_blocks,
750 + int flags)
751 +{
752 + struct ext4_extent *ex, newex, orig_ex;
753 + struct ext4_extent *ex1 = NULL;
754 + struct ext4_extent *ex2 = NULL;
755 + struct ext4_extent *ex3 = NULL;
756 + struct ext4_extent_header *eh;
757 + ext4_lblk_t ee_block;
758 + unsigned int allocated, ee_len, depth;
759 + ext4_fsblk_t newblock;
760 + int err = 0;
761 +
762 + ext_debug("ext4_split_unwritten_extents: inode %lu,"
763 + "iblock %llu, max_blocks %u\n", inode->i_ino,
764 + (unsigned long long)iblock, max_blocks);
765 + depth = ext_depth(inode);
766 + eh = path[depth].p_hdr;
767 + ex = path[depth].p_ext;
768 + ee_block = le32_to_cpu(ex->ee_block);
769 + ee_len = ext4_ext_get_actual_len(ex);
770 + allocated = ee_len - (iblock - ee_block);
771 + newblock = iblock - ee_block + ext_pblock(ex);
772 + ex2 = ex;
773 + orig_ex.ee_block = ex->ee_block;
774 + orig_ex.ee_len = cpu_to_le16(ee_len);
775 + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
776 +
777 + /*
778 + * If the uninitialized extent begins at the same logical
779 + * block where the write begins, and the write completely
780 + * covers the extent, then we don't need to split it.
781 + */
782 + if ((iblock == ee_block) && (allocated <= max_blocks))
783 + return allocated;
784 +
785 + err = ext4_ext_get_access(handle, inode, path + depth);
786 + if (err)
787 + goto out;
788 + /* ex1: ee_block to iblock - 1 : uninitialized */
789 + if (iblock > ee_block) {
790 + ex1 = ex;
791 + ex1->ee_len = cpu_to_le16(iblock - ee_block);
792 + ext4_ext_mark_uninitialized(ex1);
793 + ex2 = &newex;
794 + }
795 + /*
796 + * for sanity, update the length of the ex2 extent before
797 + * we insert ex3, if ex1 is NULL. This is to avoid temporary
798 + * overlap of blocks.
799 + */
800 + if (!ex1 && allocated > max_blocks)
801 + ex2->ee_len = cpu_to_le16(max_blocks);
802 + /* ex3: to ee_block + ee_len : uninitialised */
803 + if (allocated > max_blocks) {
804 + unsigned int newdepth;
805 + ex3 = &newex;
806 + ex3->ee_block = cpu_to_le32(iblock + max_blocks);
807 + ext4_ext_store_pblock(ex3, newblock + max_blocks);
808 + ex3->ee_len = cpu_to_le16(allocated - max_blocks);
809 + ext4_ext_mark_uninitialized(ex3);
810 + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
811 + if (err == -ENOSPC) {
812 + err = ext4_ext_zeroout(inode, &orig_ex);
813 + if (err)
814 + goto fix_extent_len;
815 + /* update the extent length and mark as initialized */
816 + ex->ee_block = orig_ex.ee_block;
817 + ex->ee_len = orig_ex.ee_len;
818 + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
819 + ext4_ext_dirty(handle, inode, path + depth);
820 + /* zeroed the full extent */
821 + /* blocks available from iblock */
822 + return allocated;
823 +
824 + } else if (err)
825 + goto fix_extent_len;
826 + /*
827 + * The depth, and hence eh & ex might change
828 + * as part of the insert above.
829 + */
830 + newdepth = ext_depth(inode);
831 + /*
832 + * update the extent length after successful insert of the
833 + * split extent
834 + */
835 + orig_ex.ee_len = cpu_to_le16(ee_len -
836 + ext4_ext_get_actual_len(ex3));
837 + depth = newdepth;
838 + ext4_ext_drop_refs(path);
839 + path = ext4_ext_find_extent(inode, iblock, path);
840 + if (IS_ERR(path)) {
841 + err = PTR_ERR(path);
842 + goto out;
843 + }
844 + eh = path[depth].p_hdr;
845 + ex = path[depth].p_ext;
846 + if (ex2 != &newex)
847 + ex2 = ex;
848 +
849 + err = ext4_ext_get_access(handle, inode, path + depth);
850 + if (err)
851 + goto out;
852 +
853 + allocated = max_blocks;
854 + }
855 + /*
856 + * If there was a change of depth as part of the
857 + * insertion of ex3 above, we need to update the length
858 + * of the ex1 extent again here
859 + */
860 + if (ex1 && ex1 != ex) {
861 + ex1 = ex;
862 + ex1->ee_len = cpu_to_le16(iblock - ee_block);
863 + ext4_ext_mark_uninitialized(ex1);
864 + ex2 = &newex;
865 + }
866 + /*
867 + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
868 + * uninitialised still.
869 + */
870 + ex2->ee_block = cpu_to_le32(iblock);
871 + ext4_ext_store_pblock(ex2, newblock);
872 + ex2->ee_len = cpu_to_le16(allocated);
873 + ext4_ext_mark_uninitialized(ex2);
874 + if (ex2 != ex)
875 + goto insert;
876 + /* Mark modified extent as dirty */
877 + err = ext4_ext_dirty(handle, inode, path + depth);
878 + ext_debug("out here\n");
879 + goto out;
880 +insert:
881 + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
882 if (err == -ENOSPC) {
883 err = ext4_ext_zeroout(inode, &orig_ex);
884 if (err)
885 @@ -2743,6 +2940,7 @@ insert:
886 } else if (err)
887 goto fix_extent_len;
888 out:
889 + ext4_ext_show_leaf(inode, path);
890 return err ? err : allocated;
891
892 fix_extent_len:
893 @@ -2753,7 +2951,151 @@ fix_extent_len:
894 ext4_ext_dirty(handle, inode, path + depth);
895 return err;
896 }
897 +static int ext4_convert_unwritten_extents_dio(handle_t *handle,
898 + struct inode *inode,
899 + struct ext4_ext_path *path)
900 +{
901 + struct ext4_extent *ex;
902 + struct ext4_extent_header *eh;
903 + int depth;
904 + int err = 0;
905 + int ret = 0;
906 +
907 + depth = ext_depth(inode);
908 + eh = path[depth].p_hdr;
909 + ex = path[depth].p_ext;
910 +
911 + err = ext4_ext_get_access(handle, inode, path + depth);
912 + if (err)
913 + goto out;
914 + /* first mark the extent as initialized */
915 + ext4_ext_mark_initialized(ex);
916 +
917 + /*
918 + * We have to see if it can be merged with the extent
919 + * on the left.
920 + */
921 + if (ex > EXT_FIRST_EXTENT(eh)) {
922 + /*
923 + * To merge left, pass "ex - 1" to try_to_merge(),
924 + * since it merges towards right _only_.
925 + */
926 + ret = ext4_ext_try_to_merge(inode, path, ex - 1);
927 + if (ret) {
928 + err = ext4_ext_correct_indexes(handle, inode, path);
929 + if (err)
930 + goto out;
931 + depth = ext_depth(inode);
932 + ex--;
933 + }
934 + }
935 + /*
936 + * Try to Merge towards right.
937 + */
938 + ret = ext4_ext_try_to_merge(inode, path, ex);
939 + if (ret) {
940 + err = ext4_ext_correct_indexes(handle, inode, path);
941 + if (err)
942 + goto out;
943 + depth = ext_depth(inode);
944 + }
945 + /* Mark modified extent as dirty */
946 + err = ext4_ext_dirty(handle, inode, path + depth);
947 +out:
948 + ext4_ext_show_leaf(inode, path);
949 + return err;
950 +}
951 +
952 +static int
953 +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
954 + ext4_lblk_t iblock, unsigned int max_blocks,
955 + struct ext4_ext_path *path, int flags,
956 + unsigned int allocated, struct buffer_head *bh_result,
957 + ext4_fsblk_t newblock)
958 +{
959 + int ret = 0;
960 + int err = 0;
961 + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
962 +
963 + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
964 + "block %llu, max_blocks %u, flags %d, allocated %u",
965 + inode->i_ino, (unsigned long long)iblock, max_blocks,
966 + flags, allocated);
967 + ext4_ext_show_leaf(inode, path);
968
969 + /* DIO get_block() before submit the IO, split the extent */
970 + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
971 + ret = ext4_split_unwritten_extents(handle,
972 + inode, path, iblock,
973 + max_blocks, flags);
974 + /*
975 + * Flag the inode(non aio case) or end_io struct (aio case)
976 + * that this IO needs to convertion to written when IO is
977 + * completed
978 + */
979 + if (io)
980 + io->flag = DIO_AIO_UNWRITTEN;
981 + else
982 + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
983 + goto out;
984 + }
985 + /* async DIO end_io complete, convert the filled extent to written */
986 + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
987 + ret = ext4_convert_unwritten_extents_dio(handle, inode,
988 + path);
989 + if (ret >= 0)
990 + ext4_update_inode_fsync_trans(handle, inode, 1);
991 + goto out2;
992 + }
993 + /* buffered IO case */
994 + /*
995 + * repeat fallocate creation request
996 + * we already have an unwritten extent
997 + */
998 + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
999 + goto map_out;
1000 +
1001 + /* buffered READ or buffered write_begin() lookup */
1002 + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
1003 + /*
1004 + * We have blocks reserved already. We
1005 + * return allocated blocks so that delalloc
1006 + * won't do block reservation for us. But
1007 + * the buffer head will be unmapped so that
1008 + * a read from the block returns 0s.
1009 + */
1010 + set_buffer_unwritten(bh_result);
1011 + goto out1;
1012 + }
1013 +
1014 + /* buffered write, writepage time, convert*/
1015 + ret = ext4_ext_convert_to_initialized(handle, inode,
1016 + path, iblock,
1017 + max_blocks);
1018 + if (ret >= 0)
1019 + ext4_update_inode_fsync_trans(handle, inode, 1);
1020 +out:
1021 + if (ret <= 0) {
1022 + err = ret;
1023 + goto out2;
1024 + } else
1025 + allocated = ret;
1026 + set_buffer_new(bh_result);
1027 +map_out:
1028 + set_buffer_mapped(bh_result);
1029 +out1:
1030 + if (allocated > max_blocks)
1031 + allocated = max_blocks;
1032 + ext4_ext_show_leaf(inode, path);
1033 + bh_result->b_bdev = inode->i_sb->s_bdev;
1034 + bh_result->b_blocknr = newblock;
1035 +out2:
1036 + if (path) {
1037 + ext4_ext_drop_refs(path);
1038 + kfree(path);
1039 + }
1040 + return err ? err : allocated;
1041 +}
1042 /*
1043 * Block allocation/map/preallocation routine for extents based files
1044 *
1045 @@ -2784,6 +3126,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1046 int err = 0, depth, ret, cache_type;
1047 unsigned int allocated = 0;
1048 struct ext4_allocation_request ar;
1049 + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
1050
1051 __clear_bit(BH_New, &bh_result->b_state);
1052 ext_debug("blocks %u/%u requested for inode %u\n",
1053 @@ -2859,33 +3202,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1054 EXT4_EXT_CACHE_EXTENT);
1055 goto out;
1056 }
1057 - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
1058 - goto out;
1059 - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
1060 - if (allocated > max_blocks)
1061 - allocated = max_blocks;
1062 - /*
1063 - * We have blocks reserved already. We
1064 - * return allocated blocks so that delalloc
1065 - * won't do block reservation for us. But
1066 - * the buffer head will be unmapped so that
1067 - * a read from the block returns 0s.
1068 - */
1069 - set_buffer_unwritten(bh_result);
1070 - bh_result->b_bdev = inode->i_sb->s_bdev;
1071 - bh_result->b_blocknr = newblock;
1072 - goto out2;
1073 - }
1074 -
1075 - ret = ext4_ext_convert_to_initialized(handle, inode,
1076 - path, iblock,
1077 - max_blocks);
1078 - if (ret <= 0) {
1079 - err = ret;
1080 - goto out2;
1081 - } else
1082 - allocated = ret;
1083 - goto outnew;
1084 + ret = ext4_ext_handle_uninitialized_extents(handle,
1085 + inode, iblock, max_blocks, path,
1086 + flags, allocated, bh_result, newblock);
1087 + return ret;
1088 }
1089 }
1090
1091 @@ -2956,9 +3276,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1092 /* try to insert new extent into found leaf and return */
1093 ext4_ext_store_pblock(&newex, newblock);
1094 newex.ee_len = cpu_to_le16(ar.len);
1095 - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
1096 + /* Mark uninitialized */
1097 + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
1098 ext4_ext_mark_uninitialized(&newex);
1099 - err = ext4_ext_insert_extent(handle, inode, path, &newex);
1100 + /*
1101 + * io_end structure was created for every async
1102 + * direct IO write to the middle of the file.
1103 + * To avoid unecessary convertion for every aio dio rewrite
1104 + * to the mid of file, here we flag the IO that is really
1105 + * need the convertion.
1106 + * For non asycn direct IO case, flag the inode state
1107 + * that we need to perform convertion when IO is done.
1108 + */
1109 + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
1110 + if (io)
1111 + io->flag = DIO_AIO_UNWRITTEN;
1112 + else
1113 + EXT4_I(inode)->i_state |=
1114 + EXT4_STATE_DIO_UNWRITTEN;;
1115 + }
1116 + }
1117 + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
1118 if (err) {
1119 /* free data blocks we just allocated */
1120 /* not a good idea to call discard here directly,
1121 @@ -2972,13 +3310,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1122 /* previous routine could use block we allocated */
1123 newblock = ext_pblock(&newex);
1124 allocated = ext4_ext_get_actual_len(&newex);
1125 -outnew:
1126 set_buffer_new(bh_result);
1127
1128 - /* Cache only when it is _not_ an uninitialized extent */
1129 - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
1130 + /*
1131 + * Cache the extent and update transaction to commit on fdatasync only
1132 + * when it is _not_ an uninitialized extent.
1133 + */
1134 + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
1135 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
1136 EXT4_EXT_CACHE_EXTENT);
1137 + ext4_update_inode_fsync_trans(handle, inode, 1);
1138 + } else
1139 + ext4_update_inode_fsync_trans(handle, inode, 0);
1140 out:
1141 if (allocated > max_blocks)
1142 allocated = max_blocks;
1143 @@ -3171,6 +3514,64 @@ retry:
1144 }
1145
1146 /*
1147 + * This function convert a range of blocks to written extents
1148 + * The caller of this function will pass the start offset and the size.
1149 + * all unwritten extents within this range will be converted to
1150 + * written extents.
1151 + *
1152 + * This function is called from the direct IO end io call back
1153 + * function, to convert the fallocated extents after IO is completed.
1154 + * Returns 0 on success.
1155 + */
1156 +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1157 + loff_t len)
1158 +{
1159 + handle_t *handle;
1160 + ext4_lblk_t block;
1161 + unsigned int max_blocks;
1162 + int ret = 0;
1163 + int ret2 = 0;
1164 + struct buffer_head map_bh;
1165 + unsigned int credits, blkbits = inode->i_blkbits;
1166 +
1167 + block = offset >> blkbits;
1168 + /*
1169 + * We can't just convert len to max_blocks because
1170 + * If blocksize = 4096 offset = 3072 and len = 2048
1171 + */
1172 + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
1173 + - block;
1174 + /*
1175 + * credits to insert 1 extent into extent tree
1176 + */
1177 + credits = ext4_chunk_trans_blocks(inode, max_blocks);
1178 + while (ret >= 0 && ret < max_blocks) {
1179 + block = block + ret;
1180 + max_blocks = max_blocks - ret;
1181 + handle = ext4_journal_start(inode, credits);
1182 + if (IS_ERR(handle)) {
1183 + ret = PTR_ERR(handle);
1184 + break;
1185 + }
1186 + map_bh.b_state = 0;
1187 + ret = ext4_get_blocks(handle, inode, block,
1188 + max_blocks, &map_bh,
1189 + EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
1190 + if (ret <= 0) {
1191 + WARN_ON(ret <= 0);
1192 + printk(KERN_ERR "%s: ext4_ext_get_blocks "
1193 + "returned error inode#%lu, block=%u, "
1194 + "max_blocks=%u", __func__,
1195 + inode->i_ino, block, max_blocks);
1196 + }
1197 + ext4_mark_inode_dirty(handle, inode);
1198 + ret2 = ext4_journal_stop(handle);
1199 + if (ret <= 0 || ret2 )
1200 + break;
1201 + }
1202 + return ret > 0 ? ret2 : ret;
1203 +}
1204 +/*
1205 * Callback function called for each extent to gather FIEMAP information.
1206 */
1207 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
1208 @@ -3308,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1209 * Walk the extent tree gathering extent information.
1210 * ext4_ext_fiemap_cb will push extents back to user.
1211 */
1212 - down_read(&EXT4_I(inode)->i_data_sem);
1213 error = ext4_ext_walk_space(inode, start_blk, len_blks,
1214 ext4_ext_fiemap_cb, fieinfo);
1215 - up_read(&EXT4_I(inode)->i_data_sem);
1216 }
1217
1218 return error;
1219 diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
1220 index 83cf641..d6049e4 100644
1221 --- a/fs/ext4/fsync.c
1222 +++ b/fs/ext4/fsync.c
1223 @@ -44,27 +44,37 @@
1224 *
1225 * What we do is just kick off a commit and wait on it. This will snapshot the
1226 * inode to disk.
1227 + *
1228 + * i_mutex lock is held when entering and exiting this function
1229 */
1230
1231 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
1232 {
1233 struct inode *inode = dentry->d_inode;
1234 + struct ext4_inode_info *ei = EXT4_I(inode);
1235 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
1236 - int ret = 0;
1237 + int ret;
1238 + tid_t commit_tid;
1239
1240 J_ASSERT(ext4_journal_current_handle() == NULL);
1241
1242 trace_ext4_sync_file(file, dentry, datasync);
1243
1244 + if (inode->i_sb->s_flags & MS_RDONLY)
1245 + return 0;
1246 +
1247 + ret = flush_aio_dio_completed_IO(inode);
1248 + if (ret < 0)
1249 + return ret;
1250 +
1251 + if (!journal)
1252 + return simple_fsync(file, dentry, datasync);
1253 +
1254 /*
1255 - * data=writeback:
1256 + * data=writeback,ordered:
1257 * The caller's filemap_fdatawrite()/wait will sync the data.
1258 - * sync_inode() will sync the metadata
1259 - *
1260 - * data=ordered:
1261 - * The caller's filemap_fdatawrite() will write the data and
1262 - * sync_inode() will write the inode if it is dirty. Then the caller's
1263 - * filemap_fdatawait() will wait on the pages.
1264 + * Metadata is in the journal, we wait for proper transaction to
1265 + * commit here.
1266 *
1267 * data=journal:
1268 * filemap_fdatawrite won't do anything (the buffers are clean).
1269 @@ -74,27 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
1270 * (they were dirtied by commit). But that's OK - the blocks are
1271 * safe in-journal, which is all fsync() needs to ensure.
1272 */
1273 - if (ext4_should_journal_data(inode)) {
1274 - ret = ext4_force_commit(inode->i_sb);
1275 - goto out;
1276 - }
1277 -
1278 - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
1279 - goto out;
1280 + if (ext4_should_journal_data(inode))
1281 + return ext4_force_commit(inode->i_sb);
1282
1283 - /*
1284 - * The VFS has written the file data. If the inode is unaltered
1285 - * then we need not start a commit.
1286 - */
1287 - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
1288 - struct writeback_control wbc = {
1289 - .sync_mode = WB_SYNC_ALL,
1290 - .nr_to_write = 0, /* sys_fsync did this */
1291 - };
1292 - ret = sync_inode(inode, &wbc);
1293 - if (journal && (journal->j_flags & JBD2_BARRIER))
1294 - blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1295 - }
1296 -out:
1297 + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
1298 + if (jbd2_log_start_commit(journal, commit_tid))
1299 + jbd2_log_wait_commit(journal, commit_tid);
1300 + else if (journal->j_flags & JBD2_BARRIER)
1301 + blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1302 return ret;
1303 }
1304 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
1305 index f9c642b..38b2154 100644
1306 --- a/fs/ext4/inode.c
1307 +++ b/fs/ext4/inode.c
1308 @@ -37,6 +37,7 @@
1309 #include <linux/namei.h>
1310 #include <linux/uio.h>
1311 #include <linux/bio.h>
1312 +#include <linux/workqueue.h>
1313
1314 #include "ext4_jbd2.h"
1315 #include "xattr.h"
1316 @@ -192,11 +193,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
1317 * so before we call here everything must be consistently dirtied against
1318 * this transaction.
1319 */
1320 -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
1321 +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
1322 + int nblocks)
1323 {
1324 + int ret;
1325 +
1326 + /*
1327 + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
1328 + * moment, get_block can be called only for blocks inside i_size since
1329 + * page cache has been already dropped and writes are blocked by
1330 + * i_mutex. So we can safely drop the i_data_sem here.
1331 + */
1332 BUG_ON(EXT4_JOURNAL(inode) == NULL);
1333 jbd_debug(2, "restarting handle %p\n", handle);
1334 - return ext4_journal_restart(handle, blocks_for_truncate(inode));
1335 + up_write(&EXT4_I(inode)->i_data_sem);
1336 + ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
1337 + down_write(&EXT4_I(inode)->i_data_sem);
1338 + ext4_discard_preallocations(inode);
1339 +
1340 + return ret;
1341 }
1342
1343 /*
1344 @@ -551,15 +566,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
1345 *
1346 * Normally this function find the preferred place for block allocation,
1347 * returns it.
1348 + * Because this is only used for non-extent files, we limit the block nr
1349 + * to 32 bits.
1350 */
1351 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
1352 Indirect *partial)
1353 {
1354 + ext4_fsblk_t goal;
1355 +
1356 /*
1357 * XXX need to get goal block from mballoc's data structures
1358 */
1359
1360 - return ext4_find_near(inode, partial);
1361 + goal = ext4_find_near(inode, partial);
1362 + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
1363 + return goal;
1364 }
1365
1366 /**
1367 @@ -640,6 +661,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
1368 if (*err)
1369 goto failed_out;
1370
1371 + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
1372 +
1373 target -= count;
1374 /* allocate blocks for indirect blocks */
1375 while (index < indirect_blks && count) {
1376 @@ -674,6 +697,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
1377 ar.flags = EXT4_MB_HINT_DATA;
1378
1379 current_block = ext4_mb_new_blocks(handle, &ar, err);
1380 + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
1381
1382 if (*err && (target == blks)) {
1383 /*
1384 @@ -998,10 +1022,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
1385 if (!err)
1386 err = ext4_splice_branch(handle, inode, iblock,
1387 partial, indirect_blks, count);
1388 - else
1389 + if (err)
1390 goto cleanup;
1391
1392 set_buffer_new(bh_result);
1393 +
1394 + ext4_update_inode_fsync_trans(handle, inode, 1);
1395 got_it:
1396 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
1397 if (count > blocks_to_boundary)
1398 @@ -1029,7 +1055,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
1399 EXT4_I(inode)->i_reserved_meta_blocks;
1400 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1401
1402 - return total;
1403 + return (total << inode->i_blkbits);
1404 }
1405 /*
1406 * Calculate the number of metadata blocks need to reserve
1407 @@ -1109,22 +1135,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1408 ext4_discard_preallocations(inode);
1409 }
1410
1411 -static int check_block_validity(struct inode *inode, sector_t logical,
1412 - sector_t phys, int len)
1413 +static int check_block_validity(struct inode *inode, const char *msg,
1414 + sector_t logical, sector_t phys, int len)
1415 {
1416 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1417 - ext4_error(inode->i_sb, "check_block_validity",
1418 + ext4_error(inode->i_sb, msg,
1419 "inode #%lu logical block %llu mapped to %llu "
1420 "(size %d)", inode->i_ino,
1421 (unsigned long long) logical,
1422 (unsigned long long) phys, len);
1423 - WARN_ON(1);
1424 return -EIO;
1425 }
1426 return 0;
1427 }
1428
1429 /*
1430 + * Return the number of contiguous dirty pages in a given inode
1431 + * starting at page frame idx.
1432 + */
1433 +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1434 + unsigned int max_pages)
1435 +{
1436 + struct address_space *mapping = inode->i_mapping;
1437 + pgoff_t index;
1438 + struct pagevec pvec;
1439 + pgoff_t num = 0;
1440 + int i, nr_pages, done = 0;
1441 +
1442 + if (max_pages == 0)
1443 + return 0;
1444 + pagevec_init(&pvec, 0);
1445 + while (!done) {
1446 + index = idx;
1447 + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1448 + PAGECACHE_TAG_DIRTY,
1449 + (pgoff_t)PAGEVEC_SIZE);
1450 + if (nr_pages == 0)
1451 + break;
1452 + for (i = 0; i < nr_pages; i++) {
1453 + struct page *page = pvec.pages[i];
1454 + struct buffer_head *bh, *head;
1455 +
1456 + lock_page(page);
1457 + if (unlikely(page->mapping != mapping) ||
1458 + !PageDirty(page) ||
1459 + PageWriteback(page) ||
1460 + page->index != idx) {
1461 + done = 1;
1462 + unlock_page(page);
1463 + break;
1464 + }
1465 + if (page_has_buffers(page)) {
1466 + bh = head = page_buffers(page);
1467 + do {
1468 + if (!buffer_delay(bh) &&
1469 + !buffer_unwritten(bh))
1470 + done = 1;
1471 + bh = bh->b_this_page;
1472 + } while (!done && (bh != head));
1473 + }
1474 + unlock_page(page);
1475 + if (done)
1476 + break;
1477 + idx++;
1478 + num++;
1479 + if (num >= max_pages)
1480 + break;
1481 + }
1482 + pagevec_release(&pvec);
1483 + }
1484 + return num;
1485 +}
1486 +
1487 +/*
1488 * The ext4_get_blocks() function tries to look up the requested blocks,
1489 * and returns if the blocks are already mapped.
1490 *
1491 @@ -1155,6 +1238,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1492 clear_buffer_mapped(bh);
1493 clear_buffer_unwritten(bh);
1494
1495 + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1496 + "logical block %lu\n", inode->i_ino, flags, max_blocks,
1497 + (unsigned long)block);
1498 /*
1499 * Try to see if we can get the block without requesting a new
1500 * file system block.
1501 @@ -1170,8 +1256,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1502 up_read((&EXT4_I(inode)->i_data_sem));
1503
1504 if (retval > 0 && buffer_mapped(bh)) {
1505 - int ret = check_block_validity(inode, block,
1506 - bh->b_blocknr, retval);
1507 + int ret = check_block_validity(inode, "file system corruption",
1508 + block, bh->b_blocknr, retval);
1509 if (ret != 0)
1510 return ret;
1511 }
1512 @@ -1235,8 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1513 * i_data's format changing. Force the migrate
1514 * to fail by clearing migrate flags
1515 */
1516 - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
1517 - ~EXT4_EXT_MIGRATE;
1518 + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1519 }
1520 }
1521
1522 @@ -1252,8 +1337,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1523
1524 up_write((&EXT4_I(inode)->i_data_sem));
1525 if (retval > 0 && buffer_mapped(bh)) {
1526 - int ret = check_block_validity(inode, block,
1527 - bh->b_blocknr, retval);
1528 + int ret = check_block_validity(inode, "file system "
1529 + "corruption after allocation",
1530 + block, bh->b_blocknr, retval);
1531 if (ret != 0)
1532 return ret;
1533 }
1534 @@ -1451,6 +1537,16 @@ static int do_journal_get_write_access(handle_t *handle,
1535 return ext4_journal_get_write_access(handle, bh);
1536 }
1537
1538 +/*
1539 + * Truncate blocks that were not used by write. We have to truncate the
1540 + * pagecache as well so that corresponding buffers get properly unmapped.
1541 + */
1542 +static void ext4_truncate_failed_write(struct inode *inode)
1543 +{
1544 + truncate_inode_pages(inode->i_mapping, inode->i_size);
1545 + ext4_truncate(inode);
1546 +}
1547 +
1548 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1549 loff_t pos, unsigned len, unsigned flags,
1550 struct page **pagep, void **fsdata)
1551 @@ -1516,7 +1612,7 @@ retry:
1552
1553 ext4_journal_stop(handle);
1554 if (pos + len > inode->i_size) {
1555 - ext4_truncate(inode);
1556 + ext4_truncate_failed_write(inode);
1557 /*
1558 * If truncate failed early the inode might
1559 * still be on the orphan list; we need to
1560 @@ -1626,7 +1722,7 @@ static int ext4_ordered_write_end(struct file *file,
1561 ret = ret2;
1562
1563 if (pos + len > inode->i_size) {
1564 - ext4_truncate(inode);
1565 + ext4_truncate_failed_write(inode);
1566 /*
1567 * If truncate failed early the inode might still be
1568 * on the orphan list; we need to make sure the inode
1569 @@ -1668,7 +1764,7 @@ static int ext4_writeback_write_end(struct file *file,
1570 ret = ret2;
1571
1572 if (pos + len > inode->i_size) {
1573 - ext4_truncate(inode);
1574 + ext4_truncate_failed_write(inode);
1575 /*
1576 * If truncate failed early the inode might still be
1577 * on the orphan list; we need to make sure the inode
1578 @@ -1731,7 +1827,7 @@ static int ext4_journalled_write_end(struct file *file,
1579 if (!ret)
1580 ret = ret2;
1581 if (pos + len > inode->i_size) {
1582 - ext4_truncate(inode);
1583 + ext4_truncate_failed_write(inode);
1584 /*
1585 * If truncate failed early the inode might still be
1586 * on the orphan list; we need to make sure the inode
1587 @@ -1776,11 +1872,11 @@ repeat:
1588
1589 if (ext4_claim_free_blocks(sbi, total)) {
1590 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1591 + vfs_dq_release_reservation_block(inode, total);
1592 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1593 yield();
1594 goto repeat;
1595 }
1596 - vfs_dq_release_reservation_block(inode, total);
1597 return -ENOSPC;
1598 }
1599 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1600 @@ -1860,22 +1956,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1601 }
1602
1603 /*
1604 - * Delayed allocation stuff
1605 - */
1606 -
1607 -struct mpage_da_data {
1608 - struct inode *inode;
1609 - sector_t b_blocknr; /* start block number of extent */
1610 - size_t b_size; /* size of extent */
1611 - unsigned long b_state; /* state of the extent */
1612 - unsigned long first_page, next_page; /* extent of pages */
1613 - struct writeback_control *wbc;
1614 - int io_done;
1615 - int pages_written;
1616 - int retval;
1617 -};
1618 -
1619 -/*
1620 * mpage_da_submit_io - walks through extent of pages and try to write
1621 * them with writepage() call back
1622 *
1623 @@ -2717,7 +2797,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
1624 * number of contiguous block. So we will limit
1625 * number of contiguous block to a sane value
1626 */
1627 - if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
1628 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
1629 (max_blocks > EXT4_MAX_TRANS_DATA))
1630 max_blocks = EXT4_MAX_TRANS_DATA;
1631
1632 @@ -2735,8 +2815,11 @@ static int ext4_da_writepages(struct address_space *mapping,
1633 int no_nrwrite_index_update;
1634 int pages_written = 0;
1635 long pages_skipped;
1636 + unsigned int max_pages;
1637 int range_cyclic, cycled = 1, io_done = 0;
1638 - int needed_blocks, ret = 0, nr_to_writebump = 0;
1639 + int needed_blocks, ret = 0;
1640 + long desired_nr_to_write, nr_to_writebump = 0;
1641 + loff_t range_start = wbc->range_start;
1642 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
1643
1644 trace_ext4_da_writepages(inode, wbc);
1645 @@ -2762,16 +2845,6 @@ static int ext4_da_writepages(struct address_space *mapping,
1646 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
1647 return -EROFS;
1648
1649 - /*
1650 - * Make sure nr_to_write is >= sbi->s_mb_stream_request
1651 - * This make sure small files blocks are allocated in
1652 - * single attempt. This ensure that small files
1653 - * get less fragmented.
1654 - */
1655 - if (wbc->nr_to_write < sbi->s_mb_stream_request) {
1656 - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
1657 - wbc->nr_to_write = sbi->s_mb_stream_request;
1658 - }
1659 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1660 range_whole = 1;
1661
1662 @@ -2786,6 +2859,36 @@ static int ext4_da_writepages(struct address_space *mapping,
1663 } else
1664 index = wbc->range_start >> PAGE_CACHE_SHIFT;
1665
1666 + /*
1667 + * This works around two forms of stupidity. The first is in
1668 + * the writeback code, which caps the maximum number of pages
1669 + * written to be 1024 pages. This is wrong on multiple
1670 + * levels; different architectues have a different page size,
1671 + * which changes the maximum amount of data which gets
1672 + * written. Secondly, 4 megabytes is way too small. XFS
1673 + * forces this value to be 16 megabytes by multiplying
1674 + * nr_to_write parameter by four, and then relies on its
1675 + * allocator to allocate larger extents to make them
1676 + * contiguous. Unfortunately this brings us to the second
1677 + * stupidity, which is that ext4's mballoc code only allocates
1678 + * at most 2048 blocks. So we force contiguous writes up to
1679 + * the number of dirty blocks in the inode, or
1680 + * sbi->max_writeback_mb_bump whichever is smaller.
1681 + */
1682 + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
1683 + if (!range_cyclic && range_whole)
1684 + desired_nr_to_write = wbc->nr_to_write * 8;
1685 + else
1686 + desired_nr_to_write = ext4_num_dirty_pages(inode, index,
1687 + max_pages);
1688 + if (desired_nr_to_write > max_pages)
1689 + desired_nr_to_write = max_pages;
1690 +
1691 + if (wbc->nr_to_write < desired_nr_to_write) {
1692 + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
1693 + wbc->nr_to_write = desired_nr_to_write;
1694 + }
1695 +
1696 mpd.wbc = wbc;
1697 mpd.inode = mapping->host;
1698
1699 @@ -2904,7 +3007,9 @@ retry:
1700 out_writepages:
1701 if (!no_nrwrite_index_update)
1702 wbc->no_nrwrite_index_update = 0;
1703 - wbc->nr_to_write -= nr_to_writebump;
1704 + if (wbc->nr_to_write > nr_to_writebump)
1705 + wbc->nr_to_write -= nr_to_writebump;
1706 + wbc->range_start = range_start;
1707 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
1708 return ret;
1709 }
1710 @@ -2994,7 +3099,7 @@ retry:
1711 * i_size_read because we hold i_mutex.
1712 */
1713 if (pos + len > inode->i_size)
1714 - ext4_truncate(inode);
1715 + ext4_truncate_failed_write(inode);
1716 }
1717
1718 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1719 @@ -3259,6 +3364,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
1720 }
1721
1722 /*
1723 + * O_DIRECT for ext3 (or indirect map) based files
1724 + *
1725 * If the O_DIRECT write will extend the file then add this inode to the
1726 * orphan list. So recovery will truncate it back to the original size
1727 * if the machine crashes during the write.
1728 @@ -3267,7 +3374,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
1729 * crashes then stale disk data _may_ be exposed inside the file. But current
1730 * VFS code falls back into buffered path in that case so we are safe.
1731 */
1732 -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1733 +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
1734 const struct iovec *iov, loff_t offset,
1735 unsigned long nr_segs)
1736 {
1737 @@ -3278,6 +3385,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1738 ssize_t ret;
1739 int orphan = 0;
1740 size_t count = iov_length(iov, nr_segs);
1741 + int retries = 0;
1742
1743 if (rw == WRITE) {
1744 loff_t final_size = offset + count;
1745 @@ -3300,9 +3408,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1746 }
1747 }
1748
1749 +retry:
1750 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1751 offset, nr_segs,
1752 ext4_get_block, NULL);
1753 + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1754 + goto retry;
1755
1756 if (orphan) {
1757 int err;
1758 @@ -3341,6 +3452,364 @@ out:
1759 return ret;
1760 }
1761
1762 +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
1763 + struct buffer_head *bh_result, int create)
1764 +{
1765 + handle_t *handle = NULL;
1766 + int ret = 0;
1767 + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1768 + int dio_credits;
1769 +
1770 + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
1771 + inode->i_ino, create);
1772 + /*
1773 + * DIO VFS code passes create = 0 flag for write to
1774 + * the middle of file. It does this to avoid block
1775 + * allocation for holes, to prevent expose stale data
1776 + * out when there is parallel buffered read (which does
1777 + * not hold the i_mutex lock) while direct IO write has
1778 + * not completed. DIO request on holes finally falls back
1779 + * to buffered IO for this reason.
1780 + *
1781 + * For ext4 extent based file, since we support fallocate,
1782 + * new allocated extent as uninitialized, for holes, we
1783 + * could fallocate blocks for holes, thus parallel
1784 + * buffered IO read will zero out the page when read on
1785 + * a hole while parallel DIO write to the hole has not completed.
1786 + *
1787 + * when we come here, we know it's a direct IO write to
1788 + * to the middle of file (<i_size)
1789 + * so it's safe to override the create flag from VFS.
1790 + */
1791 + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
1792 +
1793 + if (max_blocks > DIO_MAX_BLOCKS)
1794 + max_blocks = DIO_MAX_BLOCKS;
1795 + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
1796 + handle = ext4_journal_start(inode, dio_credits);
1797 + if (IS_ERR(handle)) {
1798 + ret = PTR_ERR(handle);
1799 + goto out;
1800 + }
1801 + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
1802 + create);
1803 + if (ret > 0) {
1804 + bh_result->b_size = (ret << inode->i_blkbits);
1805 + ret = 0;
1806 + }
1807 + ext4_journal_stop(handle);
1808 +out:
1809 + return ret;
1810 +}
1811 +
1812 +static void ext4_free_io_end(ext4_io_end_t *io)
1813 +{
1814 + BUG_ON(!io);
1815 + iput(io->inode);
1816 + kfree(io);
1817 +}
1818 +static void dump_aio_dio_list(struct inode * inode)
1819 +{
1820 +#ifdef EXT4_DEBUG
1821 + struct list_head *cur, *before, *after;
1822 + ext4_io_end_t *io, *io0, *io1;
1823 +
1824 + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
1825 + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
1826 + return;
1827 + }
1828 +
1829 + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
1830 + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
1831 + cur = &io->list;
1832 + before = cur->prev;
1833 + io0 = container_of(before, ext4_io_end_t, list);
1834 + after = cur->next;
1835 + io1 = container_of(after, ext4_io_end_t, list);
1836 +
1837 + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
1838 + io, inode->i_ino, io0, io1);
1839 + }
1840 +#endif
1841 +}
1842 +
1843 +/*
1844 + * check a range of space and convert unwritten extents to written.
1845 + */
1846 +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
1847 +{
1848 + struct inode *inode = io->inode;
1849 + loff_t offset = io->offset;
1850 + size_t size = io->size;
1851 + int ret = 0;
1852 +
1853 + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
1854 + "list->prev 0x%p\n",
1855 + io, inode->i_ino, io->list.next, io->list.prev);
1856 +
1857 + if (list_empty(&io->list))
1858 + return ret;
1859 +
1860 + if (io->flag != DIO_AIO_UNWRITTEN)
1861 + return ret;
1862 +
1863 + if (offset + size <= i_size_read(inode))
1864 + ret = ext4_convert_unwritten_extents(inode, offset, size);
1865 +
1866 + if (ret < 0) {
1867 + printk(KERN_EMERG "%s: failed to convert unwritten"
1868 + "extents to written extents, error is %d"
1869 + " io is still on inode %lu aio dio list\n",
1870 + __func__, ret, inode->i_ino);
1871 + return ret;
1872 + }
1873 +
1874 + /* clear the DIO AIO unwritten flag */
1875 + io->flag = 0;
1876 + return ret;
1877 +}
1878 +/*
1879 + * work on completed aio dio IO, to convert unwritten extents to extents
1880 + */
1881 +static void ext4_end_aio_dio_work(struct work_struct *work)
1882 +{
1883 + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
1884 + struct inode *inode = io->inode;
1885 + int ret = 0;
1886 +
1887 + mutex_lock(&inode->i_mutex);
1888 + ret = ext4_end_aio_dio_nolock(io);
1889 + if (ret >= 0) {
1890 + if (!list_empty(&io->list))
1891 + list_del_init(&io->list);
1892 + ext4_free_io_end(io);
1893 + }
1894 + mutex_unlock(&inode->i_mutex);
1895 +}
1896 +/*
1897 + * This function is called from ext4_sync_file().
1898 + *
1899 + * When AIO DIO IO is completed, the work to convert unwritten
1900 + * extents to written is queued on workqueue but may not get immediately
1901 + * scheduled. When fsync is called, we need to ensure the
1902 + * conversion is complete before fsync returns.
1903 + * The inode keeps track of a list of completed AIO from DIO path
1904 + * that might needs to do the conversion. This function walks through
1905 + * the list and convert the related unwritten extents to written.
1906 + */
1907 +int flush_aio_dio_completed_IO(struct inode *inode)
1908 +{
1909 + ext4_io_end_t *io;
1910 + int ret = 0;
1911 + int ret2 = 0;
1912 +
1913 + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
1914 + return ret;
1915 +
1916 + dump_aio_dio_list(inode);
1917 + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
1918 + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
1919 + ext4_io_end_t, list);
1920 + /*
1921 + * Calling ext4_end_aio_dio_nolock() to convert completed
1922 + * IO to written.
1923 + *
1924 + * When ext4_sync_file() is called, run_queue() may already
1925 + * about to flush the work corresponding to this io structure.
1926 + * It will be upset if it founds the io structure related
1927 + * to the work-to-be schedule is freed.
1928 + *
1929 + * Thus we need to keep the io structure still valid here after
1930 + * convertion finished. The io structure has a flag to
1931 + * avoid double converting from both fsync and background work
1932 + * queue work.
1933 + */
1934 + ret = ext4_end_aio_dio_nolock(io);
1935 + if (ret < 0)
1936 + ret2 = ret;
1937 + else
1938 + list_del_init(&io->list);
1939 + }
1940 + return (ret2 < 0) ? ret2 : 0;
1941 +}
1942 +
1943 +static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
1944 +{
1945 + ext4_io_end_t *io = NULL;
1946 +
1947 + io = kmalloc(sizeof(*io), GFP_NOFS);
1948 +
1949 + if (io) {
1950 + igrab(inode);
1951 + io->inode = inode;
1952 + io->flag = 0;
1953 + io->offset = 0;
1954 + io->size = 0;
1955 + io->error = 0;
1956 + INIT_WORK(&io->work, ext4_end_aio_dio_work);
1957 + INIT_LIST_HEAD(&io->list);
1958 + }
1959 +
1960 + return io;
1961 +}
1962 +
1963 +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
1964 + ssize_t size, void *private)
1965 +{
1966 + ext4_io_end_t *io_end = iocb->private;
1967 + struct workqueue_struct *wq;
1968 +
1969 + /* if not async direct IO or dio with 0 bytes write, just return */
1970 + if (!io_end || !size)
1971 + return;
1972 +
1973 + ext_debug("ext4_end_io_dio(): io_end 0x%p"
1974 + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
1975 + iocb->private, io_end->inode->i_ino, iocb, offset,
1976 + size);
1977 +
1978 + /* if not aio dio with unwritten extents, just free io and return */
1979 + if (io_end->flag != DIO_AIO_UNWRITTEN){
1980 + ext4_free_io_end(io_end);
1981 + iocb->private = NULL;
1982 + return;
1983 + }
1984 +
1985 + io_end->offset = offset;
1986 + io_end->size = size;
1987 + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
1988 +
1989 + /* queue the work to convert unwritten extents to written */
1990 + queue_work(wq, &io_end->work);
1991 +
1992 + /* Add the io_end to per-inode completed aio dio list*/
1993 + list_add_tail(&io_end->list,
1994 + &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
1995 + iocb->private = NULL;
1996 +}
1997 +/*
1998 + * For ext4 extent files, ext4 will do direct-io write to holes,
1999 + * preallocated extents, and those write extend the file, no need to
2000 + * fall back to buffered IO.
2001 + *
2002 + * For holes, we fallocate those blocks, mark them as unintialized
2003 + * If those blocks were preallocated, we mark sure they are splited, but
2004 + * still keep the range to write as unintialized.
2005 + *
2006 + * The unwrritten extents will be converted to written when DIO is completed.
2007 + * For async direct IO, since the IO may still pending when return, we
2008 + * set up an end_io call back function, which will do the convertion
2009 + * when async direct IO completed.
2010 + *
2011 + * If the O_DIRECT write will extend the file then add this inode to the
2012 + * orphan list. So recovery will truncate it back to the original size
2013 + * if the machine crashes during the write.
2014 + *
2015 + */
2016 +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2017 + const struct iovec *iov, loff_t offset,
2018 + unsigned long nr_segs)
2019 +{
2020 + struct file *file = iocb->ki_filp;
2021 + struct inode *inode = file->f_mapping->host;
2022 + ssize_t ret;
2023 + size_t count = iov_length(iov, nr_segs);
2024 +
2025 + loff_t final_size = offset + count;
2026 + if (rw == WRITE && final_size <= inode->i_size) {
2027 + /*
2028 + * We could direct write to holes and fallocate.
2029 + *
2030 + * Allocated blocks to fill the hole are marked as uninitialized
2031 + * to prevent paralel buffered read to expose the stale data
2032 + * before DIO complete the data IO.
2033 + *
2034 + * As to previously fallocated extents, ext4 get_block
2035 + * will just simply mark the buffer mapped but still
2036 + * keep the extents uninitialized.
2037 + *
2038 + * for non AIO case, we will convert those unwritten extents
2039 + * to written after return back from blockdev_direct_IO.
2040 + *
2041 + * for async DIO, the conversion needs to be defered when
2042 + * the IO is completed. The ext4 end_io callback function
2043 + * will be called to take care of the conversion work.
2044 + * Here for async case, we allocate an io_end structure to
2045 + * hook to the iocb.
2046 + */
2047 + iocb->private = NULL;
2048 + EXT4_I(inode)->cur_aio_dio = NULL;
2049 + if (!is_sync_kiocb(iocb)) {
2050 + iocb->private = ext4_init_io_end(inode);
2051 + if (!iocb->private)
2052 + return -ENOMEM;
2053 + /*
2054 + * we save the io structure for current async
2055 + * direct IO, so that later ext4_get_blocks()
2056 + * could flag the io structure whether there
2057 + * is a unwritten extents needs to be converted
2058 + * when IO is completed.
2059 + */
2060 + EXT4_I(inode)->cur_aio_dio = iocb->private;
2061 + }
2062 +
2063 + ret = blockdev_direct_IO(rw, iocb, inode,
2064 + inode->i_sb->s_bdev, iov,
2065 + offset, nr_segs,
2066 + ext4_get_block_dio_write,
2067 + ext4_end_io_dio);
2068 + if (iocb->private)
2069 + EXT4_I(inode)->cur_aio_dio = NULL;
2070 + /*
2071 + * The io_end structure takes a reference to the inode,
2072 + * that structure needs to be destroyed and the
2073 + * reference to the inode need to be dropped, when IO is
2074 + * complete, even with 0 byte write, or failed.
2075 + *
2076 + * In the successful AIO DIO case, the io_end structure will be
2077 + * desctroyed and the reference to the inode will be dropped
2078 + * after the end_io call back function is called.
2079 + *
2080 + * In the case there is 0 byte write, or error case, since
2081 + * VFS direct IO won't invoke the end_io call back function,
2082 + * we need to free the end_io structure here.
2083 + */
2084 + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
2085 + ext4_free_io_end(iocb->private);
2086 + iocb->private = NULL;
2087 + } else if (ret > 0 && (EXT4_I(inode)->i_state &
2088 + EXT4_STATE_DIO_UNWRITTEN)) {
2089 + int err;
2090 + /*
2091 + * for non AIO case, since the IO is already
2092 + * completed, we could do the convertion right here
2093 + */
2094 + err = ext4_convert_unwritten_extents(inode,
2095 + offset, ret);
2096 + if (err < 0)
2097 + ret = err;
2098 + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
2099 + }
2100 + return ret;
2101 + }
2102 +
2103 + /* for write the the end of file case, we fall back to old way */
2104 + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2105 +}
2106 +
2107 +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2108 + const struct iovec *iov, loff_t offset,
2109 + unsigned long nr_segs)
2110 +{
2111 + struct file *file = iocb->ki_filp;
2112 + struct inode *inode = file->f_mapping->host;
2113 +
2114 + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2115 + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
2116 +
2117 + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2118 +}
2119 +
2120 /*
2121 * Pages can be marked dirty completely asynchronously from ext4's journalling
2122 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
2123 @@ -3653,13 +4122,16 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2124 __le32 *last)
2125 {
2126 __le32 *p;
2127 + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode);
2128 +
2129 if (try_to_extend_transaction(handle, inode)) {
2130 if (bh) {
2131 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2132 ext4_handle_dirty_metadata(handle, inode, bh);
2133 }
2134 ext4_mark_inode_dirty(handle, inode);
2135 - ext4_journal_test_restart(handle, inode);
2136 + ext4_truncate_restart_trans(handle, inode,
2137 + blocks_for_truncate(inode));
2138 if (bh) {
2139 BUFFER_TRACE(bh, "retaking write access");
2140 ext4_journal_get_write_access(handle, bh);
2141 @@ -3682,11 +4154,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2142
2143 *p = 0;
2144 tbh = sb_find_get_block(inode->i_sb, nr);
2145 - ext4_forget(handle, 0, inode, tbh, nr);
2146 + ext4_forget(handle, is_metadata, inode, tbh, nr);
2147 }
2148 }
2149
2150 - ext4_free_blocks(handle, inode, block_to_free, count, 0);
2151 + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata);
2152 }
2153
2154 /**
2155 @@ -3870,7 +4342,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2156 return;
2157 if (try_to_extend_transaction(handle, inode)) {
2158 ext4_mark_inode_dirty(handle, inode);
2159 - ext4_journal_test_restart(handle, inode);
2160 + ext4_truncate_restart_trans(handle, inode,
2161 + blocks_for_truncate(inode));
2162 }
2163
2164 ext4_free_blocks(handle, inode, nr, 1, 1);
2165 @@ -3958,8 +4431,7 @@ void ext4_truncate(struct inode *inode)
2166 if (!ext4_can_truncate(inode))
2167 return;
2168
2169 - if (ei->i_disksize && inode->i_size == 0 &&
2170 - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
2171 + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
2172 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
2173
2174 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2175 @@ -4313,8 +4785,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2176 struct ext4_iloc iloc;
2177 struct ext4_inode *raw_inode;
2178 struct ext4_inode_info *ei;
2179 - struct buffer_head *bh;
2180 struct inode *inode;
2181 + journal_t *journal = EXT4_SB(sb)->s_journal;
2182 long ret;
2183 int block;
2184
2185 @@ -4325,11 +4797,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2186 return inode;
2187
2188 ei = EXT4_I(inode);
2189 + iloc.bh = 0;
2190
2191 ret = __ext4_get_inode_loc(inode, &iloc, 0);
2192 if (ret < 0)
2193 goto bad_inode;
2194 - bh = iloc.bh;
2195 raw_inode = ext4_raw_inode(&iloc);
2196 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2197 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2198 @@ -4352,7 +4824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2199 if (inode->i_mode == 0 ||
2200 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
2201 /* this inode is deleted */
2202 - brelse(bh);
2203 ret = -ESTALE;
2204 goto bad_inode;
2205 }
2206 @@ -4380,11 +4851,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2207 ei->i_data[block] = raw_inode->i_block[block];
2208 INIT_LIST_HEAD(&ei->i_orphan);
2209
2210 + /*
2211 + * Set transaction id's of transactions that have to be committed
2212 + * to finish f[data]sync. We set them to currently running transaction
2213 + * as we cannot be sure that the inode or some of its metadata isn't
2214 + * part of the transaction - the inode could have been reclaimed and
2215 + * now it is reread from disk.
2216 + */
2217 + if (journal) {
2218 + transaction_t *transaction;
2219 + tid_t tid;
2220 +
2221 + spin_lock(&journal->j_state_lock);
2222 + if (journal->j_running_transaction)
2223 + transaction = journal->j_running_transaction;
2224 + else
2225 + transaction = journal->j_committing_transaction;
2226 + if (transaction)
2227 + tid = transaction->t_tid;
2228 + else
2229 + tid = journal->j_commit_sequence;
2230 + spin_unlock(&journal->j_state_lock);
2231 + ei->i_sync_tid = tid;
2232 + ei->i_datasync_tid = tid;
2233 + }
2234 +
2235 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2236 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2237 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2238 EXT4_INODE_SIZE(inode->i_sb)) {
2239 - brelse(bh);
2240 ret = -EIO;
2241 goto bad_inode;
2242 }
2243 @@ -4416,10 +4911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2244
2245 ret = 0;
2246 if (ei->i_file_acl &&
2247 - ((ei->i_file_acl <
2248 - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
2249 - EXT4_SB(sb)->s_gdb_count)) ||
2250 - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
2251 + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
2252 ext4_error(sb, __func__,
2253 "bad extended attribute block %llu in inode #%lu",
2254 ei->i_file_acl, inode->i_ino);
2255 @@ -4437,10 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2256 /* Validate block references which are part of inode */
2257 ret = ext4_check_inode_blockref(inode);
2258 }
2259 - if (ret) {
2260 - brelse(bh);
2261 + if (ret)
2262 goto bad_inode;
2263 - }
2264
2265 if (S_ISREG(inode->i_mode)) {
2266 inode->i_op = &ext4_file_inode_operations;
2267 @@ -4468,7 +4958,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2268 init_special_inode(inode, inode->i_mode,
2269 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2270 } else {
2271 - brelse(bh);
2272 ret = -EIO;
2273 ext4_error(inode->i_sb, __func__,
2274 "bogus i_mode (%o) for inode=%lu",
2275 @@ -4481,6 +4970,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2276 return inode;
2277
2278 bad_inode:
2279 + brelse(iloc.bh);
2280 iget_failed(inode);
2281 return ERR_PTR(ret);
2282 }
2283 @@ -4581,8 +5071,7 @@ static int ext4_do_update_inode(handle_t *handle,
2284 if (ext4_inode_blocks_set(handle, raw_inode, ei))
2285 goto out_brelse;
2286 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2287 - /* clear the migrate flag in the raw_inode */
2288 - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
2289 + raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2290 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2291 cpu_to_le32(EXT4_OS_HURD))
2292 raw_inode->i_file_acl_high =
2293 @@ -4641,6 +5130,7 @@ static int ext4_do_update_inode(handle_t *handle,
2294 err = rc;
2295 ei->i_state &= ~EXT4_STATE_NEW;
2296
2297 + ext4_update_inode_fsync_trans(handle, inode, 0);
2298 out_brelse:
2299 brelse(bh);
2300 ext4_std_error(inode->i_sb, err);
2301 @@ -4684,19 +5174,40 @@ out_brelse:
2302 */
2303 int ext4_write_inode(struct inode *inode, int wait)
2304 {
2305 + int err;
2306 +
2307 if (current->flags & PF_MEMALLOC)
2308 return 0;
2309
2310 - if (ext4_journal_current_handle()) {
2311 - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
2312 - dump_stack();
2313 - return -EIO;
2314 - }
2315 + if (EXT4_SB(inode->i_sb)->s_journal) {
2316 + if (ext4_journal_current_handle()) {
2317 + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
2318 + dump_stack();
2319 + return -EIO;
2320 + }
2321
2322 - if (!wait)
2323 - return 0;
2324 + if (!wait)
2325 + return 0;
2326 +
2327 + err = ext4_force_commit(inode->i_sb);
2328 + } else {
2329 + struct ext4_iloc iloc;
2330
2331 - return ext4_force_commit(inode->i_sb);
2332 + err = ext4_get_inode_loc(inode, &iloc);
2333 + if (err)
2334 + return err;
2335 + if (wait)
2336 + sync_dirty_buffer(iloc.bh);
2337 + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
2338 + ext4_error(inode->i_sb, __func__,
2339 + "IO error syncing inode, "
2340 + "inode=%lu, block=%llu",
2341 + inode->i_ino,
2342 + (unsigned long long)iloc.bh->b_blocknr);
2343 + err = -EIO;
2344 + }
2345 + }
2346 + return err;
2347 }
2348
2349 /*
2350 @@ -4739,8 +5250,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
2351
2352 /* (user+group)*(old+new) structure, inode write (sb,
2353 * inode block, ? - but truncate inode update has it) */
2354 - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
2355 - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2356 + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
2357 + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
2358 if (IS_ERR(handle)) {
2359 error = PTR_ERR(handle);
2360 goto err_out;
2361 @@ -5137,24 +5648,13 @@ void ext4_dirty_inode(struct inode *inode)
2362 handle_t *current_handle = ext4_journal_current_handle();
2363 handle_t *handle;
2364
2365 - if (!ext4_handle_valid(current_handle)) {
2366 - ext4_mark_inode_dirty(current_handle, inode);
2367 - return;
2368 - }
2369 -
2370 handle = ext4_journal_start(inode, 2);
2371 if (IS_ERR(handle))
2372 goto out;
2373 - if (current_handle &&
2374 - current_handle->h_transaction != handle->h_transaction) {
2375 - /* This task has a transaction open against a different fs */
2376 - printk(KERN_EMERG "%s: transactions do not match!\n",
2377 - __func__);
2378 - } else {
2379 - jbd_debug(5, "marking dirty. outer handle=%p\n",
2380 - current_handle);
2381 - ext4_mark_inode_dirty(handle, inode);
2382 - }
2383 +
2384 + jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle);
2385 + ext4_mark_inode_dirty(handle, inode);
2386 +
2387 ext4_journal_stop(handle);
2388 out:
2389 return;
2390 @@ -5281,12 +5781,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2391 else
2392 len = PAGE_CACHE_SIZE;
2393
2394 + lock_page(page);
2395 + /*
2396 + * return if we have all the buffers mapped. This avoid
2397 + * the need to call write_begin/write_end which does a
2398 + * journal_start/journal_stop which can block and take
2399 + * long time
2400 + */
2401 if (page_has_buffers(page)) {
2402 - /* return if we have all the buffers mapped */
2403 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2404 - ext4_bh_unmapped))
2405 + ext4_bh_unmapped)) {
2406 + unlock_page(page);
2407 goto out_unlock;
2408 + }
2409 }
2410 + unlock_page(page);
2411 /*
2412 * OK, we need to fill the hole... Do write_begin write_end
2413 * to do block allocation/reservation.We are not holding
2414 diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
2415 index 7050a9c..b63d193 100644
2416 --- a/fs/ext4/ioctl.c
2417 +++ b/fs/ext4/ioctl.c
2418 @@ -221,32 +221,38 @@ setversion_out:
2419 struct file *donor_filp;
2420 int err;
2421
2422 + if (!(filp->f_mode & FMODE_READ) ||
2423 + !(filp->f_mode & FMODE_WRITE))
2424 + return -EBADF;
2425 +
2426 if (copy_from_user(&me,
2427 (struct move_extent __user *)arg, sizeof(me)))
2428 return -EFAULT;
2429 + me.moved_len = 0;
2430
2431 donor_filp = fget(me.donor_fd);
2432 if (!donor_filp)
2433 return -EBADF;
2434
2435 - if (!capable(CAP_DAC_OVERRIDE)) {
2436 - if ((current->real_cred->fsuid != inode->i_uid) ||
2437 - !(inode->i_mode & S_IRUSR) ||
2438 - !(donor_filp->f_dentry->d_inode->i_mode &
2439 - S_IRUSR)) {
2440 - fput(donor_filp);
2441 - return -EACCES;
2442 - }
2443 + if (!(donor_filp->f_mode & FMODE_WRITE)) {
2444 + err = -EBADF;
2445 + goto mext_out;
2446 }
2447
2448 + err = mnt_want_write(filp->f_path.mnt);
2449 + if (err)
2450 + goto mext_out;
2451 +
2452 err = ext4_move_extents(filp, donor_filp, me.orig_start,
2453 me.donor_start, me.len, &me.moved_len);
2454 - fput(donor_filp);
2455 + mnt_drop_write(filp->f_path.mnt);
2456 + if (me.moved_len > 0)
2457 + file_remove_suid(donor_filp);
2458
2459 - if (!err)
2460 - if (copy_to_user((struct move_extent *)arg,
2461 - &me, sizeof(me)))
2462 - return -EFAULT;
2463 + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
2464 + err = -EFAULT;
2465 +mext_out:
2466 + fput(donor_filp);
2467 return err;
2468 }
2469
2470 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
2471 index cd25846..099fd47 100644
2472 --- a/fs/ext4/mballoc.c
2473 +++ b/fs/ext4/mballoc.c
2474 @@ -908,6 +908,97 @@ out:
2475 return err;
2476 }
2477
2478 +static noinline_for_stack
2479 +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
2480 +{
2481 +
2482 + int ret = 0;
2483 + void *bitmap;
2484 + int blocks_per_page;
2485 + int block, pnum, poff;
2486 + int num_grp_locked = 0;
2487 + struct ext4_group_info *this_grp;
2488 + struct ext4_sb_info *sbi = EXT4_SB(sb);
2489 + struct inode *inode = sbi->s_buddy_cache;
2490 + struct page *page = NULL, *bitmap_page = NULL;
2491 +
2492 + mb_debug("init group %lu\n", group);
2493 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2494 + this_grp = ext4_get_group_info(sb, group);
2495 + /*
2496 + * This ensures we don't add group
2497 + * to this buddy cache via resize
2498 + */
2499 + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
2500 + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
2501 + /*
2502 + * somebody initialized the group
2503 + * return without doing anything
2504 + */
2505 + ret = 0;
2506 + goto err;
2507 + }
2508 + /*
2509 + * the buddy cache inode stores the block bitmap
2510 + * and buddy information in consecutive blocks.
2511 + * So for each group we need two blocks.
2512 + */
2513 + block = group * 2;
2514 + pnum = block / blocks_per_page;
2515 + poff = block % blocks_per_page;
2516 + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2517 + if (page) {
2518 + BUG_ON(page->mapping != inode->i_mapping);
2519 + ret = ext4_mb_init_cache(page, NULL);
2520 + if (ret) {
2521 + unlock_page(page);
2522 + goto err;
2523 + }
2524 + unlock_page(page);
2525 + }
2526 + if (page == NULL || !PageUptodate(page)) {
2527 + ret = -EIO;
2528 + goto err;
2529 + }
2530 + mark_page_accessed(page);
2531 + bitmap_page = page;
2532 + bitmap = page_address(page) + (poff * sb->s_blocksize);
2533 +
2534 + /* init buddy cache */
2535 + block++;
2536 + pnum = block / blocks_per_page;
2537 + poff = block % blocks_per_page;
2538 + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2539 + if (page == bitmap_page) {
2540 + /*
2541 + * If both the bitmap and buddy are in
2542 + * the same page we don't need to force
2543 + * init the buddy
2544 + */
2545 + unlock_page(page);
2546 + } else if (page) {
2547 + BUG_ON(page->mapping != inode->i_mapping);
2548 + ret = ext4_mb_init_cache(page, bitmap);
2549 + if (ret) {
2550 + unlock_page(page);
2551 + goto err;
2552 + }
2553 + unlock_page(page);
2554 + }
2555 + if (page == NULL || !PageUptodate(page)) {
2556 + ret = -EIO;
2557 + goto err;
2558 + }
2559 + mark_page_accessed(page);
2560 +err:
2561 + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
2562 + if (bitmap_page)
2563 + page_cache_release(bitmap_page);
2564 + if (page)
2565 + page_cache_release(page);
2566 + return ret;
2567 +}
2568 +
2569 static noinline_for_stack int
2570 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
2571 struct ext4_buddy *e4b)
2572 @@ -941,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
2573 * groups mapped by the page is blocked
2574 * till we are done with allocation
2575 */
2576 +repeat_load_buddy:
2577 down_read(e4b->alloc_semp);
2578
2579 + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2580 + /* we need to check for group need init flag
2581 + * with alloc_semp held so that we can be sure
2582 + * that new blocks didn't get added to the group
2583 + * when we are loading the buddy cache
2584 + */
2585 + up_read(e4b->alloc_semp);
2586 + /*
2587 + * we need full data about the group
2588 + * to make a good selection
2589 + */
2590 + ret = ext4_mb_init_group(sb, group);
2591 + if (ret)
2592 + return ret;
2593 + goto repeat_load_buddy;
2594 + }
2595 +
2596 /*
2597 * the buddy cache inode stores the block bitmap
2598 * and buddy information in consecutive blocks.
2599 @@ -1360,7 +1469,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
2600 ac->alloc_semp = e4b->alloc_semp;
2601 e4b->alloc_semp = NULL;
2602 /* store last allocated for subsequent stream allocation */
2603 - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
2604 + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2605 spin_lock(&sbi->s_md_lock);
2606 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
2607 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
2608 @@ -1837,97 +1946,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
2609
2610 }
2611
2612 -static noinline_for_stack
2613 -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
2614 -{
2615 -
2616 - int ret;
2617 - void *bitmap;
2618 - int blocks_per_page;
2619 - int block, pnum, poff;
2620 - int num_grp_locked = 0;
2621 - struct ext4_group_info *this_grp;
2622 - struct ext4_sb_info *sbi = EXT4_SB(sb);
2623 - struct inode *inode = sbi->s_buddy_cache;
2624 - struct page *page = NULL, *bitmap_page = NULL;
2625 -
2626 - mb_debug("init group %lu\n", group);
2627 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2628 - this_grp = ext4_get_group_info(sb, group);
2629 - /*
2630 - * This ensures we don't add group
2631 - * to this buddy cache via resize
2632 - */
2633 - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
2634 - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
2635 - /*
2636 - * somebody initialized the group
2637 - * return without doing anything
2638 - */
2639 - ret = 0;
2640 - goto err;
2641 - }
2642 - /*
2643 - * the buddy cache inode stores the block bitmap
2644 - * and buddy information in consecutive blocks.
2645 - * So for each group we need two blocks.
2646 - */
2647 - block = group * 2;
2648 - pnum = block / blocks_per_page;
2649 - poff = block % blocks_per_page;
2650 - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2651 - if (page) {
2652 - BUG_ON(page->mapping != inode->i_mapping);
2653 - ret = ext4_mb_init_cache(page, NULL);
2654 - if (ret) {
2655 - unlock_page(page);
2656 - goto err;
2657 - }
2658 - unlock_page(page);
2659 - }
2660 - if (page == NULL || !PageUptodate(page)) {
2661 - ret = -EIO;
2662 - goto err;
2663 - }
2664 - mark_page_accessed(page);
2665 - bitmap_page = page;
2666 - bitmap = page_address(page) + (poff * sb->s_blocksize);
2667 -
2668 - /* init buddy cache */
2669 - block++;
2670 - pnum = block / blocks_per_page;
2671 - poff = block % blocks_per_page;
2672 - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2673 - if (page == bitmap_page) {
2674 - /*
2675 - * If both the bitmap and buddy are in
2676 - * the same page we don't need to force
2677 - * init the buddy
2678 - */
2679 - unlock_page(page);
2680 - } else if (page) {
2681 - BUG_ON(page->mapping != inode->i_mapping);
2682 - ret = ext4_mb_init_cache(page, bitmap);
2683 - if (ret) {
2684 - unlock_page(page);
2685 - goto err;
2686 - }
2687 - unlock_page(page);
2688 - }
2689 - if (page == NULL || !PageUptodate(page)) {
2690 - ret = -EIO;
2691 - goto err;
2692 - }
2693 - mark_page_accessed(page);
2694 -err:
2695 - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
2696 - if (bitmap_page)
2697 - page_cache_release(bitmap_page);
2698 - if (page)
2699 - page_cache_release(page);
2700 - return ret;
2701 -}
2702 -
2703 static noinline_for_stack int
2704 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2705 {
2706 @@ -1938,11 +1956,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2707 struct ext4_sb_info *sbi;
2708 struct super_block *sb;
2709 struct ext4_buddy e4b;
2710 - loff_t size, isize;
2711
2712 sb = ac->ac_sb;
2713 sbi = EXT4_SB(sb);
2714 ngroups = ext4_get_groups_count(sb);
2715 + /* non-extent files are limited to low blocks/groups */
2716 + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
2717 + ngroups = sbi->s_blockfile_groups;
2718 +
2719 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2720
2721 /* first, try the goal */
2722 @@ -1974,20 +1995,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2723 }
2724
2725 bsbits = ac->ac_sb->s_blocksize_bits;
2726 - /* if stream allocation is enabled, use global goal */
2727 - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2728 - isize = i_size_read(ac->ac_inode) >> bsbits;
2729 - if (size < isize)
2730 - size = isize;
2731
2732 - if (size < sbi->s_mb_stream_request &&
2733 - (ac->ac_flags & EXT4_MB_HINT_DATA)) {
2734 + /* if stream allocation is enabled, use global goal */
2735 + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2736 /* TBD: may be hot point */
2737 spin_lock(&sbi->s_md_lock);
2738 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2739 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2740 spin_unlock(&sbi->s_md_lock);
2741 }
2742 +
2743 /* Let's just scan groups to find more-less suitable blocks */
2744 cr = ac->ac_2order ? 0 : 1;
2745 /*
2746 @@ -2015,27 +2032,6 @@ repeat:
2747 if (grp->bb_free == 0)
2748 continue;
2749
2750 - /*
2751 - * if the group is already init we check whether it is
2752 - * a good group and if not we don't load the buddy
2753 - */
2754 - if (EXT4_MB_GRP_NEED_INIT(grp)) {
2755 - /*
2756 - * we need full data about the group
2757 - * to make a good selection
2758 - */
2759 - err = ext4_mb_init_group(sb, group);
2760 - if (err)
2761 - goto out;
2762 - }
2763 -
2764 - /*
2765 - * If the particular group doesn't satisfy our
2766 - * criteria we continue with the next group
2767 - */
2768 - if (!ext4_mb_good_group(ac, group, cr))
2769 - continue;
2770 -
2771 err = ext4_mb_load_buddy(sb, group, &e4b);
2772 if (err)
2773 goto out;
2774 @@ -2571,13 +2567,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
2775 {
2776 ext4_group_t ngroups = ext4_get_groups_count(sb);
2777 ext4_group_t i;
2778 - int metalen;
2779 struct ext4_sb_info *sbi = EXT4_SB(sb);
2780 struct ext4_super_block *es = sbi->s_es;
2781 int num_meta_group_infos;
2782 int num_meta_group_infos_max;
2783 int array_size;
2784 - struct ext4_group_info **meta_group_info;
2785 struct ext4_group_desc *desc;
2786
2787 /* This is the number of blocks used by GDT */
2788 @@ -2622,22 +2616,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2789 goto err_freesgi;
2790 }
2791 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2792 -
2793 - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2794 - for (i = 0; i < num_meta_group_infos; i++) {
2795 - if ((i + 1) == num_meta_group_infos)
2796 - metalen = sizeof(*meta_group_info) *
2797 - (ngroups -
2798 - (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2799 - meta_group_info = kmalloc(metalen, GFP_KERNEL);
2800 - if (meta_group_info == NULL) {
2801 - printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2802 - "buddy group\n");
2803 - goto err_freemeta;
2804 - }
2805 - sbi->s_group_info[i] = meta_group_info;
2806 - }
2807 -
2808 for (i = 0; i < ngroups; i++) {
2809 desc = ext4_get_group_desc(sb, i, NULL);
2810 if (desc == NULL) {
2811 @@ -2655,7 +2633,6 @@ err_freebuddy:
2812 while (i-- > 0)
2813 kfree(ext4_get_group_info(sb, i));
2814 i = num_meta_group_infos;
2815 -err_freemeta:
2816 while (i-- > 0)
2817 kfree(sbi->s_group_info[i]);
2818 iput(sbi->s_buddy_cache);
2819 @@ -2833,7 +2810,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2820 struct ext4_group_info *db;
2821 int err, count = 0, count2 = 0;
2822 struct ext4_free_data *entry;
2823 - ext4_fsblk_t discard_block;
2824 struct list_head *l, *ltmp;
2825
2826 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2827 @@ -2863,13 +2839,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2828 page_cache_release(e4b.bd_bitmap_page);
2829 }
2830 ext4_unlock_group(sb, entry->group);
2831 - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2832 - + entry->start_blk
2833 - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2834 - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
2835 - entry->count);
2836 - sb_issue_discard(sb, discard_block, entry->count);
2837 -
2838 + if (test_opt(sb, DISCARD)) {
2839 + ext4_fsblk_t discard_block;
2840 + struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2841 +
2842 + discard_block = (ext4_fsblk_t)entry->group *
2843 + EXT4_BLOCKS_PER_GROUP(sb)
2844 + + entry->start_blk
2845 + + le32_to_cpu(es->s_first_data_block);
2846 + trace_ext4_discard_blocks(sb,
2847 + (unsigned long long)discard_block,
2848 + entry->count);
2849 + sb_issue_discard(sb, discard_block, entry->count);
2850 + }
2851 kmem_cache_free(ext4_free_ext_cachep, entry);
2852 ext4_mb_release_desc(&e4b);
2853 }
2854 @@ -3276,6 +3258,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2855 }
2856
2857 /*
2858 + * Called on failure; free up any blocks from the inode PA for this
2859 + * context. We don't need this for MB_GROUP_PA because we only change
2860 + * pa_free in ext4_mb_release_context(), but on failure, we've already
2861 + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
2862 + */
2863 +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
2864 +{
2865 + struct ext4_prealloc_space *pa = ac->ac_pa;
2866 + int len;
2867 +
2868 + if (pa && pa->pa_type == MB_INODE_PA) {
2869 + len = ac->ac_b_ex.fe_len;
2870 + pa->pa_free += len;
2871 + }
2872 +
2873 +}
2874 +
2875 +/*
2876 * use blocks preallocated to inode
2877 */
2878 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
2879 @@ -3382,6 +3382,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
2880 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
2881 continue;
2882
2883 + /* non-extent files can't have physical blocks past 2^32 */
2884 + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
2885 + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
2886 + continue;
2887 +
2888 /* found preallocated blocks, use them */
2889 spin_lock(&pa->pa_lock);
2890 if (pa->pa_deleted == 0 && pa->pa_free) {
2891 @@ -4174,16 +4179,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
2892 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2893 return;
2894
2895 + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2896 + return;
2897 +
2898 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2899 - isize = i_size_read(ac->ac_inode) >> bsbits;
2900 - size = max(size, isize);
2901 + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
2902 + >> bsbits;
2903
2904 - /* don't use group allocation for large files */
2905 - if (size >= sbi->s_mb_stream_request)
2906 + if ((size == isize) &&
2907 + !ext4_fs_is_busy(sbi) &&
2908 + (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
2909 + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
2910 return;
2911 + }
2912
2913 - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2914 + /* don't use group allocation for large files */
2915 + size = max(size, isize);
2916 + if (size >= sbi->s_mb_stream_request) {
2917 + ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
2918 return;
2919 + }
2920
2921 BUG_ON(ac->ac_lg != NULL);
2922 /*
2923 @@ -4549,6 +4564,7 @@ repeat:
2924 ac->ac_status = AC_STATUS_CONTINUE;
2925 goto repeat;
2926 } else if (*errp) {
2927 + ext4_discard_allocated_blocks(ac);
2928 ac->ac_b_ex.fe_len = 0;
2929 ar->len = 0;
2930 ext4_mb_show_ac(ac);
2931 diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
2932 index 313a50b..8646149 100644
2933 --- a/fs/ext4/migrate.c
2934 +++ b/fs/ext4/migrate.c
2935 @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
2936 goto err_out;
2937 }
2938 }
2939 - retval = ext4_ext_insert_extent(handle, inode, path, &newext);
2940 + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
2941 err_out:
2942 if (path) {
2943 ext4_ext_drop_refs(path);
2944 @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
2945 * So allocate a credit of 3. We may update
2946 * quota (user and group).
2947 */
2948 - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2949 + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2950
2951 if (ext4_journal_extend(handle, needed) != 0)
2952 retval = ext4_journal_restart(handle, needed);
2953 @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
2954
2955 down_write(&EXT4_I(inode)->i_data_sem);
2956 /*
2957 - * if EXT4_EXT_MIGRATE is cleared a block allocation
2958 + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
2959 * happened after we started the migrate. We need to
2960 * fail the migrate
2961 */
2962 - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
2963 + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
2964 retval = -EAGAIN;
2965 up_write(&EXT4_I(inode)->i_data_sem);
2966 goto err_out;
2967 } else
2968 - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
2969 - ~EXT4_EXT_MIGRATE;
2970 + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
2971 /*
2972 * We have the extent map build with the tmp inode.
2973 * Now copy the i_data across
2974 @@ -478,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode)
2975 handle = ext4_journal_start(inode,
2976 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
2977 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2978 - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
2979 + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
2980 + 1);
2981 if (IS_ERR(handle)) {
2982 retval = PTR_ERR(handle);
2983 @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
2984 * when we add extents we extent the journal
2985 */
2986 /*
2987 - * Even though we take i_mutex we can still cause block allocation
2988 - * via mmap write to holes. If we have allocated new blocks we fail
2989 - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
2990 - * The flag is updated with i_data_sem held to prevent racing with
2991 - * block allocation.
2992 + * Even though we take i_mutex we can still cause block
2993 + * allocation via mmap write to holes. If we have allocated
2994 + * new blocks we fail migrate. New block allocation will
2995 + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
2996 + * with i_data_sem held to prevent racing with block
2997 + * allocation.
2998 */
2999 down_read((&EXT4_I(inode)->i_data_sem));
3000 - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
3001 + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
3002 up_read((&EXT4_I(inode)->i_data_sem));
3003
3004 handle = ext4_journal_start(inode, 1);
3005 @@ -618,7 +618,7 @@ err_out:
3006 tmp_inode->i_nlink = 0;
3007
3008 ext4_journal_stop(handle);
3009 -
3010 + unlock_new_inode(tmp_inode);
3011 iput(tmp_inode);
3012
3013 return retval;
3014 diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
3015 index bbf2dd9..9a573a6 100644
3016 --- a/fs/ext4/move_extent.c
3017 +++ b/fs/ext4/move_extent.c
3018 @@ -19,14 +19,31 @@
3019 #include "ext4_extents.h"
3020 #include "ext4.h"
3021
3022 -#define get_ext_path(path, inode, block, ret) \
3023 - do { \
3024 - path = ext4_ext_find_extent(inode, block, path); \
3025 - if (IS_ERR(path)) { \
3026 - ret = PTR_ERR(path); \
3027 - path = NULL; \
3028 - } \
3029 - } while (0)
3030 +/**
3031 + * get_ext_path - Find an extent path for designated logical block number.
3032 + *
3033 + * @inode: an inode which is searched
3034 + * @lblock: logical block number to find an extent path
3035 + * @path: pointer to an extent path pointer (for output)
3036 + *
3037 + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
3038 + * on failure.
3039 + */
3040 +static inline int
3041 +get_ext_path(struct inode *inode, ext4_lblk_t lblock,
3042 + struct ext4_ext_path **path)
3043 +{
3044 + int ret = 0;
3045 +
3046 + *path = ext4_ext_find_extent(inode, lblock, *path);
3047 + if (IS_ERR(*path)) {
3048 + ret = PTR_ERR(*path);
3049 + *path = NULL;
3050 + } else if ((*path)[ext_depth(inode)].p_ext == NULL)
3051 + ret = -ENODATA;
3052 +
3053 + return ret;
3054 +}
3055
3056 /**
3057 * copy_extent_status - Copy the extent's initialization status
3058 @@ -60,12 +77,14 @@ static int
3059 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
3060 struct ext4_extent **extent)
3061 {
3062 + struct ext4_extent_header *eh;
3063 int ppos, leaf_ppos = path->p_depth;
3064
3065 ppos = leaf_ppos;
3066 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
3067 /* leaf block */
3068 *extent = ++path[ppos].p_ext;
3069 + path[ppos].p_block = ext_pblock(path[ppos].p_ext);
3070 return 0;
3071 }
3072
3073 @@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
3074 ext_block_hdr(path[cur_ppos+1].p_bh);
3075 }
3076
3077 + path[leaf_ppos].p_ext = *extent = NULL;
3078 +
3079 + eh = path[leaf_ppos].p_hdr;
3080 + if (le16_to_cpu(eh->eh_entries) == 0)
3081 + /* empty leaf is found */
3082 + return -ENODATA;
3083 +
3084 /* leaf block */
3085 path[leaf_ppos].p_ext = *extent =
3086 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
3087 + path[leaf_ppos].p_block =
3088 + ext_pblock(path[leaf_ppos].p_ext);
3089 return 0;
3090 }
3091 }
3092 @@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
3093 }
3094
3095 /**
3096 - * mext_double_down_read - Acquire two inodes' read semaphore
3097 + * mext_check_null_inode - NULL check for two inodes
3098 *
3099 - * @orig_inode: original inode structure
3100 - * @donor_inode: donor inode structure
3101 - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
3102 + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
3103 */
3104 -static void
3105 -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
3106 +static int
3107 +mext_check_null_inode(struct inode *inode1, struct inode *inode2,
3108 + const char *function)
3109 {
3110 - struct inode *first = orig_inode, *second = donor_inode;
3111 -
3112 - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3113 -
3114 - /*
3115 - * Use the inode number to provide the stable locking order instead
3116 - * of its address, because the C language doesn't guarantee you can
3117 - * compare pointers that don't come from the same array.
3118 - */
3119 - if (donor_inode->i_ino < orig_inode->i_ino) {
3120 - first = donor_inode;
3121 - second = orig_inode;
3122 + int ret = 0;
3123 +
3124 + if (inode1 == NULL) {
3125 + ext4_error(inode2->i_sb, function,
3126 + "Both inodes should not be NULL: "
3127 + "inode1 NULL inode2 %lu", inode2->i_ino);
3128 + ret = -EIO;
3129 + } else if (inode2 == NULL) {
3130 + ext4_error(inode1->i_sb, function,
3131 + "Both inodes should not be NULL: "
3132 + "inode1 %lu inode2 NULL", inode1->i_ino);
3133 + ret = -EIO;
3134 }
3135 -
3136 - down_read(&EXT4_I(first)->i_data_sem);
3137 - down_read(&EXT4_I(second)->i_data_sem);
3138 + return ret;
3139 }
3140
3141 /**
3142 - * mext_double_down_write - Acquire two inodes' write semaphore
3143 + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
3144 *
3145 * @orig_inode: original inode structure
3146 * @donor_inode: donor inode structure
3147 - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
3148 + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
3149 + * i_ino order.
3150 */
3151 static void
3152 -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
3153 +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
3154 {
3155 struct inode *first = orig_inode, *second = donor_inode;
3156
3157 - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3158 -
3159 /*
3160 * Use the inode number to provide the stable locking order instead
3161 * of its address, because the C language doesn't guarantee you can
3162 @@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
3163 }
3164
3165 down_write(&EXT4_I(first)->i_data_sem);
3166 - down_write(&EXT4_I(second)->i_data_sem);
3167 -}
3168 -
3169 -/**
3170 - * mext_double_up_read - Release two inodes' read semaphore
3171 - *
3172 - * @orig_inode: original inode structure to be released its lock first
3173 - * @donor_inode: donor inode structure to be released its lock second
3174 - * Release read semaphore of two inodes (orig and donor).
3175 - */
3176 -static void
3177 -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
3178 -{
3179 - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3180 -
3181 - up_read(&EXT4_I(orig_inode)->i_data_sem);
3182 - up_read(&EXT4_I(donor_inode)->i_data_sem);
3183 + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
3184 }
3185
3186 /**
3187 - * mext_double_up_write - Release two inodes' write semaphore
3188 + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
3189 *
3190 * @orig_inode: original inode structure to be released its lock first
3191 * @donor_inode: donor inode structure to be released its lock second
3192 - * Release write semaphore of two inodes (orig and donor).
3193 + * Release write lock of i_data_sem of two inodes (orig and donor).
3194 */
3195 static void
3196 -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
3197 +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
3198 {
3199 - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3200 -
3201 up_write(&EXT4_I(orig_inode)->i_data_sem);
3202 up_write(&EXT4_I(donor_inode)->i_data_sem);
3203 }
3204 @@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
3205 }
3206
3207 if (new_flag) {
3208 - get_ext_path(orig_path, orig_inode, eblock, err);
3209 - if (orig_path == NULL)
3210 + err = get_ext_path(orig_inode, eblock, &orig_path);
3211 + if (err)
3212 goto out;
3213
3214 if (ext4_ext_insert_extent(handle, orig_inode,
3215 - orig_path, new_ext))
3216 + orig_path, new_ext, 0))
3217 goto out;
3218 }
3219
3220 if (end_flag) {
3221 - get_ext_path(orig_path, orig_inode,
3222 - le32_to_cpu(end_ext->ee_block) - 1, err);
3223 - if (orig_path == NULL)
3224 + err = get_ext_path(orig_inode,
3225 + le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
3226 + if (err)
3227 goto out;
3228
3229 if (ext4_ext_insert_extent(handle, orig_inode,
3230 - orig_path, end_ext))
3231 + orig_path, end_ext, 0))
3232 goto out;
3233 }
3234 out:
3235 @@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
3236 * oext |-----------|
3237 * new_ext |-------|
3238 */
3239 - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
3240 + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
3241 + ext4_error(orig_inode->i_sb, __func__,
3242 + "new_ext_end(%u) should be less than or equal to "
3243 + "oext->ee_block(%u) + oext_alen(%d) - 1",
3244 + new_ext_end, le32_to_cpu(oext->ee_block),
3245 + oext_alen);
3246 + ret = -EIO;
3247 + goto out;
3248 + }
3249
3250 /*
3251 * Case: new_ext is smaller than original extent
3252 @@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
3253
3254 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
3255 o_end, &start_ext, &new_ext, &end_ext);
3256 +out:
3257 return ret;
3258 }
3259
3260 @@ -554,8 +569,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
3261 * @orig_off: block offset of original inode
3262 * @donor_off: block offset of donor inode
3263 * @max_count: the maximun length of extents
3264 + *
3265 + * Return 0 on success, or a negative error value on failure.
3266 */
3267 -static void
3268 +static int
3269 mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3270 struct ext4_extent *tmp_oext,
3271 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
3272 @@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3273 ext4_lblk_t diff, orig_diff;
3274 struct ext4_extent dext_old, oext_old;
3275
3276 + BUG_ON(orig_off != donor_off);
3277 +
3278 + /* original and donor extents have to cover the same block offset */
3279 + if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
3280 + le32_to_cpu(tmp_oext->ee_block) +
3281 + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
3282 + return -ENODATA;
3283 +
3284 + if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
3285 + le32_to_cpu(tmp_dext->ee_block) +
3286 + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
3287 + return -ENODATA;
3288 +
3289 dext_old = *tmp_dext;
3290 oext_old = *tmp_oext;
3291
3292 @@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3293
3294 copy_extent_status(&oext_old, tmp_dext);
3295 copy_extent_status(&dext_old, tmp_oext);
3296 +
3297 + return 0;
3298 }
3299
3300 /**
3301 @@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3302 * @donor_inode: donor inode
3303 * @from: block offset of orig_inode
3304 * @count: block count to be replaced
3305 + * @err: pointer to save return value
3306 *
3307 * Replace original inode extents and donor inode extents page by page.
3308 * We implement this replacement in the following three steps:
3309 @@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3310 * 3. Change the block information of donor inode to point at the saved
3311 * original inode blocks in the dummy extents.
3312 *
3313 - * Return 0 on success, or a negative error value on failure.
3314 + * Return replaced block count.
3315 */
3316 static int
3317 mext_replace_branches(handle_t *handle, struct inode *orig_inode,
3318 struct inode *donor_inode, ext4_lblk_t from,
3319 - ext4_lblk_t count)
3320 + ext4_lblk_t count, int *err)
3321 {
3322 struct ext4_ext_path *orig_path = NULL;
3323 struct ext4_ext_path *donor_path = NULL;
3324 struct ext4_extent *oext, *dext;
3325 struct ext4_extent tmp_dext, tmp_oext;
3326 ext4_lblk_t orig_off = from, donor_off = from;
3327 - int err = 0;
3328 int depth;
3329 int replaced_count = 0;
3330 int dext_alen;
3331
3332 - mext_double_down_write(orig_inode, donor_inode);
3333 + /* Protect extent trees against block allocations via delalloc */
3334 + double_down_write_data_sem(orig_inode, donor_inode);
3335
3336 /* Get the original extent for the block "orig_off" */
3337 - get_ext_path(orig_path, orig_inode, orig_off, err);
3338 - if (orig_path == NULL)
3339 + *err = get_ext_path(orig_inode, orig_off, &orig_path);
3340 + if (*err)
3341 goto out;
3342
3343 /* Get the donor extent for the head */
3344 - get_ext_path(donor_path, donor_inode, donor_off, err);
3345 - if (donor_path == NULL)
3346 + *err = get_ext_path(donor_inode, donor_off, &donor_path);
3347 + if (*err)
3348 goto out;
3349 depth = ext_depth(orig_inode);
3350 oext = orig_path[depth].p_ext;
3351 @@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
3352 dext = donor_path[depth].p_ext;
3353 tmp_dext = *dext;
3354
3355 - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3356 + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3357 donor_off, count);
3358 + if (*err)
3359 + goto out;
3360
3361 /* Loop for the donor extents */
3362 while (1) {
3363 /* The extent for donor must be found. */
3364 - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
3365 + if (!dext) {
3366 + ext4_error(donor_inode->i_sb, __func__,
3367 + "The extent for donor must be found");
3368 + *err = -EIO;
3369 + goto out;
3370 + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
3371 + ext4_error(donor_inode->i_sb, __func__,
3372 + "Donor offset(%u) and the first block of donor "
3373 + "extent(%u) should be equal",
3374 + donor_off,
3375 + le32_to_cpu(tmp_dext.ee_block));
3376 + *err = -EIO;
3377 + goto out;
3378 + }
3379
3380 /* Set donor extent to orig extent */
3381 - err = mext_leaf_block(handle, orig_inode,
3382 + *err = mext_leaf_block(handle, orig_inode,
3383 orig_path, &tmp_dext, &orig_off);
3384 - if (err < 0)
3385 + if (*err)
3386 goto out;
3387
3388 /* Set orig extent to donor extent */
3389 - err = mext_leaf_block(handle, donor_inode,
3390 + *err = mext_leaf_block(handle, donor_inode,
3391 donor_path, &tmp_oext, &donor_off);
3392 - if (err < 0)
3393 + if (*err)
3394 goto out;
3395
3396 dext_alen = ext4_ext_get_actual_len(&tmp_dext);
3397 @@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
3398
3399 if (orig_path)
3400 ext4_ext_drop_refs(orig_path);
3401 - get_ext_path(orig_path, orig_inode, orig_off, err);
3402 - if (orig_path == NULL)
3403 + *err = get_ext_path(orig_inode, orig_off, &orig_path);
3404 + if (*err)
3405 goto out;
3406 depth = ext_depth(orig_inode);
3407 oext = orig_path[depth].p_ext;
3408 - if (le32_to_cpu(oext->ee_block) +
3409 - ext4_ext_get_actual_len(oext) <= orig_off) {
3410 - err = 0;
3411 - goto out;
3412 - }
3413 tmp_oext = *oext;
3414
3415 if (donor_path)
3416 ext4_ext_drop_refs(donor_path);
3417 - get_ext_path(donor_path, donor_inode,
3418 - donor_off, err);
3419 - if (donor_path == NULL)
3420 + *err = get_ext_path(donor_inode, donor_off, &donor_path);
3421 + if (*err)
3422 goto out;
3423 depth = ext_depth(donor_inode);
3424 dext = donor_path[depth].p_ext;
3425 - if (le32_to_cpu(dext->ee_block) +
3426 - ext4_ext_get_actual_len(dext) <= donor_off) {
3427 - err = 0;
3428 - goto out;
3429 - }
3430 tmp_dext = *dext;
3431
3432 - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3433 - donor_off,
3434 - count - replaced_count);
3435 + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3436 + donor_off, count - replaced_count);
3437 + if (*err)
3438 + goto out;
3439 }
3440
3441 out:
3442 @@ -720,8 +758,12 @@ out:
3443 kfree(donor_path);
3444 }
3445
3446 - mext_double_up_write(orig_inode, donor_inode);
3447 - return err;
3448 + ext4_ext_invalidate_cache(orig_inode);
3449 + ext4_ext_invalidate_cache(donor_inode);
3450 +
3451 + double_up_write_data_sem(orig_inode, donor_inode);
3452 +
3453 + return replaced_count;
3454 }
3455
3456 /**
3457 @@ -733,16 +775,17 @@ out:
3458 * @data_offset_in_page: block index where data swapping starts
3459 * @block_len_in_page: the number of blocks to be swapped
3460 * @uninit: orig extent is uninitialized or not
3461 + * @err: pointer to save return value
3462 *
3463 * Save the data in original inode blocks and replace original inode extents
3464 * with donor inode extents by calling mext_replace_branches().
3465 - * Finally, write out the saved data in new original inode blocks. Return 0
3466 - * on success, or a negative error value on failure.
3467 + * Finally, write out the saved data in new original inode blocks. Return
3468 + * replaced block count.
3469 */
3470 static int
3471 -move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3472 +move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
3473 pgoff_t orig_page_offset, int data_offset_in_page,
3474 - int block_len_in_page, int uninit)
3475 + int block_len_in_page, int uninit, int *err)
3476 {
3477 struct inode *orig_inode = o_filp->f_dentry->d_inode;
3478 struct address_space *mapping = orig_inode->i_mapping;
3479 @@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3480 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
3481 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
3482 unsigned int w_flags = 0;
3483 - unsigned int tmp_data_len, data_len;
3484 + unsigned int tmp_data_size, data_size, replaced_size;
3485 void *fsdata;
3486 - int ret, i, jblocks;
3487 + int i, jblocks;
3488 + int err2 = 0;
3489 + int replaced_count = 0;
3490 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
3491
3492 /*
3493 @@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3494 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
3495 handle = ext4_journal_start(orig_inode, jblocks);
3496 if (IS_ERR(handle)) {
3497 - ret = PTR_ERR(handle);
3498 - return ret;
3499 + *err = PTR_ERR(handle);
3500 + return 0;
3501 }
3502
3503 if (segment_eq(get_fs(), KERNEL_DS))
3504 @@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3505 * Just swap data blocks between orig and donor.
3506 */
3507 if (uninit) {
3508 - ret = mext_replace_branches(handle, orig_inode,
3509 - donor_inode, orig_blk_offset,
3510 - block_len_in_page);
3511 -
3512 - /* Clear the inode cache not to refer to the old data */
3513 - ext4_ext_invalidate_cache(orig_inode);
3514 - ext4_ext_invalidate_cache(donor_inode);
3515 + replaced_count = mext_replace_branches(handle, orig_inode,
3516 + donor_inode, orig_blk_offset,
3517 + block_len_in_page, err);
3518 goto out2;
3519 }
3520
3521 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
3522
3523 - /* Calculate data_len */
3524 + /* Calculate data_size */
3525 if ((orig_blk_offset + block_len_in_page - 1) ==
3526 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
3527 /* Replace the last block */
3528 - tmp_data_len = orig_inode->i_size & (blocksize - 1);
3529 + tmp_data_size = orig_inode->i_size & (blocksize - 1);
3530 /*
3531 - * If data_len equal zero, it shows data_len is multiples of
3532 + * If data_size equal zero, it shows data_size is multiples of
3533 * blocksize. So we set appropriate value.
3534 */
3535 - if (tmp_data_len == 0)
3536 - tmp_data_len = blocksize;
3537 + if (tmp_data_size == 0)
3538 + tmp_data_size = blocksize;
3539
3540 - data_len = tmp_data_len +
3541 + data_size = tmp_data_size +
3542 ((block_len_in_page - 1) << orig_inode->i_blkbits);
3543 - } else {
3544 - data_len = block_len_in_page << orig_inode->i_blkbits;
3545 - }
3546 + } else
3547 + data_size = block_len_in_page << orig_inode->i_blkbits;
3548 +
3549 + replaced_size = data_size;
3550
3551 - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
3552 + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
3553 &page, &fsdata);
3554 - if (unlikely(ret < 0))
3555 + if (unlikely(*err < 0))
3556 goto out;
3557
3558 if (!PageUptodate(page)) {
3559 @@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3560 /* Release old bh and drop refs */
3561 try_to_release_page(page, 0);
3562
3563 - ret = mext_replace_branches(handle, orig_inode, donor_inode,
3564 - orig_blk_offset, block_len_in_page);
3565 - if (ret < 0)
3566 - goto out;
3567 -
3568 - /* Clear the inode cache not to refer to the old data */
3569 - ext4_ext_invalidate_cache(orig_inode);
3570 - ext4_ext_invalidate_cache(donor_inode);
3571 + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
3572 + orig_blk_offset, block_len_in_page,
3573 + &err2);
3574 + if (err2) {
3575 + if (replaced_count) {
3576 + block_len_in_page = replaced_count;
3577 + replaced_size =
3578 + block_len_in_page << orig_inode->i_blkbits;
3579 + } else
3580 + goto out;
3581 + }
3582
3583 if (!page_has_buffers(page))
3584 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
3585 @@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3586 bh = bh->b_this_page;
3587
3588 for (i = 0; i < block_len_in_page; i++) {
3589 - ret = ext4_get_block(orig_inode,
3590 + *err = ext4_get_block(orig_inode,
3591 (sector_t)(orig_blk_offset + i), bh, 0);
3592 - if (ret < 0)
3593 + if (*err < 0)
3594 goto out;
3595
3596 if (bh->b_this_page != NULL)
3597 bh = bh->b_this_page;
3598 }
3599
3600 - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
3601 + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
3602 page, fsdata);
3603 page = NULL;
3604
3605 @@ -871,11 +916,15 @@ out:
3606 if (PageLocked(page))
3607 unlock_page(page);
3608 page_cache_release(page);
3609 + ext4_journal_stop(handle);
3610 }
3611 out2:
3612 ext4_journal_stop(handle);
3613
3614 - return ret < 0 ? ret : 0;
3615 + if (err2)
3616 + *err = err2;
3617 +
3618 + return replaced_count;
3619 }
3620
3621 /**
3622 @@ -886,7 +935,6 @@ out2:
3623 * @orig_start: logical start offset in block for orig
3624 * @donor_start: logical start offset in block for donor
3625 * @len: the number of blocks to be moved
3626 - * @moved_len: moved block length
3627 *
3628 * Check the arguments of ext4_move_extents() whether the files can be
3629 * exchanged with each other.
3630 @@ -894,9 +942,13 @@ out2:
3631 */
3632 static int
3633 mext_check_arguments(struct inode *orig_inode,
3634 - struct inode *donor_inode, __u64 orig_start,
3635 - __u64 donor_start, __u64 *len, __u64 moved_len)
3636 + struct inode *donor_inode, __u64 orig_start,
3637 + __u64 donor_start, __u64 *len)
3638 {
3639 + ext4_lblk_t orig_blocks, donor_blocks;
3640 + unsigned int blkbits = orig_inode->i_blkbits;
3641 + unsigned int blocksize = 1 << blkbits;
3642 +
3643 /* Regular file check */
3644 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
3645 ext4_debug("ext4 move extent: The argument files should be "
3646 @@ -905,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
3647 return -EINVAL;
3648 }
3649
3650 + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
3651 + ext4_debug("ext4 move extent: suid or sgid is set"
3652 + " to donor file [ino:orig %lu, donor %lu]\n",
3653 + orig_inode->i_ino, donor_inode->i_ino);
3654 + return -EINVAL;
3655 + }
3656 +
3657 /* Ext4 move extent does not support swapfile */
3658 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
3659 ext4_debug("ext4 move extent: The argument files should "
3660 @@ -921,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode,
3661 return -EINVAL;
3662 }
3663
3664 - /* orig and donor should be different file */
3665 - if (orig_inode->i_ino == donor_inode->i_ino) {
3666 - ext4_debug("ext4 move extent: The argument files should not "
3667 - "be same file [ino:orig %lu, donor %lu]\n",
3668 - orig_inode->i_ino, donor_inode->i_ino);
3669 - return -EINVAL;
3670 - }
3671 -
3672 /* Ext4 move extent supports only extent based file */
3673 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
3674 ext4_debug("ext4 move extent: orig file is not extents "
3675 @@ -953,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
3676 return -EINVAL;
3677 }
3678
3679 - if (moved_len) {
3680 - ext4_debug("ext4 move extent: moved_len should be 0 "
3681 - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
3682 - donor_inode->i_ino);
3683 - return -EINVAL;
3684 - }
3685 -
3686 if ((orig_start > MAX_DEFRAG_SIZE) ||
3687 (donor_start > MAX_DEFRAG_SIZE) ||
3688 (*len > MAX_DEFRAG_SIZE) ||
3689 @@ -971,43 +1015,47 @@ mext_check_arguments(struct inode *orig_inode,
3690 }
3691
3692 if (orig_inode->i_size > donor_inode->i_size) {
3693 - if (orig_start >= donor_inode->i_size) {
3694 + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
3695 + /* TODO: eliminate this artificial restriction */
3696 + if (orig_start >= donor_blocks) {
3697 ext4_debug("ext4 move extent: orig start offset "
3698 - "[%llu] should be less than donor file size "
3699 - "[%lld] [ino:orig %lu, donor_inode %lu]\n",
3700 - orig_start, donor_inode->i_size,
3701 + "[%llu] should be less than donor file blocks "
3702 + "[%u] [ino:orig %lu, donor %lu]\n",
3703 + orig_start, donor_blocks,
3704 orig_inode->i_ino, donor_inode->i_ino);
3705 return -EINVAL;
3706 }
3707
3708 - if (orig_start + *len > donor_inode->i_size) {
3709 + /* TODO: eliminate this artificial restriction */
3710 + if (orig_start + *len > donor_blocks) {
3711 ext4_debug("ext4 move extent: End offset [%llu] should "
3712 - "be less than donor file size [%lld]."
3713 - "So adjust length from %llu to %lld "
3714 + "be less than donor file blocks [%u]."
3715 + "So adjust length from %llu to %llu "
3716 "[ino:orig %lu, donor %lu]\n",
3717 - orig_start + *len, donor_inode->i_size,
3718 - *len, donor_inode->i_size - orig_start,
3719 + orig_start + *len, donor_blocks,
3720 + *len, donor_blocks - orig_start,
3721 orig_inode->i_ino, donor_inode->i_ino);
3722 - *len = donor_inode->i_size - orig_start;
3723 + *len = donor_blocks - orig_start;
3724 }
3725 } else {
3726 - if (orig_start >= orig_inode->i_size) {
3727 + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
3728 + if (orig_start >= orig_blocks) {
3729 ext4_debug("ext4 move extent: start offset [%llu] "
3730 - "should be less than original file size "
3731 - "[%lld] [inode:orig %lu, donor %lu]\n",
3732 - orig_start, orig_inode->i_size,
3733 + "should be less than original file blocks "
3734 + "[%u] [ino:orig %lu, donor %lu]\n",
3735 + orig_start, orig_blocks,
3736 orig_inode->i_ino, donor_inode->i_ino);
3737 return -EINVAL;
3738 }
3739
3740 - if (orig_start + *len > orig_inode->i_size) {
3741 + if (orig_start + *len > orig_blocks) {
3742 ext4_debug("ext4 move extent: Adjust length "
3743 - "from %llu to %lld. Because it should be "
3744 - "less than original file size "
3745 + "from %llu to %llu. Because it should be "
3746 + "less than original file blocks "
3747 "[ino:orig %lu, donor %lu]\n",
3748 - *len, orig_inode->i_size - orig_start,
3749 + *len, orig_blocks - orig_start,
3750 orig_inode->i_ino, donor_inode->i_ino);
3751 - *len = orig_inode->i_size - orig_start;
3752 + *len = orig_blocks - orig_start;
3753 }
3754 }
3755
3756 @@ -1027,18 +1075,23 @@ mext_check_arguments(struct inode *orig_inode,
3757 * @inode1: the inode structure
3758 * @inode2: the inode structure
3759 *
3760 - * Lock two inodes' i_mutex by i_ino order. This function is moved from
3761 - * fs/inode.c.
3762 + * Lock two inodes' i_mutex by i_ino order.
3763 + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
3764 */
3765 -static void
3766 +static int
3767 mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
3768 {
3769 - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
3770 - if (inode1)
3771 - mutex_lock(&inode1->i_mutex);
3772 - else if (inode2)
3773 - mutex_lock(&inode2->i_mutex);
3774 - return;
3775 + int ret = 0;
3776 +
3777 + BUG_ON(inode1 == NULL && inode2 == NULL);
3778 +
3779 + ret = mext_check_null_inode(inode1, inode2, __func__);
3780 + if (ret < 0)
3781 + goto out;
3782 +
3783 + if (inode1 == inode2) {
3784 + mutex_lock(&inode1->i_mutex);
3785 + goto out;
3786 }
3787
3788 if (inode1->i_ino < inode2->i_ino) {
3789 @@ -1048,6 +1101,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
3790 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
3791 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
3792 }
3793 +
3794 +out:
3795 + return ret;
3796 }
3797
3798 /**
3799 @@ -1056,17 +1112,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
3800 * @inode1: the inode that is released first
3801 * @inode2: the inode that is released second
3802 *
3803 - * This function is moved from fs/inode.c.
3804 + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
3805 */
3806
3807 -static void
3808 +static int
3809 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
3810 {
3811 + int ret = 0;
3812 +
3813 + BUG_ON(inode1 == NULL && inode2 == NULL);
3814 +
3815 + ret = mext_check_null_inode(inode1, inode2, __func__);
3816 + if (ret < 0)
3817 + goto out;
3818 +
3819 if (inode1)
3820 mutex_unlock(&inode1->i_mutex);
3821
3822 if (inode2 && inode2 != inode1)
3823 mutex_unlock(&inode2->i_mutex);
3824 +
3825 +out:
3826 + return ret;
3827 }
3828
3829 /**
3830 @@ -1123,70 +1190,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
3831 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
3832 ext4_lblk_t rest_blocks;
3833 pgoff_t orig_page_offset = 0, seq_end_page;
3834 - int ret, depth, last_extent = 0;
3835 + int ret1, ret2, depth, last_extent = 0;
3836 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
3837 int data_offset_in_page;
3838 int block_len_in_page;
3839 int uninit;
3840
3841 - /* protect orig and donor against a truncate */
3842 - mext_inode_double_lock(orig_inode, donor_inode);
3843 + /* orig and donor should be different file */
3844 + if (orig_inode->i_ino == donor_inode->i_ino) {
3845 + ext4_debug("ext4 move extent: The argument files should not "
3846 + "be same file [ino:orig %lu, donor %lu]\n",
3847 + orig_inode->i_ino, donor_inode->i_ino);
3848 + return -EINVAL;
3849 + }
3850 +
3851 + /* Protect orig and donor inodes against a truncate */
3852 + ret1 = mext_inode_double_lock(orig_inode, donor_inode);
3853 + if (ret1 < 0)
3854 + return ret1;
3855
3856 - mext_double_down_read(orig_inode, donor_inode);
3857 + /* Protect extent tree against block allocations via delalloc */
3858 + double_down_write_data_sem(orig_inode, donor_inode);
3859 /* Check the filesystem environment whether move_extent can be done */
3860 - ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
3861 - donor_start, &len, *moved_len);
3862 - mext_double_up_read(orig_inode, donor_inode);
3863 - if (ret)
3864 - goto out2;
3865 + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
3866 + donor_start, &len);
3867 + if (ret1)
3868 + goto out;
3869
3870 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
3871 block_end = block_start + len - 1;
3872 if (file_end < block_end)
3873 len -= block_end - file_end;
3874
3875 - get_ext_path(orig_path, orig_inode, block_start, ret);
3876 - if (orig_path == NULL)
3877 - goto out2;
3878 + ret1 = get_ext_path(orig_inode, block_start, &orig_path);
3879 + if (ret1)
3880 + goto out;
3881
3882 /* Get path structure to check the hole */
3883 - get_ext_path(holecheck_path, orig_inode, block_start, ret);
3884 - if (holecheck_path == NULL)
3885 + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
3886 + if (ret1)
3887 goto out;
3888
3889 depth = ext_depth(orig_inode);
3890 ext_cur = holecheck_path[depth].p_ext;
3891 - if (ext_cur == NULL) {
3892 - ret = -EINVAL;
3893 - goto out;
3894 - }
3895
3896 /*
3897 - * Get proper extent whose ee_block is beyond block_start
3898 - * if block_start was within the hole.
3899 + * Get proper starting location of block replacement if block_start was
3900 + * within the hole.
3901 */
3902 if (le32_to_cpu(ext_cur->ee_block) +
3903 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
3904 + /*
3905 + * The hole exists between extents or the tail of
3906 + * original file.
3907 + */
3908 last_extent = mext_next_extent(orig_inode,
3909 holecheck_path, &ext_cur);
3910 if (last_extent < 0) {
3911 - ret = last_extent;
3912 + ret1 = last_extent;
3913 goto out;
3914 }
3915 last_extent = mext_next_extent(orig_inode, orig_path,
3916 &ext_dummy);
3917 if (last_extent < 0) {
3918 - ret = last_extent;
3919 + ret1 = last_extent;
3920 goto out;
3921 }
3922 - }
3923 - seq_start = block_start;
3924 + seq_start = le32_to_cpu(ext_cur->ee_block);
3925 + } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
3926 + /* The hole exists at the beginning of original file. */
3927 + seq_start = le32_to_cpu(ext_cur->ee_block);
3928 + else
3929 + seq_start = block_start;
3930
3931 /* No blocks within the specified range. */
3932 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
3933 ext4_debug("ext4 move extent: The specified range of file "
3934 "may be the hole\n");
3935 - ret = -EINVAL;
3936 + ret1 = -EINVAL;
3937 goto out;
3938 }
3939
3940 @@ -1206,7 +1287,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
3941 last_extent = mext_next_extent(orig_inode, holecheck_path,
3942 &ext_cur);
3943 if (last_extent < 0) {
3944 - ret = last_extent;
3945 + ret1 = last_extent;
3946 break;
3947 }
3948 add_blocks = ext4_ext_get_actual_len(ext_cur);
3949 @@ -1246,29 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
3950 seq_start = le32_to_cpu(ext_cur->ee_block);
3951 rest_blocks = seq_blocks;
3952
3953 - /* Discard preallocations of two inodes */
3954 - down_write(&EXT4_I(orig_inode)->i_data_sem);
3955 - ext4_discard_preallocations(orig_inode);
3956 - up_write(&EXT4_I(orig_inode)->i_data_sem);
3957 -
3958 - down_write(&EXT4_I(donor_inode)->i_data_sem);
3959 - ext4_discard_preallocations(donor_inode);
3960 - up_write(&EXT4_I(donor_inode)->i_data_sem);
3961 + /*
3962 + * Up semaphore to avoid following problems:
3963 + * a. transaction deadlock among ext4_journal_start,
3964 + * ->write_begin via pagefault, and jbd2_journal_commit
3965 + * b. racing with ->readpage, ->write_begin, and ext4_get_block
3966 + * in move_extent_per_page
3967 + */
3968 + double_up_write_data_sem(orig_inode, donor_inode);
3969
3970 while (orig_page_offset <= seq_end_page) {
3971
3972 /* Swap original branches with new branches */
3973 - ret = move_extent_par_page(o_filp, donor_inode,
3974 + block_len_in_page = move_extent_per_page(
3975 + o_filp, donor_inode,
3976 orig_page_offset,
3977 data_offset_in_page,
3978 - block_len_in_page, uninit);
3979 - if (ret < 0)
3980 - goto out;
3981 - orig_page_offset++;
3982 + block_len_in_page, uninit,
3983 + &ret1);
3984 +
3985 /* Count how many blocks we have exchanged */
3986 *moved_len += block_len_in_page;
3987 - BUG_ON(*moved_len > len);
3988 + if (ret1 < 0)
3989 + break;
3990 + if (*moved_len > len) {
3991 + ext4_error(orig_inode->i_sb, __func__,
3992 + "We replaced blocks too much! "
3993 + "sum of replaced: %llu requested: %llu",
3994 + *moved_len, len);
3995 + ret1 = -EIO;
3996 + break;
3997 + }
3998
3999 + orig_page_offset++;
4000 data_offset_in_page = 0;
4001 rest_blocks -= block_len_in_page;
4002 if (rest_blocks > blocks_per_page)
4003 @@ -1277,20 +1368,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
4004 block_len_in_page = rest_blocks;
4005 }
4006
4007 + double_down_write_data_sem(orig_inode, donor_inode);
4008 + if (ret1 < 0)
4009 + break;
4010 +
4011 /* Decrease buffer counter */
4012 if (holecheck_path)
4013 ext4_ext_drop_refs(holecheck_path);
4014 - get_ext_path(holecheck_path, orig_inode,
4015 - seq_start, ret);
4016 - if (holecheck_path == NULL)
4017 + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
4018 + if (ret1)
4019 break;
4020 depth = holecheck_path->p_depth;
4021
4022 /* Decrease buffer counter */
4023 if (orig_path)
4024 ext4_ext_drop_refs(orig_path);
4025 - get_ext_path(orig_path, orig_inode, seq_start, ret);
4026 - if (orig_path == NULL)
4027 + ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
4028 + if (ret1)
4029 break;
4030
4031 ext_cur = holecheck_path[depth].p_ext;
4032 @@ -1299,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
4033
4034 }
4035 out:
4036 + if (*moved_len) {
4037 + ext4_discard_preallocations(orig_inode);
4038 + ext4_discard_preallocations(donor_inode);
4039 + }
4040 +
4041 if (orig_path) {
4042 ext4_ext_drop_refs(orig_path);
4043 kfree(orig_path);
4044 @@ -1307,14 +1406,13 @@ out:
4045 ext4_ext_drop_refs(holecheck_path);
4046 kfree(holecheck_path);
4047 }
4048 -out2:
4049 - mext_inode_double_unlock(orig_inode, donor_inode);
4050 -
4051 - if (ret)
4052 - return ret;
4053 + double_up_write_data_sem(orig_inode, donor_inode);
4054 + ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
4055
4056 - /* All of the specified blocks must be exchanged in succeed */
4057 - BUG_ON(*moved_len != len);
4058 + if (ret1)
4059 + return ret1;
4060 + else if (ret2)
4061 + return ret2;
4062
4063 return 0;
4064 }
4065 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
4066 index de04013..9dcd686 100644
4067 --- a/fs/ext4/namei.c
4068 +++ b/fs/ext4/namei.c
4069 @@ -1292,9 +1292,6 @@ errout:
4070 * add_dirent_to_buf will attempt search the directory block for
4071 * space. It will return -ENOSPC if no space is available, and -EIO
4072 * and -EEXIST if directory entry already exists.
4073 - *
4074 - * NOTE! bh is NOT released in the case where ENOSPC is returned. In
4075 - * all other cases bh is released.
4076 */
4077 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4078 struct inode *inode, struct ext4_dir_entry_2 *de,
4079 @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4080 top = bh->b_data + blocksize - reclen;
4081 while ((char *) de <= top) {
4082 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
4083 - bh, offset)) {
4084 - brelse(bh);
4085 + bh, offset))
4086 return -EIO;
4087 - }
4088 - if (ext4_match(namelen, name, de)) {
4089 - brelse(bh);
4090 + if (ext4_match(namelen, name, de))
4091 return -EEXIST;
4092 - }
4093 nlen = EXT4_DIR_REC_LEN(de->name_len);
4094 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
4095 if ((de->inode? rlen - nlen: rlen) >= reclen)
4096 @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4097 err = ext4_journal_get_write_access(handle, bh);
4098 if (err) {
4099 ext4_std_error(dir->i_sb, err);
4100 - brelse(bh);
4101 return err;
4102 }
4103
4104 @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4105 err = ext4_handle_dirty_metadata(handle, dir, bh);
4106 if (err)
4107 ext4_std_error(dir->i_sb, err);
4108 - brelse(bh);
4109 return 0;
4110 }
4111
4112 @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
4113 if (!(de))
4114 return retval;
4115
4116 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
4117 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
4118 + brelse(bh);
4119 + return retval;
4120 }
4121
4122 /*
4123 @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
4124 if(!bh)
4125 return retval;
4126 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
4127 - if (retval != -ENOSPC)
4128 + if (retval != -ENOSPC) {
4129 + brelse(bh);
4130 return retval;
4131 + }
4132
4133 if (blocks == 1 && !dx_fallback &&
4134 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
4135 @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
4136 de = (struct ext4_dir_entry_2 *) bh->b_data;
4137 de->inode = 0;
4138 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
4139 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
4140 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
4141 + brelse(bh);
4142 + return retval;
4143 }
4144
4145 /*
4146 @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
4147 goto journal_error;
4148
4149 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
4150 - if (err != -ENOSPC) {
4151 - bh = NULL;
4152 + if (err != -ENOSPC)
4153 goto cleanup;
4154 - }
4155
4156 /* Block full, should compress but for now just split */
4157 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
4158 @@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
4159 goto cleanup;
4160 node2 = (struct dx_node *)(bh2->b_data);
4161 entries2 = node2->entries;
4162 + memset(&node2->fake, 0, sizeof(struct fake_dirent));
4163 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
4164 sb->s_blocksize);
4165 - node2->fake.inode = 0;
4166 BUFFER_TRACE(frame->bh, "get_write_access");
4167 err = ext4_journal_get_write_access(handle, frame->bh);
4168 if (err)
4169 @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
4170 if (!de)
4171 goto cleanup;
4172 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
4173 - bh = NULL;
4174 goto cleanup;
4175
4176 journal_error:
4177 @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
4178 retry:
4179 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4180 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
4181 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4182 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4183 if (IS_ERR(handle))
4184 return PTR_ERR(handle);
4185
4186 @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
4187 retry:
4188 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4189 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
4190 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4191 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4192 if (IS_ERR(handle))
4193 return PTR_ERR(handle);
4194
4195 @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4196 retry:
4197 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4198 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
4199 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4200 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4201 if (IS_ERR(handle))
4202 return PTR_ERR(handle);
4203
4204 @@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
4205 struct ext4_iloc iloc;
4206 int err = 0;
4207
4208 - if (!ext4_handle_valid(handle))
4209 + /* ext4_handle_valid() assumes a valid handle_t pointer */
4210 + if (handle && !ext4_handle_valid(handle))
4211 return 0;
4212
4213 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
4214 @@ -2258,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
4215 retry:
4216 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4217 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
4218 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4219 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4220 if (IS_ERR(handle))
4221 return PTR_ERR(handle);
4222
4223 @@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry,
4224 struct inode *inode = old_dentry->d_inode;
4225 int err, retries = 0;
4226
4227 - if (EXT4_DIR_LINK_MAX(inode))
4228 + if (inode->i_nlink >= EXT4_LINK_MAX)
4229 return -EMLINK;
4230
4231 /*
4232 @@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
4233 goto end_rename;
4234 retval = -EMLINK;
4235 if (!new_inode && new_dir != old_dir &&
4236 - new_dir->i_nlink >= EXT4_LINK_MAX)
4237 + EXT4_DIR_LINK_MAX(new_dir))
4238 goto end_rename;
4239 }
4240 if (!new_bh) {
4241 diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
4242 index 68b0351..96302cd 100644
4243 --- a/fs/ext4/resize.c
4244 +++ b/fs/ext4/resize.c
4245 @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
4246 goto exit_bh;
4247
4248 if (IS_ERR(gdb = bclean(handle, sb, block))) {
4249 - err = PTR_ERR(bh);
4250 + err = PTR_ERR(gdb);
4251 goto exit_bh;
4252 }
4253 ext4_handle_dirty_metadata(handle, NULL, gdb);
4254 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
4255 index 8f4f079..ed38f25 100644
4256 --- a/fs/ext4/super.c
4257 +++ b/fs/ext4/super.c
4258 @@ -45,6 +45,7 @@
4259 #include "ext4_jbd2.h"
4260 #include "xattr.h"
4261 #include "acl.h"
4262 +#include "mballoc.h"
4263
4264 #define CREATE_TRACE_POINTS
4265 #include <trace/events/ext4.h>
4266 @@ -188,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb,
4267 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
4268 }
4269
4270 +
4271 +/* Just increment the non-pointer handle value */
4272 +static handle_t *ext4_get_nojournal(void)
4273 +{
4274 + handle_t *handle = current->journal_info;
4275 + unsigned long ref_cnt = (unsigned long)handle;
4276 +
4277 + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
4278 +
4279 + ref_cnt++;
4280 + handle = (handle_t *)ref_cnt;
4281 +
4282 + current->journal_info = handle;
4283 + return handle;
4284 +}
4285 +
4286 +
4287 +/* Decrement the non-pointer handle value */
4288 +static void ext4_put_nojournal(handle_t *handle)
4289 +{
4290 + unsigned long ref_cnt = (unsigned long)handle;
4291 +
4292 + BUG_ON(ref_cnt == 0);
4293 +
4294 + ref_cnt--;
4295 + handle = (handle_t *)ref_cnt;
4296 +
4297 + current->journal_info = handle;
4298 +}
4299 +
4300 /*
4301 * Wrappers for jbd2_journal_start/end.
4302 *
4303 @@ -214,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
4304 }
4305 return jbd2_journal_start(journal, nblocks);
4306 }
4307 - /*
4308 - * We're not journaling, return the appropriate indication.
4309 - */
4310 - current->journal_info = EXT4_NOJOURNAL_HANDLE;
4311 - return current->journal_info;
4312 + return ext4_get_nojournal();
4313 }
4314
4315 /*
4316 @@ -234,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
4317 int rc;
4318
4319 if (!ext4_handle_valid(handle)) {
4320 - /*
4321 - * Do this here since we don't call jbd2_journal_stop() in
4322 - * no-journal mode.
4323 - */
4324 - current->journal_info = NULL;
4325 + ext4_put_nojournal(handle);
4326 return 0;
4327 }
4328 sb = handle->h_transaction->t_journal->j_private;
4329 @@ -344,7 +367,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
4330 errstr = "Out of memory";
4331 break;
4332 case -EROFS:
4333 - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
4334 + if (!sb || (EXT4_SB(sb)->s_journal &&
4335 + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
4336 errstr = "Journal has aborted";
4337 else
4338 errstr = "Readonly filesystem";
4339 @@ -578,15 +602,14 @@ static void ext4_put_super(struct super_block *sb)
4340 struct ext4_super_block *es = sbi->s_es;
4341 int i, err;
4342
4343 + flush_workqueue(sbi->dio_unwritten_wq);
4344 + destroy_workqueue(sbi->dio_unwritten_wq);
4345 +
4346 lock_super(sb);
4347 lock_kernel();
4348 if (sb->s_dirt)
4349 ext4_commit_super(sb, 1);
4350
4351 - ext4_release_system_zone(sb);
4352 - ext4_mb_release(sb);
4353 - ext4_ext_release(sb);
4354 - ext4_xattr_put_super(sb);
4355 if (sbi->s_journal) {
4356 err = jbd2_journal_destroy(sbi->s_journal);
4357 sbi->s_journal = NULL;
4358 @@ -594,6 +617,12 @@ static void ext4_put_super(struct super_block *sb)
4359 ext4_abort(sb, __func__,
4360 "Couldn't clean up the journal");
4361 }
4362 +
4363 + ext4_release_system_zone(sb);
4364 + ext4_mb_release(sb);
4365 + ext4_ext_release(sb);
4366 + ext4_xattr_put_super(sb);
4367 +
4368 if (!(sb->s_flags & MS_RDONLY)) {
4369 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4370 es->s_state = cpu_to_le16(sbi->s_mount_state);
4371 @@ -682,6 +711,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
4372 ei->i_allocated_meta_blocks = 0;
4373 ei->i_delalloc_reserved_flag = 0;
4374 spin_lock_init(&(ei->i_block_reservation_lock));
4375 + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
4376 + ei->cur_aio_dio = NULL;
4377 + ei->i_sync_tid = 0;
4378 + ei->i_datasync_tid = 0;
4379
4380 return &ei->vfs_inode;
4381 }
4382 @@ -877,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
4383 if (test_opt(sb, NO_AUTO_DA_ALLOC))
4384 seq_puts(seq, ",noauto_da_alloc");
4385
4386 + if (test_opt(sb, DISCARD))
4387 + seq_puts(seq, ",discard");
4388 +
4389 + if (test_opt(sb, NOLOAD))
4390 + seq_puts(seq, ",norecovery");
4391 +
4392 ext4_show_quota_options(seq, sb);
4393
4394 return 0;
4395 @@ -1057,7 +1096,8 @@ enum {
4396 Opt_usrquota, Opt_grpquota, Opt_i_version,
4397 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
4398 Opt_block_validity, Opt_noblock_validity,
4399 - Opt_inode_readahead_blks, Opt_journal_ioprio
4400 + Opt_inode_readahead_blks, Opt_journal_ioprio,
4401 + Opt_discard, Opt_nodiscard,
4402 };
4403
4404 static const match_table_t tokens = {
4405 @@ -1082,6 +1122,7 @@ static const match_table_t tokens = {
4406 {Opt_acl, "acl"},
4407 {Opt_noacl, "noacl"},
4408 {Opt_noload, "noload"},
4409 + {Opt_noload, "norecovery"},
4410 {Opt_nobh, "nobh"},
4411 {Opt_bh, "bh"},
4412 {Opt_commit, "commit=%u"},
4413 @@ -1123,6 +1164,8 @@ static const match_table_t tokens = {
4414 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
4415 {Opt_auto_da_alloc, "auto_da_alloc"},
4416 {Opt_noauto_da_alloc, "noauto_da_alloc"},
4417 + {Opt_discard, "discard"},
4418 + {Opt_nodiscard, "nodiscard"},
4419 {Opt_err, NULL},
4420 };
4421
4422 @@ -1551,6 +1594,12 @@ set_qf_format:
4423 else
4424 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
4425 break;
4426 + case Opt_discard:
4427 + set_opt(sbi->s_mount_opt, DISCARD);
4428 + break;
4429 + case Opt_nodiscard:
4430 + clear_opt(sbi->s_mount_opt, DISCARD);
4431 + break;
4432 default:
4433 ext4_msg(sb, KERN_ERR,
4434 "Unrecognized mount option \"%s\" "
4435 @@ -1666,14 +1715,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
4436 size_t size;
4437 int i;
4438
4439 - if (!sbi->s_es->s_log_groups_per_flex) {
4440 + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
4441 + groups_per_flex = 1 << sbi->s_log_groups_per_flex;
4442 +
4443 + if (groups_per_flex < 2) {
4444 sbi->s_log_groups_per_flex = 0;
4445 return 1;
4446 }
4447
4448 - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
4449 - groups_per_flex = 1 << sbi->s_log_groups_per_flex;
4450 -
4451 /* We allocate both existing and potentially added groups */
4452 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
4453 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
4454 @@ -1695,12 +1744,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
4455 gdp = ext4_get_group_desc(sb, i, NULL);
4456
4457 flex_group = ext4_flex_group(sbi, i);
4458 - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
4459 - ext4_free_inodes_count(sb, gdp));
4460 - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
4461 - ext4_free_blks_count(sb, gdp));
4462 - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
4463 - ext4_used_dirs_count(sb, gdp));
4464 + atomic_add(ext4_free_inodes_count(sb, gdp),
4465 + &sbi->s_flex_groups[flex_group].free_inodes);
4466 + atomic_add(ext4_free_blks_count(sb, gdp),
4467 + &sbi->s_flex_groups[flex_group].free_blocks);
4468 + atomic_add(ext4_used_dirs_count(sb, gdp),
4469 + &sbi->s_flex_groups[flex_group].used_dirs);
4470 }
4471
4472 return 1;
4473 @@ -2197,6 +2246,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
4474 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
4475 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
4476 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
4477 +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
4478
4479 static struct attribute *ext4_attrs[] = {
4480 ATTR_LIST(delayed_allocation_blocks),
4481 @@ -2210,6 +2260,7 @@ static struct attribute *ext4_attrs[] = {
4482 ATTR_LIST(mb_order2_req),
4483 ATTR_LIST(mb_stream_req),
4484 ATTR_LIST(mb_group_prealloc),
4485 + ATTR_LIST(max_writeback_mb_bump),
4486 NULL,
4487 };
4488
4489 @@ -2253,6 +2304,49 @@ static struct kobj_type ext4_ktype = {
4490 .release = ext4_sb_release,
4491 };
4492
4493 +/*
4494 + * Check whether this filesystem can be mounted based on
4495 + * the features present and the RDONLY/RDWR mount requested.
4496 + * Returns 1 if this filesystem can be mounted as requested,
4497 + * 0 if it cannot be.
4498 + */
4499 +static int ext4_feature_set_ok(struct super_block *sb, int readonly)
4500 +{
4501 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
4502 + ext4_msg(sb, KERN_ERR,
4503 + "Couldn't mount because of "
4504 + "unsupported optional features (%x)",
4505 + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
4506 + ~EXT4_FEATURE_INCOMPAT_SUPP));
4507 + return 0;
4508 + }
4509 +
4510 + if (readonly)
4511 + return 1;
4512 +
4513 + /* Check that feature set is OK for a read-write mount */
4514 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
4515 + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
4516 + "unsupported optional features (%x)",
4517 + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
4518 + ~EXT4_FEATURE_RO_COMPAT_SUPP));
4519 + return 0;
4520 + }
4521 + /*
4522 + * Large file size enabled file system can only be mounted
4523 + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
4524 + */
4525 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4526 + if (sizeof(blkcnt_t) < sizeof(u64)) {
4527 + ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
4528 + "cannot be mounted RDWR without "
4529 + "CONFIG_LBDAF");
4530 + return 0;
4531 + }
4532 + }
4533 + return 1;
4534 +}
4535 +
4536 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4537 __releases(kernel_lock)
4538 __acquires(kernel_lock)
4539 @@ -2274,7 +2368,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4540 unsigned int db_count;
4541 unsigned int i;
4542 int needs_recovery, has_huge_files;
4543 - int features;
4544 __u64 blocks_count;
4545 int err;
4546 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4547 @@ -2401,39 +2494,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4548 * previously didn't change the revision level when setting the flags,
4549 * so there is a chance incompat flags are set on a rev 0 filesystem.
4550 */
4551 - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
4552 - if (features) {
4553 - ext4_msg(sb, KERN_ERR,
4554 - "Couldn't mount because of "
4555 - "unsupported optional features (%x)",
4556 - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
4557 - ~EXT4_FEATURE_INCOMPAT_SUPP));
4558 - goto failed_mount;
4559 - }
4560 - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
4561 - if (!(sb->s_flags & MS_RDONLY) && features) {
4562 - ext4_msg(sb, KERN_ERR,
4563 - "Couldn't mount RDWR because of "
4564 - "unsupported optional features (%x)",
4565 - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
4566 - ~EXT4_FEATURE_RO_COMPAT_SUPP));
4567 + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
4568 goto failed_mount;
4569 - }
4570 - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
4571 - EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4572 - if (has_huge_files) {
4573 - /*
4574 - * Large file size enabled file system can only be
4575 - * mount if kernel is build with CONFIG_LBDAF
4576 - */
4577 - if (sizeof(root->i_blocks) < sizeof(u64) &&
4578 - !(sb->s_flags & MS_RDONLY)) {
4579 - ext4_msg(sb, KERN_ERR, "Filesystem with huge "
4580 - "files cannot be mounted read-write "
4581 - "without CONFIG_LBDAF");
4582 - goto failed_mount;
4583 - }
4584 - }
4585 +
4586 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
4587
4588 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
4589 @@ -2469,6 +2532,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4590 }
4591 }
4592
4593 + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
4594 + EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4595 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
4596 has_huge_files);
4597 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
4598 @@ -2549,12 +2614,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4599 goto failed_mount;
4600 }
4601
4602 - if (ext4_blocks_count(es) >
4603 - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
4604 + /*
4605 + * Test whether we have more sectors than will fit in sector_t,
4606 + * and whether the max offset is addressable by the page cache.
4607 + */
4608 + if ((ext4_blocks_count(es) >
4609 + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
4610 + (ext4_blocks_count(es) >
4611 + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
4612 ext4_msg(sb, KERN_ERR, "filesystem"
4613 - " too large to mount safely");
4614 + " too large to mount safely on this system");
4615 if (sizeof(sector_t) < 8)
4616 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
4617 + ret = -EFBIG;
4618 goto failed_mount;
4619 }
4620
4621 @@ -2595,6 +2667,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4622 goto failed_mount;
4623 }
4624 sbi->s_groups_count = blocks_count;
4625 + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4626 + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4627 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4628 EXT4_DESC_PER_BLOCK(sb);
4629 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
4630 @@ -2656,6 +2730,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4631 }
4632
4633 sbi->s_stripe = ext4_get_stripe_size(sbi);
4634 + sbi->s_max_writeback_mb_bump = 128;
4635
4636 /*
4637 * set up enough so that it can read an inode
4638 @@ -2781,6 +2856,12 @@ no_journal:
4639 clear_opt(sbi->s_mount_opt, NOBH);
4640 }
4641 }
4642 + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
4643 + if (!EXT4_SB(sb)->dio_unwritten_wq) {
4644 + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
4645 + goto failed_mount_wq;
4646 + }
4647 +
4648 /*
4649 * The jbd2_journal_load will have done any necessary log recovery,
4650 * so we can safely mount the rest of the filesystem now.
4651 @@ -2893,6 +2974,8 @@ cantfind_ext4:
4652
4653 failed_mount4:
4654 ext4_msg(sb, KERN_ERR, "mount failed");
4655 + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
4656 +failed_mount_wq:
4657 ext4_release_system_zone(sb);
4658 if (sbi->s_journal) {
4659 jbd2_journal_destroy(sbi->s_journal);
4660 @@ -3208,7 +3291,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4661 clear_buffer_write_io_error(sbh);
4662 set_buffer_uptodate(sbh);
4663 }
4664 - es->s_wtime = cpu_to_le32(get_seconds());
4665 + /*
4666 + * If the file system is mounted read-only, don't update the
4667 + * superblock write time. This avoids updating the superblock
4668 + * write time when we are mounting the root file system
4669 + * read/only but we need to replay the journal; at that point,
4670 + * for people who are east of GMT and who make their clock
4671 + * tick in localtime for Windows bug-for-bug compatibility,
4672 + * the clock is set in the future, and this will cause e2fsck
4673 + * to complain and force a full file system check.
4674 + */
4675 + if (!(sb->s_flags & MS_RDONLY))
4676 + es->s_wtime = cpu_to_le32(get_seconds());
4677 es->s_kbytes_written =
4678 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4679 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
4680 @@ -3333,11 +3427,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4681 {
4682 int ret = 0;
4683 tid_t target;
4684 + struct ext4_sb_info *sbi = EXT4_SB(sb);
4685
4686 trace_ext4_sync_fs(sb, wait);
4687 - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
4688 + flush_workqueue(sbi->dio_unwritten_wq);
4689 + if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4690 if (wait)
4691 - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
4692 + jbd2_log_wait_commit(sbi->s_journal, target);
4693 }
4694 return ret;
4695 }
4696 @@ -3477,18 +3573,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4697 if (sbi->s_journal)
4698 ext4_mark_recovery_complete(sb, es);
4699 } else {
4700 - int ret;
4701 - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
4702 - ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
4703 - ext4_msg(sb, KERN_WARNING, "couldn't "
4704 - "remount RDWR because of unsupported "
4705 - "optional features (%x)",
4706 - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
4707 - ~EXT4_FEATURE_RO_COMPAT_SUPP));
4708 + /* Make sure we can mount this feature set readwrite */
4709 + if (!ext4_feature_set_ok(sb, 0)) {
4710 err = -EROFS;
4711 goto restore_opts;
4712 }
4713 -
4714 /*
4715 * Make sure the group descriptor checksums
4716 * are sane. If they aren't, refuse to remount r/w.
4717 @@ -3624,13 +3713,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4718 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4719 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4720 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4721 - ext4_free_blocks_count_set(es, buf->f_bfree);
4722 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4723 if (buf->f_bfree < ext4_r_blocks_count(es))
4724 buf->f_bavail = 0;
4725 buf->f_files = le32_to_cpu(es->s_inodes_count);
4726 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
4727 - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
4728 buf->f_namelen = EXT4_NAME_LEN;
4729 fsid = le64_to_cpup((void *)es->s_uuid) ^
4730 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
4731 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
4732 index 62b31c2..0257019 100644
4733 --- a/fs/ext4/xattr.c
4734 +++ b/fs/ext4/xattr.c
4735 @@ -810,12 +810,23 @@ inserted:
4736 get_bh(new_bh);
4737 } else {
4738 /* We need to allocate a new block */
4739 - ext4_fsblk_t goal = ext4_group_first_block_no(sb,
4740 + ext4_fsblk_t goal, block;
4741 +
4742 + goal = ext4_group_first_block_no(sb,
4743 EXT4_I(inode)->i_block_group);
4744 - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
4745 +
4746 + /* non-extent files can't have physical blocks past 2^32 */
4747 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4748 + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
4749 +
4750 + block = ext4_new_meta_blocks(handle, inode,
4751 goal, NULL, &error);
4752 if (error)
4753 goto cleanup;
4754 +
4755 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4756 + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
4757 +
4758 ea_idebug(inode, "creating block %d", block);
4759
4760 new_bh = sb_getblk(sb, block);
4761 @@ -977,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
4762 if (error)
4763 goto cleanup;
4764
4765 + error = ext4_journal_get_write_access(handle, is.iloc.bh);
4766 + if (error)
4767 + goto cleanup;
4768 +
4769 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
4770 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
4771 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
4772 @@ -1002,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
4773 if (flags & XATTR_CREATE)
4774 goto cleanup;
4775 }
4776 - error = ext4_journal_get_write_access(handle, is.iloc.bh);
4777 - if (error)
4778 - goto cleanup;
4779 if (!value) {
4780 if (!is.s.not_found)
4781 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
4782 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
4783 index 7b4088b..8cf902a 100644
4784 --- a/fs/jbd2/commit.c
4785 +++ b/fs/jbd2/commit.c
4786 @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
4787 JBUFFER_TRACE(jh, "ph3: write metadata");
4788 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
4789 jh, &new_jh, blocknr);
4790 + if (flags < 0) {
4791 + jbd2_journal_abort(journal, flags);
4792 + continue;
4793 + }
4794 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
4795 wbuf[bufs++] = jh2bh(new_jh);
4796
4797 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
4798 index e378cb3..4b74149 100644
4799 --- a/fs/jbd2/journal.c
4800 +++ b/fs/jbd2/journal.c
4801 @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
4802 EXPORT_SYMBOL(jbd2_journal_ack_err);
4803 EXPORT_SYMBOL(jbd2_journal_clear_err);
4804 EXPORT_SYMBOL(jbd2_log_wait_commit);
4805 +EXPORT_SYMBOL(jbd2_log_start_commit);
4806 EXPORT_SYMBOL(jbd2_journal_start_commit);
4807 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
4808 EXPORT_SYMBOL(jbd2_journal_wipe);
4809 @@ -361,6 +362,10 @@ repeat:
4810
4811 jbd_unlock_bh_state(bh_in);
4812 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
4813 + if (!tmp) {
4814 + jbd2_journal_put_journal_head(new_jh);
4815 + return -ENOMEM;
4816 + }
4817 jbd_lock_bh_state(bh_in);
4818 if (jh_in->b_frozen_data) {
4819 jbd2_free(tmp, bh_in->b_size);
4820 @@ -1187,6 +1192,12 @@ static int journal_reset(journal_t *journal)
4821
4822 first = be32_to_cpu(sb->s_first);
4823 last = be32_to_cpu(sb->s_maxlen);
4824 + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
4825 + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
4826 + first, last);
4827 + journal_fail_superblock(journal);
4828 + return -EINVAL;
4829 + }
4830
4831 journal->j_first = first;
4832 journal->j_last = last;
4833 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
4834 index 6213ac7..a051270 100644
4835 --- a/fs/jbd2/transaction.c
4836 +++ b/fs/jbd2/transaction.c
4837 @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
4838 INIT_LIST_HEAD(&transaction->t_private_list);
4839
4840 /* Set up the commit timer for the new transaction. */
4841 - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
4842 + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
4843 add_timer(&journal->j_commit_timer);
4844
4845 J_ASSERT(journal->j_running_transaction == NULL);
4846 @@ -238,6 +238,8 @@ repeat_locked:
4847 __jbd2_log_space_left(journal));
4848 spin_unlock(&transaction->t_handle_lock);
4849 spin_unlock(&journal->j_state_lock);
4850 +
4851 + lock_map_acquire(&handle->h_lockdep_map);
4852 out:
4853 if (unlikely(new_transaction)) /* It's usually NULL */
4854 kfree(new_transaction);
4855 @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
4856 handle = ERR_PTR(err);
4857 goto out;
4858 }
4859 -
4860 - lock_map_acquire(&handle->h_lockdep_map);
4861 out:
4862 return handle;
4863 }
4864 @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
4865 __jbd2_log_start_commit(journal, transaction->t_tid);
4866 spin_unlock(&journal->j_state_lock);
4867
4868 + lock_map_release(&handle->h_lockdep_map);
4869 handle->h_buffer_credits = nblocks;
4870 ret = start_this_handle(journal, handle);
4871 return ret;
4872 diff --git a/include/linux/sched.h b/include/linux/sched.h
4873 index 0f1ea4a..d3e910b 100644
4874 --- a/include/linux/sched.h
4875 +++ b/include/linux/sched.h
4876 @@ -1999,11 +1999,18 @@ static inline int is_si_special(const struct siginfo *info)
4877 return info <= SEND_SIG_FORCED;
4878 }
4879
4880 -/* True if we are on the alternate signal stack. */
4881 -
4882 +/*
4883 + * True if we are on the alternate signal stack.
4884 + */
4885 static inline int on_sig_stack(unsigned long sp)
4886 {
4887 - return (sp - current->sas_ss_sp < current->sas_ss_size);
4888 +#ifdef CONFIG_STACK_GROWSUP
4889 + return sp >= current->sas_ss_sp &&
4890 + sp - current->sas_ss_sp < current->sas_ss_size;
4891 +#else
4892 + return sp > current->sas_ss_sp &&
4893 + sp - current->sas_ss_sp <= current->sas_ss_size;
4894 +#endif
4895 }
4896
4897 static inline int sas_ss_flags(unsigned long sp)
4898 diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h
4899 index 2cc8e8b..6856612 100644
4900 --- a/include/scsi/osd_protocol.h
4901 +++ b/include/scsi/osd_protocol.h
4902 @@ -17,6 +17,7 @@
4903 #define __OSD_PROTOCOL_H__
4904
4905 #include <linux/types.h>
4906 +#include <linux/kernel.h>
4907 #include <asm/unaligned.h>
4908 #include <scsi/scsi.h>
4909
4910 diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
4911 index b62a097..6cc72e2 100644
4912 --- a/include/scsi/scsi_host.h
4913 +++ b/include/scsi/scsi_host.h
4914 @@ -677,6 +677,12 @@ struct Scsi_Host {
4915 void *shost_data;
4916
4917 /*
4918 + * Points to the physical bus device we'd use to do DMA
4919 + * Needed just in case we have virtual hosts.
4920 + */
4921 + struct device *dma_dev;
4922 +
4923 + /*
4924 * We should ensure that this is aligned, both for better performance
4925 * and also because some compilers (m68k) don't automatically force
4926 * alignment to a long boundary.
4927 @@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
4928 extern void scsi_flush_work(struct Scsi_Host *);
4929
4930 extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
4931 -extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *);
4932 +extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
4933 + struct device *,
4934 + struct device *);
4935 extern void scsi_scan_host(struct Scsi_Host *);
4936 extern void scsi_rescan_device(struct device *);
4937 extern void scsi_remove_host(struct Scsi_Host *);
4938 @@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(enum scsi_host_state);
4939
4940 extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *);
4941
4942 +static inline int __must_check scsi_add_host(struct Scsi_Host *host,
4943 + struct device *dev)
4944 +{
4945 + return scsi_add_host_with_dma(host, dev, dev);
4946 +}
4947 +
4948 static inline struct device *scsi_get_device(struct Scsi_Host *shost)
4949 {
4950 return shost->shost_gendev.parent;
4951 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
4952 index 7d8b5bc..824979e 100644
4953 --- a/include/trace/events/ext4.h
4954 +++ b/include/trace/events/ext4.h
4955 @@ -5,10 +5,12 @@
4956 #define _TRACE_EXT4_H
4957
4958 #include <linux/writeback.h>
4959 -#include "../../../fs/ext4/ext4.h"
4960 -#include "../../../fs/ext4/mballoc.h"
4961 #include <linux/tracepoint.h>
4962
4963 +struct ext4_allocation_context;
4964 +struct ext4_allocation_request;
4965 +struct ext4_prealloc_space;
4966 +
4967 TRACE_EVENT(ext4_free_inode,
4968 TP_PROTO(struct inode *inode),
4969
4970 @@ -229,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages,
4971 __field( char, for_reclaim )
4972 __field( char, for_writepages )
4973 __field( char, range_cyclic )
4974 + __field( pgoff_t, writeback_index )
4975 ),
4976
4977 TP_fast_assign(
4978 @@ -243,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages,
4979 __entry->for_reclaim = wbc->for_reclaim;
4980 __entry->for_writepages = wbc->for_writepages;
4981 __entry->range_cyclic = wbc->range_cyclic;
4982 + __entry->writeback_index = inode->i_mapping->writeback_index;
4983 ),
4984
4985 - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
4986 - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
4987 + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu",
4988 + jbd2_dev_to_name(__entry->dev),
4989 + (unsigned long) __entry->ino, __entry->nr_to_write,
4990 __entry->pages_skipped, __entry->range_start,
4991 __entry->range_end, __entry->nonblocking,
4992 __entry->for_kupdate, __entry->for_reclaim,
4993 - __entry->for_writepages, __entry->range_cyclic)
4994 + __entry->for_writepages, __entry->range_cyclic,
4995 + (unsigned long) __entry->writeback_index)
4996 +);
4997 +
4998 +TRACE_EVENT(ext4_da_write_pages,
4999 + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
5000 +
5001 + TP_ARGS(inode, mpd),
5002 +
5003 + TP_STRUCT__entry(
5004 + __field( dev_t, dev )
5005 + __field( ino_t, ino )
5006 + __field( __u64, b_blocknr )
5007 + __field( __u32, b_size )
5008 + __field( __u32, b_state )
5009 + __field( unsigned long, first_page )
5010 + __field( int, io_done )
5011 + __field( int, pages_written )
5012 + ),
5013 +
5014 + TP_fast_assign(
5015 + __entry->dev = inode->i_sb->s_dev;
5016 + __entry->ino = inode->i_ino;
5017 + __entry->b_blocknr = mpd->b_blocknr;
5018 + __entry->b_size = mpd->b_size;
5019 + __entry->b_state = mpd->b_state;
5020 + __entry->first_page = mpd->first_page;
5021 + __entry->io_done = mpd->io_done;
5022 + __entry->pages_written = mpd->pages_written;
5023 + ),
5024 +
5025 + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
5026 + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
5027 + __entry->b_blocknr, __entry->b_size,
5028 + __entry->b_state, __entry->first_page,
5029 + __entry->io_done, __entry->pages_written)
5030 );
5031
5032 TRACE_EVENT(ext4_da_writepages_result,
5033 @@ -268,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result,
5034 __field( char, encountered_congestion )
5035 __field( char, more_io )
5036 __field( char, no_nrwrite_index_update )
5037 + __field( pgoff_t, writeback_index )
5038 ),
5039
5040 TP_fast_assign(
5041 @@ -279,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result,
5042 __entry->encountered_congestion = wbc->encountered_congestion;
5043 __entry->more_io = wbc->more_io;
5044 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
5045 + __entry->writeback_index = inode->i_mapping->writeback_index;
5046 ),
5047
5048 - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
5049 - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret,
5050 + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
5051 + jbd2_dev_to_name(__entry->dev),
5052 + (unsigned long) __entry->ino, __entry->ret,
5053 __entry->pages_written, __entry->pages_skipped,
5054 __entry->encountered_congestion, __entry->more_io,
5055 - __entry->no_nrwrite_index_update)
5056 + __entry->no_nrwrite_index_update,
5057 + (unsigned long) __entry->writeback_index)
5058 );
5059
5060 TRACE_EVENT(ext4_da_write_begin,