Magellan Linux

Annotation of /trunk/kernel26-mcore/patches-2.6.31-r1/0107-2.6.31.8-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 973 - (hide annotations) (download)
Tue Jan 5 09:57:31 2010 UTC (14 years, 4 months ago) by niro
File size: 165293 byte(s)
-2.6.31-mcore-r1

1 niro 973 diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
2     index 7be02ac..32c3da4 100644
3     --- a/Documentation/filesystems/ext4.txt
4     +++ b/Documentation/filesystems/ext4.txt
5     @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
6     identified through its new major/minor numbers encoded
7     in devnum.
8    
9     -noload Don't load the journal on mounting. Note that
10     - if the filesystem was not unmounted cleanly,
11     +norecovery Don't load the journal on mounting. Note that
12     +noload if the filesystem was not unmounted cleanly,
13     skipping the journal replay will lead to the
14     filesystem containing inconsistencies that can
15     lead to any number of problems.
16     @@ -338,6 +338,12 @@ noauto_da_alloc replacing existing files via patterns such as
17     system crashes before the delayed allocation
18     blocks are forced to disk.
19    
20     +discard Controls whether ext4 should issue discard/TRIM
21     +nodiscard(*) commands to the underlying block device when
22     + blocks are freed. This is useful for SSD devices
23     + and sparse/thinly-provisioned LUNs, but it is off
24     + by default until sufficient testing has been done.
25     +
26     Data Mode
27     =========
28     There are 3 different data modes:
29     diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
30     index 5fd2da4..28a753d 100644
31     --- a/drivers/scsi/hosts.c
32     +++ b/drivers/scsi/hosts.c
33     @@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *shost)
34     EXPORT_SYMBOL(scsi_remove_host);
35    
36     /**
37     - * scsi_add_host - add a scsi host
38     + * scsi_add_host_with_dma - add a scsi host with dma device
39     * @shost: scsi host pointer to add
40     * @dev: a struct device of type scsi class
41     + * @dma_dev: dma device for the host
42     + *
43     + * Note: You rarely need to worry about this unless you're in a
44     + * virtualised host environments, so use the simpler scsi_add_host()
45     + * function instead.
46     *
47     * Return value:
48     * 0 on success / != 0 for error
49     **/
50     -int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
51     +int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
52     + struct device *dma_dev)
53     {
54     struct scsi_host_template *sht = shost->hostt;
55     int error = -EINVAL;
56     @@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
57    
58     if (!shost->shost_gendev.parent)
59     shost->shost_gendev.parent = dev ? dev : &platform_bus;
60     + shost->dma_dev = dma_dev;
61    
62     error = device_add(&shost->shost_gendev);
63     if (error)
64     @@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
65     fail:
66     return error;
67     }
68     -EXPORT_SYMBOL(scsi_add_host);
69     +EXPORT_SYMBOL(scsi_add_host_with_dma);
70    
71     static void scsi_host_dev_release(struct device *dev)
72     {
73     diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
74     index fc67cc6..cf13ff2 100644
75     --- a/drivers/scsi/lpfc/lpfc_init.c
76     +++ b/drivers/scsi/lpfc/lpfc_init.c
77     @@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
78     vport->els_tmofunc.function = lpfc_els_timeout;
79     vport->els_tmofunc.data = (unsigned long)vport;
80    
81     - error = scsi_add_host(shost, dev);
82     + error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev);
83     if (error)
84     goto out_put_shost;
85    
86     diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
87     index 7dc3d18..7a838c8 100644
88     --- a/drivers/scsi/megaraid/megaraid_sas.c
89     +++ b/drivers/scsi/megaraid/megaraid_sas.c
90     @@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
91     int error = 0, i;
92     void *sense = NULL;
93     dma_addr_t sense_handle;
94     - u32 *sense_ptr;
95     + unsigned long *sense_ptr;
96    
97     memset(kbuff_arr, 0, sizeof(kbuff_arr));
98    
99     @@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
100     }
101    
102     sense_ptr =
103     - (u32 *) ((unsigned long)cmd->frame + ioc->sense_off);
104     + (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off);
105     *sense_ptr = sense_handle;
106     }
107    
108     @@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
109     * sense_ptr points to the location that has the user
110     * sense buffer address
111     */
112     - sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw +
113     - ioc->sense_off);
114     + sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw +
115     + ioc->sense_off);
116    
117     if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)),
118     sense, ioc->sense_len)) {
119     diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
120     index 0f87962..67e016d 100644
121     --- a/drivers/scsi/qla2xxx/qla_attr.c
122     +++ b/drivers/scsi/qla2xxx/qla_attr.c
123     @@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc_vport, bool disable)
124     fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN);
125     }
126    
127     - if (scsi_add_host(vha->host, &fc_vport->dev)) {
128     + if (scsi_add_host_with_dma(vha->host, &fc_vport->dev,
129     + &ha->pdev->dev)) {
130     DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n",
131     vha->host_no, vha->vp_idx));
132     goto vport_create_failed_2;
133     diff --git a/drivers/scsi/scsi_lib_dma.c b/drivers/scsi/scsi_lib_dma.c
134     index ac6855c..dcd1285 100644
135     --- a/drivers/scsi/scsi_lib_dma.c
136     +++ b/drivers/scsi/scsi_lib_dma.c
137     @@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd)
138     int nseg = 0;
139    
140     if (scsi_sg_count(cmd)) {
141     - struct device *dev = cmd->device->host->shost_gendev.parent;
142     + struct device *dev = cmd->device->host->dma_dev;
143    
144     nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
145     cmd->sc_data_direction);
146     @@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map);
147     void scsi_dma_unmap(struct scsi_cmnd *cmd)
148     {
149     if (scsi_sg_count(cmd)) {
150     - struct device *dev = cmd->device->host->shost_gendev.parent;
151     + struct device *dev = cmd->device->host->dma_dev;
152    
153     dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
154     cmd->sc_data_direction);
155     diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
156     index e2126d7..34bb797 100644
157     --- a/fs/ext4/balloc.c
158     +++ b/fs/ext4/balloc.c
159     @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
160     static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
161     ext4_group_t group)
162     {
163     - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
164     + if (!ext4_bg_has_super(sb, group))
165     + return 0;
166     +
167     + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
168     + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
169     + else
170     + return EXT4_SB(sb)->s_gdb_count;
171     }
172    
173     /**
174     diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
175     index 50784ef..dc79b75 100644
176     --- a/fs/ext4/block_validity.c
177     +++ b/fs/ext4/block_validity.c
178     @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
179     if (ext4_bg_has_super(sb, i) &&
180     ((i < 5) || ((i % flex_size) == 0)))
181     add_system_zone(sbi, ext4_group_first_block_no(sb, i),
182     - sbi->s_gdb_count + 1);
183     + ext4_bg_num_gdb(sb, i) + 1);
184     gdp = ext4_get_group_desc(sb, i, NULL);
185     ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
186     if (ret)
187     diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
188     index 9714db3..3b8321b 100644
189     --- a/fs/ext4/ext4.h
190     +++ b/fs/ext4/ext4.h
191     @@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t;
192     #define EXT4_MB_HINT_TRY_GOAL 512
193     /* blocks already pre-reserved by delayed allocation */
194     #define EXT4_MB_DELALLOC_RESERVED 1024
195     +/* We are doing stream allocation */
196     +#define EXT4_MB_STREAM_ALLOC 2048
197    
198    
199     struct ext4_allocation_request {
200     @@ -111,6 +113,33 @@ struct ext4_allocation_request {
201     unsigned int flags;
202     };
203    
204     +#define DIO_AIO_UNWRITTEN 0x1
205     +typedef struct ext4_io_end {
206     + struct list_head list; /* per-file finished AIO list */
207     + struct inode *inode; /* file being written to */
208     + unsigned int flag; /* sync IO or AIO */
209     + int error; /* I/O error code */
210     + ext4_lblk_t offset; /* offset in the file */
211     + size_t size; /* size of the extent */
212     + struct work_struct work; /* data work queue */
213     +} ext4_io_end_t;
214     +
215     +/*
216     + * Delayed allocation stuff
217     + */
218     +
219     +struct mpage_da_data {
220     + struct inode *inode;
221     + sector_t b_blocknr; /* start block number of extent */
222     + size_t b_size; /* size of extent */
223     + unsigned long b_state; /* state of the extent */
224     + unsigned long first_page, next_page; /* extent of pages */
225     + struct writeback_control *wbc;
226     + int io_done;
227     + int pages_written;
228     + int retval;
229     +};
230     +
231     /*
232     * Special inodes numbers
233     */
234     @@ -251,7 +280,6 @@ struct flex_groups {
235     #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
236     #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
237     #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
238     -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
239     #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
240    
241     #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
242     @@ -289,6 +317,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
243     #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
244     #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
245     #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
246     +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
247     +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
248    
249     /* Used to pass group descriptor data when online resize is done */
250     struct ext4_new_group_input {
251     @@ -330,7 +360,16 @@ struct ext4_new_group_data {
252     /* Call ext4_da_update_reserve_space() after successfully
253     allocating the blocks */
254     #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
255     -
256     + /* caller is from the direct IO path, request to creation of an
257     + unitialized extents if not allocated, split the uninitialized
258     + extent if blocks has been preallocated already*/
259     +#define EXT4_GET_BLOCKS_DIO 0x0010
260     +#define EXT4_GET_BLOCKS_CONVERT 0x0020
261     +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
262     + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
263     + /* Convert extent to initialized after direct IO complete */
264     +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
265     + EXT4_GET_BLOCKS_DIO_CREATE_EXT)
266    
267     /*
268     * ioctl commands
269     @@ -386,6 +425,9 @@ struct ext4_mount_options {
270     #endif
271     };
272    
273     +/* Max physical block we can addres w/o extents */
274     +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
275     +
276     /*
277     * Structure of an inode on the disk
278     */
279     @@ -481,8 +523,8 @@ struct move_extent {
280     static inline __le32 ext4_encode_extra_time(struct timespec *time)
281     {
282     return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
283     - time->tv_sec >> 32 : 0) |
284     - ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
285     + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
286     + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
287     }
288    
289     static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
290     @@ -490,7 +532,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
291     if (sizeof(time->tv_sec) > 4)
292     time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
293     << 32;
294     - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
295     + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
296     }
297    
298     #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
299     @@ -653,6 +695,18 @@ struct ext4_inode_info {
300     __u16 i_extra_isize;
301    
302     spinlock_t i_block_reservation_lock;
303     +
304     + /* completed async DIOs that might need unwritten extents handling */
305     + struct list_head i_aio_dio_complete_list;
306     + /* current io_end structure for async DIO write*/
307     + ext4_io_end_t *cur_aio_dio;
308     +
309     + /*
310     + * Transactions that contain inode's metadata needed to complete
311     + * fsync and fdatasync, respectively.
312     + */
313     + tid_t i_sync_tid;
314     + tid_t i_datasync_tid;
315     };
316    
317     /*
318     @@ -700,6 +754,7 @@ struct ext4_inode_info {
319     #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
320     #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
321     #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
322     +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
323    
324     #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
325     #define set_opt(o, opt) o |= EXT4_MOUNT_##opt
326     @@ -841,6 +896,7 @@ struct ext4_sb_info {
327     unsigned long s_gdb_count; /* Number of group descriptor blocks */
328     unsigned long s_desc_per_block; /* Number of group descriptors per block */
329     ext4_group_t s_groups_count; /* Number of groups in the fs */
330     + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
331     unsigned long s_overhead_last; /* Last calculated overhead */
332     unsigned long s_blocks_last; /* Last seen block count */
333     loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
334     @@ -923,6 +979,7 @@ struct ext4_sb_info {
335     unsigned int s_mb_stats;
336     unsigned int s_mb_order2_reqs;
337     unsigned int s_mb_group_prealloc;
338     + unsigned int s_max_writeback_mb_bump;
339     /* where last allocation was done - for stream allocation */
340     unsigned long s_mb_last_group;
341     unsigned long s_mb_last_start;
342     @@ -950,6 +1007,7 @@ struct ext4_sb_info {
343     atomic_t s_mb_lost_chunks;
344     atomic_t s_mb_preallocated;
345     atomic_t s_mb_discarded;
346     + atomic_t s_lock_busy;
347    
348     /* locality groups */
349     struct ext4_locality_group *s_locality_groups;
350     @@ -960,6 +1018,9 @@ struct ext4_sb_info {
351    
352     unsigned int s_log_groups_per_flex;
353     struct flex_groups *s_flex_groups;
354     +
355     + /* workqueue for dio unwritten */
356     + struct workqueue_struct *dio_unwritten_wq;
357     };
358    
359     static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
360     @@ -1367,6 +1428,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
361     extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
362     extern int ext4_can_truncate(struct inode *inode);
363     extern void ext4_truncate(struct inode *);
364     +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
365     extern void ext4_set_inode_flags(struct inode *);
366     extern void ext4_get_inode_flags(struct ext4_inode_info *);
367     extern int ext4_alloc_da_blocks(struct inode *inode);
368     @@ -1378,7 +1440,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
369     struct address_space *mapping, loff_t from);
370     extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
371     extern qsize_t ext4_get_reserved_space(struct inode *inode);
372     -
373     +extern int flush_aio_dio_completed_IO(struct inode *inode);
374     /* ioctl.c */
375     extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
376     extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
377     @@ -1591,15 +1653,42 @@ struct ext4_group_info {
378     #define EXT4_MB_GRP_NEED_INIT(grp) \
379     (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
380    
381     +#define EXT4_MAX_CONTENTION 8
382     +#define EXT4_CONTENTION_THRESHOLD 2
383     +
384     static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
385     ext4_group_t group)
386     {
387     return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
388     }
389    
390     +/*
391     + * Returns true if the filesystem is busy enough that attempts to
392     + * access the block group locks has run into contention.
393     + */
394     +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
395     +{
396     + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
397     +}
398     +
399     static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
400     {
401     - spin_lock(ext4_group_lock_ptr(sb, group));
402     + spinlock_t *lock = ext4_group_lock_ptr(sb, group);
403     + if (spin_trylock(lock))
404     + /*
405     + * We're able to grab the lock right away, so drop the
406     + * lock contention counter.
407     + */
408     + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
409     + else {
410     + /*
411     + * The lock is busy, so bump the contention counter,
412     + * and then wait on the spin lock.
413     + */
414     + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
415     + EXT4_MAX_CONTENTION);
416     + spin_lock(lock);
417     + }
418     }
419    
420     static inline void ext4_unlock_group(struct super_block *sb,
421     @@ -1650,6 +1739,8 @@ extern void ext4_ext_init(struct super_block *);
422     extern void ext4_ext_release(struct super_block *);
423     extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
424     loff_t len);
425     +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
426     + loff_t len);
427     extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
428     sector_t block, unsigned int max_blocks,
429     struct buffer_head *bh, int flags);
430     diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
431     index 20a8410..1c2db3f 100644
432     --- a/fs/ext4/ext4_extents.h
433     +++ b/fs/ext4/ext4_extents.h
434     @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
435     (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
436     }
437    
438     +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
439     +{
440     + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
441     +}
442     +
443     extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
444     extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
445     extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
446     @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
447     struct ext4_ext_path *path,
448     struct ext4_extent *);
449     extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
450     -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
451     +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
452     extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
453     ext_prepare_callback, void *);
454     extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
455     diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
456     index eb27fd0..6a94099 100644
457     --- a/fs/ext4/ext4_jbd2.c
458     +++ b/fs/ext4/ext4_jbd2.c
459     @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
460     handle, err);
461     }
462     else
463     - brelse(bh);
464     + bforget(bh);
465     return err;
466     }
467    
468     @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
469     handle, err);
470     }
471     else
472     - brelse(bh);
473     + bforget(bh);
474     return err;
475     }
476    
477     @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
478     ext4_journal_abort_handle(where, __func__, bh,
479     handle, err);
480     } else {
481     - mark_buffer_dirty(bh);
482     + if (inode && bh)
483     + mark_buffer_dirty_inode(bh, inode);
484     + else
485     + mark_buffer_dirty(bh);
486     if (inode && inode_needs_sync(inode)) {
487     sync_dirty_buffer(bh);
488     if (buffer_req(bh) && !buffer_uptodate(bh)) {
489     diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
490     index 139fb8c..1892a77 100644
491     --- a/fs/ext4/ext4_jbd2.h
492     +++ b/fs/ext4/ext4_jbd2.h
493     @@ -49,7 +49,7 @@
494    
495     #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
496     EXT4_XATTR_TRANS_BLOCKS - 2 + \
497     - 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
498     + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
499    
500     /*
501     * Define the number of metadata blocks we need to account to modify data.
502     @@ -57,7 +57,7 @@
503     * This include super block, inode block, quota blocks and xattr blocks
504     */
505     #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
506     - 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
507     + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
508    
509     /* Delete operations potentially hit one directory's namespace plus an
510     * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
511     @@ -92,6 +92,7 @@
512     * but inode, sb and group updates are done only once */
513     #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
514     (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
515     +
516     #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
517     (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
518     #else
519     @@ -99,6 +100,9 @@
520     #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
521     #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
522     #endif
523     +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
524     +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
525     +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
526    
527     int
528     ext4_mark_iloc_dirty(handle_t *handle,
529     @@ -161,11 +165,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
530     handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
531     int __ext4_journal_stop(const char *where, handle_t *handle);
532    
533     -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
534     +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
535    
536     +/* Note: Do not use this for NULL handles. This is only to determine if
537     + * a properly allocated handle is using a journal or not. */
538     static inline int ext4_handle_valid(handle_t *handle)
539     {
540     - if (handle == EXT4_NOJOURNAL_HANDLE)
541     + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
542     return 0;
543     return 1;
544     }
545     @@ -252,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
546     return 0;
547     }
548    
549     +static inline void ext4_update_inode_fsync_trans(handle_t *handle,
550     + struct inode *inode,
551     + int datasync)
552     +{
553     + struct ext4_inode_info *ei = EXT4_I(inode);
554     +
555     + if (ext4_handle_valid(handle)) {
556     + ei->i_sync_tid = handle->h_transaction->t_tid;
557     + if (datasync)
558     + ei->i_datasync_tid = handle->h_transaction->t_tid;
559     + }
560     +}
561     +
562     /* super.c */
563     int ext4_force_commit(struct super_block *sb);
564    
565     diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
566     index 73ebfb4..24fb20b 100644
567     --- a/fs/ext4/extents.c
568     +++ b/fs/ext4/extents.c
569     @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
570     ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
571     }
572    
573     -static int ext4_ext_journal_restart(handle_t *handle, int needed)
574     +static int ext4_ext_truncate_extend_restart(handle_t *handle,
575     + struct inode *inode,
576     + int needed)
577     {
578     int err;
579    
580     @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
581     err = ext4_journal_extend(handle, needed);
582     if (err <= 0)
583     return err;
584     - return ext4_journal_restart(handle, needed);
585     + err = ext4_truncate_restart_trans(handle, inode, needed);
586     + /*
587     + * We have dropped i_data_sem so someone might have cached again
588     + * an extent we are going to truncate.
589     + */
590     + ext4_ext_invalidate_cache(inode);
591     +
592     + return err;
593     }
594    
595     /*
596     @@ -701,7 +710,7 @@ err:
597     * insert new index [@logical;@ptr] into the block at @curp;
598     * check where to insert: before @curp or after @curp
599     */
600     -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
601     +int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
602     struct ext4_ext_path *curp,
603     int logical, ext4_fsblk_t ptr)
604     {
605     @@ -1563,7 +1572,7 @@ out:
606     */
607     int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
608     struct ext4_ext_path *path,
609     - struct ext4_extent *newext)
610     + struct ext4_extent *newext, int flag)
611     {
612     struct ext4_extent_header *eh;
613     struct ext4_extent *ex, *fex;
614     @@ -1579,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
615     BUG_ON(path[depth].p_hdr == NULL);
616    
617     /* try to insert block into found extent and return */
618     - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
619     + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
620     + && ext4_can_extents_be_merged(inode, ex, newext)) {
621     ext_debug("append %d block to %d:%d (from %llu)\n",
622     ext4_ext_get_actual_len(newext),
623     le32_to_cpu(ex->ee_block),
624     @@ -1694,7 +1704,8 @@ has_space:
625    
626     merge:
627     /* try to merge extents to the right */
628     - ext4_ext_try_to_merge(inode, path, nearex);
629     + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
630     + ext4_ext_try_to_merge(inode, path, nearex);
631    
632     /* try to merge extents to the left */
633    
634     @@ -1731,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
635     while (block < last && block != EXT_MAX_BLOCK) {
636     num = last - block;
637     /* find extent for this block */
638     + down_read(&EXT4_I(inode)->i_data_sem);
639     path = ext4_ext_find_extent(inode, block, path);
640     + up_read(&EXT4_I(inode)->i_data_sem);
641     if (IS_ERR(path)) {
642     err = PTR_ERR(path);
643     path = NULL;
644     @@ -2044,7 +2057,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
645     ext_debug("free last %u blocks starting %llu\n", num, start);
646     for (i = 0; i < num; i++) {
647     bh = sb_find_get_block(inode->i_sb, start + i);
648     - ext4_forget(handle, 0, inode, bh, start + i);
649     + ext4_forget(handle, metadata, inode, bh, start + i);
650     }
651     ext4_free_blocks(handle, inode, start, num, metadata);
652     } else if (from == le32_to_cpu(ex->ee_block)
653     @@ -2136,9 +2149,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
654     correct_index = 1;
655     credits += (ext_depth(inode)) + 1;
656     }
657     - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
658     + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
659    
660     - err = ext4_ext_journal_restart(handle, credits);
661     + err = ext4_ext_truncate_extend_restart(handle, inode, credits);
662     if (err)
663     goto out;
664    
665     @@ -2461,7 +2474,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
666     }
667    
668     #define EXT4_EXT_ZERO_LEN 7
669     -
670     /*
671     * This function is called by ext4_ext_get_blocks() if someone tries to write
672     * to an uninitialized extent. It may result in splitting the uninitialized
673     @@ -2554,7 +2566,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
674     ex3->ee_block = cpu_to_le32(iblock);
675     ext4_ext_store_pblock(ex3, newblock);
676     ex3->ee_len = cpu_to_le16(allocated);
677     - err = ext4_ext_insert_extent(handle, inode, path, ex3);
678     + err = ext4_ext_insert_extent(handle, inode, path,
679     + ex3, 0);
680     if (err == -ENOSPC) {
681     err = ext4_ext_zeroout(inode, &orig_ex);
682     if (err)
683     @@ -2610,7 +2623,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
684     ext4_ext_store_pblock(ex3, newblock + max_blocks);
685     ex3->ee_len = cpu_to_le16(allocated - max_blocks);
686     ext4_ext_mark_uninitialized(ex3);
687     - err = ext4_ext_insert_extent(handle, inode, path, ex3);
688     + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
689     if (err == -ENOSPC) {
690     err = ext4_ext_zeroout(inode, &orig_ex);
691     if (err)
692     @@ -2728,7 +2741,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
693     err = ext4_ext_dirty(handle, inode, path + depth);
694     goto out;
695     insert:
696     - err = ext4_ext_insert_extent(handle, inode, path, &newex);
697     + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
698     + if (err == -ENOSPC) {
699     + err = ext4_ext_zeroout(inode, &orig_ex);
700     + if (err)
701     + goto fix_extent_len;
702     + /* update the extent length and mark as initialized */
703     + ex->ee_block = orig_ex.ee_block;
704     + ex->ee_len = orig_ex.ee_len;
705     + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
706     + ext4_ext_dirty(handle, inode, path + depth);
707     + /* zero out the first half */
708     + return allocated;
709     + } else if (err)
710     + goto fix_extent_len;
711     +out:
712     + return err ? err : allocated;
713     +
714     +fix_extent_len:
715     + ex->ee_block = orig_ex.ee_block;
716     + ex->ee_len = orig_ex.ee_len;
717     + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
718     + ext4_ext_mark_uninitialized(ex);
719     + ext4_ext_dirty(handle, inode, path + depth);
720     + return err;
721     +}
722     +
723     +/*
724     + * This function is called by ext4_ext_get_blocks() from
725     + * ext4_get_blocks_dio_write() when DIO to write
726     + * to an uninitialized extent.
727     + *
728     + * Writing to an uninitized extent may result in splitting the uninitialized
729     + * extent into multiple /intialized unintialized extents (up to three)
730     + * There are three possibilities:
731     + * a> There is no split required: Entire extent should be uninitialized
732     + * b> Splits in two extents: Write is happening at either end of the extent
733     + * c> Splits in three extents: Somone is writing in middle of the extent
734     + *
735     + * One of more index blocks maybe needed if the extent tree grow after
736     + * the unintialized extent split. To prevent ENOSPC occur at the IO
737     + * complete, we need to split the uninitialized extent before DIO submit
738     + * the IO. The uninitilized extent called at this time will be split
739     + * into three uninitialized extent(at most). After IO complete, the part
740     + * being filled will be convert to initialized by the end_io callback function
741     + * via ext4_convert_unwritten_extents().
742     + *
743     + * Returns the size of uninitialized extent to be written on success.
744     + */
745     +static int ext4_split_unwritten_extents(handle_t *handle,
746     + struct inode *inode,
747     + struct ext4_ext_path *path,
748     + ext4_lblk_t iblock,
749     + unsigned int max_blocks,
750     + int flags)
751     +{
752     + struct ext4_extent *ex, newex, orig_ex;
753     + struct ext4_extent *ex1 = NULL;
754     + struct ext4_extent *ex2 = NULL;
755     + struct ext4_extent *ex3 = NULL;
756     + struct ext4_extent_header *eh;
757     + ext4_lblk_t ee_block;
758     + unsigned int allocated, ee_len, depth;
759     + ext4_fsblk_t newblock;
760     + int err = 0;
761     +
762     + ext_debug("ext4_split_unwritten_extents: inode %lu,"
763     + "iblock %llu, max_blocks %u\n", inode->i_ino,
764     + (unsigned long long)iblock, max_blocks);
765     + depth = ext_depth(inode);
766     + eh = path[depth].p_hdr;
767     + ex = path[depth].p_ext;
768     + ee_block = le32_to_cpu(ex->ee_block);
769     + ee_len = ext4_ext_get_actual_len(ex);
770     + allocated = ee_len - (iblock - ee_block);
771     + newblock = iblock - ee_block + ext_pblock(ex);
772     + ex2 = ex;
773     + orig_ex.ee_block = ex->ee_block;
774     + orig_ex.ee_len = cpu_to_le16(ee_len);
775     + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
776     +
777     + /*
778     + * If the uninitialized extent begins at the same logical
779     + * block where the write begins, and the write completely
780     + * covers the extent, then we don't need to split it.
781     + */
782     + if ((iblock == ee_block) && (allocated <= max_blocks))
783     + return allocated;
784     +
785     + err = ext4_ext_get_access(handle, inode, path + depth);
786     + if (err)
787     + goto out;
788     + /* ex1: ee_block to iblock - 1 : uninitialized */
789     + if (iblock > ee_block) {
790     + ex1 = ex;
791     + ex1->ee_len = cpu_to_le16(iblock - ee_block);
792     + ext4_ext_mark_uninitialized(ex1);
793     + ex2 = &newex;
794     + }
795     + /*
796     + * for sanity, update the length of the ex2 extent before
797     + * we insert ex3, if ex1 is NULL. This is to avoid temporary
798     + * overlap of blocks.
799     + */
800     + if (!ex1 && allocated > max_blocks)
801     + ex2->ee_len = cpu_to_le16(max_blocks);
802     + /* ex3: to ee_block + ee_len : uninitialised */
803     + if (allocated > max_blocks) {
804     + unsigned int newdepth;
805     + ex3 = &newex;
806     + ex3->ee_block = cpu_to_le32(iblock + max_blocks);
807     + ext4_ext_store_pblock(ex3, newblock + max_blocks);
808     + ex3->ee_len = cpu_to_le16(allocated - max_blocks);
809     + ext4_ext_mark_uninitialized(ex3);
810     + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
811     + if (err == -ENOSPC) {
812     + err = ext4_ext_zeroout(inode, &orig_ex);
813     + if (err)
814     + goto fix_extent_len;
815     + /* update the extent length and mark as initialized */
816     + ex->ee_block = orig_ex.ee_block;
817     + ex->ee_len = orig_ex.ee_len;
818     + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
819     + ext4_ext_dirty(handle, inode, path + depth);
820     + /* zeroed the full extent */
821     + /* blocks available from iblock */
822     + return allocated;
823     +
824     + } else if (err)
825     + goto fix_extent_len;
826     + /*
827     + * The depth, and hence eh & ex might change
828     + * as part of the insert above.
829     + */
830     + newdepth = ext_depth(inode);
831     + /*
832     + * update the extent length after successful insert of the
833     + * split extent
834     + */
835     + orig_ex.ee_len = cpu_to_le16(ee_len -
836     + ext4_ext_get_actual_len(ex3));
837     + depth = newdepth;
838     + ext4_ext_drop_refs(path);
839     + path = ext4_ext_find_extent(inode, iblock, path);
840     + if (IS_ERR(path)) {
841     + err = PTR_ERR(path);
842     + goto out;
843     + }
844     + eh = path[depth].p_hdr;
845     + ex = path[depth].p_ext;
846     + if (ex2 != &newex)
847     + ex2 = ex;
848     +
849     + err = ext4_ext_get_access(handle, inode, path + depth);
850     + if (err)
851     + goto out;
852     +
853     + allocated = max_blocks;
854     + }
855     + /*
856     + * If there was a change of depth as part of the
857     + * insertion of ex3 above, we need to update the length
858     + * of the ex1 extent again here
859     + */
860     + if (ex1 && ex1 != ex) {
861     + ex1 = ex;
862     + ex1->ee_len = cpu_to_le16(iblock - ee_block);
863     + ext4_ext_mark_uninitialized(ex1);
864     + ex2 = &newex;
865     + }
866     + /*
867     + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
868     + * uninitialised still.
869     + */
870     + ex2->ee_block = cpu_to_le32(iblock);
871     + ext4_ext_store_pblock(ex2, newblock);
872     + ex2->ee_len = cpu_to_le16(allocated);
873     + ext4_ext_mark_uninitialized(ex2);
874     + if (ex2 != ex)
875     + goto insert;
876     + /* Mark modified extent as dirty */
877     + err = ext4_ext_dirty(handle, inode, path + depth);
878     + ext_debug("out here\n");
879     + goto out;
880     +insert:
881     + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
882     if (err == -ENOSPC) {
883     err = ext4_ext_zeroout(inode, &orig_ex);
884     if (err)
885     @@ -2743,6 +2940,7 @@ insert:
886     } else if (err)
887     goto fix_extent_len;
888     out:
889     + ext4_ext_show_leaf(inode, path);
890     return err ? err : allocated;
891    
892     fix_extent_len:
893     @@ -2753,7 +2951,151 @@ fix_extent_len:
894     ext4_ext_dirty(handle, inode, path + depth);
895     return err;
896     }
897     +static int ext4_convert_unwritten_extents_dio(handle_t *handle,
898     + struct inode *inode,
899     + struct ext4_ext_path *path)
900     +{
901     + struct ext4_extent *ex;
902     + struct ext4_extent_header *eh;
903     + int depth;
904     + int err = 0;
905     + int ret = 0;
906     +
907     + depth = ext_depth(inode);
908     + eh = path[depth].p_hdr;
909     + ex = path[depth].p_ext;
910     +
911     + err = ext4_ext_get_access(handle, inode, path + depth);
912     + if (err)
913     + goto out;
914     + /* first mark the extent as initialized */
915     + ext4_ext_mark_initialized(ex);
916     +
917     + /*
918     + * We have to see if it can be merged with the extent
919     + * on the left.
920     + */
921     + if (ex > EXT_FIRST_EXTENT(eh)) {
922     + /*
923     + * To merge left, pass "ex - 1" to try_to_merge(),
924     + * since it merges towards right _only_.
925     + */
926     + ret = ext4_ext_try_to_merge(inode, path, ex - 1);
927     + if (ret) {
928     + err = ext4_ext_correct_indexes(handle, inode, path);
929     + if (err)
930     + goto out;
931     + depth = ext_depth(inode);
932     + ex--;
933     + }
934     + }
935     + /*
936     + * Try to Merge towards right.
937     + */
938     + ret = ext4_ext_try_to_merge(inode, path, ex);
939     + if (ret) {
940     + err = ext4_ext_correct_indexes(handle, inode, path);
941     + if (err)
942     + goto out;
943     + depth = ext_depth(inode);
944     + }
945     + /* Mark modified extent as dirty */
946     + err = ext4_ext_dirty(handle, inode, path + depth);
947     +out:
948     + ext4_ext_show_leaf(inode, path);
949     + return err;
950     +}
951     +
952     +static int
953     +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
954     + ext4_lblk_t iblock, unsigned int max_blocks,
955     + struct ext4_ext_path *path, int flags,
956     + unsigned int allocated, struct buffer_head *bh_result,
957     + ext4_fsblk_t newblock)
958     +{
959     + int ret = 0;
960     + int err = 0;
961     + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
962     +
963     + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
964     + "block %llu, max_blocks %u, flags %d, allocated %u",
965     + inode->i_ino, (unsigned long long)iblock, max_blocks,
966     + flags, allocated);
967     + ext4_ext_show_leaf(inode, path);
968    
969     + /* DIO get_block() before submit the IO, split the extent */
970     + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
971     + ret = ext4_split_unwritten_extents(handle,
972     + inode, path, iblock,
973     + max_blocks, flags);
974     + /*
975     + * Flag the inode(non aio case) or end_io struct (aio case)
976     + * that this IO needs to convertion to written when IO is
977     + * completed
978     + */
979     + if (io)
980     + io->flag = DIO_AIO_UNWRITTEN;
981     + else
982     + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
983     + goto out;
984     + }
985     + /* async DIO end_io complete, convert the filled extent to written */
986     + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
987     + ret = ext4_convert_unwritten_extents_dio(handle, inode,
988     + path);
989     + if (ret >= 0)
990     + ext4_update_inode_fsync_trans(handle, inode, 1);
991     + goto out2;
992     + }
993     + /* buffered IO case */
994     + /*
995     + * repeat fallocate creation request
996     + * we already have an unwritten extent
997     + */
998     + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
999     + goto map_out;
1000     +
1001     + /* buffered READ or buffered write_begin() lookup */
1002     + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
1003     + /*
1004     + * We have blocks reserved already. We
1005     + * return allocated blocks so that delalloc
1006     + * won't do block reservation for us. But
1007     + * the buffer head will be unmapped so that
1008     + * a read from the block returns 0s.
1009     + */
1010     + set_buffer_unwritten(bh_result);
1011     + goto out1;
1012     + }
1013     +
1014     + /* buffered write, writepage time, convert*/
1015     + ret = ext4_ext_convert_to_initialized(handle, inode,
1016     + path, iblock,
1017     + max_blocks);
1018     + if (ret >= 0)
1019     + ext4_update_inode_fsync_trans(handle, inode, 1);
1020     +out:
1021     + if (ret <= 0) {
1022     + err = ret;
1023     + goto out2;
1024     + } else
1025     + allocated = ret;
1026     + set_buffer_new(bh_result);
1027     +map_out:
1028     + set_buffer_mapped(bh_result);
1029     +out1:
1030     + if (allocated > max_blocks)
1031     + allocated = max_blocks;
1032     + ext4_ext_show_leaf(inode, path);
1033     + bh_result->b_bdev = inode->i_sb->s_bdev;
1034     + bh_result->b_blocknr = newblock;
1035     +out2:
1036     + if (path) {
1037     + ext4_ext_drop_refs(path);
1038     + kfree(path);
1039     + }
1040     + return err ? err : allocated;
1041     +}
1042     /*
1043     * Block allocation/map/preallocation routine for extents based files
1044     *
1045     @@ -2784,6 +3126,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1046     int err = 0, depth, ret, cache_type;
1047     unsigned int allocated = 0;
1048     struct ext4_allocation_request ar;
1049     + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
1050    
1051     __clear_bit(BH_New, &bh_result->b_state);
1052     ext_debug("blocks %u/%u requested for inode %u\n",
1053     @@ -2859,33 +3202,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1054     EXT4_EXT_CACHE_EXTENT);
1055     goto out;
1056     }
1057     - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
1058     - goto out;
1059     - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
1060     - if (allocated > max_blocks)
1061     - allocated = max_blocks;
1062     - /*
1063     - * We have blocks reserved already. We
1064     - * return allocated blocks so that delalloc
1065     - * won't do block reservation for us. But
1066     - * the buffer head will be unmapped so that
1067     - * a read from the block returns 0s.
1068     - */
1069     - set_buffer_unwritten(bh_result);
1070     - bh_result->b_bdev = inode->i_sb->s_bdev;
1071     - bh_result->b_blocknr = newblock;
1072     - goto out2;
1073     - }
1074     -
1075     - ret = ext4_ext_convert_to_initialized(handle, inode,
1076     - path, iblock,
1077     - max_blocks);
1078     - if (ret <= 0) {
1079     - err = ret;
1080     - goto out2;
1081     - } else
1082     - allocated = ret;
1083     - goto outnew;
1084     + ret = ext4_ext_handle_uninitialized_extents(handle,
1085     + inode, iblock, max_blocks, path,
1086     + flags, allocated, bh_result, newblock);
1087     + return ret;
1088     }
1089     }
1090    
1091     @@ -2956,9 +3276,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1092     /* try to insert new extent into found leaf and return */
1093     ext4_ext_store_pblock(&newex, newblock);
1094     newex.ee_len = cpu_to_le16(ar.len);
1095     - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
1096     + /* Mark uninitialized */
1097     + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
1098     ext4_ext_mark_uninitialized(&newex);
1099     - err = ext4_ext_insert_extent(handle, inode, path, &newex);
1100     + /*
1101     + * io_end structure was created for every async
1102     + * direct IO write to the middle of the file.
1103     + * To avoid unecessary convertion for every aio dio rewrite
1104     + * to the mid of file, here we flag the IO that is really
1105     + * need the convertion.
1106     + * For non asycn direct IO case, flag the inode state
1107     + * that we need to perform convertion when IO is done.
1108     + */
1109     + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
1110     + if (io)
1111     + io->flag = DIO_AIO_UNWRITTEN;
1112     + else
1113     + EXT4_I(inode)->i_state |=
1114     + EXT4_STATE_DIO_UNWRITTEN;;
1115     + }
1116     + }
1117     + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
1118     if (err) {
1119     /* free data blocks we just allocated */
1120     /* not a good idea to call discard here directly,
1121     @@ -2972,13 +3310,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1122     /* previous routine could use block we allocated */
1123     newblock = ext_pblock(&newex);
1124     allocated = ext4_ext_get_actual_len(&newex);
1125     -outnew:
1126     set_buffer_new(bh_result);
1127    
1128     - /* Cache only when it is _not_ an uninitialized extent */
1129     - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
1130     + /*
1131     + * Cache the extent and update transaction to commit on fdatasync only
1132     + * when it is _not_ an uninitialized extent.
1133     + */
1134     + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
1135     ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
1136     EXT4_EXT_CACHE_EXTENT);
1137     + ext4_update_inode_fsync_trans(handle, inode, 1);
1138     + } else
1139     + ext4_update_inode_fsync_trans(handle, inode, 0);
1140     out:
1141     if (allocated > max_blocks)
1142     allocated = max_blocks;
1143     @@ -3171,6 +3514,64 @@ retry:
1144     }
1145    
1146     /*
1147     + * This function convert a range of blocks to written extents
1148     + * The caller of this function will pass the start offset and the size.
1149     + * all unwritten extents within this range will be converted to
1150     + * written extents.
1151     + *
1152     + * This function is called from the direct IO end io call back
1153     + * function, to convert the fallocated extents after IO is completed.
1154     + * Returns 0 on success.
1155     + */
1156     +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1157     + loff_t len)
1158     +{
1159     + handle_t *handle;
1160     + ext4_lblk_t block;
1161     + unsigned int max_blocks;
1162     + int ret = 0;
1163     + int ret2 = 0;
1164     + struct buffer_head map_bh;
1165     + unsigned int credits, blkbits = inode->i_blkbits;
1166     +
1167     + block = offset >> blkbits;
1168     + /*
1169     + * We can't just convert len to max_blocks because
1170     + * If blocksize = 4096 offset = 3072 and len = 2048
1171     + */
1172     + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
1173     + - block;
1174     + /*
1175     + * credits to insert 1 extent into extent tree
1176     + */
1177     + credits = ext4_chunk_trans_blocks(inode, max_blocks);
1178     + while (ret >= 0 && ret < max_blocks) {
1179     + block = block + ret;
1180     + max_blocks = max_blocks - ret;
1181     + handle = ext4_journal_start(inode, credits);
1182     + if (IS_ERR(handle)) {
1183     + ret = PTR_ERR(handle);
1184     + break;
1185     + }
1186     + map_bh.b_state = 0;
1187     + ret = ext4_get_blocks(handle, inode, block,
1188     + max_blocks, &map_bh,
1189     + EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
1190     + if (ret <= 0) {
1191     + WARN_ON(ret <= 0);
1192     + printk(KERN_ERR "%s: ext4_ext_get_blocks "
1193     + "returned error inode#%lu, block=%u, "
1194     + "max_blocks=%u", __func__,
1195     + inode->i_ino, block, max_blocks);
1196     + }
1197     + ext4_mark_inode_dirty(handle, inode);
1198     + ret2 = ext4_journal_stop(handle);
1199     + if (ret <= 0 || ret2 )
1200     + break;
1201     + }
1202     + return ret > 0 ? ret2 : ret;
1203     +}
1204     +/*
1205     * Callback function called for each extent to gather FIEMAP information.
1206     */
1207     static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
1208     @@ -3308,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1209     * Walk the extent tree gathering extent information.
1210     * ext4_ext_fiemap_cb will push extents back to user.
1211     */
1212     - down_read(&EXT4_I(inode)->i_data_sem);
1213     error = ext4_ext_walk_space(inode, start_blk, len_blks,
1214     ext4_ext_fiemap_cb, fieinfo);
1215     - up_read(&EXT4_I(inode)->i_data_sem);
1216     }
1217    
1218     return error;
1219     diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
1220     index 83cf641..d6049e4 100644
1221     --- a/fs/ext4/fsync.c
1222     +++ b/fs/ext4/fsync.c
1223     @@ -44,27 +44,37 @@
1224     *
1225     * What we do is just kick off a commit and wait on it. This will snapshot the
1226     * inode to disk.
1227     + *
1228     + * i_mutex lock is held when entering and exiting this function
1229     */
1230    
1231     int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
1232     {
1233     struct inode *inode = dentry->d_inode;
1234     + struct ext4_inode_info *ei = EXT4_I(inode);
1235     journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
1236     - int ret = 0;
1237     + int ret;
1238     + tid_t commit_tid;
1239    
1240     J_ASSERT(ext4_journal_current_handle() == NULL);
1241    
1242     trace_ext4_sync_file(file, dentry, datasync);
1243    
1244     + if (inode->i_sb->s_flags & MS_RDONLY)
1245     + return 0;
1246     +
1247     + ret = flush_aio_dio_completed_IO(inode);
1248     + if (ret < 0)
1249     + return ret;
1250     +
1251     + if (!journal)
1252     + return simple_fsync(file, dentry, datasync);
1253     +
1254     /*
1255     - * data=writeback:
1256     + * data=writeback,ordered:
1257     * The caller's filemap_fdatawrite()/wait will sync the data.
1258     - * sync_inode() will sync the metadata
1259     - *
1260     - * data=ordered:
1261     - * The caller's filemap_fdatawrite() will write the data and
1262     - * sync_inode() will write the inode if it is dirty. Then the caller's
1263     - * filemap_fdatawait() will wait on the pages.
1264     + * Metadata is in the journal, we wait for proper transaction to
1265     + * commit here.
1266     *
1267     * data=journal:
1268     * filemap_fdatawrite won't do anything (the buffers are clean).
1269     @@ -74,27 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
1270     * (they were dirtied by commit). But that's OK - the blocks are
1271     * safe in-journal, which is all fsync() needs to ensure.
1272     */
1273     - if (ext4_should_journal_data(inode)) {
1274     - ret = ext4_force_commit(inode->i_sb);
1275     - goto out;
1276     - }
1277     -
1278     - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
1279     - goto out;
1280     + if (ext4_should_journal_data(inode))
1281     + return ext4_force_commit(inode->i_sb);
1282    
1283     - /*
1284     - * The VFS has written the file data. If the inode is unaltered
1285     - * then we need not start a commit.
1286     - */
1287     - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
1288     - struct writeback_control wbc = {
1289     - .sync_mode = WB_SYNC_ALL,
1290     - .nr_to_write = 0, /* sys_fsync did this */
1291     - };
1292     - ret = sync_inode(inode, &wbc);
1293     - if (journal && (journal->j_flags & JBD2_BARRIER))
1294     - blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1295     - }
1296     -out:
1297     + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
1298     + if (jbd2_log_start_commit(journal, commit_tid))
1299     + jbd2_log_wait_commit(journal, commit_tid);
1300     + else if (journal->j_flags & JBD2_BARRIER)
1301     + blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1302     return ret;
1303     }
1304     diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
1305     index f9c642b..38b2154 100644
1306     --- a/fs/ext4/inode.c
1307     +++ b/fs/ext4/inode.c
1308     @@ -37,6 +37,7 @@
1309     #include <linux/namei.h>
1310     #include <linux/uio.h>
1311     #include <linux/bio.h>
1312     +#include <linux/workqueue.h>
1313    
1314     #include "ext4_jbd2.h"
1315     #include "xattr.h"
1316     @@ -192,11 +193,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
1317     * so before we call here everything must be consistently dirtied against
1318     * this transaction.
1319     */
1320     -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
1321     +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
1322     + int nblocks)
1323     {
1324     + int ret;
1325     +
1326     + /*
1327     + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
1328     + * moment, get_block can be called only for blocks inside i_size since
1329     + * page cache has been already dropped and writes are blocked by
1330     + * i_mutex. So we can safely drop the i_data_sem here.
1331     + */
1332     BUG_ON(EXT4_JOURNAL(inode) == NULL);
1333     jbd_debug(2, "restarting handle %p\n", handle);
1334     - return ext4_journal_restart(handle, blocks_for_truncate(inode));
1335     + up_write(&EXT4_I(inode)->i_data_sem);
1336     + ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
1337     + down_write(&EXT4_I(inode)->i_data_sem);
1338     + ext4_discard_preallocations(inode);
1339     +
1340     + return ret;
1341     }
1342    
1343     /*
1344     @@ -551,15 +566,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
1345     *
1346     * Normally this function find the preferred place for block allocation,
1347     * returns it.
1348     + * Because this is only used for non-extent files, we limit the block nr
1349     + * to 32 bits.
1350     */
1351     static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
1352     Indirect *partial)
1353     {
1354     + ext4_fsblk_t goal;
1355     +
1356     /*
1357     * XXX need to get goal block from mballoc's data structures
1358     */
1359    
1360     - return ext4_find_near(inode, partial);
1361     + goal = ext4_find_near(inode, partial);
1362     + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
1363     + return goal;
1364     }
1365    
1366     /**
1367     @@ -640,6 +661,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
1368     if (*err)
1369     goto failed_out;
1370    
1371     + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
1372     +
1373     target -= count;
1374     /* allocate blocks for indirect blocks */
1375     while (index < indirect_blks && count) {
1376     @@ -674,6 +697,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
1377     ar.flags = EXT4_MB_HINT_DATA;
1378    
1379     current_block = ext4_mb_new_blocks(handle, &ar, err);
1380     + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
1381    
1382     if (*err && (target == blks)) {
1383     /*
1384     @@ -998,10 +1022,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
1385     if (!err)
1386     err = ext4_splice_branch(handle, inode, iblock,
1387     partial, indirect_blks, count);
1388     - else
1389     + if (err)
1390     goto cleanup;
1391    
1392     set_buffer_new(bh_result);
1393     +
1394     + ext4_update_inode_fsync_trans(handle, inode, 1);
1395     got_it:
1396     map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
1397     if (count > blocks_to_boundary)
1398     @@ -1029,7 +1055,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
1399     EXT4_I(inode)->i_reserved_meta_blocks;
1400     spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1401    
1402     - return total;
1403     + return (total << inode->i_blkbits);
1404     }
1405     /*
1406     * Calculate the number of metadata blocks need to reserve
1407     @@ -1109,22 +1135,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1408     ext4_discard_preallocations(inode);
1409     }
1410    
1411     -static int check_block_validity(struct inode *inode, sector_t logical,
1412     - sector_t phys, int len)
1413     +static int check_block_validity(struct inode *inode, const char *msg,
1414     + sector_t logical, sector_t phys, int len)
1415     {
1416     if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1417     - ext4_error(inode->i_sb, "check_block_validity",
1418     + ext4_error(inode->i_sb, msg,
1419     "inode #%lu logical block %llu mapped to %llu "
1420     "(size %d)", inode->i_ino,
1421     (unsigned long long) logical,
1422     (unsigned long long) phys, len);
1423     - WARN_ON(1);
1424     return -EIO;
1425     }
1426     return 0;
1427     }
1428    
1429     /*
1430     + * Return the number of contiguous dirty pages in a given inode
1431     + * starting at page frame idx.
1432     + */
1433     +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1434     + unsigned int max_pages)
1435     +{
1436     + struct address_space *mapping = inode->i_mapping;
1437     + pgoff_t index;
1438     + struct pagevec pvec;
1439     + pgoff_t num = 0;
1440     + int i, nr_pages, done = 0;
1441     +
1442     + if (max_pages == 0)
1443     + return 0;
1444     + pagevec_init(&pvec, 0);
1445     + while (!done) {
1446     + index = idx;
1447     + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1448     + PAGECACHE_TAG_DIRTY,
1449     + (pgoff_t)PAGEVEC_SIZE);
1450     + if (nr_pages == 0)
1451     + break;
1452     + for (i = 0; i < nr_pages; i++) {
1453     + struct page *page = pvec.pages[i];
1454     + struct buffer_head *bh, *head;
1455     +
1456     + lock_page(page);
1457     + if (unlikely(page->mapping != mapping) ||
1458     + !PageDirty(page) ||
1459     + PageWriteback(page) ||
1460     + page->index != idx) {
1461     + done = 1;
1462     + unlock_page(page);
1463     + break;
1464     + }
1465     + if (page_has_buffers(page)) {
1466     + bh = head = page_buffers(page);
1467     + do {
1468     + if (!buffer_delay(bh) &&
1469     + !buffer_unwritten(bh))
1470     + done = 1;
1471     + bh = bh->b_this_page;
1472     + } while (!done && (bh != head));
1473     + }
1474     + unlock_page(page);
1475     + if (done)
1476     + break;
1477     + idx++;
1478     + num++;
1479     + if (num >= max_pages)
1480     + break;
1481     + }
1482     + pagevec_release(&pvec);
1483     + }
1484     + return num;
1485     +}
1486     +
1487     +/*
1488     * The ext4_get_blocks() function tries to look up the requested blocks,
1489     * and returns if the blocks are already mapped.
1490     *
1491     @@ -1155,6 +1238,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1492     clear_buffer_mapped(bh);
1493     clear_buffer_unwritten(bh);
1494    
1495     + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1496     + "logical block %lu\n", inode->i_ino, flags, max_blocks,
1497     + (unsigned long)block);
1498     /*
1499     * Try to see if we can get the block without requesting a new
1500     * file system block.
1501     @@ -1170,8 +1256,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1502     up_read((&EXT4_I(inode)->i_data_sem));
1503    
1504     if (retval > 0 && buffer_mapped(bh)) {
1505     - int ret = check_block_validity(inode, block,
1506     - bh->b_blocknr, retval);
1507     + int ret = check_block_validity(inode, "file system corruption",
1508     + block, bh->b_blocknr, retval);
1509     if (ret != 0)
1510     return ret;
1511     }
1512     @@ -1235,8 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1513     * i_data's format changing. Force the migrate
1514     * to fail by clearing migrate flags
1515     */
1516     - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
1517     - ~EXT4_EXT_MIGRATE;
1518     + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1519     }
1520     }
1521    
1522     @@ -1252,8 +1337,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1523    
1524     up_write((&EXT4_I(inode)->i_data_sem));
1525     if (retval > 0 && buffer_mapped(bh)) {
1526     - int ret = check_block_validity(inode, block,
1527     - bh->b_blocknr, retval);
1528     + int ret = check_block_validity(inode, "file system "
1529     + "corruption after allocation",
1530     + block, bh->b_blocknr, retval);
1531     if (ret != 0)
1532     return ret;
1533     }
1534     @@ -1451,6 +1537,16 @@ static int do_journal_get_write_access(handle_t *handle,
1535     return ext4_journal_get_write_access(handle, bh);
1536     }
1537    
1538     +/*
1539     + * Truncate blocks that were not used by write. We have to truncate the
1540     + * pagecache as well so that corresponding buffers get properly unmapped.
1541     + */
1542     +static void ext4_truncate_failed_write(struct inode *inode)
1543     +{
1544     + truncate_inode_pages(inode->i_mapping, inode->i_size);
1545     + ext4_truncate(inode);
1546     +}
1547     +
1548     static int ext4_write_begin(struct file *file, struct address_space *mapping,
1549     loff_t pos, unsigned len, unsigned flags,
1550     struct page **pagep, void **fsdata)
1551     @@ -1516,7 +1612,7 @@ retry:
1552    
1553     ext4_journal_stop(handle);
1554     if (pos + len > inode->i_size) {
1555     - ext4_truncate(inode);
1556     + ext4_truncate_failed_write(inode);
1557     /*
1558     * If truncate failed early the inode might
1559     * still be on the orphan list; we need to
1560     @@ -1626,7 +1722,7 @@ static int ext4_ordered_write_end(struct file *file,
1561     ret = ret2;
1562    
1563     if (pos + len > inode->i_size) {
1564     - ext4_truncate(inode);
1565     + ext4_truncate_failed_write(inode);
1566     /*
1567     * If truncate failed early the inode might still be
1568     * on the orphan list; we need to make sure the inode
1569     @@ -1668,7 +1764,7 @@ static int ext4_writeback_write_end(struct file *file,
1570     ret = ret2;
1571    
1572     if (pos + len > inode->i_size) {
1573     - ext4_truncate(inode);
1574     + ext4_truncate_failed_write(inode);
1575     /*
1576     * If truncate failed early the inode might still be
1577     * on the orphan list; we need to make sure the inode
1578     @@ -1731,7 +1827,7 @@ static int ext4_journalled_write_end(struct file *file,
1579     if (!ret)
1580     ret = ret2;
1581     if (pos + len > inode->i_size) {
1582     - ext4_truncate(inode);
1583     + ext4_truncate_failed_write(inode);
1584     /*
1585     * If truncate failed early the inode might still be
1586     * on the orphan list; we need to make sure the inode
1587     @@ -1776,11 +1872,11 @@ repeat:
1588    
1589     if (ext4_claim_free_blocks(sbi, total)) {
1590     spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1591     + vfs_dq_release_reservation_block(inode, total);
1592     if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1593     yield();
1594     goto repeat;
1595     }
1596     - vfs_dq_release_reservation_block(inode, total);
1597     return -ENOSPC;
1598     }
1599     EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1600     @@ -1860,22 +1956,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1601     }
1602    
1603     /*
1604     - * Delayed allocation stuff
1605     - */
1606     -
1607     -struct mpage_da_data {
1608     - struct inode *inode;
1609     - sector_t b_blocknr; /* start block number of extent */
1610     - size_t b_size; /* size of extent */
1611     - unsigned long b_state; /* state of the extent */
1612     - unsigned long first_page, next_page; /* extent of pages */
1613     - struct writeback_control *wbc;
1614     - int io_done;
1615     - int pages_written;
1616     - int retval;
1617     -};
1618     -
1619     -/*
1620     * mpage_da_submit_io - walks through extent of pages and try to write
1621     * them with writepage() call back
1622     *
1623     @@ -2717,7 +2797,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
1624     * number of contiguous block. So we will limit
1625     * number of contiguous block to a sane value
1626     */
1627     - if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
1628     + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
1629     (max_blocks > EXT4_MAX_TRANS_DATA))
1630     max_blocks = EXT4_MAX_TRANS_DATA;
1631    
1632     @@ -2735,8 +2815,11 @@ static int ext4_da_writepages(struct address_space *mapping,
1633     int no_nrwrite_index_update;
1634     int pages_written = 0;
1635     long pages_skipped;
1636     + unsigned int max_pages;
1637     int range_cyclic, cycled = 1, io_done = 0;
1638     - int needed_blocks, ret = 0, nr_to_writebump = 0;
1639     + int needed_blocks, ret = 0;
1640     + long desired_nr_to_write, nr_to_writebump = 0;
1641     + loff_t range_start = wbc->range_start;
1642     struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
1643    
1644     trace_ext4_da_writepages(inode, wbc);
1645     @@ -2762,16 +2845,6 @@ static int ext4_da_writepages(struct address_space *mapping,
1646     if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
1647     return -EROFS;
1648    
1649     - /*
1650     - * Make sure nr_to_write is >= sbi->s_mb_stream_request
1651     - * This make sure small files blocks are allocated in
1652     - * single attempt. This ensure that small files
1653     - * get less fragmented.
1654     - */
1655     - if (wbc->nr_to_write < sbi->s_mb_stream_request) {
1656     - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
1657     - wbc->nr_to_write = sbi->s_mb_stream_request;
1658     - }
1659     if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1660     range_whole = 1;
1661    
1662     @@ -2786,6 +2859,36 @@ static int ext4_da_writepages(struct address_space *mapping,
1663     } else
1664     index = wbc->range_start >> PAGE_CACHE_SHIFT;
1665    
1666     + /*
1667     + * This works around two forms of stupidity. The first is in
1668     + * the writeback code, which caps the maximum number of pages
1669     + * written to be 1024 pages. This is wrong on multiple
1670     + * levels; different architectues have a different page size,
1671     + * which changes the maximum amount of data which gets
1672     + * written. Secondly, 4 megabytes is way too small. XFS
1673     + * forces this value to be 16 megabytes by multiplying
1674     + * nr_to_write parameter by four, and then relies on its
1675     + * allocator to allocate larger extents to make them
1676     + * contiguous. Unfortunately this brings us to the second
1677     + * stupidity, which is that ext4's mballoc code only allocates
1678     + * at most 2048 blocks. So we force contiguous writes up to
1679     + * the number of dirty blocks in the inode, or
1680     + * sbi->max_writeback_mb_bump whichever is smaller.
1681     + */
1682     + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
1683     + if (!range_cyclic && range_whole)
1684     + desired_nr_to_write = wbc->nr_to_write * 8;
1685     + else
1686     + desired_nr_to_write = ext4_num_dirty_pages(inode, index,
1687     + max_pages);
1688     + if (desired_nr_to_write > max_pages)
1689     + desired_nr_to_write = max_pages;
1690     +
1691     + if (wbc->nr_to_write < desired_nr_to_write) {
1692     + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
1693     + wbc->nr_to_write = desired_nr_to_write;
1694     + }
1695     +
1696     mpd.wbc = wbc;
1697     mpd.inode = mapping->host;
1698    
1699     @@ -2904,7 +3007,9 @@ retry:
1700     out_writepages:
1701     if (!no_nrwrite_index_update)
1702     wbc->no_nrwrite_index_update = 0;
1703     - wbc->nr_to_write -= nr_to_writebump;
1704     + if (wbc->nr_to_write > nr_to_writebump)
1705     + wbc->nr_to_write -= nr_to_writebump;
1706     + wbc->range_start = range_start;
1707     trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
1708     return ret;
1709     }
1710     @@ -2994,7 +3099,7 @@ retry:
1711     * i_size_read because we hold i_mutex.
1712     */
1713     if (pos + len > inode->i_size)
1714     - ext4_truncate(inode);
1715     + ext4_truncate_failed_write(inode);
1716     }
1717    
1718     if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1719     @@ -3259,6 +3364,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
1720     }
1721    
1722     /*
1723     + * O_DIRECT for ext3 (or indirect map) based files
1724     + *
1725     * If the O_DIRECT write will extend the file then add this inode to the
1726     * orphan list. So recovery will truncate it back to the original size
1727     * if the machine crashes during the write.
1728     @@ -3267,7 +3374,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
1729     * crashes then stale disk data _may_ be exposed inside the file. But current
1730     * VFS code falls back into buffered path in that case so we are safe.
1731     */
1732     -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1733     +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
1734     const struct iovec *iov, loff_t offset,
1735     unsigned long nr_segs)
1736     {
1737     @@ -3278,6 +3385,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1738     ssize_t ret;
1739     int orphan = 0;
1740     size_t count = iov_length(iov, nr_segs);
1741     + int retries = 0;
1742    
1743     if (rw == WRITE) {
1744     loff_t final_size = offset + count;
1745     @@ -3300,9 +3408,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1746     }
1747     }
1748    
1749     +retry:
1750     ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1751     offset, nr_segs,
1752     ext4_get_block, NULL);
1753     + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1754     + goto retry;
1755    
1756     if (orphan) {
1757     int err;
1758     @@ -3341,6 +3452,364 @@ out:
1759     return ret;
1760     }
1761    
1762     +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
1763     + struct buffer_head *bh_result, int create)
1764     +{
1765     + handle_t *handle = NULL;
1766     + int ret = 0;
1767     + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1768     + int dio_credits;
1769     +
1770     + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
1771     + inode->i_ino, create);
1772     + /*
1773     + * DIO VFS code passes create = 0 flag for write to
1774     + * the middle of file. It does this to avoid block
1775     + * allocation for holes, to prevent expose stale data
1776     + * out when there is parallel buffered read (which does
1777     + * not hold the i_mutex lock) while direct IO write has
1778     + * not completed. DIO request on holes finally falls back
1779     + * to buffered IO for this reason.
1780     + *
1781     + * For ext4 extent based file, since we support fallocate,
1782     + * new allocated extent as uninitialized, for holes, we
1783     + * could fallocate blocks for holes, thus parallel
1784     + * buffered IO read will zero out the page when read on
1785     + * a hole while parallel DIO write to the hole has not completed.
1786     + *
1787     + * when we come here, we know it's a direct IO write to
1788     + * to the middle of file (<i_size)
1789     + * so it's safe to override the create flag from VFS.
1790     + */
1791     + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
1792     +
1793     + if (max_blocks > DIO_MAX_BLOCKS)
1794     + max_blocks = DIO_MAX_BLOCKS;
1795     + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
1796     + handle = ext4_journal_start(inode, dio_credits);
1797     + if (IS_ERR(handle)) {
1798     + ret = PTR_ERR(handle);
1799     + goto out;
1800     + }
1801     + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
1802     + create);
1803     + if (ret > 0) {
1804     + bh_result->b_size = (ret << inode->i_blkbits);
1805     + ret = 0;
1806     + }
1807     + ext4_journal_stop(handle);
1808     +out:
1809     + return ret;
1810     +}
1811     +
1812     +static void ext4_free_io_end(ext4_io_end_t *io)
1813     +{
1814     + BUG_ON(!io);
1815     + iput(io->inode);
1816     + kfree(io);
1817     +}
1818     +static void dump_aio_dio_list(struct inode * inode)
1819     +{
1820     +#ifdef EXT4_DEBUG
1821     + struct list_head *cur, *before, *after;
1822     + ext4_io_end_t *io, *io0, *io1;
1823     +
1824     + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
1825     + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
1826     + return;
1827     + }
1828     +
1829     + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
1830     + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
1831     + cur = &io->list;
1832     + before = cur->prev;
1833     + io0 = container_of(before, ext4_io_end_t, list);
1834     + after = cur->next;
1835     + io1 = container_of(after, ext4_io_end_t, list);
1836     +
1837     + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
1838     + io, inode->i_ino, io0, io1);
1839     + }
1840     +#endif
1841     +}
1842     +
1843     +/*
1844     + * check a range of space and convert unwritten extents to written.
1845     + */
1846     +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
1847     +{
1848     + struct inode *inode = io->inode;
1849     + loff_t offset = io->offset;
1850     + size_t size = io->size;
1851     + int ret = 0;
1852     +
1853     + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
1854     + "list->prev 0x%p\n",
1855     + io, inode->i_ino, io->list.next, io->list.prev);
1856     +
1857     + if (list_empty(&io->list))
1858     + return ret;
1859     +
1860     + if (io->flag != DIO_AIO_UNWRITTEN)
1861     + return ret;
1862     +
1863     + if (offset + size <= i_size_read(inode))
1864     + ret = ext4_convert_unwritten_extents(inode, offset, size);
1865     +
1866     + if (ret < 0) {
1867     + printk(KERN_EMERG "%s: failed to convert unwritten"
1868     + "extents to written extents, error is %d"
1869     + " io is still on inode %lu aio dio list\n",
1870     + __func__, ret, inode->i_ino);
1871     + return ret;
1872     + }
1873     +
1874     + /* clear the DIO AIO unwritten flag */
1875     + io->flag = 0;
1876     + return ret;
1877     +}
1878     +/*
1879     + * work on completed aio dio IO, to convert unwritten extents to extents
1880     + */
1881     +static void ext4_end_aio_dio_work(struct work_struct *work)
1882     +{
1883     + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
1884     + struct inode *inode = io->inode;
1885     + int ret = 0;
1886     +
1887     + mutex_lock(&inode->i_mutex);
1888     + ret = ext4_end_aio_dio_nolock(io);
1889     + if (ret >= 0) {
1890     + if (!list_empty(&io->list))
1891     + list_del_init(&io->list);
1892     + ext4_free_io_end(io);
1893     + }
1894     + mutex_unlock(&inode->i_mutex);
1895     +}
1896     +/*
1897     + * This function is called from ext4_sync_file().
1898     + *
1899     + * When AIO DIO IO is completed, the work to convert unwritten
1900     + * extents to written is queued on workqueue but may not get immediately
1901     + * scheduled. When fsync is called, we need to ensure the
1902     + * conversion is complete before fsync returns.
1903     + * The inode keeps track of a list of completed AIO from DIO path
1904     + * that might needs to do the conversion. This function walks through
1905     + * the list and convert the related unwritten extents to written.
1906     + */
1907     +int flush_aio_dio_completed_IO(struct inode *inode)
1908     +{
1909     + ext4_io_end_t *io;
1910     + int ret = 0;
1911     + int ret2 = 0;
1912     +
1913     + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
1914     + return ret;
1915     +
1916     + dump_aio_dio_list(inode);
1917     + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
1918     + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
1919     + ext4_io_end_t, list);
1920     + /*
1921     + * Calling ext4_end_aio_dio_nolock() to convert completed
1922     + * IO to written.
1923     + *
1924     + * When ext4_sync_file() is called, run_queue() may already
1925     + * about to flush the work corresponding to this io structure.
1926     + * It will be upset if it founds the io structure related
1927     + * to the work-to-be schedule is freed.
1928     + *
1929     + * Thus we need to keep the io structure still valid here after
1930     + * convertion finished. The io structure has a flag to
1931     + * avoid double converting from both fsync and background work
1932     + * queue work.
1933     + */
1934     + ret = ext4_end_aio_dio_nolock(io);
1935     + if (ret < 0)
1936     + ret2 = ret;
1937     + else
1938     + list_del_init(&io->list);
1939     + }
1940     + return (ret2 < 0) ? ret2 : 0;
1941     +}
1942     +
1943     +static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
1944     +{
1945     + ext4_io_end_t *io = NULL;
1946     +
1947     + io = kmalloc(sizeof(*io), GFP_NOFS);
1948     +
1949     + if (io) {
1950     + igrab(inode);
1951     + io->inode = inode;
1952     + io->flag = 0;
1953     + io->offset = 0;
1954     + io->size = 0;
1955     + io->error = 0;
1956     + INIT_WORK(&io->work, ext4_end_aio_dio_work);
1957     + INIT_LIST_HEAD(&io->list);
1958     + }
1959     +
1960     + return io;
1961     +}
1962     +
1963     +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
1964     + ssize_t size, void *private)
1965     +{
1966     + ext4_io_end_t *io_end = iocb->private;
1967     + struct workqueue_struct *wq;
1968     +
1969     + /* if not async direct IO or dio with 0 bytes write, just return */
1970     + if (!io_end || !size)
1971     + return;
1972     +
1973     + ext_debug("ext4_end_io_dio(): io_end 0x%p"
1974     + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
1975     + iocb->private, io_end->inode->i_ino, iocb, offset,
1976     + size);
1977     +
1978     + /* if not aio dio with unwritten extents, just free io and return */
1979     + if (io_end->flag != DIO_AIO_UNWRITTEN){
1980     + ext4_free_io_end(io_end);
1981     + iocb->private = NULL;
1982     + return;
1983     + }
1984     +
1985     + io_end->offset = offset;
1986     + io_end->size = size;
1987     + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
1988     +
1989     + /* queue the work to convert unwritten extents to written */
1990     + queue_work(wq, &io_end->work);
1991     +
1992     + /* Add the io_end to per-inode completed aio dio list*/
1993     + list_add_tail(&io_end->list,
1994     + &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
1995     + iocb->private = NULL;
1996     +}
1997     +/*
1998     + * For ext4 extent files, ext4 will do direct-io write to holes,
1999     + * preallocated extents, and those write extend the file, no need to
2000     + * fall back to buffered IO.
2001     + *
2002     + * For holes, we fallocate those blocks, mark them as unintialized
2003     + * If those blocks were preallocated, we mark sure they are splited, but
2004     + * still keep the range to write as unintialized.
2005     + *
2006     + * The unwrritten extents will be converted to written when DIO is completed.
2007     + * For async direct IO, since the IO may still pending when return, we
2008     + * set up an end_io call back function, which will do the convertion
2009     + * when async direct IO completed.
2010     + *
2011     + * If the O_DIRECT write will extend the file then add this inode to the
2012     + * orphan list. So recovery will truncate it back to the original size
2013     + * if the machine crashes during the write.
2014     + *
2015     + */
2016     +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2017     + const struct iovec *iov, loff_t offset,
2018     + unsigned long nr_segs)
2019     +{
2020     + struct file *file = iocb->ki_filp;
2021     + struct inode *inode = file->f_mapping->host;
2022     + ssize_t ret;
2023     + size_t count = iov_length(iov, nr_segs);
2024     +
2025     + loff_t final_size = offset + count;
2026     + if (rw == WRITE && final_size <= inode->i_size) {
2027     + /*
2028     + * We could direct write to holes and fallocate.
2029     + *
2030     + * Allocated blocks to fill the hole are marked as uninitialized
2031     + * to prevent paralel buffered read to expose the stale data
2032     + * before DIO complete the data IO.
2033     + *
2034     + * As to previously fallocated extents, ext4 get_block
2035     + * will just simply mark the buffer mapped but still
2036     + * keep the extents uninitialized.
2037     + *
2038     + * for non AIO case, we will convert those unwritten extents
2039     + * to written after return back from blockdev_direct_IO.
2040     + *
2041     + * for async DIO, the conversion needs to be defered when
2042     + * the IO is completed. The ext4 end_io callback function
2043     + * will be called to take care of the conversion work.
2044     + * Here for async case, we allocate an io_end structure to
2045     + * hook to the iocb.
2046     + */
2047     + iocb->private = NULL;
2048     + EXT4_I(inode)->cur_aio_dio = NULL;
2049     + if (!is_sync_kiocb(iocb)) {
2050     + iocb->private = ext4_init_io_end(inode);
2051     + if (!iocb->private)
2052     + return -ENOMEM;
2053     + /*
2054     + * we save the io structure for current async
2055     + * direct IO, so that later ext4_get_blocks()
2056     + * could flag the io structure whether there
2057     + * is a unwritten extents needs to be converted
2058     + * when IO is completed.
2059     + */
2060     + EXT4_I(inode)->cur_aio_dio = iocb->private;
2061     + }
2062     +
2063     + ret = blockdev_direct_IO(rw, iocb, inode,
2064     + inode->i_sb->s_bdev, iov,
2065     + offset, nr_segs,
2066     + ext4_get_block_dio_write,
2067     + ext4_end_io_dio);
2068     + if (iocb->private)
2069     + EXT4_I(inode)->cur_aio_dio = NULL;
2070     + /*
2071     + * The io_end structure takes a reference to the inode,
2072     + * that structure needs to be destroyed and the
2073     + * reference to the inode need to be dropped, when IO is
2074     + * complete, even with 0 byte write, or failed.
2075     + *
2076     + * In the successful AIO DIO case, the io_end structure will be
2077     + * desctroyed and the reference to the inode will be dropped
2078     + * after the end_io call back function is called.
2079     + *
2080     + * In the case there is 0 byte write, or error case, since
2081     + * VFS direct IO won't invoke the end_io call back function,
2082     + * we need to free the end_io structure here.
2083     + */
2084     + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
2085     + ext4_free_io_end(iocb->private);
2086     + iocb->private = NULL;
2087     + } else if (ret > 0 && (EXT4_I(inode)->i_state &
2088     + EXT4_STATE_DIO_UNWRITTEN)) {
2089     + int err;
2090     + /*
2091     + * for non AIO case, since the IO is already
2092     + * completed, we could do the convertion right here
2093     + */
2094     + err = ext4_convert_unwritten_extents(inode,
2095     + offset, ret);
2096     + if (err < 0)
2097     + ret = err;
2098     + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
2099     + }
2100     + return ret;
2101     + }
2102     +
2103     + /* for write the the end of file case, we fall back to old way */
2104     + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2105     +}
2106     +
2107     +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2108     + const struct iovec *iov, loff_t offset,
2109     + unsigned long nr_segs)
2110     +{
2111     + struct file *file = iocb->ki_filp;
2112     + struct inode *inode = file->f_mapping->host;
2113     +
2114     + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2115     + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
2116     +
2117     + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2118     +}
2119     +
2120     /*
2121     * Pages can be marked dirty completely asynchronously from ext4's journalling
2122     * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
2123     @@ -3653,13 +4122,16 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2124     __le32 *last)
2125     {
2126     __le32 *p;
2127     + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode);
2128     +
2129     if (try_to_extend_transaction(handle, inode)) {
2130     if (bh) {
2131     BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2132     ext4_handle_dirty_metadata(handle, inode, bh);
2133     }
2134     ext4_mark_inode_dirty(handle, inode);
2135     - ext4_journal_test_restart(handle, inode);
2136     + ext4_truncate_restart_trans(handle, inode,
2137     + blocks_for_truncate(inode));
2138     if (bh) {
2139     BUFFER_TRACE(bh, "retaking write access");
2140     ext4_journal_get_write_access(handle, bh);
2141     @@ -3682,11 +4154,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2142    
2143     *p = 0;
2144     tbh = sb_find_get_block(inode->i_sb, nr);
2145     - ext4_forget(handle, 0, inode, tbh, nr);
2146     + ext4_forget(handle, is_metadata, inode, tbh, nr);
2147     }
2148     }
2149    
2150     - ext4_free_blocks(handle, inode, block_to_free, count, 0);
2151     + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata);
2152     }
2153    
2154     /**
2155     @@ -3870,7 +4342,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2156     return;
2157     if (try_to_extend_transaction(handle, inode)) {
2158     ext4_mark_inode_dirty(handle, inode);
2159     - ext4_journal_test_restart(handle, inode);
2160     + ext4_truncate_restart_trans(handle, inode,
2161     + blocks_for_truncate(inode));
2162     }
2163    
2164     ext4_free_blocks(handle, inode, nr, 1, 1);
2165     @@ -3958,8 +4431,7 @@ void ext4_truncate(struct inode *inode)
2166     if (!ext4_can_truncate(inode))
2167     return;
2168    
2169     - if (ei->i_disksize && inode->i_size == 0 &&
2170     - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
2171     + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
2172     ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
2173    
2174     if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2175     @@ -4313,8 +4785,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2176     struct ext4_iloc iloc;
2177     struct ext4_inode *raw_inode;
2178     struct ext4_inode_info *ei;
2179     - struct buffer_head *bh;
2180     struct inode *inode;
2181     + journal_t *journal = EXT4_SB(sb)->s_journal;
2182     long ret;
2183     int block;
2184    
2185     @@ -4325,11 +4797,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2186     return inode;
2187    
2188     ei = EXT4_I(inode);
2189     + iloc.bh = 0;
2190    
2191     ret = __ext4_get_inode_loc(inode, &iloc, 0);
2192     if (ret < 0)
2193     goto bad_inode;
2194     - bh = iloc.bh;
2195     raw_inode = ext4_raw_inode(&iloc);
2196     inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2197     inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2198     @@ -4352,7 +4824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2199     if (inode->i_mode == 0 ||
2200     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
2201     /* this inode is deleted */
2202     - brelse(bh);
2203     ret = -ESTALE;
2204     goto bad_inode;
2205     }
2206     @@ -4380,11 +4851,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2207     ei->i_data[block] = raw_inode->i_block[block];
2208     INIT_LIST_HEAD(&ei->i_orphan);
2209    
2210     + /*
2211     + * Set transaction id's of transactions that have to be committed
2212     + * to finish f[data]sync. We set them to currently running transaction
2213     + * as we cannot be sure that the inode or some of its metadata isn't
2214     + * part of the transaction - the inode could have been reclaimed and
2215     + * now it is reread from disk.
2216     + */
2217     + if (journal) {
2218     + transaction_t *transaction;
2219     + tid_t tid;
2220     +
2221     + spin_lock(&journal->j_state_lock);
2222     + if (journal->j_running_transaction)
2223     + transaction = journal->j_running_transaction;
2224     + else
2225     + transaction = journal->j_committing_transaction;
2226     + if (transaction)
2227     + tid = transaction->t_tid;
2228     + else
2229     + tid = journal->j_commit_sequence;
2230     + spin_unlock(&journal->j_state_lock);
2231     + ei->i_sync_tid = tid;
2232     + ei->i_datasync_tid = tid;
2233     + }
2234     +
2235     if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2236     ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2237     if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2238     EXT4_INODE_SIZE(inode->i_sb)) {
2239     - brelse(bh);
2240     ret = -EIO;
2241     goto bad_inode;
2242     }
2243     @@ -4416,10 +4911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2244    
2245     ret = 0;
2246     if (ei->i_file_acl &&
2247     - ((ei->i_file_acl <
2248     - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
2249     - EXT4_SB(sb)->s_gdb_count)) ||
2250     - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
2251     + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
2252     ext4_error(sb, __func__,
2253     "bad extended attribute block %llu in inode #%lu",
2254     ei->i_file_acl, inode->i_ino);
2255     @@ -4437,10 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2256     /* Validate block references which are part of inode */
2257     ret = ext4_check_inode_blockref(inode);
2258     }
2259     - if (ret) {
2260     - brelse(bh);
2261     + if (ret)
2262     goto bad_inode;
2263     - }
2264    
2265     if (S_ISREG(inode->i_mode)) {
2266     inode->i_op = &ext4_file_inode_operations;
2267     @@ -4468,7 +4958,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2268     init_special_inode(inode, inode->i_mode,
2269     new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2270     } else {
2271     - brelse(bh);
2272     ret = -EIO;
2273     ext4_error(inode->i_sb, __func__,
2274     "bogus i_mode (%o) for inode=%lu",
2275     @@ -4481,6 +4970,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
2276     return inode;
2277    
2278     bad_inode:
2279     + brelse(iloc.bh);
2280     iget_failed(inode);
2281     return ERR_PTR(ret);
2282     }
2283     @@ -4581,8 +5071,7 @@ static int ext4_do_update_inode(handle_t *handle,
2284     if (ext4_inode_blocks_set(handle, raw_inode, ei))
2285     goto out_brelse;
2286     raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2287     - /* clear the migrate flag in the raw_inode */
2288     - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
2289     + raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2290     if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2291     cpu_to_le32(EXT4_OS_HURD))
2292     raw_inode->i_file_acl_high =
2293     @@ -4641,6 +5130,7 @@ static int ext4_do_update_inode(handle_t *handle,
2294     err = rc;
2295     ei->i_state &= ~EXT4_STATE_NEW;
2296    
2297     + ext4_update_inode_fsync_trans(handle, inode, 0);
2298     out_brelse:
2299     brelse(bh);
2300     ext4_std_error(inode->i_sb, err);
2301     @@ -4684,19 +5174,40 @@ out_brelse:
2302     */
2303     int ext4_write_inode(struct inode *inode, int wait)
2304     {
2305     + int err;
2306     +
2307     if (current->flags & PF_MEMALLOC)
2308     return 0;
2309    
2310     - if (ext4_journal_current_handle()) {
2311     - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
2312     - dump_stack();
2313     - return -EIO;
2314     - }
2315     + if (EXT4_SB(inode->i_sb)->s_journal) {
2316     + if (ext4_journal_current_handle()) {
2317     + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
2318     + dump_stack();
2319     + return -EIO;
2320     + }
2321    
2322     - if (!wait)
2323     - return 0;
2324     + if (!wait)
2325     + return 0;
2326     +
2327     + err = ext4_force_commit(inode->i_sb);
2328     + } else {
2329     + struct ext4_iloc iloc;
2330    
2331     - return ext4_force_commit(inode->i_sb);
2332     + err = ext4_get_inode_loc(inode, &iloc);
2333     + if (err)
2334     + return err;
2335     + if (wait)
2336     + sync_dirty_buffer(iloc.bh);
2337     + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
2338     + ext4_error(inode->i_sb, __func__,
2339     + "IO error syncing inode, "
2340     + "inode=%lu, block=%llu",
2341     + inode->i_ino,
2342     + (unsigned long long)iloc.bh->b_blocknr);
2343     + err = -EIO;
2344     + }
2345     + }
2346     + return err;
2347     }
2348    
2349     /*
2350     @@ -4739,8 +5250,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
2351    
2352     /* (user+group)*(old+new) structure, inode write (sb,
2353     * inode block, ? - but truncate inode update has it) */
2354     - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
2355     - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2356     + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
2357     + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
2358     if (IS_ERR(handle)) {
2359     error = PTR_ERR(handle);
2360     goto err_out;
2361     @@ -5137,24 +5648,13 @@ void ext4_dirty_inode(struct inode *inode)
2362     handle_t *current_handle = ext4_journal_current_handle();
2363     handle_t *handle;
2364    
2365     - if (!ext4_handle_valid(current_handle)) {
2366     - ext4_mark_inode_dirty(current_handle, inode);
2367     - return;
2368     - }
2369     -
2370     handle = ext4_journal_start(inode, 2);
2371     if (IS_ERR(handle))
2372     goto out;
2373     - if (current_handle &&
2374     - current_handle->h_transaction != handle->h_transaction) {
2375     - /* This task has a transaction open against a different fs */
2376     - printk(KERN_EMERG "%s: transactions do not match!\n",
2377     - __func__);
2378     - } else {
2379     - jbd_debug(5, "marking dirty. outer handle=%p\n",
2380     - current_handle);
2381     - ext4_mark_inode_dirty(handle, inode);
2382     - }
2383     +
2384     + jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle);
2385     + ext4_mark_inode_dirty(handle, inode);
2386     +
2387     ext4_journal_stop(handle);
2388     out:
2389     return;
2390     @@ -5281,12 +5781,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2391     else
2392     len = PAGE_CACHE_SIZE;
2393    
2394     + lock_page(page);
2395     + /*
2396     + * return if we have all the buffers mapped. This avoid
2397     + * the need to call write_begin/write_end which does a
2398     + * journal_start/journal_stop which can block and take
2399     + * long time
2400     + */
2401     if (page_has_buffers(page)) {
2402     - /* return if we have all the buffers mapped */
2403     if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2404     - ext4_bh_unmapped))
2405     + ext4_bh_unmapped)) {
2406     + unlock_page(page);
2407     goto out_unlock;
2408     + }
2409     }
2410     + unlock_page(page);
2411     /*
2412     * OK, we need to fill the hole... Do write_begin write_end
2413     * to do block allocation/reservation.We are not holding
2414     diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
2415     index 7050a9c..b63d193 100644
2416     --- a/fs/ext4/ioctl.c
2417     +++ b/fs/ext4/ioctl.c
2418     @@ -221,32 +221,38 @@ setversion_out:
2419     struct file *donor_filp;
2420     int err;
2421    
2422     + if (!(filp->f_mode & FMODE_READ) ||
2423     + !(filp->f_mode & FMODE_WRITE))
2424     + return -EBADF;
2425     +
2426     if (copy_from_user(&me,
2427     (struct move_extent __user *)arg, sizeof(me)))
2428     return -EFAULT;
2429     + me.moved_len = 0;
2430    
2431     donor_filp = fget(me.donor_fd);
2432     if (!donor_filp)
2433     return -EBADF;
2434    
2435     - if (!capable(CAP_DAC_OVERRIDE)) {
2436     - if ((current->real_cred->fsuid != inode->i_uid) ||
2437     - !(inode->i_mode & S_IRUSR) ||
2438     - !(donor_filp->f_dentry->d_inode->i_mode &
2439     - S_IRUSR)) {
2440     - fput(donor_filp);
2441     - return -EACCES;
2442     - }
2443     + if (!(donor_filp->f_mode & FMODE_WRITE)) {
2444     + err = -EBADF;
2445     + goto mext_out;
2446     }
2447    
2448     + err = mnt_want_write(filp->f_path.mnt);
2449     + if (err)
2450     + goto mext_out;
2451     +
2452     err = ext4_move_extents(filp, donor_filp, me.orig_start,
2453     me.donor_start, me.len, &me.moved_len);
2454     - fput(donor_filp);
2455     + mnt_drop_write(filp->f_path.mnt);
2456     + if (me.moved_len > 0)
2457     + file_remove_suid(donor_filp);
2458    
2459     - if (!err)
2460     - if (copy_to_user((struct move_extent *)arg,
2461     - &me, sizeof(me)))
2462     - return -EFAULT;
2463     + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
2464     + err = -EFAULT;
2465     +mext_out:
2466     + fput(donor_filp);
2467     return err;
2468     }
2469    
2470     diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
2471     index cd25846..099fd47 100644
2472     --- a/fs/ext4/mballoc.c
2473     +++ b/fs/ext4/mballoc.c
2474     @@ -908,6 +908,97 @@ out:
2475     return err;
2476     }
2477    
2478     +static noinline_for_stack
2479     +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
2480     +{
2481     +
2482     + int ret = 0;
2483     + void *bitmap;
2484     + int blocks_per_page;
2485     + int block, pnum, poff;
2486     + int num_grp_locked = 0;
2487     + struct ext4_group_info *this_grp;
2488     + struct ext4_sb_info *sbi = EXT4_SB(sb);
2489     + struct inode *inode = sbi->s_buddy_cache;
2490     + struct page *page = NULL, *bitmap_page = NULL;
2491     +
2492     + mb_debug("init group %lu\n", group);
2493     + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2494     + this_grp = ext4_get_group_info(sb, group);
2495     + /*
2496     + * This ensures we don't add group
2497     + * to this buddy cache via resize
2498     + */
2499     + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
2500     + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
2501     + /*
2502     + * somebody initialized the group
2503     + * return without doing anything
2504     + */
2505     + ret = 0;
2506     + goto err;
2507     + }
2508     + /*
2509     + * the buddy cache inode stores the block bitmap
2510     + * and buddy information in consecutive blocks.
2511     + * So for each group we need two blocks.
2512     + */
2513     + block = group * 2;
2514     + pnum = block / blocks_per_page;
2515     + poff = block % blocks_per_page;
2516     + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2517     + if (page) {
2518     + BUG_ON(page->mapping != inode->i_mapping);
2519     + ret = ext4_mb_init_cache(page, NULL);
2520     + if (ret) {
2521     + unlock_page(page);
2522     + goto err;
2523     + }
2524     + unlock_page(page);
2525     + }
2526     + if (page == NULL || !PageUptodate(page)) {
2527     + ret = -EIO;
2528     + goto err;
2529     + }
2530     + mark_page_accessed(page);
2531     + bitmap_page = page;
2532     + bitmap = page_address(page) + (poff * sb->s_blocksize);
2533     +
2534     + /* init buddy cache */
2535     + block++;
2536     + pnum = block / blocks_per_page;
2537     + poff = block % blocks_per_page;
2538     + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2539     + if (page == bitmap_page) {
2540     + /*
2541     + * If both the bitmap and buddy are in
2542     + * the same page we don't need to force
2543     + * init the buddy
2544     + */
2545     + unlock_page(page);
2546     + } else if (page) {
2547     + BUG_ON(page->mapping != inode->i_mapping);
2548     + ret = ext4_mb_init_cache(page, bitmap);
2549     + if (ret) {
2550     + unlock_page(page);
2551     + goto err;
2552     + }
2553     + unlock_page(page);
2554     + }
2555     + if (page == NULL || !PageUptodate(page)) {
2556     + ret = -EIO;
2557     + goto err;
2558     + }
2559     + mark_page_accessed(page);
2560     +err:
2561     + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
2562     + if (bitmap_page)
2563     + page_cache_release(bitmap_page);
2564     + if (page)
2565     + page_cache_release(page);
2566     + return ret;
2567     +}
2568     +
2569     static noinline_for_stack int
2570     ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
2571     struct ext4_buddy *e4b)
2572     @@ -941,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
2573     * groups mapped by the page is blocked
2574     * till we are done with allocation
2575     */
2576     +repeat_load_buddy:
2577     down_read(e4b->alloc_semp);
2578    
2579     + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2580     + /* we need to check for group need init flag
2581     + * with alloc_semp held so that we can be sure
2582     + * that new blocks didn't get added to the group
2583     + * when we are loading the buddy cache
2584     + */
2585     + up_read(e4b->alloc_semp);
2586     + /*
2587     + * we need full data about the group
2588     + * to make a good selection
2589     + */
2590     + ret = ext4_mb_init_group(sb, group);
2591     + if (ret)
2592     + return ret;
2593     + goto repeat_load_buddy;
2594     + }
2595     +
2596     /*
2597     * the buddy cache inode stores the block bitmap
2598     * and buddy information in consecutive blocks.
2599     @@ -1360,7 +1469,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
2600     ac->alloc_semp = e4b->alloc_semp;
2601     e4b->alloc_semp = NULL;
2602     /* store last allocated for subsequent stream allocation */
2603     - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
2604     + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2605     spin_lock(&sbi->s_md_lock);
2606     sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
2607     sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
2608     @@ -1837,97 +1946,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
2609    
2610     }
2611    
2612     -static noinline_for_stack
2613     -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
2614     -{
2615     -
2616     - int ret;
2617     - void *bitmap;
2618     - int blocks_per_page;
2619     - int block, pnum, poff;
2620     - int num_grp_locked = 0;
2621     - struct ext4_group_info *this_grp;
2622     - struct ext4_sb_info *sbi = EXT4_SB(sb);
2623     - struct inode *inode = sbi->s_buddy_cache;
2624     - struct page *page = NULL, *bitmap_page = NULL;
2625     -
2626     - mb_debug("init group %lu\n", group);
2627     - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2628     - this_grp = ext4_get_group_info(sb, group);
2629     - /*
2630     - * This ensures we don't add group
2631     - * to this buddy cache via resize
2632     - */
2633     - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
2634     - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
2635     - /*
2636     - * somebody initialized the group
2637     - * return without doing anything
2638     - */
2639     - ret = 0;
2640     - goto err;
2641     - }
2642     - /*
2643     - * the buddy cache inode stores the block bitmap
2644     - * and buddy information in consecutive blocks.
2645     - * So for each group we need two blocks.
2646     - */
2647     - block = group * 2;
2648     - pnum = block / blocks_per_page;
2649     - poff = block % blocks_per_page;
2650     - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2651     - if (page) {
2652     - BUG_ON(page->mapping != inode->i_mapping);
2653     - ret = ext4_mb_init_cache(page, NULL);
2654     - if (ret) {
2655     - unlock_page(page);
2656     - goto err;
2657     - }
2658     - unlock_page(page);
2659     - }
2660     - if (page == NULL || !PageUptodate(page)) {
2661     - ret = -EIO;
2662     - goto err;
2663     - }
2664     - mark_page_accessed(page);
2665     - bitmap_page = page;
2666     - bitmap = page_address(page) + (poff * sb->s_blocksize);
2667     -
2668     - /* init buddy cache */
2669     - block++;
2670     - pnum = block / blocks_per_page;
2671     - poff = block % blocks_per_page;
2672     - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
2673     - if (page == bitmap_page) {
2674     - /*
2675     - * If both the bitmap and buddy are in
2676     - * the same page we don't need to force
2677     - * init the buddy
2678     - */
2679     - unlock_page(page);
2680     - } else if (page) {
2681     - BUG_ON(page->mapping != inode->i_mapping);
2682     - ret = ext4_mb_init_cache(page, bitmap);
2683     - if (ret) {
2684     - unlock_page(page);
2685     - goto err;
2686     - }
2687     - unlock_page(page);
2688     - }
2689     - if (page == NULL || !PageUptodate(page)) {
2690     - ret = -EIO;
2691     - goto err;
2692     - }
2693     - mark_page_accessed(page);
2694     -err:
2695     - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
2696     - if (bitmap_page)
2697     - page_cache_release(bitmap_page);
2698     - if (page)
2699     - page_cache_release(page);
2700     - return ret;
2701     -}
2702     -
2703     static noinline_for_stack int
2704     ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2705     {
2706     @@ -1938,11 +1956,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2707     struct ext4_sb_info *sbi;
2708     struct super_block *sb;
2709     struct ext4_buddy e4b;
2710     - loff_t size, isize;
2711    
2712     sb = ac->ac_sb;
2713     sbi = EXT4_SB(sb);
2714     ngroups = ext4_get_groups_count(sb);
2715     + /* non-extent files are limited to low blocks/groups */
2716     + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
2717     + ngroups = sbi->s_blockfile_groups;
2718     +
2719     BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2720    
2721     /* first, try the goal */
2722     @@ -1974,20 +1995,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2723     }
2724    
2725     bsbits = ac->ac_sb->s_blocksize_bits;
2726     - /* if stream allocation is enabled, use global goal */
2727     - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2728     - isize = i_size_read(ac->ac_inode) >> bsbits;
2729     - if (size < isize)
2730     - size = isize;
2731    
2732     - if (size < sbi->s_mb_stream_request &&
2733     - (ac->ac_flags & EXT4_MB_HINT_DATA)) {
2734     + /* if stream allocation is enabled, use global goal */
2735     + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2736     /* TBD: may be hot point */
2737     spin_lock(&sbi->s_md_lock);
2738     ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2739     ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2740     spin_unlock(&sbi->s_md_lock);
2741     }
2742     +
2743     /* Let's just scan groups to find more-less suitable blocks */
2744     cr = ac->ac_2order ? 0 : 1;
2745     /*
2746     @@ -2015,27 +2032,6 @@ repeat:
2747     if (grp->bb_free == 0)
2748     continue;
2749    
2750     - /*
2751     - * if the group is already init we check whether it is
2752     - * a good group and if not we don't load the buddy
2753     - */
2754     - if (EXT4_MB_GRP_NEED_INIT(grp)) {
2755     - /*
2756     - * we need full data about the group
2757     - * to make a good selection
2758     - */
2759     - err = ext4_mb_init_group(sb, group);
2760     - if (err)
2761     - goto out;
2762     - }
2763     -
2764     - /*
2765     - * If the particular group doesn't satisfy our
2766     - * criteria we continue with the next group
2767     - */
2768     - if (!ext4_mb_good_group(ac, group, cr))
2769     - continue;
2770     -
2771     err = ext4_mb_load_buddy(sb, group, &e4b);
2772     if (err)
2773     goto out;
2774     @@ -2571,13 +2567,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
2775     {
2776     ext4_group_t ngroups = ext4_get_groups_count(sb);
2777     ext4_group_t i;
2778     - int metalen;
2779     struct ext4_sb_info *sbi = EXT4_SB(sb);
2780     struct ext4_super_block *es = sbi->s_es;
2781     int num_meta_group_infos;
2782     int num_meta_group_infos_max;
2783     int array_size;
2784     - struct ext4_group_info **meta_group_info;
2785     struct ext4_group_desc *desc;
2786    
2787     /* This is the number of blocks used by GDT */
2788     @@ -2622,22 +2616,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2789     goto err_freesgi;
2790     }
2791     EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2792     -
2793     - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2794     - for (i = 0; i < num_meta_group_infos; i++) {
2795     - if ((i + 1) == num_meta_group_infos)
2796     - metalen = sizeof(*meta_group_info) *
2797     - (ngroups -
2798     - (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2799     - meta_group_info = kmalloc(metalen, GFP_KERNEL);
2800     - if (meta_group_info == NULL) {
2801     - printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2802     - "buddy group\n");
2803     - goto err_freemeta;
2804     - }
2805     - sbi->s_group_info[i] = meta_group_info;
2806     - }
2807     -
2808     for (i = 0; i < ngroups; i++) {
2809     desc = ext4_get_group_desc(sb, i, NULL);
2810     if (desc == NULL) {
2811     @@ -2655,7 +2633,6 @@ err_freebuddy:
2812     while (i-- > 0)
2813     kfree(ext4_get_group_info(sb, i));
2814     i = num_meta_group_infos;
2815     -err_freemeta:
2816     while (i-- > 0)
2817     kfree(sbi->s_group_info[i]);
2818     iput(sbi->s_buddy_cache);
2819     @@ -2833,7 +2810,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2820     struct ext4_group_info *db;
2821     int err, count = 0, count2 = 0;
2822     struct ext4_free_data *entry;
2823     - ext4_fsblk_t discard_block;
2824     struct list_head *l, *ltmp;
2825    
2826     list_for_each_safe(l, ltmp, &txn->t_private_list) {
2827     @@ -2863,13 +2839,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2828     page_cache_release(e4b.bd_bitmap_page);
2829     }
2830     ext4_unlock_group(sb, entry->group);
2831     - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2832     - + entry->start_blk
2833     - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2834     - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
2835     - entry->count);
2836     - sb_issue_discard(sb, discard_block, entry->count);
2837     -
2838     + if (test_opt(sb, DISCARD)) {
2839     + ext4_fsblk_t discard_block;
2840     + struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2841     +
2842     + discard_block = (ext4_fsblk_t)entry->group *
2843     + EXT4_BLOCKS_PER_GROUP(sb)
2844     + + entry->start_blk
2845     + + le32_to_cpu(es->s_first_data_block);
2846     + trace_ext4_discard_blocks(sb,
2847     + (unsigned long long)discard_block,
2848     + entry->count);
2849     + sb_issue_discard(sb, discard_block, entry->count);
2850     + }
2851     kmem_cache_free(ext4_free_ext_cachep, entry);
2852     ext4_mb_release_desc(&e4b);
2853     }
2854     @@ -3276,6 +3258,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2855     }
2856    
2857     /*
2858     + * Called on failure; free up any blocks from the inode PA for this
2859     + * context. We don't need this for MB_GROUP_PA because we only change
2860     + * pa_free in ext4_mb_release_context(), but on failure, we've already
2861     + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
2862     + */
2863     +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
2864     +{
2865     + struct ext4_prealloc_space *pa = ac->ac_pa;
2866     + int len;
2867     +
2868     + if (pa && pa->pa_type == MB_INODE_PA) {
2869     + len = ac->ac_b_ex.fe_len;
2870     + pa->pa_free += len;
2871     + }
2872     +
2873     +}
2874     +
2875     +/*
2876     * use blocks preallocated to inode
2877     */
2878     static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
2879     @@ -3382,6 +3382,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
2880     ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
2881     continue;
2882    
2883     + /* non-extent files can't have physical blocks past 2^32 */
2884     + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
2885     + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
2886     + continue;
2887     +
2888     /* found preallocated blocks, use them */
2889     spin_lock(&pa->pa_lock);
2890     if (pa->pa_deleted == 0 && pa->pa_free) {
2891     @@ -4174,16 +4179,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
2892     if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2893     return;
2894    
2895     + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2896     + return;
2897     +
2898     size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2899     - isize = i_size_read(ac->ac_inode) >> bsbits;
2900     - size = max(size, isize);
2901     + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
2902     + >> bsbits;
2903    
2904     - /* don't use group allocation for large files */
2905     - if (size >= sbi->s_mb_stream_request)
2906     + if ((size == isize) &&
2907     + !ext4_fs_is_busy(sbi) &&
2908     + (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
2909     + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
2910     return;
2911     + }
2912    
2913     - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2914     + /* don't use group allocation for large files */
2915     + size = max(size, isize);
2916     + if (size >= sbi->s_mb_stream_request) {
2917     + ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
2918     return;
2919     + }
2920    
2921     BUG_ON(ac->ac_lg != NULL);
2922     /*
2923     @@ -4549,6 +4564,7 @@ repeat:
2924     ac->ac_status = AC_STATUS_CONTINUE;
2925     goto repeat;
2926     } else if (*errp) {
2927     + ext4_discard_allocated_blocks(ac);
2928     ac->ac_b_ex.fe_len = 0;
2929     ar->len = 0;
2930     ext4_mb_show_ac(ac);
2931     diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
2932     index 313a50b..8646149 100644
2933     --- a/fs/ext4/migrate.c
2934     +++ b/fs/ext4/migrate.c
2935     @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
2936     goto err_out;
2937     }
2938     }
2939     - retval = ext4_ext_insert_extent(handle, inode, path, &newext);
2940     + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
2941     err_out:
2942     if (path) {
2943     ext4_ext_drop_refs(path);
2944     @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
2945     * So allocate a credit of 3. We may update
2946     * quota (user and group).
2947     */
2948     - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2949     + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2950    
2951     if (ext4_journal_extend(handle, needed) != 0)
2952     retval = ext4_journal_restart(handle, needed);
2953     @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
2954    
2955     down_write(&EXT4_I(inode)->i_data_sem);
2956     /*
2957     - * if EXT4_EXT_MIGRATE is cleared a block allocation
2958     + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
2959     * happened after we started the migrate. We need to
2960     * fail the migrate
2961     */
2962     - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
2963     + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
2964     retval = -EAGAIN;
2965     up_write(&EXT4_I(inode)->i_data_sem);
2966     goto err_out;
2967     } else
2968     - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
2969     - ~EXT4_EXT_MIGRATE;
2970     + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
2971     /*
2972     * We have the extent map build with the tmp inode.
2973     * Now copy the i_data across
2974     @@ -478,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode)
2975     handle = ext4_journal_start(inode,
2976     EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
2977     EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2978     - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
2979     + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
2980     + 1);
2981     if (IS_ERR(handle)) {
2982     retval = PTR_ERR(handle);
2983     @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
2984     * when we add extents we extent the journal
2985     */
2986     /*
2987     - * Even though we take i_mutex we can still cause block allocation
2988     - * via mmap write to holes. If we have allocated new blocks we fail
2989     - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
2990     - * The flag is updated with i_data_sem held to prevent racing with
2991     - * block allocation.
2992     + * Even though we take i_mutex we can still cause block
2993     + * allocation via mmap write to holes. If we have allocated
2994     + * new blocks we fail migrate. New block allocation will
2995     + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
2996     + * with i_data_sem held to prevent racing with block
2997     + * allocation.
2998     */
2999     down_read((&EXT4_I(inode)->i_data_sem));
3000     - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
3001     + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
3002     up_read((&EXT4_I(inode)->i_data_sem));
3003    
3004     handle = ext4_journal_start(inode, 1);
3005     @@ -618,7 +618,7 @@ err_out:
3006     tmp_inode->i_nlink = 0;
3007    
3008     ext4_journal_stop(handle);
3009     -
3010     + unlock_new_inode(tmp_inode);
3011     iput(tmp_inode);
3012    
3013     return retval;
3014     diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
3015     index bbf2dd9..9a573a6 100644
3016     --- a/fs/ext4/move_extent.c
3017     +++ b/fs/ext4/move_extent.c
3018     @@ -19,14 +19,31 @@
3019     #include "ext4_extents.h"
3020     #include "ext4.h"
3021    
3022     -#define get_ext_path(path, inode, block, ret) \
3023     - do { \
3024     - path = ext4_ext_find_extent(inode, block, path); \
3025     - if (IS_ERR(path)) { \
3026     - ret = PTR_ERR(path); \
3027     - path = NULL; \
3028     - } \
3029     - } while (0)
3030     +/**
3031     + * get_ext_path - Find an extent path for designated logical block number.
3032     + *
3033     + * @inode: an inode which is searched
3034     + * @lblock: logical block number to find an extent path
3035     + * @path: pointer to an extent path pointer (for output)
3036     + *
3037     + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
3038     + * on failure.
3039     + */
3040     +static inline int
3041     +get_ext_path(struct inode *inode, ext4_lblk_t lblock,
3042     + struct ext4_ext_path **path)
3043     +{
3044     + int ret = 0;
3045     +
3046     + *path = ext4_ext_find_extent(inode, lblock, *path);
3047     + if (IS_ERR(*path)) {
3048     + ret = PTR_ERR(*path);
3049     + *path = NULL;
3050     + } else if ((*path)[ext_depth(inode)].p_ext == NULL)
3051     + ret = -ENODATA;
3052     +
3053     + return ret;
3054     +}
3055    
3056     /**
3057     * copy_extent_status - Copy the extent's initialization status
3058     @@ -60,12 +77,14 @@ static int
3059     mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
3060     struct ext4_extent **extent)
3061     {
3062     + struct ext4_extent_header *eh;
3063     int ppos, leaf_ppos = path->p_depth;
3064    
3065     ppos = leaf_ppos;
3066     if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
3067     /* leaf block */
3068     *extent = ++path[ppos].p_ext;
3069     + path[ppos].p_block = ext_pblock(path[ppos].p_ext);
3070     return 0;
3071     }
3072    
3073     @@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
3074     ext_block_hdr(path[cur_ppos+1].p_bh);
3075     }
3076    
3077     + path[leaf_ppos].p_ext = *extent = NULL;
3078     +
3079     + eh = path[leaf_ppos].p_hdr;
3080     + if (le16_to_cpu(eh->eh_entries) == 0)
3081     + /* empty leaf is found */
3082     + return -ENODATA;
3083     +
3084     /* leaf block */
3085     path[leaf_ppos].p_ext = *extent =
3086     EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
3087     + path[leaf_ppos].p_block =
3088     + ext_pblock(path[leaf_ppos].p_ext);
3089     return 0;
3090     }
3091     }
3092     @@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
3093     }
3094    
3095     /**
3096     - * mext_double_down_read - Acquire two inodes' read semaphore
3097     + * mext_check_null_inode - NULL check for two inodes
3098     *
3099     - * @orig_inode: original inode structure
3100     - * @donor_inode: donor inode structure
3101     - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
3102     + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
3103     */
3104     -static void
3105     -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
3106     +static int
3107     +mext_check_null_inode(struct inode *inode1, struct inode *inode2,
3108     + const char *function)
3109     {
3110     - struct inode *first = orig_inode, *second = donor_inode;
3111     -
3112     - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3113     -
3114     - /*
3115     - * Use the inode number to provide the stable locking order instead
3116     - * of its address, because the C language doesn't guarantee you can
3117     - * compare pointers that don't come from the same array.
3118     - */
3119     - if (donor_inode->i_ino < orig_inode->i_ino) {
3120     - first = donor_inode;
3121     - second = orig_inode;
3122     + int ret = 0;
3123     +
3124     + if (inode1 == NULL) {
3125     + ext4_error(inode2->i_sb, function,
3126     + "Both inodes should not be NULL: "
3127     + "inode1 NULL inode2 %lu", inode2->i_ino);
3128     + ret = -EIO;
3129     + } else if (inode2 == NULL) {
3130     + ext4_error(inode1->i_sb, function,
3131     + "Both inodes should not be NULL: "
3132     + "inode1 %lu inode2 NULL", inode1->i_ino);
3133     + ret = -EIO;
3134     }
3135     -
3136     - down_read(&EXT4_I(first)->i_data_sem);
3137     - down_read(&EXT4_I(second)->i_data_sem);
3138     + return ret;
3139     }
3140    
3141     /**
3142     - * mext_double_down_write - Acquire two inodes' write semaphore
3143     + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
3144     *
3145     * @orig_inode: original inode structure
3146     * @donor_inode: donor inode structure
3147     - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
3148     + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
3149     + * i_ino order.
3150     */
3151     static void
3152     -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
3153     +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
3154     {
3155     struct inode *first = orig_inode, *second = donor_inode;
3156    
3157     - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3158     -
3159     /*
3160     * Use the inode number to provide the stable locking order instead
3161     * of its address, because the C language doesn't guarantee you can
3162     @@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
3163     }
3164    
3165     down_write(&EXT4_I(first)->i_data_sem);
3166     - down_write(&EXT4_I(second)->i_data_sem);
3167     -}
3168     -
3169     -/**
3170     - * mext_double_up_read - Release two inodes' read semaphore
3171     - *
3172     - * @orig_inode: original inode structure to be released its lock first
3173     - * @donor_inode: donor inode structure to be released its lock second
3174     - * Release read semaphore of two inodes (orig and donor).
3175     - */
3176     -static void
3177     -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
3178     -{
3179     - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3180     -
3181     - up_read(&EXT4_I(orig_inode)->i_data_sem);
3182     - up_read(&EXT4_I(donor_inode)->i_data_sem);
3183     + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
3184     }
3185    
3186     /**
3187     - * mext_double_up_write - Release two inodes' write semaphore
3188     + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
3189     *
3190     * @orig_inode: original inode structure to be released its lock first
3191     * @donor_inode: donor inode structure to be released its lock second
3192     - * Release write semaphore of two inodes (orig and donor).
3193     + * Release write lock of i_data_sem of two inodes (orig and donor).
3194     */
3195     static void
3196     -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
3197     +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
3198     {
3199     - BUG_ON(orig_inode == NULL || donor_inode == NULL);
3200     -
3201     up_write(&EXT4_I(orig_inode)->i_data_sem);
3202     up_write(&EXT4_I(donor_inode)->i_data_sem);
3203     }
3204     @@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
3205     }
3206    
3207     if (new_flag) {
3208     - get_ext_path(orig_path, orig_inode, eblock, err);
3209     - if (orig_path == NULL)
3210     + err = get_ext_path(orig_inode, eblock, &orig_path);
3211     + if (err)
3212     goto out;
3213    
3214     if (ext4_ext_insert_extent(handle, orig_inode,
3215     - orig_path, new_ext))
3216     + orig_path, new_ext, 0))
3217     goto out;
3218     }
3219    
3220     if (end_flag) {
3221     - get_ext_path(orig_path, orig_inode,
3222     - le32_to_cpu(end_ext->ee_block) - 1, err);
3223     - if (orig_path == NULL)
3224     + err = get_ext_path(orig_inode,
3225     + le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
3226     + if (err)
3227     goto out;
3228    
3229     if (ext4_ext_insert_extent(handle, orig_inode,
3230     - orig_path, end_ext))
3231     + orig_path, end_ext, 0))
3232     goto out;
3233     }
3234     out:
3235     @@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
3236     * oext |-----------|
3237     * new_ext |-------|
3238     */
3239     - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
3240     + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
3241     + ext4_error(orig_inode->i_sb, __func__,
3242     + "new_ext_end(%u) should be less than or equal to "
3243     + "oext->ee_block(%u) + oext_alen(%d) - 1",
3244     + new_ext_end, le32_to_cpu(oext->ee_block),
3245     + oext_alen);
3246     + ret = -EIO;
3247     + goto out;
3248     + }
3249    
3250     /*
3251     * Case: new_ext is smaller than original extent
3252     @@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
3253    
3254     ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
3255     o_end, &start_ext, &new_ext, &end_ext);
3256     +out:
3257     return ret;
3258     }
3259    
3260     @@ -554,8 +569,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
3261     * @orig_off: block offset of original inode
3262     * @donor_off: block offset of donor inode
3263     * @max_count: the maximun length of extents
3264     + *
3265     + * Return 0 on success, or a negative error value on failure.
3266     */
3267     -static void
3268     +static int
3269     mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3270     struct ext4_extent *tmp_oext,
3271     ext4_lblk_t orig_off, ext4_lblk_t donor_off,
3272     @@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3273     ext4_lblk_t diff, orig_diff;
3274     struct ext4_extent dext_old, oext_old;
3275    
3276     + BUG_ON(orig_off != donor_off);
3277     +
3278     + /* original and donor extents have to cover the same block offset */
3279     + if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
3280     + le32_to_cpu(tmp_oext->ee_block) +
3281     + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
3282     + return -ENODATA;
3283     +
3284     + if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
3285     + le32_to_cpu(tmp_dext->ee_block) +
3286     + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
3287     + return -ENODATA;
3288     +
3289     dext_old = *tmp_dext;
3290     oext_old = *tmp_oext;
3291    
3292     @@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3293    
3294     copy_extent_status(&oext_old, tmp_dext);
3295     copy_extent_status(&dext_old, tmp_oext);
3296     +
3297     + return 0;
3298     }
3299    
3300     /**
3301     @@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3302     * @donor_inode: donor inode
3303     * @from: block offset of orig_inode
3304     * @count: block count to be replaced
3305     + * @err: pointer to save return value
3306     *
3307     * Replace original inode extents and donor inode extents page by page.
3308     * We implement this replacement in the following three steps:
3309     @@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
3310     * 3. Change the block information of donor inode to point at the saved
3311     * original inode blocks in the dummy extents.
3312     *
3313     - * Return 0 on success, or a negative error value on failure.
3314     + * Return replaced block count.
3315     */
3316     static int
3317     mext_replace_branches(handle_t *handle, struct inode *orig_inode,
3318     struct inode *donor_inode, ext4_lblk_t from,
3319     - ext4_lblk_t count)
3320     + ext4_lblk_t count, int *err)
3321     {
3322     struct ext4_ext_path *orig_path = NULL;
3323     struct ext4_ext_path *donor_path = NULL;
3324     struct ext4_extent *oext, *dext;
3325     struct ext4_extent tmp_dext, tmp_oext;
3326     ext4_lblk_t orig_off = from, donor_off = from;
3327     - int err = 0;
3328     int depth;
3329     int replaced_count = 0;
3330     int dext_alen;
3331    
3332     - mext_double_down_write(orig_inode, donor_inode);
3333     + /* Protect extent trees against block allocations via delalloc */
3334     + double_down_write_data_sem(orig_inode, donor_inode);
3335    
3336     /* Get the original extent for the block "orig_off" */
3337     - get_ext_path(orig_path, orig_inode, orig_off, err);
3338     - if (orig_path == NULL)
3339     + *err = get_ext_path(orig_inode, orig_off, &orig_path);
3340     + if (*err)
3341     goto out;
3342    
3343     /* Get the donor extent for the head */
3344     - get_ext_path(donor_path, donor_inode, donor_off, err);
3345     - if (donor_path == NULL)
3346     + *err = get_ext_path(donor_inode, donor_off, &donor_path);
3347     + if (*err)
3348     goto out;
3349     depth = ext_depth(orig_inode);
3350     oext = orig_path[depth].p_ext;
3351     @@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
3352     dext = donor_path[depth].p_ext;
3353     tmp_dext = *dext;
3354    
3355     - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3356     + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3357     donor_off, count);
3358     + if (*err)
3359     + goto out;
3360    
3361     /* Loop for the donor extents */
3362     while (1) {
3363     /* The extent for donor must be found. */
3364     - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
3365     + if (!dext) {
3366     + ext4_error(donor_inode->i_sb, __func__,
3367     + "The extent for donor must be found");
3368     + *err = -EIO;
3369     + goto out;
3370     + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
3371     + ext4_error(donor_inode->i_sb, __func__,
3372     + "Donor offset(%u) and the first block of donor "
3373     + "extent(%u) should be equal",
3374     + donor_off,
3375     + le32_to_cpu(tmp_dext.ee_block));
3376     + *err = -EIO;
3377     + goto out;
3378     + }
3379    
3380     /* Set donor extent to orig extent */
3381     - err = mext_leaf_block(handle, orig_inode,
3382     + *err = mext_leaf_block(handle, orig_inode,
3383     orig_path, &tmp_dext, &orig_off);
3384     - if (err < 0)
3385     + if (*err)
3386     goto out;
3387    
3388     /* Set orig extent to donor extent */
3389     - err = mext_leaf_block(handle, donor_inode,
3390     + *err = mext_leaf_block(handle, donor_inode,
3391     donor_path, &tmp_oext, &donor_off);
3392     - if (err < 0)
3393     + if (*err)
3394     goto out;
3395    
3396     dext_alen = ext4_ext_get_actual_len(&tmp_dext);
3397     @@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
3398    
3399     if (orig_path)
3400     ext4_ext_drop_refs(orig_path);
3401     - get_ext_path(orig_path, orig_inode, orig_off, err);
3402     - if (orig_path == NULL)
3403     + *err = get_ext_path(orig_inode, orig_off, &orig_path);
3404     + if (*err)
3405     goto out;
3406     depth = ext_depth(orig_inode);
3407     oext = orig_path[depth].p_ext;
3408     - if (le32_to_cpu(oext->ee_block) +
3409     - ext4_ext_get_actual_len(oext) <= orig_off) {
3410     - err = 0;
3411     - goto out;
3412     - }
3413     tmp_oext = *oext;
3414    
3415     if (donor_path)
3416     ext4_ext_drop_refs(donor_path);
3417     - get_ext_path(donor_path, donor_inode,
3418     - donor_off, err);
3419     - if (donor_path == NULL)
3420     + *err = get_ext_path(donor_inode, donor_off, &donor_path);
3421     + if (*err)
3422     goto out;
3423     depth = ext_depth(donor_inode);
3424     dext = donor_path[depth].p_ext;
3425     - if (le32_to_cpu(dext->ee_block) +
3426     - ext4_ext_get_actual_len(dext) <= donor_off) {
3427     - err = 0;
3428     - goto out;
3429     - }
3430     tmp_dext = *dext;
3431    
3432     - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3433     - donor_off,
3434     - count - replaced_count);
3435     + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
3436     + donor_off, count - replaced_count);
3437     + if (*err)
3438     + goto out;
3439     }
3440    
3441     out:
3442     @@ -720,8 +758,12 @@ out:
3443     kfree(donor_path);
3444     }
3445    
3446     - mext_double_up_write(orig_inode, donor_inode);
3447     - return err;
3448     + ext4_ext_invalidate_cache(orig_inode);
3449     + ext4_ext_invalidate_cache(donor_inode);
3450     +
3451     + double_up_write_data_sem(orig_inode, donor_inode);
3452     +
3453     + return replaced_count;
3454     }
3455    
3456     /**
3457     @@ -733,16 +775,17 @@ out:
3458     * @data_offset_in_page: block index where data swapping starts
3459     * @block_len_in_page: the number of blocks to be swapped
3460     * @uninit: orig extent is uninitialized or not
3461     + * @err: pointer to save return value
3462     *
3463     * Save the data in original inode blocks and replace original inode extents
3464     * with donor inode extents by calling mext_replace_branches().
3465     - * Finally, write out the saved data in new original inode blocks. Return 0
3466     - * on success, or a negative error value on failure.
3467     + * Finally, write out the saved data in new original inode blocks. Return
3468     + * replaced block count.
3469     */
3470     static int
3471     -move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3472     +move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
3473     pgoff_t orig_page_offset, int data_offset_in_page,
3474     - int block_len_in_page, int uninit)
3475     + int block_len_in_page, int uninit, int *err)
3476     {
3477     struct inode *orig_inode = o_filp->f_dentry->d_inode;
3478     struct address_space *mapping = orig_inode->i_mapping;
3479     @@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3480     long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
3481     unsigned long blocksize = orig_inode->i_sb->s_blocksize;
3482     unsigned int w_flags = 0;
3483     - unsigned int tmp_data_len, data_len;
3484     + unsigned int tmp_data_size, data_size, replaced_size;
3485     void *fsdata;
3486     - int ret, i, jblocks;
3487     + int i, jblocks;
3488     + int err2 = 0;
3489     + int replaced_count = 0;
3490     int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
3491    
3492     /*
3493     @@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3494     jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
3495     handle = ext4_journal_start(orig_inode, jblocks);
3496     if (IS_ERR(handle)) {
3497     - ret = PTR_ERR(handle);
3498     - return ret;
3499     + *err = PTR_ERR(handle);
3500     + return 0;
3501     }
3502    
3503     if (segment_eq(get_fs(), KERNEL_DS))
3504     @@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3505     * Just swap data blocks between orig and donor.
3506     */
3507     if (uninit) {
3508     - ret = mext_replace_branches(handle, orig_inode,
3509     - donor_inode, orig_blk_offset,
3510     - block_len_in_page);
3511     -
3512     - /* Clear the inode cache not to refer to the old data */
3513     - ext4_ext_invalidate_cache(orig_inode);
3514     - ext4_ext_invalidate_cache(donor_inode);
3515     + replaced_count = mext_replace_branches(handle, orig_inode,
3516     + donor_inode, orig_blk_offset,
3517     + block_len_in_page, err);
3518     goto out2;
3519     }
3520    
3521     offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
3522    
3523     - /* Calculate data_len */
3524     + /* Calculate data_size */
3525     if ((orig_blk_offset + block_len_in_page - 1) ==
3526     ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
3527     /* Replace the last block */
3528     - tmp_data_len = orig_inode->i_size & (blocksize - 1);
3529     + tmp_data_size = orig_inode->i_size & (blocksize - 1);
3530     /*
3531     - * If data_len equal zero, it shows data_len is multiples of
3532     + * If data_size equal zero, it shows data_size is multiples of
3533     * blocksize. So we set appropriate value.
3534     */
3535     - if (tmp_data_len == 0)
3536     - tmp_data_len = blocksize;
3537     + if (tmp_data_size == 0)
3538     + tmp_data_size = blocksize;
3539    
3540     - data_len = tmp_data_len +
3541     + data_size = tmp_data_size +
3542     ((block_len_in_page - 1) << orig_inode->i_blkbits);
3543     - } else {
3544     - data_len = block_len_in_page << orig_inode->i_blkbits;
3545     - }
3546     + } else
3547     + data_size = block_len_in_page << orig_inode->i_blkbits;
3548     +
3549     + replaced_size = data_size;
3550    
3551     - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
3552     + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
3553     &page, &fsdata);
3554     - if (unlikely(ret < 0))
3555     + if (unlikely(*err < 0))
3556     goto out;
3557    
3558     if (!PageUptodate(page)) {
3559     @@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3560     /* Release old bh and drop refs */
3561     try_to_release_page(page, 0);
3562    
3563     - ret = mext_replace_branches(handle, orig_inode, donor_inode,
3564     - orig_blk_offset, block_len_in_page);
3565     - if (ret < 0)
3566     - goto out;
3567     -
3568     - /* Clear the inode cache not to refer to the old data */
3569     - ext4_ext_invalidate_cache(orig_inode);
3570     - ext4_ext_invalidate_cache(donor_inode);
3571     + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
3572     + orig_blk_offset, block_len_in_page,
3573     + &err2);
3574     + if (err2) {
3575     + if (replaced_count) {
3576     + block_len_in_page = replaced_count;
3577     + replaced_size =
3578     + block_len_in_page << orig_inode->i_blkbits;
3579     + } else
3580     + goto out;
3581     + }
3582    
3583     if (!page_has_buffers(page))
3584     create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
3585     @@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
3586     bh = bh->b_this_page;
3587    
3588     for (i = 0; i < block_len_in_page; i++) {
3589     - ret = ext4_get_block(orig_inode,
3590     + *err = ext4_get_block(orig_inode,
3591     (sector_t)(orig_blk_offset + i), bh, 0);
3592     - if (ret < 0)
3593     + if (*err < 0)
3594     goto out;
3595    
3596     if (bh->b_this_page != NULL)
3597     bh = bh->b_this_page;
3598     }
3599    
3600     - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
3601     + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
3602     page, fsdata);
3603     page = NULL;
3604    
3605     @@ -871,11 +916,15 @@ out:
3606     if (PageLocked(page))
3607     unlock_page(page);
3608     page_cache_release(page);
3609     + ext4_journal_stop(handle);
3610     }
3611     out2:
3612     ext4_journal_stop(handle);
3613    
3614     - return ret < 0 ? ret : 0;
3615     + if (err2)
3616     + *err = err2;
3617     +
3618     + return replaced_count;
3619     }
3620    
3621     /**
3622     @@ -886,7 +935,6 @@ out2:
3623     * @orig_start: logical start offset in block for orig
3624     * @donor_start: logical start offset in block for donor
3625     * @len: the number of blocks to be moved
3626     - * @moved_len: moved block length
3627     *
3628     * Check the arguments of ext4_move_extents() whether the files can be
3629     * exchanged with each other.
3630     @@ -894,9 +942,13 @@ out2:
3631     */
3632     static int
3633     mext_check_arguments(struct inode *orig_inode,
3634     - struct inode *donor_inode, __u64 orig_start,
3635     - __u64 donor_start, __u64 *len, __u64 moved_len)
3636     + struct inode *donor_inode, __u64 orig_start,
3637     + __u64 donor_start, __u64 *len)
3638     {
3639     + ext4_lblk_t orig_blocks, donor_blocks;
3640     + unsigned int blkbits = orig_inode->i_blkbits;
3641     + unsigned int blocksize = 1 << blkbits;
3642     +
3643     /* Regular file check */
3644     if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
3645     ext4_debug("ext4 move extent: The argument files should be "
3646     @@ -905,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
3647     return -EINVAL;
3648     }
3649    
3650     + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
3651     + ext4_debug("ext4 move extent: suid or sgid is set"
3652     + " to donor file [ino:orig %lu, donor %lu]\n",
3653     + orig_inode->i_ino, donor_inode->i_ino);
3654     + return -EINVAL;
3655     + }
3656     +
3657     /* Ext4 move extent does not support swapfile */
3658     if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
3659     ext4_debug("ext4 move extent: The argument files should "
3660     @@ -921,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode,
3661     return -EINVAL;
3662     }
3663    
3664     - /* orig and donor should be different file */
3665     - if (orig_inode->i_ino == donor_inode->i_ino) {
3666     - ext4_debug("ext4 move extent: The argument files should not "
3667     - "be same file [ino:orig %lu, donor %lu]\n",
3668     - orig_inode->i_ino, donor_inode->i_ino);
3669     - return -EINVAL;
3670     - }
3671     -
3672     /* Ext4 move extent supports only extent based file */
3673     if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
3674     ext4_debug("ext4 move extent: orig file is not extents "
3675     @@ -953,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
3676     return -EINVAL;
3677     }
3678    
3679     - if (moved_len) {
3680     - ext4_debug("ext4 move extent: moved_len should be 0 "
3681     - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
3682     - donor_inode->i_ino);
3683     - return -EINVAL;
3684     - }
3685     -
3686     if ((orig_start > MAX_DEFRAG_SIZE) ||
3687     (donor_start > MAX_DEFRAG_SIZE) ||
3688     (*len > MAX_DEFRAG_SIZE) ||
3689     @@ -971,43 +1015,47 @@ mext_check_arguments(struct inode *orig_inode,
3690     }
3691    
3692     if (orig_inode->i_size > donor_inode->i_size) {
3693     - if (orig_start >= donor_inode->i_size) {
3694     + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
3695     + /* TODO: eliminate this artificial restriction */
3696     + if (orig_start >= donor_blocks) {
3697     ext4_debug("ext4 move extent: orig start offset "
3698     - "[%llu] should be less than donor file size "
3699     - "[%lld] [ino:orig %lu, donor_inode %lu]\n",
3700     - orig_start, donor_inode->i_size,
3701     + "[%llu] should be less than donor file blocks "
3702     + "[%u] [ino:orig %lu, donor %lu]\n",
3703     + orig_start, donor_blocks,
3704     orig_inode->i_ino, donor_inode->i_ino);
3705     return -EINVAL;
3706     }
3707    
3708     - if (orig_start + *len > donor_inode->i_size) {
3709     + /* TODO: eliminate this artificial restriction */
3710     + if (orig_start + *len > donor_blocks) {
3711     ext4_debug("ext4 move extent: End offset [%llu] should "
3712     - "be less than donor file size [%lld]."
3713     - "So adjust length from %llu to %lld "
3714     + "be less than donor file blocks [%u]."
3715     + "So adjust length from %llu to %llu "
3716     "[ino:orig %lu, donor %lu]\n",
3717     - orig_start + *len, donor_inode->i_size,
3718     - *len, donor_inode->i_size - orig_start,
3719     + orig_start + *len, donor_blocks,
3720     + *len, donor_blocks - orig_start,
3721     orig_inode->i_ino, donor_inode->i_ino);
3722     - *len = donor_inode->i_size - orig_start;
3723     + *len = donor_blocks - orig_start;
3724     }
3725     } else {
3726     - if (orig_start >= orig_inode->i_size) {
3727     + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
3728     + if (orig_start >= orig_blocks) {
3729     ext4_debug("ext4 move extent: start offset [%llu] "
3730     - "should be less than original file size "
3731     - "[%lld] [inode:orig %lu, donor %lu]\n",
3732     - orig_start, orig_inode->i_size,
3733     + "should be less than original file blocks "
3734     + "[%u] [ino:orig %lu, donor %lu]\n",
3735     + orig_start, orig_blocks,
3736     orig_inode->i_ino, donor_inode->i_ino);
3737     return -EINVAL;
3738     }
3739    
3740     - if (orig_start + *len > orig_inode->i_size) {
3741     + if (orig_start + *len > orig_blocks) {
3742     ext4_debug("ext4 move extent: Adjust length "
3743     - "from %llu to %lld. Because it should be "
3744     - "less than original file size "
3745     + "from %llu to %llu. Because it should be "
3746     + "less than original file blocks "
3747     "[ino:orig %lu, donor %lu]\n",
3748     - *len, orig_inode->i_size - orig_start,
3749     + *len, orig_blocks - orig_start,
3750     orig_inode->i_ino, donor_inode->i_ino);
3751     - *len = orig_inode->i_size - orig_start;
3752     + *len = orig_blocks - orig_start;
3753     }
3754     }
3755    
3756     @@ -1027,18 +1075,23 @@ mext_check_arguments(struct inode *orig_inode,
3757     * @inode1: the inode structure
3758     * @inode2: the inode structure
3759     *
3760     - * Lock two inodes' i_mutex by i_ino order. This function is moved from
3761     - * fs/inode.c.
3762     + * Lock two inodes' i_mutex by i_ino order.
3763     + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
3764     */
3765     -static void
3766     +static int
3767     mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
3768     {
3769     - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
3770     - if (inode1)
3771     - mutex_lock(&inode1->i_mutex);
3772     - else if (inode2)
3773     - mutex_lock(&inode2->i_mutex);
3774     - return;
3775     + int ret = 0;
3776     +
3777     + BUG_ON(inode1 == NULL && inode2 == NULL);
3778     +
3779     + ret = mext_check_null_inode(inode1, inode2, __func__);
3780     + if (ret < 0)
3781     + goto out;
3782     +
3783     + if (inode1 == inode2) {
3784     + mutex_lock(&inode1->i_mutex);
3785     + goto out;
3786     }
3787    
3788     if (inode1->i_ino < inode2->i_ino) {
3789     @@ -1048,6 +1101,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
3790     mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
3791     mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
3792     }
3793     +
3794     +out:
3795     + return ret;
3796     }
3797    
3798     /**
3799     @@ -1056,17 +1112,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
3800     * @inode1: the inode that is released first
3801     * @inode2: the inode that is released second
3802     *
3803     - * This function is moved from fs/inode.c.
3804     + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
3805     */
3806    
3807     -static void
3808     +static int
3809     mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
3810     {
3811     + int ret = 0;
3812     +
3813     + BUG_ON(inode1 == NULL && inode2 == NULL);
3814     +
3815     + ret = mext_check_null_inode(inode1, inode2, __func__);
3816     + if (ret < 0)
3817     + goto out;
3818     +
3819     if (inode1)
3820     mutex_unlock(&inode1->i_mutex);
3821    
3822     if (inode2 && inode2 != inode1)
3823     mutex_unlock(&inode2->i_mutex);
3824     +
3825     +out:
3826     + return ret;
3827     }
3828    
3829     /**
3830     @@ -1123,70 +1190,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
3831     ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
3832     ext4_lblk_t rest_blocks;
3833     pgoff_t orig_page_offset = 0, seq_end_page;
3834     - int ret, depth, last_extent = 0;
3835     + int ret1, ret2, depth, last_extent = 0;
3836     int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
3837     int data_offset_in_page;
3838     int block_len_in_page;
3839     int uninit;
3840    
3841     - /* protect orig and donor against a truncate */
3842     - mext_inode_double_lock(orig_inode, donor_inode);
3843     + /* orig and donor should be different file */
3844     + if (orig_inode->i_ino == donor_inode->i_ino) {
3845     + ext4_debug("ext4 move extent: The argument files should not "
3846     + "be same file [ino:orig %lu, donor %lu]\n",
3847     + orig_inode->i_ino, donor_inode->i_ino);
3848     + return -EINVAL;
3849     + }
3850     +
3851     + /* Protect orig and donor inodes against a truncate */
3852     + ret1 = mext_inode_double_lock(orig_inode, donor_inode);
3853     + if (ret1 < 0)
3854     + return ret1;
3855    
3856     - mext_double_down_read(orig_inode, donor_inode);
3857     + /* Protect extent tree against block allocations via delalloc */
3858     + double_down_write_data_sem(orig_inode, donor_inode);
3859     /* Check the filesystem environment whether move_extent can be done */
3860     - ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
3861     - donor_start, &len, *moved_len);
3862     - mext_double_up_read(orig_inode, donor_inode);
3863     - if (ret)
3864     - goto out2;
3865     + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
3866     + donor_start, &len);
3867     + if (ret1)
3868     + goto out;
3869    
3870     file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
3871     block_end = block_start + len - 1;
3872     if (file_end < block_end)
3873     len -= block_end - file_end;
3874    
3875     - get_ext_path(orig_path, orig_inode, block_start, ret);
3876     - if (orig_path == NULL)
3877     - goto out2;
3878     + ret1 = get_ext_path(orig_inode, block_start, &orig_path);
3879     + if (ret1)
3880     + goto out;
3881    
3882     /* Get path structure to check the hole */
3883     - get_ext_path(holecheck_path, orig_inode, block_start, ret);
3884     - if (holecheck_path == NULL)
3885     + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
3886     + if (ret1)
3887     goto out;
3888    
3889     depth = ext_depth(orig_inode);
3890     ext_cur = holecheck_path[depth].p_ext;
3891     - if (ext_cur == NULL) {
3892     - ret = -EINVAL;
3893     - goto out;
3894     - }
3895    
3896     /*
3897     - * Get proper extent whose ee_block is beyond block_start
3898     - * if block_start was within the hole.
3899     + * Get proper starting location of block replacement if block_start was
3900     + * within the hole.
3901     */
3902     if (le32_to_cpu(ext_cur->ee_block) +
3903     ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
3904     + /*
3905     + * The hole exists between extents or the tail of
3906     + * original file.
3907     + */
3908     last_extent = mext_next_extent(orig_inode,
3909     holecheck_path, &ext_cur);
3910     if (last_extent < 0) {
3911     - ret = last_extent;
3912     + ret1 = last_extent;
3913     goto out;
3914     }
3915     last_extent = mext_next_extent(orig_inode, orig_path,
3916     &ext_dummy);
3917     if (last_extent < 0) {
3918     - ret = last_extent;
3919     + ret1 = last_extent;
3920     goto out;
3921     }
3922     - }
3923     - seq_start = block_start;
3924     + seq_start = le32_to_cpu(ext_cur->ee_block);
3925     + } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
3926     + /* The hole exists at the beginning of original file. */
3927     + seq_start = le32_to_cpu(ext_cur->ee_block);
3928     + else
3929     + seq_start = block_start;
3930    
3931     /* No blocks within the specified range. */
3932     if (le32_to_cpu(ext_cur->ee_block) > block_end) {
3933     ext4_debug("ext4 move extent: The specified range of file "
3934     "may be the hole\n");
3935     - ret = -EINVAL;
3936     + ret1 = -EINVAL;
3937     goto out;
3938     }
3939    
3940     @@ -1206,7 +1287,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
3941     last_extent = mext_next_extent(orig_inode, holecheck_path,
3942     &ext_cur);
3943     if (last_extent < 0) {
3944     - ret = last_extent;
3945     + ret1 = last_extent;
3946     break;
3947     }
3948     add_blocks = ext4_ext_get_actual_len(ext_cur);
3949     @@ -1246,29 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
3950     seq_start = le32_to_cpu(ext_cur->ee_block);
3951     rest_blocks = seq_blocks;
3952    
3953     - /* Discard preallocations of two inodes */
3954     - down_write(&EXT4_I(orig_inode)->i_data_sem);
3955     - ext4_discard_preallocations(orig_inode);
3956     - up_write(&EXT4_I(orig_inode)->i_data_sem);
3957     -
3958     - down_write(&EXT4_I(donor_inode)->i_data_sem);
3959     - ext4_discard_preallocations(donor_inode);
3960     - up_write(&EXT4_I(donor_inode)->i_data_sem);
3961     + /*
3962     + * Up semaphore to avoid following problems:
3963     + * a. transaction deadlock among ext4_journal_start,
3964     + * ->write_begin via pagefault, and jbd2_journal_commit
3965     + * b. racing with ->readpage, ->write_begin, and ext4_get_block
3966     + * in move_extent_per_page
3967     + */
3968     + double_up_write_data_sem(orig_inode, donor_inode);
3969    
3970     while (orig_page_offset <= seq_end_page) {
3971    
3972     /* Swap original branches with new branches */
3973     - ret = move_extent_par_page(o_filp, donor_inode,
3974     + block_len_in_page = move_extent_per_page(
3975     + o_filp, donor_inode,
3976     orig_page_offset,
3977     data_offset_in_page,
3978     - block_len_in_page, uninit);
3979     - if (ret < 0)
3980     - goto out;
3981     - orig_page_offset++;
3982     + block_len_in_page, uninit,
3983     + &ret1);
3984     +
3985     /* Count how many blocks we have exchanged */
3986     *moved_len += block_len_in_page;
3987     - BUG_ON(*moved_len > len);
3988     + if (ret1 < 0)
3989     + break;
3990     + if (*moved_len > len) {
3991     + ext4_error(orig_inode->i_sb, __func__,
3992     + "We replaced blocks too much! "
3993     + "sum of replaced: %llu requested: %llu",
3994     + *moved_len, len);
3995     + ret1 = -EIO;
3996     + break;
3997     + }
3998    
3999     + orig_page_offset++;
4000     data_offset_in_page = 0;
4001     rest_blocks -= block_len_in_page;
4002     if (rest_blocks > blocks_per_page)
4003     @@ -1277,20 +1368,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
4004     block_len_in_page = rest_blocks;
4005     }
4006    
4007     + double_down_write_data_sem(orig_inode, donor_inode);
4008     + if (ret1 < 0)
4009     + break;
4010     +
4011     /* Decrease buffer counter */
4012     if (holecheck_path)
4013     ext4_ext_drop_refs(holecheck_path);
4014     - get_ext_path(holecheck_path, orig_inode,
4015     - seq_start, ret);
4016     - if (holecheck_path == NULL)
4017     + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
4018     + if (ret1)
4019     break;
4020     depth = holecheck_path->p_depth;
4021    
4022     /* Decrease buffer counter */
4023     if (orig_path)
4024     ext4_ext_drop_refs(orig_path);
4025     - get_ext_path(orig_path, orig_inode, seq_start, ret);
4026     - if (orig_path == NULL)
4027     + ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
4028     + if (ret1)
4029     break;
4030    
4031     ext_cur = holecheck_path[depth].p_ext;
4032     @@ -1299,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
4033    
4034     }
4035     out:
4036     + if (*moved_len) {
4037     + ext4_discard_preallocations(orig_inode);
4038     + ext4_discard_preallocations(donor_inode);
4039     + }
4040     +
4041     if (orig_path) {
4042     ext4_ext_drop_refs(orig_path);
4043     kfree(orig_path);
4044     @@ -1307,14 +1406,13 @@ out:
4045     ext4_ext_drop_refs(holecheck_path);
4046     kfree(holecheck_path);
4047     }
4048     -out2:
4049     - mext_inode_double_unlock(orig_inode, donor_inode);
4050     -
4051     - if (ret)
4052     - return ret;
4053     + double_up_write_data_sem(orig_inode, donor_inode);
4054     + ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
4055    
4056     - /* All of the specified blocks must be exchanged in succeed */
4057     - BUG_ON(*moved_len != len);
4058     + if (ret1)
4059     + return ret1;
4060     + else if (ret2)
4061     + return ret2;
4062    
4063     return 0;
4064     }
4065     diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
4066     index de04013..9dcd686 100644
4067     --- a/fs/ext4/namei.c
4068     +++ b/fs/ext4/namei.c
4069     @@ -1292,9 +1292,6 @@ errout:
4070     * add_dirent_to_buf will attempt search the directory block for
4071     * space. It will return -ENOSPC if no space is available, and -EIO
4072     * and -EEXIST if directory entry already exists.
4073     - *
4074     - * NOTE! bh is NOT released in the case where ENOSPC is returned. In
4075     - * all other cases bh is released.
4076     */
4077     static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4078     struct inode *inode, struct ext4_dir_entry_2 *de,
4079     @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4080     top = bh->b_data + blocksize - reclen;
4081     while ((char *) de <= top) {
4082     if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
4083     - bh, offset)) {
4084     - brelse(bh);
4085     + bh, offset))
4086     return -EIO;
4087     - }
4088     - if (ext4_match(namelen, name, de)) {
4089     - brelse(bh);
4090     + if (ext4_match(namelen, name, de))
4091     return -EEXIST;
4092     - }
4093     nlen = EXT4_DIR_REC_LEN(de->name_len);
4094     rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
4095     if ((de->inode? rlen - nlen: rlen) >= reclen)
4096     @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4097     err = ext4_journal_get_write_access(handle, bh);
4098     if (err) {
4099     ext4_std_error(dir->i_sb, err);
4100     - brelse(bh);
4101     return err;
4102     }
4103    
4104     @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
4105     err = ext4_handle_dirty_metadata(handle, dir, bh);
4106     if (err)
4107     ext4_std_error(dir->i_sb, err);
4108     - brelse(bh);
4109     return 0;
4110     }
4111    
4112     @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
4113     if (!(de))
4114     return retval;
4115    
4116     - return add_dirent_to_buf(handle, dentry, inode, de, bh);
4117     + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
4118     + brelse(bh);
4119     + return retval;
4120     }
4121    
4122     /*
4123     @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
4124     if(!bh)
4125     return retval;
4126     retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
4127     - if (retval != -ENOSPC)
4128     + if (retval != -ENOSPC) {
4129     + brelse(bh);
4130     return retval;
4131     + }
4132    
4133     if (blocks == 1 && !dx_fallback &&
4134     EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
4135     @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
4136     de = (struct ext4_dir_entry_2 *) bh->b_data;
4137     de->inode = 0;
4138     de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
4139     - return add_dirent_to_buf(handle, dentry, inode, de, bh);
4140     + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
4141     + brelse(bh);
4142     + return retval;
4143     }
4144    
4145     /*
4146     @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
4147     goto journal_error;
4148    
4149     err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
4150     - if (err != -ENOSPC) {
4151     - bh = NULL;
4152     + if (err != -ENOSPC)
4153     goto cleanup;
4154     - }
4155    
4156     /* Block full, should compress but for now just split */
4157     dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
4158     @@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
4159     goto cleanup;
4160     node2 = (struct dx_node *)(bh2->b_data);
4161     entries2 = node2->entries;
4162     + memset(&node2->fake, 0, sizeof(struct fake_dirent));
4163     node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
4164     sb->s_blocksize);
4165     - node2->fake.inode = 0;
4166     BUFFER_TRACE(frame->bh, "get_write_access");
4167     err = ext4_journal_get_write_access(handle, frame->bh);
4168     if (err)
4169     @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
4170     if (!de)
4171     goto cleanup;
4172     err = add_dirent_to_buf(handle, dentry, inode, de, bh);
4173     - bh = NULL;
4174     goto cleanup;
4175    
4176     journal_error:
4177     @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
4178     retry:
4179     handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4180     EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
4181     - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4182     + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4183     if (IS_ERR(handle))
4184     return PTR_ERR(handle);
4185    
4186     @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
4187     retry:
4188     handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4189     EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
4190     - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4191     + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4192     if (IS_ERR(handle))
4193     return PTR_ERR(handle);
4194    
4195     @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4196     retry:
4197     handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4198     EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
4199     - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4200     + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4201     if (IS_ERR(handle))
4202     return PTR_ERR(handle);
4203    
4204     @@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
4205     struct ext4_iloc iloc;
4206     int err = 0;
4207    
4208     - if (!ext4_handle_valid(handle))
4209     + /* ext4_handle_valid() assumes a valid handle_t pointer */
4210     + if (handle && !ext4_handle_valid(handle))
4211     return 0;
4212    
4213     mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
4214     @@ -2258,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
4215     retry:
4216     handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
4217     EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
4218     - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
4219     + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
4220     if (IS_ERR(handle))
4221     return PTR_ERR(handle);
4222    
4223     @@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry,
4224     struct inode *inode = old_dentry->d_inode;
4225     int err, retries = 0;
4226    
4227     - if (EXT4_DIR_LINK_MAX(inode))
4228     + if (inode->i_nlink >= EXT4_LINK_MAX)
4229     return -EMLINK;
4230    
4231     /*
4232     @@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
4233     goto end_rename;
4234     retval = -EMLINK;
4235     if (!new_inode && new_dir != old_dir &&
4236     - new_dir->i_nlink >= EXT4_LINK_MAX)
4237     + EXT4_DIR_LINK_MAX(new_dir))
4238     goto end_rename;
4239     }
4240     if (!new_bh) {
4241     diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
4242     index 68b0351..96302cd 100644
4243     --- a/fs/ext4/resize.c
4244     +++ b/fs/ext4/resize.c
4245     @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
4246     goto exit_bh;
4247    
4248     if (IS_ERR(gdb = bclean(handle, sb, block))) {
4249     - err = PTR_ERR(bh);
4250     + err = PTR_ERR(gdb);
4251     goto exit_bh;
4252     }
4253     ext4_handle_dirty_metadata(handle, NULL, gdb);
4254     diff --git a/fs/ext4/super.c b/fs/ext4/super.c
4255     index 8f4f079..ed38f25 100644
4256     --- a/fs/ext4/super.c
4257     +++ b/fs/ext4/super.c
4258     @@ -45,6 +45,7 @@
4259     #include "ext4_jbd2.h"
4260     #include "xattr.h"
4261     #include "acl.h"
4262     +#include "mballoc.h"
4263    
4264     #define CREATE_TRACE_POINTS
4265     #include <trace/events/ext4.h>
4266     @@ -188,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb,
4267     bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
4268     }
4269    
4270     +
4271     +/* Just increment the non-pointer handle value */
4272     +static handle_t *ext4_get_nojournal(void)
4273     +{
4274     + handle_t *handle = current->journal_info;
4275     + unsigned long ref_cnt = (unsigned long)handle;
4276     +
4277     + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
4278     +
4279     + ref_cnt++;
4280     + handle = (handle_t *)ref_cnt;
4281     +
4282     + current->journal_info = handle;
4283     + return handle;
4284     +}
4285     +
4286     +
4287     +/* Decrement the non-pointer handle value */
4288     +static void ext4_put_nojournal(handle_t *handle)
4289     +{
4290     + unsigned long ref_cnt = (unsigned long)handle;
4291     +
4292     + BUG_ON(ref_cnt == 0);
4293     +
4294     + ref_cnt--;
4295     + handle = (handle_t *)ref_cnt;
4296     +
4297     + current->journal_info = handle;
4298     +}
4299     +
4300     /*
4301     * Wrappers for jbd2_journal_start/end.
4302     *
4303     @@ -214,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
4304     }
4305     return jbd2_journal_start(journal, nblocks);
4306     }
4307     - /*
4308     - * We're not journaling, return the appropriate indication.
4309     - */
4310     - current->journal_info = EXT4_NOJOURNAL_HANDLE;
4311     - return current->journal_info;
4312     + return ext4_get_nojournal();
4313     }
4314    
4315     /*
4316     @@ -234,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
4317     int rc;
4318    
4319     if (!ext4_handle_valid(handle)) {
4320     - /*
4321     - * Do this here since we don't call jbd2_journal_stop() in
4322     - * no-journal mode.
4323     - */
4324     - current->journal_info = NULL;
4325     + ext4_put_nojournal(handle);
4326     return 0;
4327     }
4328     sb = handle->h_transaction->t_journal->j_private;
4329     @@ -344,7 +367,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
4330     errstr = "Out of memory";
4331     break;
4332     case -EROFS:
4333     - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
4334     + if (!sb || (EXT4_SB(sb)->s_journal &&
4335     + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
4336     errstr = "Journal has aborted";
4337     else
4338     errstr = "Readonly filesystem";
4339     @@ -578,15 +602,14 @@ static void ext4_put_super(struct super_block *sb)
4340     struct ext4_super_block *es = sbi->s_es;
4341     int i, err;
4342    
4343     + flush_workqueue(sbi->dio_unwritten_wq);
4344     + destroy_workqueue(sbi->dio_unwritten_wq);
4345     +
4346     lock_super(sb);
4347     lock_kernel();
4348     if (sb->s_dirt)
4349     ext4_commit_super(sb, 1);
4350    
4351     - ext4_release_system_zone(sb);
4352     - ext4_mb_release(sb);
4353     - ext4_ext_release(sb);
4354     - ext4_xattr_put_super(sb);
4355     if (sbi->s_journal) {
4356     err = jbd2_journal_destroy(sbi->s_journal);
4357     sbi->s_journal = NULL;
4358     @@ -594,6 +617,12 @@ static void ext4_put_super(struct super_block *sb)
4359     ext4_abort(sb, __func__,
4360     "Couldn't clean up the journal");
4361     }
4362     +
4363     + ext4_release_system_zone(sb);
4364     + ext4_mb_release(sb);
4365     + ext4_ext_release(sb);
4366     + ext4_xattr_put_super(sb);
4367     +
4368     if (!(sb->s_flags & MS_RDONLY)) {
4369     EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4370     es->s_state = cpu_to_le16(sbi->s_mount_state);
4371     @@ -682,6 +711,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
4372     ei->i_allocated_meta_blocks = 0;
4373     ei->i_delalloc_reserved_flag = 0;
4374     spin_lock_init(&(ei->i_block_reservation_lock));
4375     + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
4376     + ei->cur_aio_dio = NULL;
4377     + ei->i_sync_tid = 0;
4378     + ei->i_datasync_tid = 0;
4379    
4380     return &ei->vfs_inode;
4381     }
4382     @@ -877,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
4383     if (test_opt(sb, NO_AUTO_DA_ALLOC))
4384     seq_puts(seq, ",noauto_da_alloc");
4385    
4386     + if (test_opt(sb, DISCARD))
4387     + seq_puts(seq, ",discard");
4388     +
4389     + if (test_opt(sb, NOLOAD))
4390     + seq_puts(seq, ",norecovery");
4391     +
4392     ext4_show_quota_options(seq, sb);
4393    
4394     return 0;
4395     @@ -1057,7 +1096,8 @@ enum {
4396     Opt_usrquota, Opt_grpquota, Opt_i_version,
4397     Opt_stripe, Opt_delalloc, Opt_nodelalloc,
4398     Opt_block_validity, Opt_noblock_validity,
4399     - Opt_inode_readahead_blks, Opt_journal_ioprio
4400     + Opt_inode_readahead_blks, Opt_journal_ioprio,
4401     + Opt_discard, Opt_nodiscard,
4402     };
4403    
4404     static const match_table_t tokens = {
4405     @@ -1082,6 +1122,7 @@ static const match_table_t tokens = {
4406     {Opt_acl, "acl"},
4407     {Opt_noacl, "noacl"},
4408     {Opt_noload, "noload"},
4409     + {Opt_noload, "norecovery"},
4410     {Opt_nobh, "nobh"},
4411     {Opt_bh, "bh"},
4412     {Opt_commit, "commit=%u"},
4413     @@ -1123,6 +1164,8 @@ static const match_table_t tokens = {
4414     {Opt_auto_da_alloc, "auto_da_alloc=%u"},
4415     {Opt_auto_da_alloc, "auto_da_alloc"},
4416     {Opt_noauto_da_alloc, "noauto_da_alloc"},
4417     + {Opt_discard, "discard"},
4418     + {Opt_nodiscard, "nodiscard"},
4419     {Opt_err, NULL},
4420     };
4421    
4422     @@ -1551,6 +1594,12 @@ set_qf_format:
4423     else
4424     set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
4425     break;
4426     + case Opt_discard:
4427     + set_opt(sbi->s_mount_opt, DISCARD);
4428     + break;
4429     + case Opt_nodiscard:
4430     + clear_opt(sbi->s_mount_opt, DISCARD);
4431     + break;
4432     default:
4433     ext4_msg(sb, KERN_ERR,
4434     "Unrecognized mount option \"%s\" "
4435     @@ -1666,14 +1715,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
4436     size_t size;
4437     int i;
4438    
4439     - if (!sbi->s_es->s_log_groups_per_flex) {
4440     + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
4441     + groups_per_flex = 1 << sbi->s_log_groups_per_flex;
4442     +
4443     + if (groups_per_flex < 2) {
4444     sbi->s_log_groups_per_flex = 0;
4445     return 1;
4446     }
4447    
4448     - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
4449     - groups_per_flex = 1 << sbi->s_log_groups_per_flex;
4450     -
4451     /* We allocate both existing and potentially added groups */
4452     flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
4453     ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
4454     @@ -1695,12 +1744,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
4455     gdp = ext4_get_group_desc(sb, i, NULL);
4456    
4457     flex_group = ext4_flex_group(sbi, i);
4458     - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
4459     - ext4_free_inodes_count(sb, gdp));
4460     - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
4461     - ext4_free_blks_count(sb, gdp));
4462     - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
4463     - ext4_used_dirs_count(sb, gdp));
4464     + atomic_add(ext4_free_inodes_count(sb, gdp),
4465     + &sbi->s_flex_groups[flex_group].free_inodes);
4466     + atomic_add(ext4_free_blks_count(sb, gdp),
4467     + &sbi->s_flex_groups[flex_group].free_blocks);
4468     + atomic_add(ext4_used_dirs_count(sb, gdp),
4469     + &sbi->s_flex_groups[flex_group].used_dirs);
4470     }
4471    
4472     return 1;
4473     @@ -2197,6 +2246,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
4474     EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
4475     EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
4476     EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
4477     +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
4478    
4479     static struct attribute *ext4_attrs[] = {
4480     ATTR_LIST(delayed_allocation_blocks),
4481     @@ -2210,6 +2260,7 @@ static struct attribute *ext4_attrs[] = {
4482     ATTR_LIST(mb_order2_req),
4483     ATTR_LIST(mb_stream_req),
4484     ATTR_LIST(mb_group_prealloc),
4485     + ATTR_LIST(max_writeback_mb_bump),
4486     NULL,
4487     };
4488    
4489     @@ -2253,6 +2304,49 @@ static struct kobj_type ext4_ktype = {
4490     .release = ext4_sb_release,
4491     };
4492    
4493     +/*
4494     + * Check whether this filesystem can be mounted based on
4495     + * the features present and the RDONLY/RDWR mount requested.
4496     + * Returns 1 if this filesystem can be mounted as requested,
4497     + * 0 if it cannot be.
4498     + */
4499     +static int ext4_feature_set_ok(struct super_block *sb, int readonly)
4500     +{
4501     + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
4502     + ext4_msg(sb, KERN_ERR,
4503     + "Couldn't mount because of "
4504     + "unsupported optional features (%x)",
4505     + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
4506     + ~EXT4_FEATURE_INCOMPAT_SUPP));
4507     + return 0;
4508     + }
4509     +
4510     + if (readonly)
4511     + return 1;
4512     +
4513     + /* Check that feature set is OK for a read-write mount */
4514     + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
4515     + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
4516     + "unsupported optional features (%x)",
4517     + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
4518     + ~EXT4_FEATURE_RO_COMPAT_SUPP));
4519     + return 0;
4520     + }
4521     + /*
4522     + * Large file size enabled file system can only be mounted
4523     + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
4524     + */
4525     + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4526     + if (sizeof(blkcnt_t) < sizeof(u64)) {
4527     + ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
4528     + "cannot be mounted RDWR without "
4529     + "CONFIG_LBDAF");
4530     + return 0;
4531     + }
4532     + }
4533     + return 1;
4534     +}
4535     +
4536     static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4537     __releases(kernel_lock)
4538     __acquires(kernel_lock)
4539     @@ -2274,7 +2368,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4540     unsigned int db_count;
4541     unsigned int i;
4542     int needs_recovery, has_huge_files;
4543     - int features;
4544     __u64 blocks_count;
4545     int err;
4546     unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4547     @@ -2401,39 +2494,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4548     * previously didn't change the revision level when setting the flags,
4549     * so there is a chance incompat flags are set on a rev 0 filesystem.
4550     */
4551     - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
4552     - if (features) {
4553     - ext4_msg(sb, KERN_ERR,
4554     - "Couldn't mount because of "
4555     - "unsupported optional features (%x)",
4556     - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
4557     - ~EXT4_FEATURE_INCOMPAT_SUPP));
4558     - goto failed_mount;
4559     - }
4560     - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
4561     - if (!(sb->s_flags & MS_RDONLY) && features) {
4562     - ext4_msg(sb, KERN_ERR,
4563     - "Couldn't mount RDWR because of "
4564     - "unsupported optional features (%x)",
4565     - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
4566     - ~EXT4_FEATURE_RO_COMPAT_SUPP));
4567     + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
4568     goto failed_mount;
4569     - }
4570     - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
4571     - EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4572     - if (has_huge_files) {
4573     - /*
4574     - * Large file size enabled file system can only be
4575     - * mount if kernel is build with CONFIG_LBDAF
4576     - */
4577     - if (sizeof(root->i_blocks) < sizeof(u64) &&
4578     - !(sb->s_flags & MS_RDONLY)) {
4579     - ext4_msg(sb, KERN_ERR, "Filesystem with huge "
4580     - "files cannot be mounted read-write "
4581     - "without CONFIG_LBDAF");
4582     - goto failed_mount;
4583     - }
4584     - }
4585     +
4586     blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
4587    
4588     if (blocksize < EXT4_MIN_BLOCK_SIZE ||
4589     @@ -2469,6 +2532,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4590     }
4591     }
4592    
4593     + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
4594     + EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4595     sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
4596     has_huge_files);
4597     sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
4598     @@ -2549,12 +2614,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4599     goto failed_mount;
4600     }
4601    
4602     - if (ext4_blocks_count(es) >
4603     - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
4604     + /*
4605     + * Test whether we have more sectors than will fit in sector_t,
4606     + * and whether the max offset is addressable by the page cache.
4607     + */
4608     + if ((ext4_blocks_count(es) >
4609     + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
4610     + (ext4_blocks_count(es) >
4611     + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
4612     ext4_msg(sb, KERN_ERR, "filesystem"
4613     - " too large to mount safely");
4614     + " too large to mount safely on this system");
4615     if (sizeof(sector_t) < 8)
4616     ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
4617     + ret = -EFBIG;
4618     goto failed_mount;
4619     }
4620    
4621     @@ -2595,6 +2667,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4622     goto failed_mount;
4623     }
4624     sbi->s_groups_count = blocks_count;
4625     + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4626     + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4627     db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4628     EXT4_DESC_PER_BLOCK(sb);
4629     sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
4630     @@ -2656,6 +2730,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4631     }
4632    
4633     sbi->s_stripe = ext4_get_stripe_size(sbi);
4634     + sbi->s_max_writeback_mb_bump = 128;
4635    
4636     /*
4637     * set up enough so that it can read an inode
4638     @@ -2781,6 +2856,12 @@ no_journal:
4639     clear_opt(sbi->s_mount_opt, NOBH);
4640     }
4641     }
4642     + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
4643     + if (!EXT4_SB(sb)->dio_unwritten_wq) {
4644     + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
4645     + goto failed_mount_wq;
4646     + }
4647     +
4648     /*
4649     * The jbd2_journal_load will have done any necessary log recovery,
4650     * so we can safely mount the rest of the filesystem now.
4651     @@ -2893,6 +2974,8 @@ cantfind_ext4:
4652    
4653     failed_mount4:
4654     ext4_msg(sb, KERN_ERR, "mount failed");
4655     + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
4656     +failed_mount_wq:
4657     ext4_release_system_zone(sb);
4658     if (sbi->s_journal) {
4659     jbd2_journal_destroy(sbi->s_journal);
4660     @@ -3208,7 +3291,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4661     clear_buffer_write_io_error(sbh);
4662     set_buffer_uptodate(sbh);
4663     }
4664     - es->s_wtime = cpu_to_le32(get_seconds());
4665     + /*
4666     + * If the file system is mounted read-only, don't update the
4667     + * superblock write time. This avoids updating the superblock
4668     + * write time when we are mounting the root file system
4669     + * read/only but we need to replay the journal; at that point,
4670     + * for people who are east of GMT and who make their clock
4671     + * tick in localtime for Windows bug-for-bug compatibility,
4672     + * the clock is set in the future, and this will cause e2fsck
4673     + * to complain and force a full file system check.
4674     + */
4675     + if (!(sb->s_flags & MS_RDONLY))
4676     + es->s_wtime = cpu_to_le32(get_seconds());
4677     es->s_kbytes_written =
4678     cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4679     ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
4680     @@ -3333,11 +3427,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4681     {
4682     int ret = 0;
4683     tid_t target;
4684     + struct ext4_sb_info *sbi = EXT4_SB(sb);
4685    
4686     trace_ext4_sync_fs(sb, wait);
4687     - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
4688     + flush_workqueue(sbi->dio_unwritten_wq);
4689     + if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4690     if (wait)
4691     - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
4692     + jbd2_log_wait_commit(sbi->s_journal, target);
4693     }
4694     return ret;
4695     }
4696     @@ -3477,18 +3573,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4697     if (sbi->s_journal)
4698     ext4_mark_recovery_complete(sb, es);
4699     } else {
4700     - int ret;
4701     - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
4702     - ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
4703     - ext4_msg(sb, KERN_WARNING, "couldn't "
4704     - "remount RDWR because of unsupported "
4705     - "optional features (%x)",
4706     - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
4707     - ~EXT4_FEATURE_RO_COMPAT_SUPP));
4708     + /* Make sure we can mount this feature set readwrite */
4709     + if (!ext4_feature_set_ok(sb, 0)) {
4710     err = -EROFS;
4711     goto restore_opts;
4712     }
4713     -
4714     /*
4715     * Make sure the group descriptor checksums
4716     * are sane. If they aren't, refuse to remount r/w.
4717     @@ -3624,13 +3713,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4718     buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4719     buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4720     percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4721     - ext4_free_blocks_count_set(es, buf->f_bfree);
4722     buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4723     if (buf->f_bfree < ext4_r_blocks_count(es))
4724     buf->f_bavail = 0;
4725     buf->f_files = le32_to_cpu(es->s_inodes_count);
4726     buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
4727     - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
4728     buf->f_namelen = EXT4_NAME_LEN;
4729     fsid = le64_to_cpup((void *)es->s_uuid) ^
4730     le64_to_cpup((void *)es->s_uuid + sizeof(u64));
4731     diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
4732     index 62b31c2..0257019 100644
4733     --- a/fs/ext4/xattr.c
4734     +++ b/fs/ext4/xattr.c
4735     @@ -810,12 +810,23 @@ inserted:
4736     get_bh(new_bh);
4737     } else {
4738     /* We need to allocate a new block */
4739     - ext4_fsblk_t goal = ext4_group_first_block_no(sb,
4740     + ext4_fsblk_t goal, block;
4741     +
4742     + goal = ext4_group_first_block_no(sb,
4743     EXT4_I(inode)->i_block_group);
4744     - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
4745     +
4746     + /* non-extent files can't have physical blocks past 2^32 */
4747     + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4748     + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
4749     +
4750     + block = ext4_new_meta_blocks(handle, inode,
4751     goal, NULL, &error);
4752     if (error)
4753     goto cleanup;
4754     +
4755     + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4756     + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
4757     +
4758     ea_idebug(inode, "creating block %d", block);
4759    
4760     new_bh = sb_getblk(sb, block);
4761     @@ -977,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
4762     if (error)
4763     goto cleanup;
4764    
4765     + error = ext4_journal_get_write_access(handle, is.iloc.bh);
4766     + if (error)
4767     + goto cleanup;
4768     +
4769     if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
4770     struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
4771     memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
4772     @@ -1002,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
4773     if (flags & XATTR_CREATE)
4774     goto cleanup;
4775     }
4776     - error = ext4_journal_get_write_access(handle, is.iloc.bh);
4777     - if (error)
4778     - goto cleanup;
4779     if (!value) {
4780     if (!is.s.not_found)
4781     error = ext4_xattr_ibody_set(handle, inode, &i, &is);
4782     diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
4783     index 7b4088b..8cf902a 100644
4784     --- a/fs/jbd2/commit.c
4785     +++ b/fs/jbd2/commit.c
4786     @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
4787     JBUFFER_TRACE(jh, "ph3: write metadata");
4788     flags = jbd2_journal_write_metadata_buffer(commit_transaction,
4789     jh, &new_jh, blocknr);
4790     + if (flags < 0) {
4791     + jbd2_journal_abort(journal, flags);
4792     + continue;
4793     + }
4794     set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
4795     wbuf[bufs++] = jh2bh(new_jh);
4796    
4797     diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
4798     index e378cb3..4b74149 100644
4799     --- a/fs/jbd2/journal.c
4800     +++ b/fs/jbd2/journal.c
4801     @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
4802     EXPORT_SYMBOL(jbd2_journal_ack_err);
4803     EXPORT_SYMBOL(jbd2_journal_clear_err);
4804     EXPORT_SYMBOL(jbd2_log_wait_commit);
4805     +EXPORT_SYMBOL(jbd2_log_start_commit);
4806     EXPORT_SYMBOL(jbd2_journal_start_commit);
4807     EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
4808     EXPORT_SYMBOL(jbd2_journal_wipe);
4809     @@ -361,6 +362,10 @@ repeat:
4810    
4811     jbd_unlock_bh_state(bh_in);
4812     tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
4813     + if (!tmp) {
4814     + jbd2_journal_put_journal_head(new_jh);
4815     + return -ENOMEM;
4816     + }
4817     jbd_lock_bh_state(bh_in);
4818     if (jh_in->b_frozen_data) {
4819     jbd2_free(tmp, bh_in->b_size);
4820     @@ -1187,6 +1192,12 @@ static int journal_reset(journal_t *journal)
4821    
4822     first = be32_to_cpu(sb->s_first);
4823     last = be32_to_cpu(sb->s_maxlen);
4824     + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
4825     + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
4826     + first, last);
4827     + journal_fail_superblock(journal);
4828     + return -EINVAL;
4829     + }
4830    
4831     journal->j_first = first;
4832     journal->j_last = last;
4833     diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
4834     index 6213ac7..a051270 100644
4835     --- a/fs/jbd2/transaction.c
4836     +++ b/fs/jbd2/transaction.c
4837     @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
4838     INIT_LIST_HEAD(&transaction->t_private_list);
4839    
4840     /* Set up the commit timer for the new transaction. */
4841     - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
4842     + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
4843     add_timer(&journal->j_commit_timer);
4844    
4845     J_ASSERT(journal->j_running_transaction == NULL);
4846     @@ -238,6 +238,8 @@ repeat_locked:
4847     __jbd2_log_space_left(journal));
4848     spin_unlock(&transaction->t_handle_lock);
4849     spin_unlock(&journal->j_state_lock);
4850     +
4851     + lock_map_acquire(&handle->h_lockdep_map);
4852     out:
4853     if (unlikely(new_transaction)) /* It's usually NULL */
4854     kfree(new_transaction);
4855     @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
4856     handle = ERR_PTR(err);
4857     goto out;
4858     }
4859     -
4860     - lock_map_acquire(&handle->h_lockdep_map);
4861     out:
4862     return handle;
4863     }
4864     @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
4865     __jbd2_log_start_commit(journal, transaction->t_tid);
4866     spin_unlock(&journal->j_state_lock);
4867    
4868     + lock_map_release(&handle->h_lockdep_map);
4869     handle->h_buffer_credits = nblocks;
4870     ret = start_this_handle(journal, handle);
4871     return ret;
4872     diff --git a/include/linux/sched.h b/include/linux/sched.h
4873     index 0f1ea4a..d3e910b 100644
4874     --- a/include/linux/sched.h
4875     +++ b/include/linux/sched.h
4876     @@ -1999,11 +1999,18 @@ static inline int is_si_special(const struct siginfo *info)
4877     return info <= SEND_SIG_FORCED;
4878     }
4879    
4880     -/* True if we are on the alternate signal stack. */
4881     -
4882     +/*
4883     + * True if we are on the alternate signal stack.
4884     + */
4885     static inline int on_sig_stack(unsigned long sp)
4886     {
4887     - return (sp - current->sas_ss_sp < current->sas_ss_size);
4888     +#ifdef CONFIG_STACK_GROWSUP
4889     + return sp >= current->sas_ss_sp &&
4890     + sp - current->sas_ss_sp < current->sas_ss_size;
4891     +#else
4892     + return sp > current->sas_ss_sp &&
4893     + sp - current->sas_ss_sp <= current->sas_ss_size;
4894     +#endif
4895     }
4896    
4897     static inline int sas_ss_flags(unsigned long sp)
4898     diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h
4899     index 2cc8e8b..6856612 100644
4900     --- a/include/scsi/osd_protocol.h
4901     +++ b/include/scsi/osd_protocol.h
4902     @@ -17,6 +17,7 @@
4903     #define __OSD_PROTOCOL_H__
4904    
4905     #include <linux/types.h>
4906     +#include <linux/kernel.h>
4907     #include <asm/unaligned.h>
4908     #include <scsi/scsi.h>
4909    
4910     diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
4911     index b62a097..6cc72e2 100644
4912     --- a/include/scsi/scsi_host.h
4913     +++ b/include/scsi/scsi_host.h
4914     @@ -677,6 +677,12 @@ struct Scsi_Host {
4915     void *shost_data;
4916    
4917     /*
4918     + * Points to the physical bus device we'd use to do DMA
4919     + * Needed just in case we have virtual hosts.
4920     + */
4921     + struct device *dma_dev;
4922     +
4923     + /*
4924     * We should ensure that this is aligned, both for better performance
4925     * and also because some compilers (m68k) don't automatically force
4926     * alignment to a long boundary.
4927     @@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
4928     extern void scsi_flush_work(struct Scsi_Host *);
4929    
4930     extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
4931     -extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *);
4932     +extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
4933     + struct device *,
4934     + struct device *);
4935     extern void scsi_scan_host(struct Scsi_Host *);
4936     extern void scsi_rescan_device(struct device *);
4937     extern void scsi_remove_host(struct Scsi_Host *);
4938     @@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(enum scsi_host_state);
4939    
4940     extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *);
4941    
4942     +static inline int __must_check scsi_add_host(struct Scsi_Host *host,
4943     + struct device *dev)
4944     +{
4945     + return scsi_add_host_with_dma(host, dev, dev);
4946     +}
4947     +
4948     static inline struct device *scsi_get_device(struct Scsi_Host *shost)
4949     {
4950     return shost->shost_gendev.parent;
4951     diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
4952     index 7d8b5bc..824979e 100644
4953     --- a/include/trace/events/ext4.h
4954     +++ b/include/trace/events/ext4.h
4955     @@ -5,10 +5,12 @@
4956     #define _TRACE_EXT4_H
4957    
4958     #include <linux/writeback.h>
4959     -#include "../../../fs/ext4/ext4.h"
4960     -#include "../../../fs/ext4/mballoc.h"
4961     #include <linux/tracepoint.h>
4962    
4963     +struct ext4_allocation_context;
4964     +struct ext4_allocation_request;
4965     +struct ext4_prealloc_space;
4966     +
4967     TRACE_EVENT(ext4_free_inode,
4968     TP_PROTO(struct inode *inode),
4969    
4970     @@ -229,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages,
4971     __field( char, for_reclaim )
4972     __field( char, for_writepages )
4973     __field( char, range_cyclic )
4974     + __field( pgoff_t, writeback_index )
4975     ),
4976    
4977     TP_fast_assign(
4978     @@ -243,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages,
4979     __entry->for_reclaim = wbc->for_reclaim;
4980     __entry->for_writepages = wbc->for_writepages;
4981     __entry->range_cyclic = wbc->range_cyclic;
4982     + __entry->writeback_index = inode->i_mapping->writeback_index;
4983     ),
4984    
4985     - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
4986     - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
4987     + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu",
4988     + jbd2_dev_to_name(__entry->dev),
4989     + (unsigned long) __entry->ino, __entry->nr_to_write,
4990     __entry->pages_skipped, __entry->range_start,
4991     __entry->range_end, __entry->nonblocking,
4992     __entry->for_kupdate, __entry->for_reclaim,
4993     - __entry->for_writepages, __entry->range_cyclic)
4994     + __entry->for_writepages, __entry->range_cyclic,
4995     + (unsigned long) __entry->writeback_index)
4996     +);
4997     +
4998     +TRACE_EVENT(ext4_da_write_pages,
4999     + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
5000     +
5001     + TP_ARGS(inode, mpd),
5002     +
5003     + TP_STRUCT__entry(
5004     + __field( dev_t, dev )
5005     + __field( ino_t, ino )
5006     + __field( __u64, b_blocknr )
5007     + __field( __u32, b_size )
5008     + __field( __u32, b_state )
5009     + __field( unsigned long, first_page )
5010     + __field( int, io_done )
5011     + __field( int, pages_written )
5012     + ),
5013     +
5014     + TP_fast_assign(
5015     + __entry->dev = inode->i_sb->s_dev;
5016     + __entry->ino = inode->i_ino;
5017     + __entry->b_blocknr = mpd->b_blocknr;
5018     + __entry->b_size = mpd->b_size;
5019     + __entry->b_state = mpd->b_state;
5020     + __entry->first_page = mpd->first_page;
5021     + __entry->io_done = mpd->io_done;
5022     + __entry->pages_written = mpd->pages_written;
5023     + ),
5024     +
5025     + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
5026     + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
5027     + __entry->b_blocknr, __entry->b_size,
5028     + __entry->b_state, __entry->first_page,
5029     + __entry->io_done, __entry->pages_written)
5030     );
5031    
5032     TRACE_EVENT(ext4_da_writepages_result,
5033     @@ -268,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result,
5034     __field( char, encountered_congestion )
5035     __field( char, more_io )
5036     __field( char, no_nrwrite_index_update )
5037     + __field( pgoff_t, writeback_index )
5038     ),
5039    
5040     TP_fast_assign(
5041     @@ -279,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result,
5042     __entry->encountered_congestion = wbc->encountered_congestion;
5043     __entry->more_io = wbc->more_io;
5044     __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
5045     + __entry->writeback_index = inode->i_mapping->writeback_index;
5046     ),
5047    
5048     - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
5049     - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret,
5050     + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
5051     + jbd2_dev_to_name(__entry->dev),
5052     + (unsigned long) __entry->ino, __entry->ret,
5053     __entry->pages_written, __entry->pages_skipped,
5054     __entry->encountered_congestion, __entry->more_io,
5055     - __entry->no_nrwrite_index_update)
5056     + __entry->no_nrwrite_index_update,
5057     + (unsigned long) __entry->writeback_index)
5058     );
5059    
5060     TRACE_EVENT(ext4_da_write_begin,