Annotation of /trunk/kernel26-mcore/patches-2.6.31-r1/0107-2.6.31.8-all-fixes.patch
Parent Directory | Revision Log
Revision 973 -
(hide annotations)
(download)
Tue Jan 5 09:57:31 2010 UTC (14 years, 8 months ago) by niro
File size: 165293 byte(s)
Tue Jan 5 09:57:31 2010 UTC (14 years, 8 months ago) by niro
File size: 165293 byte(s)
-2.6.31-mcore-r1
1 | niro | 973 | diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt |
2 | index 7be02ac..32c3da4 100644 | ||
3 | --- a/Documentation/filesystems/ext4.txt | ||
4 | +++ b/Documentation/filesystems/ext4.txt | ||
5 | @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers | ||
6 | identified through its new major/minor numbers encoded | ||
7 | in devnum. | ||
8 | |||
9 | -noload Don't load the journal on mounting. Note that | ||
10 | - if the filesystem was not unmounted cleanly, | ||
11 | +norecovery Don't load the journal on mounting. Note that | ||
12 | +noload if the filesystem was not unmounted cleanly, | ||
13 | skipping the journal replay will lead to the | ||
14 | filesystem containing inconsistencies that can | ||
15 | lead to any number of problems. | ||
16 | @@ -338,6 +338,12 @@ noauto_da_alloc replacing existing files via patterns such as | ||
17 | system crashes before the delayed allocation | ||
18 | blocks are forced to disk. | ||
19 | |||
20 | +discard Controls whether ext4 should issue discard/TRIM | ||
21 | +nodiscard(*) commands to the underlying block device when | ||
22 | + blocks are freed. This is useful for SSD devices | ||
23 | + and sparse/thinly-provisioned LUNs, but it is off | ||
24 | + by default until sufficient testing has been done. | ||
25 | + | ||
26 | Data Mode | ||
27 | ========= | ||
28 | There are 3 different data modes: | ||
29 | diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c | ||
30 | index 5fd2da4..28a753d 100644 | ||
31 | --- a/drivers/scsi/hosts.c | ||
32 | +++ b/drivers/scsi/hosts.c | ||
33 | @@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *shost) | ||
34 | EXPORT_SYMBOL(scsi_remove_host); | ||
35 | |||
36 | /** | ||
37 | - * scsi_add_host - add a scsi host | ||
38 | + * scsi_add_host_with_dma - add a scsi host with dma device | ||
39 | * @shost: scsi host pointer to add | ||
40 | * @dev: a struct device of type scsi class | ||
41 | + * @dma_dev: dma device for the host | ||
42 | + * | ||
43 | + * Note: You rarely need to worry about this unless you're in a | ||
44 | + * virtualised host environments, so use the simpler scsi_add_host() | ||
45 | + * function instead. | ||
46 | * | ||
47 | * Return value: | ||
48 | * 0 on success / != 0 for error | ||
49 | **/ | ||
50 | -int scsi_add_host(struct Scsi_Host *shost, struct device *dev) | ||
51 | +int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, | ||
52 | + struct device *dma_dev) | ||
53 | { | ||
54 | struct scsi_host_template *sht = shost->hostt; | ||
55 | int error = -EINVAL; | ||
56 | @@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev) | ||
57 | |||
58 | if (!shost->shost_gendev.parent) | ||
59 | shost->shost_gendev.parent = dev ? dev : &platform_bus; | ||
60 | + shost->dma_dev = dma_dev; | ||
61 | |||
62 | error = device_add(&shost->shost_gendev); | ||
63 | if (error) | ||
64 | @@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev) | ||
65 | fail: | ||
66 | return error; | ||
67 | } | ||
68 | -EXPORT_SYMBOL(scsi_add_host); | ||
69 | +EXPORT_SYMBOL(scsi_add_host_with_dma); | ||
70 | |||
71 | static void scsi_host_dev_release(struct device *dev) | ||
72 | { | ||
73 | diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c | ||
74 | index fc67cc6..cf13ff2 100644 | ||
75 | --- a/drivers/scsi/lpfc/lpfc_init.c | ||
76 | +++ b/drivers/scsi/lpfc/lpfc_init.c | ||
77 | @@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) | ||
78 | vport->els_tmofunc.function = lpfc_els_timeout; | ||
79 | vport->els_tmofunc.data = (unsigned long)vport; | ||
80 | |||
81 | - error = scsi_add_host(shost, dev); | ||
82 | + error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev); | ||
83 | if (error) | ||
84 | goto out_put_shost; | ||
85 | |||
86 | diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c | ||
87 | index 7dc3d18..7a838c8 100644 | ||
88 | --- a/drivers/scsi/megaraid/megaraid_sas.c | ||
89 | +++ b/drivers/scsi/megaraid/megaraid_sas.c | ||
90 | @@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, | ||
91 | int error = 0, i; | ||
92 | void *sense = NULL; | ||
93 | dma_addr_t sense_handle; | ||
94 | - u32 *sense_ptr; | ||
95 | + unsigned long *sense_ptr; | ||
96 | |||
97 | memset(kbuff_arr, 0, sizeof(kbuff_arr)); | ||
98 | |||
99 | @@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, | ||
100 | } | ||
101 | |||
102 | sense_ptr = | ||
103 | - (u32 *) ((unsigned long)cmd->frame + ioc->sense_off); | ||
104 | + (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off); | ||
105 | *sense_ptr = sense_handle; | ||
106 | } | ||
107 | |||
108 | @@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, | ||
109 | * sense_ptr points to the location that has the user | ||
110 | * sense buffer address | ||
111 | */ | ||
112 | - sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw + | ||
113 | - ioc->sense_off); | ||
114 | + sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw + | ||
115 | + ioc->sense_off); | ||
116 | |||
117 | if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)), | ||
118 | sense, ioc->sense_len)) { | ||
119 | diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c | ||
120 | index 0f87962..67e016d 100644 | ||
121 | --- a/drivers/scsi/qla2xxx/qla_attr.c | ||
122 | +++ b/drivers/scsi/qla2xxx/qla_attr.c | ||
123 | @@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc_vport, bool disable) | ||
124 | fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN); | ||
125 | } | ||
126 | |||
127 | - if (scsi_add_host(vha->host, &fc_vport->dev)) { | ||
128 | + if (scsi_add_host_with_dma(vha->host, &fc_vport->dev, | ||
129 | + &ha->pdev->dev)) { | ||
130 | DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n", | ||
131 | vha->host_no, vha->vp_idx)); | ||
132 | goto vport_create_failed_2; | ||
133 | diff --git a/drivers/scsi/scsi_lib_dma.c b/drivers/scsi/scsi_lib_dma.c | ||
134 | index ac6855c..dcd1285 100644 | ||
135 | --- a/drivers/scsi/scsi_lib_dma.c | ||
136 | +++ b/drivers/scsi/scsi_lib_dma.c | ||
137 | @@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd) | ||
138 | int nseg = 0; | ||
139 | |||
140 | if (scsi_sg_count(cmd)) { | ||
141 | - struct device *dev = cmd->device->host->shost_gendev.parent; | ||
142 | + struct device *dev = cmd->device->host->dma_dev; | ||
143 | |||
144 | nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), | ||
145 | cmd->sc_data_direction); | ||
146 | @@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map); | ||
147 | void scsi_dma_unmap(struct scsi_cmnd *cmd) | ||
148 | { | ||
149 | if (scsi_sg_count(cmd)) { | ||
150 | - struct device *dev = cmd->device->host->shost_gendev.parent; | ||
151 | + struct device *dev = cmd->device->host->dma_dev; | ||
152 | |||
153 | dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), | ||
154 | cmd->sc_data_direction); | ||
155 | diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c | ||
156 | index e2126d7..34bb797 100644 | ||
157 | --- a/fs/ext4/balloc.c | ||
158 | +++ b/fs/ext4/balloc.c | ||
159 | @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, | ||
160 | static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, | ||
161 | ext4_group_t group) | ||
162 | { | ||
163 | - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; | ||
164 | + if (!ext4_bg_has_super(sb, group)) | ||
165 | + return 0; | ||
166 | + | ||
167 | + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) | ||
168 | + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); | ||
169 | + else | ||
170 | + return EXT4_SB(sb)->s_gdb_count; | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c | ||
175 | index 50784ef..dc79b75 100644 | ||
176 | --- a/fs/ext4/block_validity.c | ||
177 | +++ b/fs/ext4/block_validity.c | ||
178 | @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) | ||
179 | if (ext4_bg_has_super(sb, i) && | ||
180 | ((i < 5) || ((i % flex_size) == 0))) | ||
181 | add_system_zone(sbi, ext4_group_first_block_no(sb, i), | ||
182 | - sbi->s_gdb_count + 1); | ||
183 | + ext4_bg_num_gdb(sb, i) + 1); | ||
184 | gdp = ext4_get_group_desc(sb, i, NULL); | ||
185 | ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); | ||
186 | if (ret) | ||
187 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h | ||
188 | index 9714db3..3b8321b 100644 | ||
189 | --- a/fs/ext4/ext4.h | ||
190 | +++ b/fs/ext4/ext4.h | ||
191 | @@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t; | ||
192 | #define EXT4_MB_HINT_TRY_GOAL 512 | ||
193 | /* blocks already pre-reserved by delayed allocation */ | ||
194 | #define EXT4_MB_DELALLOC_RESERVED 1024 | ||
195 | +/* We are doing stream allocation */ | ||
196 | +#define EXT4_MB_STREAM_ALLOC 2048 | ||
197 | |||
198 | |||
199 | struct ext4_allocation_request { | ||
200 | @@ -111,6 +113,33 @@ struct ext4_allocation_request { | ||
201 | unsigned int flags; | ||
202 | }; | ||
203 | |||
204 | +#define DIO_AIO_UNWRITTEN 0x1 | ||
205 | +typedef struct ext4_io_end { | ||
206 | + struct list_head list; /* per-file finished AIO list */ | ||
207 | + struct inode *inode; /* file being written to */ | ||
208 | + unsigned int flag; /* sync IO or AIO */ | ||
209 | + int error; /* I/O error code */ | ||
210 | + ext4_lblk_t offset; /* offset in the file */ | ||
211 | + size_t size; /* size of the extent */ | ||
212 | + struct work_struct work; /* data work queue */ | ||
213 | +} ext4_io_end_t; | ||
214 | + | ||
215 | +/* | ||
216 | + * Delayed allocation stuff | ||
217 | + */ | ||
218 | + | ||
219 | +struct mpage_da_data { | ||
220 | + struct inode *inode; | ||
221 | + sector_t b_blocknr; /* start block number of extent */ | ||
222 | + size_t b_size; /* size of extent */ | ||
223 | + unsigned long b_state; /* state of the extent */ | ||
224 | + unsigned long first_page, next_page; /* extent of pages */ | ||
225 | + struct writeback_control *wbc; | ||
226 | + int io_done; | ||
227 | + int pages_written; | ||
228 | + int retval; | ||
229 | +}; | ||
230 | + | ||
231 | /* | ||
232 | * Special inodes numbers | ||
233 | */ | ||
234 | @@ -251,7 +280,6 @@ struct flex_groups { | ||
235 | #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ | ||
236 | #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ | ||
237 | #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ | ||
238 | -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ | ||
239 | #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ | ||
240 | |||
241 | #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ | ||
242 | @@ -289,6 +317,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) | ||
243 | #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ | ||
244 | #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ | ||
245 | #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ | ||
246 | +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ | ||
247 | +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ | ||
248 | |||
249 | /* Used to pass group descriptor data when online resize is done */ | ||
250 | struct ext4_new_group_input { | ||
251 | @@ -330,7 +360,16 @@ struct ext4_new_group_data { | ||
252 | /* Call ext4_da_update_reserve_space() after successfully | ||
253 | allocating the blocks */ | ||
254 | #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 | ||
255 | - | ||
256 | + /* caller is from the direct IO path, request to creation of an | ||
257 | + unitialized extents if not allocated, split the uninitialized | ||
258 | + extent if blocks has been preallocated already*/ | ||
259 | +#define EXT4_GET_BLOCKS_DIO 0x0010 | ||
260 | +#define EXT4_GET_BLOCKS_CONVERT 0x0020 | ||
261 | +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ | ||
262 | + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) | ||
263 | + /* Convert extent to initialized after direct IO complete */ | ||
264 | +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ | ||
265 | + EXT4_GET_BLOCKS_DIO_CREATE_EXT) | ||
266 | |||
267 | /* | ||
268 | * ioctl commands | ||
269 | @@ -386,6 +425,9 @@ struct ext4_mount_options { | ||
270 | #endif | ||
271 | }; | ||
272 | |||
273 | +/* Max physical block we can addres w/o extents */ | ||
274 | +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF | ||
275 | + | ||
276 | /* | ||
277 | * Structure of an inode on the disk | ||
278 | */ | ||
279 | @@ -481,8 +523,8 @@ struct move_extent { | ||
280 | static inline __le32 ext4_encode_extra_time(struct timespec *time) | ||
281 | { | ||
282 | return cpu_to_le32((sizeof(time->tv_sec) > 4 ? | ||
283 | - time->tv_sec >> 32 : 0) | | ||
284 | - ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); | ||
285 | + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | | ||
286 | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); | ||
287 | } | ||
288 | |||
289 | static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) | ||
290 | @@ -490,7 +532,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) | ||
291 | if (sizeof(time->tv_sec) > 4) | ||
292 | time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) | ||
293 | << 32; | ||
294 | - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; | ||
295 | + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; | ||
296 | } | ||
297 | |||
298 | #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ | ||
299 | @@ -653,6 +695,18 @@ struct ext4_inode_info { | ||
300 | __u16 i_extra_isize; | ||
301 | |||
302 | spinlock_t i_block_reservation_lock; | ||
303 | + | ||
304 | + /* completed async DIOs that might need unwritten extents handling */ | ||
305 | + struct list_head i_aio_dio_complete_list; | ||
306 | + /* current io_end structure for async DIO write*/ | ||
307 | + ext4_io_end_t *cur_aio_dio; | ||
308 | + | ||
309 | + /* | ||
310 | + * Transactions that contain inode's metadata needed to complete | ||
311 | + * fsync and fdatasync, respectively. | ||
312 | + */ | ||
313 | + tid_t i_sync_tid; | ||
314 | + tid_t i_datasync_tid; | ||
315 | }; | ||
316 | |||
317 | /* | ||
318 | @@ -700,6 +754,7 @@ struct ext4_inode_info { | ||
319 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ | ||
320 | #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ | ||
321 | #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ | ||
322 | +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ | ||
323 | |||
324 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt | ||
325 | #define set_opt(o, opt) o |= EXT4_MOUNT_##opt | ||
326 | @@ -841,6 +896,7 @@ struct ext4_sb_info { | ||
327 | unsigned long s_gdb_count; /* Number of group descriptor blocks */ | ||
328 | unsigned long s_desc_per_block; /* Number of group descriptors per block */ | ||
329 | ext4_group_t s_groups_count; /* Number of groups in the fs */ | ||
330 | + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ | ||
331 | unsigned long s_overhead_last; /* Last calculated overhead */ | ||
332 | unsigned long s_blocks_last; /* Last seen block count */ | ||
333 | loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ | ||
334 | @@ -923,6 +979,7 @@ struct ext4_sb_info { | ||
335 | unsigned int s_mb_stats; | ||
336 | unsigned int s_mb_order2_reqs; | ||
337 | unsigned int s_mb_group_prealloc; | ||
338 | + unsigned int s_max_writeback_mb_bump; | ||
339 | /* where last allocation was done - for stream allocation */ | ||
340 | unsigned long s_mb_last_group; | ||
341 | unsigned long s_mb_last_start; | ||
342 | @@ -950,6 +1007,7 @@ struct ext4_sb_info { | ||
343 | atomic_t s_mb_lost_chunks; | ||
344 | atomic_t s_mb_preallocated; | ||
345 | atomic_t s_mb_discarded; | ||
346 | + atomic_t s_lock_busy; | ||
347 | |||
348 | /* locality groups */ | ||
349 | struct ext4_locality_group *s_locality_groups; | ||
350 | @@ -960,6 +1018,9 @@ struct ext4_sb_info { | ||
351 | |||
352 | unsigned int s_log_groups_per_flex; | ||
353 | struct flex_groups *s_flex_groups; | ||
354 | + | ||
355 | + /* workqueue for dio unwritten */ | ||
356 | + struct workqueue_struct *dio_unwritten_wq; | ||
357 | }; | ||
358 | |||
359 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | ||
360 | @@ -1367,6 +1428,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); | ||
361 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); | ||
362 | extern int ext4_can_truncate(struct inode *inode); | ||
363 | extern void ext4_truncate(struct inode *); | ||
364 | +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); | ||
365 | extern void ext4_set_inode_flags(struct inode *); | ||
366 | extern void ext4_get_inode_flags(struct ext4_inode_info *); | ||
367 | extern int ext4_alloc_da_blocks(struct inode *inode); | ||
368 | @@ -1378,7 +1440,7 @@ extern int ext4_block_truncate_page(handle_t *handle, | ||
369 | struct address_space *mapping, loff_t from); | ||
370 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | ||
371 | extern qsize_t ext4_get_reserved_space(struct inode *inode); | ||
372 | - | ||
373 | +extern int flush_aio_dio_completed_IO(struct inode *inode); | ||
374 | /* ioctl.c */ | ||
375 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | ||
376 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); | ||
377 | @@ -1591,15 +1653,42 @@ struct ext4_group_info { | ||
378 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | ||
379 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | ||
380 | |||
381 | +#define EXT4_MAX_CONTENTION 8 | ||
382 | +#define EXT4_CONTENTION_THRESHOLD 2 | ||
383 | + | ||
384 | static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, | ||
385 | ext4_group_t group) | ||
386 | { | ||
387 | return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); | ||
388 | } | ||
389 | |||
390 | +/* | ||
391 | + * Returns true if the filesystem is busy enough that attempts to | ||
392 | + * access the block group locks has run into contention. | ||
393 | + */ | ||
394 | +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) | ||
395 | +{ | ||
396 | + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); | ||
397 | +} | ||
398 | + | ||
399 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) | ||
400 | { | ||
401 | - spin_lock(ext4_group_lock_ptr(sb, group)); | ||
402 | + spinlock_t *lock = ext4_group_lock_ptr(sb, group); | ||
403 | + if (spin_trylock(lock)) | ||
404 | + /* | ||
405 | + * We're able to grab the lock right away, so drop the | ||
406 | + * lock contention counter. | ||
407 | + */ | ||
408 | + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); | ||
409 | + else { | ||
410 | + /* | ||
411 | + * The lock is busy, so bump the contention counter, | ||
412 | + * and then wait on the spin lock. | ||
413 | + */ | ||
414 | + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, | ||
415 | + EXT4_MAX_CONTENTION); | ||
416 | + spin_lock(lock); | ||
417 | + } | ||
418 | } | ||
419 | |||
420 | static inline void ext4_unlock_group(struct super_block *sb, | ||
421 | @@ -1650,6 +1739,8 @@ extern void ext4_ext_init(struct super_block *); | ||
422 | extern void ext4_ext_release(struct super_block *); | ||
423 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | ||
424 | loff_t len); | ||
425 | +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, | ||
426 | + loff_t len); | ||
427 | extern int ext4_get_blocks(handle_t *handle, struct inode *inode, | ||
428 | sector_t block, unsigned int max_blocks, | ||
429 | struct buffer_head *bh, int flags); | ||
430 | diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h | ||
431 | index 20a8410..1c2db3f 100644 | ||
432 | --- a/fs/ext4/ext4_extents.h | ||
433 | +++ b/fs/ext4/ext4_extents.h | ||
434 | @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) | ||
435 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); | ||
436 | } | ||
437 | |||
438 | +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) | ||
439 | +{ | ||
440 | + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); | ||
441 | +} | ||
442 | + | ||
443 | extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); | ||
444 | extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); | ||
445 | extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); | ||
446 | @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode, | ||
447 | struct ext4_ext_path *path, | ||
448 | struct ext4_extent *); | ||
449 | extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); | ||
450 | -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); | ||
451 | +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); | ||
452 | extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, | ||
453 | ext_prepare_callback, void *); | ||
454 | extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, | ||
455 | diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c | ||
456 | index eb27fd0..6a94099 100644 | ||
457 | --- a/fs/ext4/ext4_jbd2.c | ||
458 | +++ b/fs/ext4/ext4_jbd2.c | ||
459 | @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle, | ||
460 | handle, err); | ||
461 | } | ||
462 | else | ||
463 | - brelse(bh); | ||
464 | + bforget(bh); | ||
465 | return err; | ||
466 | } | ||
467 | |||
468 | @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, | ||
469 | handle, err); | ||
470 | } | ||
471 | else | ||
472 | - brelse(bh); | ||
473 | + bforget(bh); | ||
474 | return err; | ||
475 | } | ||
476 | |||
477 | @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, | ||
478 | ext4_journal_abort_handle(where, __func__, bh, | ||
479 | handle, err); | ||
480 | } else { | ||
481 | - mark_buffer_dirty(bh); | ||
482 | + if (inode && bh) | ||
483 | + mark_buffer_dirty_inode(bh, inode); | ||
484 | + else | ||
485 | + mark_buffer_dirty(bh); | ||
486 | if (inode && inode_needs_sync(inode)) { | ||
487 | sync_dirty_buffer(bh); | ||
488 | if (buffer_req(bh) && !buffer_uptodate(bh)) { | ||
489 | diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h | ||
490 | index 139fb8c..1892a77 100644 | ||
491 | --- a/fs/ext4/ext4_jbd2.h | ||
492 | +++ b/fs/ext4/ext4_jbd2.h | ||
493 | @@ -49,7 +49,7 @@ | ||
494 | |||
495 | #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ | ||
496 | EXT4_XATTR_TRANS_BLOCKS - 2 + \ | ||
497 | - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) | ||
498 | + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) | ||
499 | |||
500 | /* | ||
501 | * Define the number of metadata blocks we need to account to modify data. | ||
502 | @@ -57,7 +57,7 @@ | ||
503 | * This include super block, inode block, quota blocks and xattr blocks | ||
504 | */ | ||
505 | #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ | ||
506 | - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) | ||
507 | + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) | ||
508 | |||
509 | /* Delete operations potentially hit one directory's namespace plus an | ||
510 | * entire inode, plus arbitrary amounts of bitmap/indirection data. Be | ||
511 | @@ -92,6 +92,7 @@ | ||
512 | * but inode, sb and group updates are done only once */ | ||
513 | #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ | ||
514 | (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) | ||
515 | + | ||
516 | #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ | ||
517 | (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) | ||
518 | #else | ||
519 | @@ -99,6 +100,9 @@ | ||
520 | #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 | ||
521 | #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 | ||
522 | #endif | ||
523 | +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) | ||
524 | +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) | ||
525 | +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) | ||
526 | |||
527 | int | ||
528 | ext4_mark_iloc_dirty(handle_t *handle, | ||
529 | @@ -161,11 +165,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, | ||
530 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); | ||
531 | int __ext4_journal_stop(const char *where, handle_t *handle); | ||
532 | |||
533 | -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) | ||
534 | +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) | ||
535 | |||
536 | +/* Note: Do not use this for NULL handles. This is only to determine if | ||
537 | + * a properly allocated handle is using a journal or not. */ | ||
538 | static inline int ext4_handle_valid(handle_t *handle) | ||
539 | { | ||
540 | - if (handle == EXT4_NOJOURNAL_HANDLE) | ||
541 | + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) | ||
542 | return 0; | ||
543 | return 1; | ||
544 | } | ||
545 | @@ -252,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) | ||
546 | return 0; | ||
547 | } | ||
548 | |||
549 | +static inline void ext4_update_inode_fsync_trans(handle_t *handle, | ||
550 | + struct inode *inode, | ||
551 | + int datasync) | ||
552 | +{ | ||
553 | + struct ext4_inode_info *ei = EXT4_I(inode); | ||
554 | + | ||
555 | + if (ext4_handle_valid(handle)) { | ||
556 | + ei->i_sync_tid = handle->h_transaction->t_tid; | ||
557 | + if (datasync) | ||
558 | + ei->i_datasync_tid = handle->h_transaction->t_tid; | ||
559 | + } | ||
560 | +} | ||
561 | + | ||
562 | /* super.c */ | ||
563 | int ext4_force_commit(struct super_block *sb); | ||
564 | |||
565 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c | ||
566 | index 73ebfb4..24fb20b 100644 | ||
567 | --- a/fs/ext4/extents.c | ||
568 | +++ b/fs/ext4/extents.c | ||
569 | @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) | ||
570 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); | ||
571 | } | ||
572 | |||
573 | -static int ext4_ext_journal_restart(handle_t *handle, int needed) | ||
574 | +static int ext4_ext_truncate_extend_restart(handle_t *handle, | ||
575 | + struct inode *inode, | ||
576 | + int needed) | ||
577 | { | ||
578 | int err; | ||
579 | |||
580 | @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) | ||
581 | err = ext4_journal_extend(handle, needed); | ||
582 | if (err <= 0) | ||
583 | return err; | ||
584 | - return ext4_journal_restart(handle, needed); | ||
585 | + err = ext4_truncate_restart_trans(handle, inode, needed); | ||
586 | + /* | ||
587 | + * We have dropped i_data_sem so someone might have cached again | ||
588 | + * an extent we are going to truncate. | ||
589 | + */ | ||
590 | + ext4_ext_invalidate_cache(inode); | ||
591 | + | ||
592 | + return err; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | @@ -701,7 +710,7 @@ err: | ||
597 | * insert new index [@logical;@ptr] into the block at @curp; | ||
598 | * check where to insert: before @curp or after @curp | ||
599 | */ | ||
600 | -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | ||
601 | +int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | ||
602 | struct ext4_ext_path *curp, | ||
603 | int logical, ext4_fsblk_t ptr) | ||
604 | { | ||
605 | @@ -1563,7 +1572,7 @@ out: | ||
606 | */ | ||
607 | int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | ||
608 | struct ext4_ext_path *path, | ||
609 | - struct ext4_extent *newext) | ||
610 | + struct ext4_extent *newext, int flag) | ||
611 | { | ||
612 | struct ext4_extent_header *eh; | ||
613 | struct ext4_extent *ex, *fex; | ||
614 | @@ -1579,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | ||
615 | BUG_ON(path[depth].p_hdr == NULL); | ||
616 | |||
617 | /* try to insert block into found extent and return */ | ||
618 | - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { | ||
619 | + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) | ||
620 | + && ext4_can_extents_be_merged(inode, ex, newext)) { | ||
621 | ext_debug("append %d block to %d:%d (from %llu)\n", | ||
622 | ext4_ext_get_actual_len(newext), | ||
623 | le32_to_cpu(ex->ee_block), | ||
624 | @@ -1694,7 +1704,8 @@ has_space: | ||
625 | |||
626 | merge: | ||
627 | /* try to merge extents to the right */ | ||
628 | - ext4_ext_try_to_merge(inode, path, nearex); | ||
629 | + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) | ||
630 | + ext4_ext_try_to_merge(inode, path, nearex); | ||
631 | |||
632 | /* try to merge extents to the left */ | ||
633 | |||
634 | @@ -1731,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, | ||
635 | while (block < last && block != EXT_MAX_BLOCK) { | ||
636 | num = last - block; | ||
637 | /* find extent for this block */ | ||
638 | + down_read(&EXT4_I(inode)->i_data_sem); | ||
639 | path = ext4_ext_find_extent(inode, block, path); | ||
640 | + up_read(&EXT4_I(inode)->i_data_sem); | ||
641 | if (IS_ERR(path)) { | ||
642 | err = PTR_ERR(path); | ||
643 | path = NULL; | ||
644 | @@ -2044,7 +2057,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | ||
645 | ext_debug("free last %u blocks starting %llu\n", num, start); | ||
646 | for (i = 0; i < num; i++) { | ||
647 | bh = sb_find_get_block(inode->i_sb, start + i); | ||
648 | - ext4_forget(handle, 0, inode, bh, start + i); | ||
649 | + ext4_forget(handle, metadata, inode, bh, start + i); | ||
650 | } | ||
651 | ext4_free_blocks(handle, inode, start, num, metadata); | ||
652 | } else if (from == le32_to_cpu(ex->ee_block) | ||
653 | @@ -2136,9 +2149,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | ||
654 | correct_index = 1; | ||
655 | credits += (ext_depth(inode)) + 1; | ||
656 | } | ||
657 | - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | ||
658 | + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); | ||
659 | |||
660 | - err = ext4_ext_journal_restart(handle, credits); | ||
661 | + err = ext4_ext_truncate_extend_restart(handle, inode, credits); | ||
662 | if (err) | ||
663 | goto out; | ||
664 | |||
665 | @@ -2461,7 +2474,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) | ||
666 | } | ||
667 | |||
668 | #define EXT4_EXT_ZERO_LEN 7 | ||
669 | - | ||
670 | /* | ||
671 | * This function is called by ext4_ext_get_blocks() if someone tries to write | ||
672 | * to an uninitialized extent. It may result in splitting the uninitialized | ||
673 | @@ -2554,7 +2566,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | ||
674 | ex3->ee_block = cpu_to_le32(iblock); | ||
675 | ext4_ext_store_pblock(ex3, newblock); | ||
676 | ex3->ee_len = cpu_to_le16(allocated); | ||
677 | - err = ext4_ext_insert_extent(handle, inode, path, ex3); | ||
678 | + err = ext4_ext_insert_extent(handle, inode, path, | ||
679 | + ex3, 0); | ||
680 | if (err == -ENOSPC) { | ||
681 | err = ext4_ext_zeroout(inode, &orig_ex); | ||
682 | if (err) | ||
683 | @@ -2610,7 +2623,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | ||
684 | ext4_ext_store_pblock(ex3, newblock + max_blocks); | ||
685 | ex3->ee_len = cpu_to_le16(allocated - max_blocks); | ||
686 | ext4_ext_mark_uninitialized(ex3); | ||
687 | - err = ext4_ext_insert_extent(handle, inode, path, ex3); | ||
688 | + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); | ||
689 | if (err == -ENOSPC) { | ||
690 | err = ext4_ext_zeroout(inode, &orig_ex); | ||
691 | if (err) | ||
692 | @@ -2728,7 +2741,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | ||
693 | err = ext4_ext_dirty(handle, inode, path + depth); | ||
694 | goto out; | ||
695 | insert: | ||
696 | - err = ext4_ext_insert_extent(handle, inode, path, &newex); | ||
697 | + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); | ||
698 | + if (err == -ENOSPC) { | ||
699 | + err = ext4_ext_zeroout(inode, &orig_ex); | ||
700 | + if (err) | ||
701 | + goto fix_extent_len; | ||
702 | + /* update the extent length and mark as initialized */ | ||
703 | + ex->ee_block = orig_ex.ee_block; | ||
704 | + ex->ee_len = orig_ex.ee_len; | ||
705 | + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); | ||
706 | + ext4_ext_dirty(handle, inode, path + depth); | ||
707 | + /* zero out the first half */ | ||
708 | + return allocated; | ||
709 | + } else if (err) | ||
710 | + goto fix_extent_len; | ||
711 | +out: | ||
712 | + return err ? err : allocated; | ||
713 | + | ||
714 | +fix_extent_len: | ||
715 | + ex->ee_block = orig_ex.ee_block; | ||
716 | + ex->ee_len = orig_ex.ee_len; | ||
717 | + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); | ||
718 | + ext4_ext_mark_uninitialized(ex); | ||
719 | + ext4_ext_dirty(handle, inode, path + depth); | ||
720 | + return err; | ||
721 | +} | ||
722 | + | ||
723 | +/* | ||
724 | + * This function is called by ext4_ext_get_blocks() from | ||
725 | + * ext4_get_blocks_dio_write() when DIO to write | ||
726 | + * to an uninitialized extent. | ||
727 | + * | ||
728 | + * Writing to an uninitized extent may result in splitting the uninitialized | ||
729 | + * extent into multiple /intialized unintialized extents (up to three) | ||
730 | + * There are three possibilities: | ||
731 | + * a> There is no split required: Entire extent should be uninitialized | ||
732 | + * b> Splits in two extents: Write is happening at either end of the extent | ||
733 | + * c> Splits in three extents: Somone is writing in middle of the extent | ||
734 | + * | ||
735 | + * One of more index blocks maybe needed if the extent tree grow after | ||
736 | + * the unintialized extent split. To prevent ENOSPC occur at the IO | ||
737 | + * complete, we need to split the uninitialized extent before DIO submit | ||
738 | + * the IO. The uninitilized extent called at this time will be split | ||
739 | + * into three uninitialized extent(at most). After IO complete, the part | ||
740 | + * being filled will be convert to initialized by the end_io callback function | ||
741 | + * via ext4_convert_unwritten_extents(). | ||
742 | + * | ||
743 | + * Returns the size of uninitialized extent to be written on success. | ||
744 | + */ | ||
745 | +static int ext4_split_unwritten_extents(handle_t *handle, | ||
746 | + struct inode *inode, | ||
747 | + struct ext4_ext_path *path, | ||
748 | + ext4_lblk_t iblock, | ||
749 | + unsigned int max_blocks, | ||
750 | + int flags) | ||
751 | +{ | ||
752 | + struct ext4_extent *ex, newex, orig_ex; | ||
753 | + struct ext4_extent *ex1 = NULL; | ||
754 | + struct ext4_extent *ex2 = NULL; | ||
755 | + struct ext4_extent *ex3 = NULL; | ||
756 | + struct ext4_extent_header *eh; | ||
757 | + ext4_lblk_t ee_block; | ||
758 | + unsigned int allocated, ee_len, depth; | ||
759 | + ext4_fsblk_t newblock; | ||
760 | + int err = 0; | ||
761 | + | ||
762 | + ext_debug("ext4_split_unwritten_extents: inode %lu," | ||
763 | + "iblock %llu, max_blocks %u\n", inode->i_ino, | ||
764 | + (unsigned long long)iblock, max_blocks); | ||
765 | + depth = ext_depth(inode); | ||
766 | + eh = path[depth].p_hdr; | ||
767 | + ex = path[depth].p_ext; | ||
768 | + ee_block = le32_to_cpu(ex->ee_block); | ||
769 | + ee_len = ext4_ext_get_actual_len(ex); | ||
770 | + allocated = ee_len - (iblock - ee_block); | ||
771 | + newblock = iblock - ee_block + ext_pblock(ex); | ||
772 | + ex2 = ex; | ||
773 | + orig_ex.ee_block = ex->ee_block; | ||
774 | + orig_ex.ee_len = cpu_to_le16(ee_len); | ||
775 | + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); | ||
776 | + | ||
777 | + /* | ||
778 | + * If the uninitialized extent begins at the same logical | ||
779 | + * block where the write begins, and the write completely | ||
780 | + * covers the extent, then we don't need to split it. | ||
781 | + */ | ||
782 | + if ((iblock == ee_block) && (allocated <= max_blocks)) | ||
783 | + return allocated; | ||
784 | + | ||
785 | + err = ext4_ext_get_access(handle, inode, path + depth); | ||
786 | + if (err) | ||
787 | + goto out; | ||
788 | + /* ex1: ee_block to iblock - 1 : uninitialized */ | ||
789 | + if (iblock > ee_block) { | ||
790 | + ex1 = ex; | ||
791 | + ex1->ee_len = cpu_to_le16(iblock - ee_block); | ||
792 | + ext4_ext_mark_uninitialized(ex1); | ||
793 | + ex2 = &newex; | ||
794 | + } | ||
795 | + /* | ||
796 | + * for sanity, update the length of the ex2 extent before | ||
797 | + * we insert ex3, if ex1 is NULL. This is to avoid temporary | ||
798 | + * overlap of blocks. | ||
799 | + */ | ||
800 | + if (!ex1 && allocated > max_blocks) | ||
801 | + ex2->ee_len = cpu_to_le16(max_blocks); | ||
802 | + /* ex3: to ee_block + ee_len : uninitialised */ | ||
803 | + if (allocated > max_blocks) { | ||
804 | + unsigned int newdepth; | ||
805 | + ex3 = &newex; | ||
806 | + ex3->ee_block = cpu_to_le32(iblock + max_blocks); | ||
807 | + ext4_ext_store_pblock(ex3, newblock + max_blocks); | ||
808 | + ex3->ee_len = cpu_to_le16(allocated - max_blocks); | ||
809 | + ext4_ext_mark_uninitialized(ex3); | ||
810 | + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); | ||
811 | + if (err == -ENOSPC) { | ||
812 | + err = ext4_ext_zeroout(inode, &orig_ex); | ||
813 | + if (err) | ||
814 | + goto fix_extent_len; | ||
815 | + /* update the extent length and mark as initialized */ | ||
816 | + ex->ee_block = orig_ex.ee_block; | ||
817 | + ex->ee_len = orig_ex.ee_len; | ||
818 | + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); | ||
819 | + ext4_ext_dirty(handle, inode, path + depth); | ||
820 | + /* zeroed the full extent */ | ||
821 | + /* blocks available from iblock */ | ||
822 | + return allocated; | ||
823 | + | ||
824 | + } else if (err) | ||
825 | + goto fix_extent_len; | ||
826 | + /* | ||
827 | + * The depth, and hence eh & ex might change | ||
828 | + * as part of the insert above. | ||
829 | + */ | ||
830 | + newdepth = ext_depth(inode); | ||
831 | + /* | ||
832 | + * update the extent length after successful insert of the | ||
833 | + * split extent | ||
834 | + */ | ||
835 | + orig_ex.ee_len = cpu_to_le16(ee_len - | ||
836 | + ext4_ext_get_actual_len(ex3)); | ||
837 | + depth = newdepth; | ||
838 | + ext4_ext_drop_refs(path); | ||
839 | + path = ext4_ext_find_extent(inode, iblock, path); | ||
840 | + if (IS_ERR(path)) { | ||
841 | + err = PTR_ERR(path); | ||
842 | + goto out; | ||
843 | + } | ||
844 | + eh = path[depth].p_hdr; | ||
845 | + ex = path[depth].p_ext; | ||
846 | + if (ex2 != &newex) | ||
847 | + ex2 = ex; | ||
848 | + | ||
849 | + err = ext4_ext_get_access(handle, inode, path + depth); | ||
850 | + if (err) | ||
851 | + goto out; | ||
852 | + | ||
853 | + allocated = max_blocks; | ||
854 | + } | ||
855 | + /* | ||
856 | + * If there was a change of depth as part of the | ||
857 | + * insertion of ex3 above, we need to update the length | ||
858 | + * of the ex1 extent again here | ||
859 | + */ | ||
860 | + if (ex1 && ex1 != ex) { | ||
861 | + ex1 = ex; | ||
862 | + ex1->ee_len = cpu_to_le16(iblock - ee_block); | ||
863 | + ext4_ext_mark_uninitialized(ex1); | ||
864 | + ex2 = &newex; | ||
865 | + } | ||
866 | + /* | ||
867 | + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, | ||
868 | + * uninitialised still. | ||
869 | + */ | ||
870 | + ex2->ee_block = cpu_to_le32(iblock); | ||
871 | + ext4_ext_store_pblock(ex2, newblock); | ||
872 | + ex2->ee_len = cpu_to_le16(allocated); | ||
873 | + ext4_ext_mark_uninitialized(ex2); | ||
874 | + if (ex2 != ex) | ||
875 | + goto insert; | ||
876 | + /* Mark modified extent as dirty */ | ||
877 | + err = ext4_ext_dirty(handle, inode, path + depth); | ||
878 | + ext_debug("out here\n"); | ||
879 | + goto out; | ||
880 | +insert: | ||
881 | + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); | ||
882 | if (err == -ENOSPC) { | ||
883 | err = ext4_ext_zeroout(inode, &orig_ex); | ||
884 | if (err) | ||
885 | @@ -2743,6 +2940,7 @@ insert: | ||
886 | } else if (err) | ||
887 | goto fix_extent_len; | ||
888 | out: | ||
889 | + ext4_ext_show_leaf(inode, path); | ||
890 | return err ? err : allocated; | ||
891 | |||
892 | fix_extent_len: | ||
893 | @@ -2753,7 +2951,151 @@ fix_extent_len: | ||
894 | ext4_ext_dirty(handle, inode, path + depth); | ||
895 | return err; | ||
896 | } | ||
897 | +static int ext4_convert_unwritten_extents_dio(handle_t *handle, | ||
898 | + struct inode *inode, | ||
899 | + struct ext4_ext_path *path) | ||
900 | +{ | ||
901 | + struct ext4_extent *ex; | ||
902 | + struct ext4_extent_header *eh; | ||
903 | + int depth; | ||
904 | + int err = 0; | ||
905 | + int ret = 0; | ||
906 | + | ||
907 | + depth = ext_depth(inode); | ||
908 | + eh = path[depth].p_hdr; | ||
909 | + ex = path[depth].p_ext; | ||
910 | + | ||
911 | + err = ext4_ext_get_access(handle, inode, path + depth); | ||
912 | + if (err) | ||
913 | + goto out; | ||
914 | + /* first mark the extent as initialized */ | ||
915 | + ext4_ext_mark_initialized(ex); | ||
916 | + | ||
917 | + /* | ||
918 | + * We have to see if it can be merged with the extent | ||
919 | + * on the left. | ||
920 | + */ | ||
921 | + if (ex > EXT_FIRST_EXTENT(eh)) { | ||
922 | + /* | ||
923 | + * To merge left, pass "ex - 1" to try_to_merge(), | ||
924 | + * since it merges towards right _only_. | ||
925 | + */ | ||
926 | + ret = ext4_ext_try_to_merge(inode, path, ex - 1); | ||
927 | + if (ret) { | ||
928 | + err = ext4_ext_correct_indexes(handle, inode, path); | ||
929 | + if (err) | ||
930 | + goto out; | ||
931 | + depth = ext_depth(inode); | ||
932 | + ex--; | ||
933 | + } | ||
934 | + } | ||
935 | + /* | ||
936 | + * Try to Merge towards right. | ||
937 | + */ | ||
938 | + ret = ext4_ext_try_to_merge(inode, path, ex); | ||
939 | + if (ret) { | ||
940 | + err = ext4_ext_correct_indexes(handle, inode, path); | ||
941 | + if (err) | ||
942 | + goto out; | ||
943 | + depth = ext_depth(inode); | ||
944 | + } | ||
945 | + /* Mark modified extent as dirty */ | ||
946 | + err = ext4_ext_dirty(handle, inode, path + depth); | ||
947 | +out: | ||
948 | + ext4_ext_show_leaf(inode, path); | ||
949 | + return err; | ||
950 | +} | ||
951 | + | ||
952 | +static int | ||
953 | +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | ||
954 | + ext4_lblk_t iblock, unsigned int max_blocks, | ||
955 | + struct ext4_ext_path *path, int flags, | ||
956 | + unsigned int allocated, struct buffer_head *bh_result, | ||
957 | + ext4_fsblk_t newblock) | ||
958 | +{ | ||
959 | + int ret = 0; | ||
960 | + int err = 0; | ||
961 | + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; | ||
962 | + | ||
963 | + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" | ||
964 | + "block %llu, max_blocks %u, flags %d, allocated %u", | ||
965 | + inode->i_ino, (unsigned long long)iblock, max_blocks, | ||
966 | + flags, allocated); | ||
967 | + ext4_ext_show_leaf(inode, path); | ||
968 | |||
969 | + /* DIO get_block() before submit the IO, split the extent */ | ||
970 | + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { | ||
971 | + ret = ext4_split_unwritten_extents(handle, | ||
972 | + inode, path, iblock, | ||
973 | + max_blocks, flags); | ||
974 | + /* | ||
975 | + * Flag the inode(non aio case) or end_io struct (aio case) | ||
976 | + * that this IO needs to convertion to written when IO is | ||
977 | + * completed | ||
978 | + */ | ||
979 | + if (io) | ||
980 | + io->flag = DIO_AIO_UNWRITTEN; | ||
981 | + else | ||
982 | + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; | ||
983 | + goto out; | ||
984 | + } | ||
985 | + /* async DIO end_io complete, convert the filled extent to written */ | ||
986 | + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { | ||
987 | + ret = ext4_convert_unwritten_extents_dio(handle, inode, | ||
988 | + path); | ||
989 | + if (ret >= 0) | ||
990 | + ext4_update_inode_fsync_trans(handle, inode, 1); | ||
991 | + goto out2; | ||
992 | + } | ||
993 | + /* buffered IO case */ | ||
994 | + /* | ||
995 | + * repeat fallocate creation request | ||
996 | + * we already have an unwritten extent | ||
997 | + */ | ||
998 | + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) | ||
999 | + goto map_out; | ||
1000 | + | ||
1001 | + /* buffered READ or buffered write_begin() lookup */ | ||
1002 | + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | ||
1003 | + /* | ||
1004 | + * We have blocks reserved already. We | ||
1005 | + * return allocated blocks so that delalloc | ||
1006 | + * won't do block reservation for us. But | ||
1007 | + * the buffer head will be unmapped so that | ||
1008 | + * a read from the block returns 0s. | ||
1009 | + */ | ||
1010 | + set_buffer_unwritten(bh_result); | ||
1011 | + goto out1; | ||
1012 | + } | ||
1013 | + | ||
1014 | + /* buffered write, writepage time, convert*/ | ||
1015 | + ret = ext4_ext_convert_to_initialized(handle, inode, | ||
1016 | + path, iblock, | ||
1017 | + max_blocks); | ||
1018 | + if (ret >= 0) | ||
1019 | + ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1020 | +out: | ||
1021 | + if (ret <= 0) { | ||
1022 | + err = ret; | ||
1023 | + goto out2; | ||
1024 | + } else | ||
1025 | + allocated = ret; | ||
1026 | + set_buffer_new(bh_result); | ||
1027 | +map_out: | ||
1028 | + set_buffer_mapped(bh_result); | ||
1029 | +out1: | ||
1030 | + if (allocated > max_blocks) | ||
1031 | + allocated = max_blocks; | ||
1032 | + ext4_ext_show_leaf(inode, path); | ||
1033 | + bh_result->b_bdev = inode->i_sb->s_bdev; | ||
1034 | + bh_result->b_blocknr = newblock; | ||
1035 | +out2: | ||
1036 | + if (path) { | ||
1037 | + ext4_ext_drop_refs(path); | ||
1038 | + kfree(path); | ||
1039 | + } | ||
1040 | + return err ? err : allocated; | ||
1041 | +} | ||
1042 | /* | ||
1043 | * Block allocation/map/preallocation routine for extents based files | ||
1044 | * | ||
1045 | @@ -2784,6 +3126,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | ||
1046 | int err = 0, depth, ret, cache_type; | ||
1047 | unsigned int allocated = 0; | ||
1048 | struct ext4_allocation_request ar; | ||
1049 | + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; | ||
1050 | |||
1051 | __clear_bit(BH_New, &bh_result->b_state); | ||
1052 | ext_debug("blocks %u/%u requested for inode %u\n", | ||
1053 | @@ -2859,33 +3202,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | ||
1054 | EXT4_EXT_CACHE_EXTENT); | ||
1055 | goto out; | ||
1056 | } | ||
1057 | - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) | ||
1058 | - goto out; | ||
1059 | - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | ||
1060 | - if (allocated > max_blocks) | ||
1061 | - allocated = max_blocks; | ||
1062 | - /* | ||
1063 | - * We have blocks reserved already. We | ||
1064 | - * return allocated blocks so that delalloc | ||
1065 | - * won't do block reservation for us. But | ||
1066 | - * the buffer head will be unmapped so that | ||
1067 | - * a read from the block returns 0s. | ||
1068 | - */ | ||
1069 | - set_buffer_unwritten(bh_result); | ||
1070 | - bh_result->b_bdev = inode->i_sb->s_bdev; | ||
1071 | - bh_result->b_blocknr = newblock; | ||
1072 | - goto out2; | ||
1073 | - } | ||
1074 | - | ||
1075 | - ret = ext4_ext_convert_to_initialized(handle, inode, | ||
1076 | - path, iblock, | ||
1077 | - max_blocks); | ||
1078 | - if (ret <= 0) { | ||
1079 | - err = ret; | ||
1080 | - goto out2; | ||
1081 | - } else | ||
1082 | - allocated = ret; | ||
1083 | - goto outnew; | ||
1084 | + ret = ext4_ext_handle_uninitialized_extents(handle, | ||
1085 | + inode, iblock, max_blocks, path, | ||
1086 | + flags, allocated, bh_result, newblock); | ||
1087 | + return ret; | ||
1088 | } | ||
1089 | } | ||
1090 | |||
1091 | @@ -2956,9 +3276,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | ||
1092 | /* try to insert new extent into found leaf and return */ | ||
1093 | ext4_ext_store_pblock(&newex, newblock); | ||
1094 | newex.ee_len = cpu_to_le16(ar.len); | ||
1095 | - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ | ||
1096 | + /* Mark uninitialized */ | ||
1097 | + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ | ||
1098 | ext4_ext_mark_uninitialized(&newex); | ||
1099 | - err = ext4_ext_insert_extent(handle, inode, path, &newex); | ||
1100 | + /* | ||
1101 | + * io_end structure was created for every async | ||
1102 | + * direct IO write to the middle of the file. | ||
1103 | + * To avoid unecessary convertion for every aio dio rewrite | ||
1104 | + * to the mid of file, here we flag the IO that is really | ||
1105 | + * need the convertion. | ||
1106 | + * For non asycn direct IO case, flag the inode state | ||
1107 | + * that we need to perform convertion when IO is done. | ||
1108 | + */ | ||
1109 | + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { | ||
1110 | + if (io) | ||
1111 | + io->flag = DIO_AIO_UNWRITTEN; | ||
1112 | + else | ||
1113 | + EXT4_I(inode)->i_state |= | ||
1114 | + EXT4_STATE_DIO_UNWRITTEN;; | ||
1115 | + } | ||
1116 | + } | ||
1117 | + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); | ||
1118 | if (err) { | ||
1119 | /* free data blocks we just allocated */ | ||
1120 | /* not a good idea to call discard here directly, | ||
1121 | @@ -2972,13 +3310,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | ||
1122 | /* previous routine could use block we allocated */ | ||
1123 | newblock = ext_pblock(&newex); | ||
1124 | allocated = ext4_ext_get_actual_len(&newex); | ||
1125 | -outnew: | ||
1126 | set_buffer_new(bh_result); | ||
1127 | |||
1128 | - /* Cache only when it is _not_ an uninitialized extent */ | ||
1129 | - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) | ||
1130 | + /* | ||
1131 | + * Cache the extent and update transaction to commit on fdatasync only | ||
1132 | + * when it is _not_ an uninitialized extent. | ||
1133 | + */ | ||
1134 | + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { | ||
1135 | ext4_ext_put_in_cache(inode, iblock, allocated, newblock, | ||
1136 | EXT4_EXT_CACHE_EXTENT); | ||
1137 | + ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1138 | + } else | ||
1139 | + ext4_update_inode_fsync_trans(handle, inode, 0); | ||
1140 | out: | ||
1141 | if (allocated > max_blocks) | ||
1142 | allocated = max_blocks; | ||
1143 | @@ -3171,6 +3514,64 @@ retry: | ||
1144 | } | ||
1145 | |||
1146 | /* | ||
1147 | + * This function convert a range of blocks to written extents | ||
1148 | + * The caller of this function will pass the start offset and the size. | ||
1149 | + * all unwritten extents within this range will be converted to | ||
1150 | + * written extents. | ||
1151 | + * | ||
1152 | + * This function is called from the direct IO end io call back | ||
1153 | + * function, to convert the fallocated extents after IO is completed. | ||
1154 | + * Returns 0 on success. | ||
1155 | + */ | ||
1156 | +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, | ||
1157 | + loff_t len) | ||
1158 | +{ | ||
1159 | + handle_t *handle; | ||
1160 | + ext4_lblk_t block; | ||
1161 | + unsigned int max_blocks; | ||
1162 | + int ret = 0; | ||
1163 | + int ret2 = 0; | ||
1164 | + struct buffer_head map_bh; | ||
1165 | + unsigned int credits, blkbits = inode->i_blkbits; | ||
1166 | + | ||
1167 | + block = offset >> blkbits; | ||
1168 | + /* | ||
1169 | + * We can't just convert len to max_blocks because | ||
1170 | + * If blocksize = 4096 offset = 3072 and len = 2048 | ||
1171 | + */ | ||
1172 | + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) | ||
1173 | + - block; | ||
1174 | + /* | ||
1175 | + * credits to insert 1 extent into extent tree | ||
1176 | + */ | ||
1177 | + credits = ext4_chunk_trans_blocks(inode, max_blocks); | ||
1178 | + while (ret >= 0 && ret < max_blocks) { | ||
1179 | + block = block + ret; | ||
1180 | + max_blocks = max_blocks - ret; | ||
1181 | + handle = ext4_journal_start(inode, credits); | ||
1182 | + if (IS_ERR(handle)) { | ||
1183 | + ret = PTR_ERR(handle); | ||
1184 | + break; | ||
1185 | + } | ||
1186 | + map_bh.b_state = 0; | ||
1187 | + ret = ext4_get_blocks(handle, inode, block, | ||
1188 | + max_blocks, &map_bh, | ||
1189 | + EXT4_GET_BLOCKS_DIO_CONVERT_EXT); | ||
1190 | + if (ret <= 0) { | ||
1191 | + WARN_ON(ret <= 0); | ||
1192 | + printk(KERN_ERR "%s: ext4_ext_get_blocks " | ||
1193 | + "returned error inode#%lu, block=%u, " | ||
1194 | + "max_blocks=%u", __func__, | ||
1195 | + inode->i_ino, block, max_blocks); | ||
1196 | + } | ||
1197 | + ext4_mark_inode_dirty(handle, inode); | ||
1198 | + ret2 = ext4_journal_stop(handle); | ||
1199 | + if (ret <= 0 || ret2 ) | ||
1200 | + break; | ||
1201 | + } | ||
1202 | + return ret > 0 ? ret2 : ret; | ||
1203 | +} | ||
1204 | +/* | ||
1205 | * Callback function called for each extent to gather FIEMAP information. | ||
1206 | */ | ||
1207 | static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, | ||
1208 | @@ -3308,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | ||
1209 | * Walk the extent tree gathering extent information. | ||
1210 | * ext4_ext_fiemap_cb will push extents back to user. | ||
1211 | */ | ||
1212 | - down_read(&EXT4_I(inode)->i_data_sem); | ||
1213 | error = ext4_ext_walk_space(inode, start_blk, len_blks, | ||
1214 | ext4_ext_fiemap_cb, fieinfo); | ||
1215 | - up_read(&EXT4_I(inode)->i_data_sem); | ||
1216 | } | ||
1217 | |||
1218 | return error; | ||
1219 | diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c | ||
1220 | index 83cf641..d6049e4 100644 | ||
1221 | --- a/fs/ext4/fsync.c | ||
1222 | +++ b/fs/ext4/fsync.c | ||
1223 | @@ -44,27 +44,37 @@ | ||
1224 | * | ||
1225 | * What we do is just kick off a commit and wait on it. This will snapshot the | ||
1226 | * inode to disk. | ||
1227 | + * | ||
1228 | + * i_mutex lock is held when entering and exiting this function | ||
1229 | */ | ||
1230 | |||
1231 | int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | ||
1232 | { | ||
1233 | struct inode *inode = dentry->d_inode; | ||
1234 | + struct ext4_inode_info *ei = EXT4_I(inode); | ||
1235 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | ||
1236 | - int ret = 0; | ||
1237 | + int ret; | ||
1238 | + tid_t commit_tid; | ||
1239 | |||
1240 | J_ASSERT(ext4_journal_current_handle() == NULL); | ||
1241 | |||
1242 | trace_ext4_sync_file(file, dentry, datasync); | ||
1243 | |||
1244 | + if (inode->i_sb->s_flags & MS_RDONLY) | ||
1245 | + return 0; | ||
1246 | + | ||
1247 | + ret = flush_aio_dio_completed_IO(inode); | ||
1248 | + if (ret < 0) | ||
1249 | + return ret; | ||
1250 | + | ||
1251 | + if (!journal) | ||
1252 | + return simple_fsync(file, dentry, datasync); | ||
1253 | + | ||
1254 | /* | ||
1255 | - * data=writeback: | ||
1256 | + * data=writeback,ordered: | ||
1257 | * The caller's filemap_fdatawrite()/wait will sync the data. | ||
1258 | - * sync_inode() will sync the metadata | ||
1259 | - * | ||
1260 | - * data=ordered: | ||
1261 | - * The caller's filemap_fdatawrite() will write the data and | ||
1262 | - * sync_inode() will write the inode if it is dirty. Then the caller's | ||
1263 | - * filemap_fdatawait() will wait on the pages. | ||
1264 | + * Metadata is in the journal, we wait for proper transaction to | ||
1265 | + * commit here. | ||
1266 | * | ||
1267 | * data=journal: | ||
1268 | * filemap_fdatawrite won't do anything (the buffers are clean). | ||
1269 | @@ -74,27 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | ||
1270 | * (they were dirtied by commit). But that's OK - the blocks are | ||
1271 | * safe in-journal, which is all fsync() needs to ensure. | ||
1272 | */ | ||
1273 | - if (ext4_should_journal_data(inode)) { | ||
1274 | - ret = ext4_force_commit(inode->i_sb); | ||
1275 | - goto out; | ||
1276 | - } | ||
1277 | - | ||
1278 | - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) | ||
1279 | - goto out; | ||
1280 | + if (ext4_should_journal_data(inode)) | ||
1281 | + return ext4_force_commit(inode->i_sb); | ||
1282 | |||
1283 | - /* | ||
1284 | - * The VFS has written the file data. If the inode is unaltered | ||
1285 | - * then we need not start a commit. | ||
1286 | - */ | ||
1287 | - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { | ||
1288 | - struct writeback_control wbc = { | ||
1289 | - .sync_mode = WB_SYNC_ALL, | ||
1290 | - .nr_to_write = 0, /* sys_fsync did this */ | ||
1291 | - }; | ||
1292 | - ret = sync_inode(inode, &wbc); | ||
1293 | - if (journal && (journal->j_flags & JBD2_BARRIER)) | ||
1294 | - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | ||
1295 | - } | ||
1296 | -out: | ||
1297 | + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; | ||
1298 | + if (jbd2_log_start_commit(journal, commit_tid)) | ||
1299 | + jbd2_log_wait_commit(journal, commit_tid); | ||
1300 | + else if (journal->j_flags & JBD2_BARRIER) | ||
1301 | + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | ||
1302 | return ret; | ||
1303 | } | ||
1304 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c | ||
1305 | index f9c642b..38b2154 100644 | ||
1306 | --- a/fs/ext4/inode.c | ||
1307 | +++ b/fs/ext4/inode.c | ||
1308 | @@ -37,6 +37,7 @@ | ||
1309 | #include <linux/namei.h> | ||
1310 | #include <linux/uio.h> | ||
1311 | #include <linux/bio.h> | ||
1312 | +#include <linux/workqueue.h> | ||
1313 | |||
1314 | #include "ext4_jbd2.h" | ||
1315 | #include "xattr.h" | ||
1316 | @@ -192,11 +193,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
1317 | * so before we call here everything must be consistently dirtied against | ||
1318 | * this transaction. | ||
1319 | */ | ||
1320 | -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) | ||
1321 | +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, | ||
1322 | + int nblocks) | ||
1323 | { | ||
1324 | + int ret; | ||
1325 | + | ||
1326 | + /* | ||
1327 | + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this | ||
1328 | + * moment, get_block can be called only for blocks inside i_size since | ||
1329 | + * page cache has been already dropped and writes are blocked by | ||
1330 | + * i_mutex. So we can safely drop the i_data_sem here. | ||
1331 | + */ | ||
1332 | BUG_ON(EXT4_JOURNAL(inode) == NULL); | ||
1333 | jbd_debug(2, "restarting handle %p\n", handle); | ||
1334 | - return ext4_journal_restart(handle, blocks_for_truncate(inode)); | ||
1335 | + up_write(&EXT4_I(inode)->i_data_sem); | ||
1336 | + ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); | ||
1337 | + down_write(&EXT4_I(inode)->i_data_sem); | ||
1338 | + ext4_discard_preallocations(inode); | ||
1339 | + | ||
1340 | + return ret; | ||
1341 | } | ||
1342 | |||
1343 | /* | ||
1344 | @@ -551,15 +566,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
1345 | * | ||
1346 | * Normally this function find the preferred place for block allocation, | ||
1347 | * returns it. | ||
1348 | + * Because this is only used for non-extent files, we limit the block nr | ||
1349 | + * to 32 bits. | ||
1350 | */ | ||
1351 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
1352 | Indirect *partial) | ||
1353 | { | ||
1354 | + ext4_fsblk_t goal; | ||
1355 | + | ||
1356 | /* | ||
1357 | * XXX need to get goal block from mballoc's data structures | ||
1358 | */ | ||
1359 | |||
1360 | - return ext4_find_near(inode, partial); | ||
1361 | + goal = ext4_find_near(inode, partial); | ||
1362 | + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
1363 | + return goal; | ||
1364 | } | ||
1365 | |||
1366 | /** | ||
1367 | @@ -640,6 +661,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
1368 | if (*err) | ||
1369 | goto failed_out; | ||
1370 | |||
1371 | + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); | ||
1372 | + | ||
1373 | target -= count; | ||
1374 | /* allocate blocks for indirect blocks */ | ||
1375 | while (index < indirect_blks && count) { | ||
1376 | @@ -674,6 +697,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
1377 | ar.flags = EXT4_MB_HINT_DATA; | ||
1378 | |||
1379 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
1380 | + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); | ||
1381 | |||
1382 | if (*err && (target == blks)) { | ||
1383 | /* | ||
1384 | @@ -998,10 +1022,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | ||
1385 | if (!err) | ||
1386 | err = ext4_splice_branch(handle, inode, iblock, | ||
1387 | partial, indirect_blks, count); | ||
1388 | - else | ||
1389 | + if (err) | ||
1390 | goto cleanup; | ||
1391 | |||
1392 | set_buffer_new(bh_result); | ||
1393 | + | ||
1394 | + ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1395 | got_it: | ||
1396 | map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); | ||
1397 | if (count > blocks_to_boundary) | ||
1398 | @@ -1029,7 +1055,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) | ||
1399 | EXT4_I(inode)->i_reserved_meta_blocks; | ||
1400 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1401 | |||
1402 | - return total; | ||
1403 | + return (total << inode->i_blkbits); | ||
1404 | } | ||
1405 | /* | ||
1406 | * Calculate the number of metadata blocks need to reserve | ||
1407 | @@ -1109,22 +1135,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) | ||
1408 | ext4_discard_preallocations(inode); | ||
1409 | } | ||
1410 | |||
1411 | -static int check_block_validity(struct inode *inode, sector_t logical, | ||
1412 | - sector_t phys, int len) | ||
1413 | +static int check_block_validity(struct inode *inode, const char *msg, | ||
1414 | + sector_t logical, sector_t phys, int len) | ||
1415 | { | ||
1416 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { | ||
1417 | - ext4_error(inode->i_sb, "check_block_validity", | ||
1418 | + ext4_error(inode->i_sb, msg, | ||
1419 | "inode #%lu logical block %llu mapped to %llu " | ||
1420 | "(size %d)", inode->i_ino, | ||
1421 | (unsigned long long) logical, | ||
1422 | (unsigned long long) phys, len); | ||
1423 | - WARN_ON(1); | ||
1424 | return -EIO; | ||
1425 | } | ||
1426 | return 0; | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | + * Return the number of contiguous dirty pages in a given inode | ||
1431 | + * starting at page frame idx. | ||
1432 | + */ | ||
1433 | +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | ||
1434 | + unsigned int max_pages) | ||
1435 | +{ | ||
1436 | + struct address_space *mapping = inode->i_mapping; | ||
1437 | + pgoff_t index; | ||
1438 | + struct pagevec pvec; | ||
1439 | + pgoff_t num = 0; | ||
1440 | + int i, nr_pages, done = 0; | ||
1441 | + | ||
1442 | + if (max_pages == 0) | ||
1443 | + return 0; | ||
1444 | + pagevec_init(&pvec, 0); | ||
1445 | + while (!done) { | ||
1446 | + index = idx; | ||
1447 | + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
1448 | + PAGECACHE_TAG_DIRTY, | ||
1449 | + (pgoff_t)PAGEVEC_SIZE); | ||
1450 | + if (nr_pages == 0) | ||
1451 | + break; | ||
1452 | + for (i = 0; i < nr_pages; i++) { | ||
1453 | + struct page *page = pvec.pages[i]; | ||
1454 | + struct buffer_head *bh, *head; | ||
1455 | + | ||
1456 | + lock_page(page); | ||
1457 | + if (unlikely(page->mapping != mapping) || | ||
1458 | + !PageDirty(page) || | ||
1459 | + PageWriteback(page) || | ||
1460 | + page->index != idx) { | ||
1461 | + done = 1; | ||
1462 | + unlock_page(page); | ||
1463 | + break; | ||
1464 | + } | ||
1465 | + if (page_has_buffers(page)) { | ||
1466 | + bh = head = page_buffers(page); | ||
1467 | + do { | ||
1468 | + if (!buffer_delay(bh) && | ||
1469 | + !buffer_unwritten(bh)) | ||
1470 | + done = 1; | ||
1471 | + bh = bh->b_this_page; | ||
1472 | + } while (!done && (bh != head)); | ||
1473 | + } | ||
1474 | + unlock_page(page); | ||
1475 | + if (done) | ||
1476 | + break; | ||
1477 | + idx++; | ||
1478 | + num++; | ||
1479 | + if (num >= max_pages) | ||
1480 | + break; | ||
1481 | + } | ||
1482 | + pagevec_release(&pvec); | ||
1483 | + } | ||
1484 | + return num; | ||
1485 | +} | ||
1486 | + | ||
1487 | +/* | ||
1488 | * The ext4_get_blocks() function tries to look up the requested blocks, | ||
1489 | * and returns if the blocks are already mapped. | ||
1490 | * | ||
1491 | @@ -1155,6 +1238,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | ||
1492 | clear_buffer_mapped(bh); | ||
1493 | clear_buffer_unwritten(bh); | ||
1494 | |||
1495 | + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," | ||
1496 | + "logical block %lu\n", inode->i_ino, flags, max_blocks, | ||
1497 | + (unsigned long)block); | ||
1498 | /* | ||
1499 | * Try to see if we can get the block without requesting a new | ||
1500 | * file system block. | ||
1501 | @@ -1170,8 +1256,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | ||
1502 | up_read((&EXT4_I(inode)->i_data_sem)); | ||
1503 | |||
1504 | if (retval > 0 && buffer_mapped(bh)) { | ||
1505 | - int ret = check_block_validity(inode, block, | ||
1506 | - bh->b_blocknr, retval); | ||
1507 | + int ret = check_block_validity(inode, "file system corruption", | ||
1508 | + block, bh->b_blocknr, retval); | ||
1509 | if (ret != 0) | ||
1510 | return ret; | ||
1511 | } | ||
1512 | @@ -1235,8 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | ||
1513 | * i_data's format changing. Force the migrate | ||
1514 | * to fail by clearing migrate flags | ||
1515 | */ | ||
1516 | - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & | ||
1517 | - ~EXT4_EXT_MIGRATE; | ||
1518 | + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; | ||
1519 | } | ||
1520 | } | ||
1521 | |||
1522 | @@ -1252,8 +1337,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | ||
1523 | |||
1524 | up_write((&EXT4_I(inode)->i_data_sem)); | ||
1525 | if (retval > 0 && buffer_mapped(bh)) { | ||
1526 | - int ret = check_block_validity(inode, block, | ||
1527 | - bh->b_blocknr, retval); | ||
1528 | + int ret = check_block_validity(inode, "file system " | ||
1529 | + "corruption after allocation", | ||
1530 | + block, bh->b_blocknr, retval); | ||
1531 | if (ret != 0) | ||
1532 | return ret; | ||
1533 | } | ||
1534 | @@ -1451,6 +1537,16 @@ static int do_journal_get_write_access(handle_t *handle, | ||
1535 | return ext4_journal_get_write_access(handle, bh); | ||
1536 | } | ||
1537 | |||
1538 | +/* | ||
1539 | + * Truncate blocks that were not used by write. We have to truncate the | ||
1540 | + * pagecache as well so that corresponding buffers get properly unmapped. | ||
1541 | + */ | ||
1542 | +static void ext4_truncate_failed_write(struct inode *inode) | ||
1543 | +{ | ||
1544 | + truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
1545 | + ext4_truncate(inode); | ||
1546 | +} | ||
1547 | + | ||
1548 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | ||
1549 | loff_t pos, unsigned len, unsigned flags, | ||
1550 | struct page **pagep, void **fsdata) | ||
1551 | @@ -1516,7 +1612,7 @@ retry: | ||
1552 | |||
1553 | ext4_journal_stop(handle); | ||
1554 | if (pos + len > inode->i_size) { | ||
1555 | - ext4_truncate(inode); | ||
1556 | + ext4_truncate_failed_write(inode); | ||
1557 | /* | ||
1558 | * If truncate failed early the inode might | ||
1559 | * still be on the orphan list; we need to | ||
1560 | @@ -1626,7 +1722,7 @@ static int ext4_ordered_write_end(struct file *file, | ||
1561 | ret = ret2; | ||
1562 | |||
1563 | if (pos + len > inode->i_size) { | ||
1564 | - ext4_truncate(inode); | ||
1565 | + ext4_truncate_failed_write(inode); | ||
1566 | /* | ||
1567 | * If truncate failed early the inode might still be | ||
1568 | * on the orphan list; we need to make sure the inode | ||
1569 | @@ -1668,7 +1764,7 @@ static int ext4_writeback_write_end(struct file *file, | ||
1570 | ret = ret2; | ||
1571 | |||
1572 | if (pos + len > inode->i_size) { | ||
1573 | - ext4_truncate(inode); | ||
1574 | + ext4_truncate_failed_write(inode); | ||
1575 | /* | ||
1576 | * If truncate failed early the inode might still be | ||
1577 | * on the orphan list; we need to make sure the inode | ||
1578 | @@ -1731,7 +1827,7 @@ static int ext4_journalled_write_end(struct file *file, | ||
1579 | if (!ret) | ||
1580 | ret = ret2; | ||
1581 | if (pos + len > inode->i_size) { | ||
1582 | - ext4_truncate(inode); | ||
1583 | + ext4_truncate_failed_write(inode); | ||
1584 | /* | ||
1585 | * If truncate failed early the inode might still be | ||
1586 | * on the orphan list; we need to make sure the inode | ||
1587 | @@ -1776,11 +1872,11 @@ repeat: | ||
1588 | |||
1589 | if (ext4_claim_free_blocks(sbi, total)) { | ||
1590 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1591 | + vfs_dq_release_reservation_block(inode, total); | ||
1592 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | ||
1593 | yield(); | ||
1594 | goto repeat; | ||
1595 | } | ||
1596 | - vfs_dq_release_reservation_block(inode, total); | ||
1597 | return -ENOSPC; | ||
1598 | } | ||
1599 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | ||
1600 | @@ -1860,22 +1956,6 @@ static void ext4_da_page_release_reservation(struct page *page, | ||
1601 | } | ||
1602 | |||
1603 | /* | ||
1604 | - * Delayed allocation stuff | ||
1605 | - */ | ||
1606 | - | ||
1607 | -struct mpage_da_data { | ||
1608 | - struct inode *inode; | ||
1609 | - sector_t b_blocknr; /* start block number of extent */ | ||
1610 | - size_t b_size; /* size of extent */ | ||
1611 | - unsigned long b_state; /* state of the extent */ | ||
1612 | - unsigned long first_page, next_page; /* extent of pages */ | ||
1613 | - struct writeback_control *wbc; | ||
1614 | - int io_done; | ||
1615 | - int pages_written; | ||
1616 | - int retval; | ||
1617 | -}; | ||
1618 | - | ||
1619 | -/* | ||
1620 | * mpage_da_submit_io - walks through extent of pages and try to write | ||
1621 | * them with writepage() call back | ||
1622 | * | ||
1623 | @@ -2717,7 +2797,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) | ||
1624 | * number of contiguous block. So we will limit | ||
1625 | * number of contiguous block to a sane value | ||
1626 | */ | ||
1627 | - if (!(inode->i_flags & EXT4_EXTENTS_FL) && | ||
1628 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && | ||
1629 | (max_blocks > EXT4_MAX_TRANS_DATA)) | ||
1630 | max_blocks = EXT4_MAX_TRANS_DATA; | ||
1631 | |||
1632 | @@ -2735,8 +2815,11 @@ static int ext4_da_writepages(struct address_space *mapping, | ||
1633 | int no_nrwrite_index_update; | ||
1634 | int pages_written = 0; | ||
1635 | long pages_skipped; | ||
1636 | + unsigned int max_pages; | ||
1637 | int range_cyclic, cycled = 1, io_done = 0; | ||
1638 | - int needed_blocks, ret = 0, nr_to_writebump = 0; | ||
1639 | + int needed_blocks, ret = 0; | ||
1640 | + long desired_nr_to_write, nr_to_writebump = 0; | ||
1641 | + loff_t range_start = wbc->range_start; | ||
1642 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | ||
1643 | |||
1644 | trace_ext4_da_writepages(inode, wbc); | ||
1645 | @@ -2762,16 +2845,6 @@ static int ext4_da_writepages(struct address_space *mapping, | ||
1646 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | ||
1647 | return -EROFS; | ||
1648 | |||
1649 | - /* | ||
1650 | - * Make sure nr_to_write is >= sbi->s_mb_stream_request | ||
1651 | - * This make sure small files blocks are allocated in | ||
1652 | - * single attempt. This ensure that small files | ||
1653 | - * get less fragmented. | ||
1654 | - */ | ||
1655 | - if (wbc->nr_to_write < sbi->s_mb_stream_request) { | ||
1656 | - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | ||
1657 | - wbc->nr_to_write = sbi->s_mb_stream_request; | ||
1658 | - } | ||
1659 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | ||
1660 | range_whole = 1; | ||
1661 | |||
1662 | @@ -2786,6 +2859,36 @@ static int ext4_da_writepages(struct address_space *mapping, | ||
1663 | } else | ||
1664 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
1665 | |||
1666 | + /* | ||
1667 | + * This works around two forms of stupidity. The first is in | ||
1668 | + * the writeback code, which caps the maximum number of pages | ||
1669 | + * written to be 1024 pages. This is wrong on multiple | ||
1670 | + * levels; different architectues have a different page size, | ||
1671 | + * which changes the maximum amount of data which gets | ||
1672 | + * written. Secondly, 4 megabytes is way too small. XFS | ||
1673 | + * forces this value to be 16 megabytes by multiplying | ||
1674 | + * nr_to_write parameter by four, and then relies on its | ||
1675 | + * allocator to allocate larger extents to make them | ||
1676 | + * contiguous. Unfortunately this brings us to the second | ||
1677 | + * stupidity, which is that ext4's mballoc code only allocates | ||
1678 | + * at most 2048 blocks. So we force contiguous writes up to | ||
1679 | + * the number of dirty blocks in the inode, or | ||
1680 | + * sbi->max_writeback_mb_bump whichever is smaller. | ||
1681 | + */ | ||
1682 | + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | ||
1683 | + if (!range_cyclic && range_whole) | ||
1684 | + desired_nr_to_write = wbc->nr_to_write * 8; | ||
1685 | + else | ||
1686 | + desired_nr_to_write = ext4_num_dirty_pages(inode, index, | ||
1687 | + max_pages); | ||
1688 | + if (desired_nr_to_write > max_pages) | ||
1689 | + desired_nr_to_write = max_pages; | ||
1690 | + | ||
1691 | + if (wbc->nr_to_write < desired_nr_to_write) { | ||
1692 | + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | ||
1693 | + wbc->nr_to_write = desired_nr_to_write; | ||
1694 | + } | ||
1695 | + | ||
1696 | mpd.wbc = wbc; | ||
1697 | mpd.inode = mapping->host; | ||
1698 | |||
1699 | @@ -2904,7 +3007,9 @@ retry: | ||
1700 | out_writepages: | ||
1701 | if (!no_nrwrite_index_update) | ||
1702 | wbc->no_nrwrite_index_update = 0; | ||
1703 | - wbc->nr_to_write -= nr_to_writebump; | ||
1704 | + if (wbc->nr_to_write > nr_to_writebump) | ||
1705 | + wbc->nr_to_write -= nr_to_writebump; | ||
1706 | + wbc->range_start = range_start; | ||
1707 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | ||
1708 | return ret; | ||
1709 | } | ||
1710 | @@ -2994,7 +3099,7 @@ retry: | ||
1711 | * i_size_read because we hold i_mutex. | ||
1712 | */ | ||
1713 | if (pos + len > inode->i_size) | ||
1714 | - ext4_truncate(inode); | ||
1715 | + ext4_truncate_failed_write(inode); | ||
1716 | } | ||
1717 | |||
1718 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
1719 | @@ -3259,6 +3364,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | ||
1720 | } | ||
1721 | |||
1722 | /* | ||
1723 | + * O_DIRECT for ext3 (or indirect map) based files | ||
1724 | + * | ||
1725 | * If the O_DIRECT write will extend the file then add this inode to the | ||
1726 | * orphan list. So recovery will truncate it back to the original size | ||
1727 | * if the machine crashes during the write. | ||
1728 | @@ -3267,7 +3374,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | ||
1729 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
1730 | * VFS code falls back into buffered path in that case so we are safe. | ||
1731 | */ | ||
1732 | -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
1733 | +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
1734 | const struct iovec *iov, loff_t offset, | ||
1735 | unsigned long nr_segs) | ||
1736 | { | ||
1737 | @@ -3278,6 +3385,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
1738 | ssize_t ret; | ||
1739 | int orphan = 0; | ||
1740 | size_t count = iov_length(iov, nr_segs); | ||
1741 | + int retries = 0; | ||
1742 | |||
1743 | if (rw == WRITE) { | ||
1744 | loff_t final_size = offset + count; | ||
1745 | @@ -3300,9 +3408,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
1746 | } | ||
1747 | } | ||
1748 | |||
1749 | +retry: | ||
1750 | ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, | ||
1751 | offset, nr_segs, | ||
1752 | ext4_get_block, NULL); | ||
1753 | + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
1754 | + goto retry; | ||
1755 | |||
1756 | if (orphan) { | ||
1757 | int err; | ||
1758 | @@ -3341,6 +3452,364 @@ out: | ||
1759 | return ret; | ||
1760 | } | ||
1761 | |||
1762 | +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, | ||
1763 | + struct buffer_head *bh_result, int create) | ||
1764 | +{ | ||
1765 | + handle_t *handle = NULL; | ||
1766 | + int ret = 0; | ||
1767 | + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
1768 | + int dio_credits; | ||
1769 | + | ||
1770 | + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", | ||
1771 | + inode->i_ino, create); | ||
1772 | + /* | ||
1773 | + * DIO VFS code passes create = 0 flag for write to | ||
1774 | + * the middle of file. It does this to avoid block | ||
1775 | + * allocation for holes, to prevent expose stale data | ||
1776 | + * out when there is parallel buffered read (which does | ||
1777 | + * not hold the i_mutex lock) while direct IO write has | ||
1778 | + * not completed. DIO request on holes finally falls back | ||
1779 | + * to buffered IO for this reason. | ||
1780 | + * | ||
1781 | + * For ext4 extent based file, since we support fallocate, | ||
1782 | + * new allocated extent as uninitialized, for holes, we | ||
1783 | + * could fallocate blocks for holes, thus parallel | ||
1784 | + * buffered IO read will zero out the page when read on | ||
1785 | + * a hole while parallel DIO write to the hole has not completed. | ||
1786 | + * | ||
1787 | + * when we come here, we know it's a direct IO write to | ||
1788 | + * to the middle of file (<i_size) | ||
1789 | + * so it's safe to override the create flag from VFS. | ||
1790 | + */ | ||
1791 | + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; | ||
1792 | + | ||
1793 | + if (max_blocks > DIO_MAX_BLOCKS) | ||
1794 | + max_blocks = DIO_MAX_BLOCKS; | ||
1795 | + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); | ||
1796 | + handle = ext4_journal_start(inode, dio_credits); | ||
1797 | + if (IS_ERR(handle)) { | ||
1798 | + ret = PTR_ERR(handle); | ||
1799 | + goto out; | ||
1800 | + } | ||
1801 | + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, | ||
1802 | + create); | ||
1803 | + if (ret > 0) { | ||
1804 | + bh_result->b_size = (ret << inode->i_blkbits); | ||
1805 | + ret = 0; | ||
1806 | + } | ||
1807 | + ext4_journal_stop(handle); | ||
1808 | +out: | ||
1809 | + return ret; | ||
1810 | +} | ||
1811 | + | ||
1812 | +static void ext4_free_io_end(ext4_io_end_t *io) | ||
1813 | +{ | ||
1814 | + BUG_ON(!io); | ||
1815 | + iput(io->inode); | ||
1816 | + kfree(io); | ||
1817 | +} | ||
1818 | +static void dump_aio_dio_list(struct inode * inode) | ||
1819 | +{ | ||
1820 | +#ifdef EXT4_DEBUG | ||
1821 | + struct list_head *cur, *before, *after; | ||
1822 | + ext4_io_end_t *io, *io0, *io1; | ||
1823 | + | ||
1824 | + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
1825 | + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); | ||
1826 | + return; | ||
1827 | + } | ||
1828 | + | ||
1829 | + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); | ||
1830 | + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ | ||
1831 | + cur = &io->list; | ||
1832 | + before = cur->prev; | ||
1833 | + io0 = container_of(before, ext4_io_end_t, list); | ||
1834 | + after = cur->next; | ||
1835 | + io1 = container_of(after, ext4_io_end_t, list); | ||
1836 | + | ||
1837 | + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
1838 | + io, inode->i_ino, io0, io1); | ||
1839 | + } | ||
1840 | +#endif | ||
1841 | +} | ||
1842 | + | ||
1843 | +/* | ||
1844 | + * check a range of space and convert unwritten extents to written. | ||
1845 | + */ | ||
1846 | +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) | ||
1847 | +{ | ||
1848 | + struct inode *inode = io->inode; | ||
1849 | + loff_t offset = io->offset; | ||
1850 | + size_t size = io->size; | ||
1851 | + int ret = 0; | ||
1852 | + | ||
1853 | + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," | ||
1854 | + "list->prev 0x%p\n", | ||
1855 | + io, inode->i_ino, io->list.next, io->list.prev); | ||
1856 | + | ||
1857 | + if (list_empty(&io->list)) | ||
1858 | + return ret; | ||
1859 | + | ||
1860 | + if (io->flag != DIO_AIO_UNWRITTEN) | ||
1861 | + return ret; | ||
1862 | + | ||
1863 | + if (offset + size <= i_size_read(inode)) | ||
1864 | + ret = ext4_convert_unwritten_extents(inode, offset, size); | ||
1865 | + | ||
1866 | + if (ret < 0) { | ||
1867 | + printk(KERN_EMERG "%s: failed to convert unwritten" | ||
1868 | + "extents to written extents, error is %d" | ||
1869 | + " io is still on inode %lu aio dio list\n", | ||
1870 | + __func__, ret, inode->i_ino); | ||
1871 | + return ret; | ||
1872 | + } | ||
1873 | + | ||
1874 | + /* clear the DIO AIO unwritten flag */ | ||
1875 | + io->flag = 0; | ||
1876 | + return ret; | ||
1877 | +} | ||
1878 | +/* | ||
1879 | + * work on completed aio dio IO, to convert unwritten extents to extents | ||
1880 | + */ | ||
1881 | +static void ext4_end_aio_dio_work(struct work_struct *work) | ||
1882 | +{ | ||
1883 | + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
1884 | + struct inode *inode = io->inode; | ||
1885 | + int ret = 0; | ||
1886 | + | ||
1887 | + mutex_lock(&inode->i_mutex); | ||
1888 | + ret = ext4_end_aio_dio_nolock(io); | ||
1889 | + if (ret >= 0) { | ||
1890 | + if (!list_empty(&io->list)) | ||
1891 | + list_del_init(&io->list); | ||
1892 | + ext4_free_io_end(io); | ||
1893 | + } | ||
1894 | + mutex_unlock(&inode->i_mutex); | ||
1895 | +} | ||
1896 | +/* | ||
1897 | + * This function is called from ext4_sync_file(). | ||
1898 | + * | ||
1899 | + * When AIO DIO IO is completed, the work to convert unwritten | ||
1900 | + * extents to written is queued on workqueue but may not get immediately | ||
1901 | + * scheduled. When fsync is called, we need to ensure the | ||
1902 | + * conversion is complete before fsync returns. | ||
1903 | + * The inode keeps track of a list of completed AIO from DIO path | ||
1904 | + * that might needs to do the conversion. This function walks through | ||
1905 | + * the list and convert the related unwritten extents to written. | ||
1906 | + */ | ||
1907 | +int flush_aio_dio_completed_IO(struct inode *inode) | ||
1908 | +{ | ||
1909 | + ext4_io_end_t *io; | ||
1910 | + int ret = 0; | ||
1911 | + int ret2 = 0; | ||
1912 | + | ||
1913 | + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) | ||
1914 | + return ret; | ||
1915 | + | ||
1916 | + dump_aio_dio_list(inode); | ||
1917 | + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
1918 | + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, | ||
1919 | + ext4_io_end_t, list); | ||
1920 | + /* | ||
1921 | + * Calling ext4_end_aio_dio_nolock() to convert completed | ||
1922 | + * IO to written. | ||
1923 | + * | ||
1924 | + * When ext4_sync_file() is called, run_queue() may already | ||
1925 | + * about to flush the work corresponding to this io structure. | ||
1926 | + * It will be upset if it founds the io structure related | ||
1927 | + * to the work-to-be schedule is freed. | ||
1928 | + * | ||
1929 | + * Thus we need to keep the io structure still valid here after | ||
1930 | + * convertion finished. The io structure has a flag to | ||
1931 | + * avoid double converting from both fsync and background work | ||
1932 | + * queue work. | ||
1933 | + */ | ||
1934 | + ret = ext4_end_aio_dio_nolock(io); | ||
1935 | + if (ret < 0) | ||
1936 | + ret2 = ret; | ||
1937 | + else | ||
1938 | + list_del_init(&io->list); | ||
1939 | + } | ||
1940 | + return (ret2 < 0) ? ret2 : 0; | ||
1941 | +} | ||
1942 | + | ||
1943 | +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) | ||
1944 | +{ | ||
1945 | + ext4_io_end_t *io = NULL; | ||
1946 | + | ||
1947 | + io = kmalloc(sizeof(*io), GFP_NOFS); | ||
1948 | + | ||
1949 | + if (io) { | ||
1950 | + igrab(inode); | ||
1951 | + io->inode = inode; | ||
1952 | + io->flag = 0; | ||
1953 | + io->offset = 0; | ||
1954 | + io->size = 0; | ||
1955 | + io->error = 0; | ||
1956 | + INIT_WORK(&io->work, ext4_end_aio_dio_work); | ||
1957 | + INIT_LIST_HEAD(&io->list); | ||
1958 | + } | ||
1959 | + | ||
1960 | + return io; | ||
1961 | +} | ||
1962 | + | ||
1963 | +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | ||
1964 | + ssize_t size, void *private) | ||
1965 | +{ | ||
1966 | + ext4_io_end_t *io_end = iocb->private; | ||
1967 | + struct workqueue_struct *wq; | ||
1968 | + | ||
1969 | + /* if not async direct IO or dio with 0 bytes write, just return */ | ||
1970 | + if (!io_end || !size) | ||
1971 | + return; | ||
1972 | + | ||
1973 | + ext_debug("ext4_end_io_dio(): io_end 0x%p" | ||
1974 | + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", | ||
1975 | + iocb->private, io_end->inode->i_ino, iocb, offset, | ||
1976 | + size); | ||
1977 | + | ||
1978 | + /* if not aio dio with unwritten extents, just free io and return */ | ||
1979 | + if (io_end->flag != DIO_AIO_UNWRITTEN){ | ||
1980 | + ext4_free_io_end(io_end); | ||
1981 | + iocb->private = NULL; | ||
1982 | + return; | ||
1983 | + } | ||
1984 | + | ||
1985 | + io_end->offset = offset; | ||
1986 | + io_end->size = size; | ||
1987 | + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
1988 | + | ||
1989 | + /* queue the work to convert unwritten extents to written */ | ||
1990 | + queue_work(wq, &io_end->work); | ||
1991 | + | ||
1992 | + /* Add the io_end to per-inode completed aio dio list*/ | ||
1993 | + list_add_tail(&io_end->list, | ||
1994 | + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); | ||
1995 | + iocb->private = NULL; | ||
1996 | +} | ||
1997 | +/* | ||
1998 | + * For ext4 extent files, ext4 will do direct-io write to holes, | ||
1999 | + * preallocated extents, and those write extend the file, no need to | ||
2000 | + * fall back to buffered IO. | ||
2001 | + * | ||
2002 | + * For holes, we fallocate those blocks, mark them as unintialized | ||
2003 | + * If those blocks were preallocated, we mark sure they are splited, but | ||
2004 | + * still keep the range to write as unintialized. | ||
2005 | + * | ||
2006 | + * The unwrritten extents will be converted to written when DIO is completed. | ||
2007 | + * For async direct IO, since the IO may still pending when return, we | ||
2008 | + * set up an end_io call back function, which will do the convertion | ||
2009 | + * when async direct IO completed. | ||
2010 | + * | ||
2011 | + * If the O_DIRECT write will extend the file then add this inode to the | ||
2012 | + * orphan list. So recovery will truncate it back to the original size | ||
2013 | + * if the machine crashes during the write. | ||
2014 | + * | ||
2015 | + */ | ||
2016 | +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | ||
2017 | + const struct iovec *iov, loff_t offset, | ||
2018 | + unsigned long nr_segs) | ||
2019 | +{ | ||
2020 | + struct file *file = iocb->ki_filp; | ||
2021 | + struct inode *inode = file->f_mapping->host; | ||
2022 | + ssize_t ret; | ||
2023 | + size_t count = iov_length(iov, nr_segs); | ||
2024 | + | ||
2025 | + loff_t final_size = offset + count; | ||
2026 | + if (rw == WRITE && final_size <= inode->i_size) { | ||
2027 | + /* | ||
2028 | + * We could direct write to holes and fallocate. | ||
2029 | + * | ||
2030 | + * Allocated blocks to fill the hole are marked as uninitialized | ||
2031 | + * to prevent paralel buffered read to expose the stale data | ||
2032 | + * before DIO complete the data IO. | ||
2033 | + * | ||
2034 | + * As to previously fallocated extents, ext4 get_block | ||
2035 | + * will just simply mark the buffer mapped but still | ||
2036 | + * keep the extents uninitialized. | ||
2037 | + * | ||
2038 | + * for non AIO case, we will convert those unwritten extents | ||
2039 | + * to written after return back from blockdev_direct_IO. | ||
2040 | + * | ||
2041 | + * for async DIO, the conversion needs to be defered when | ||
2042 | + * the IO is completed. The ext4 end_io callback function | ||
2043 | + * will be called to take care of the conversion work. | ||
2044 | + * Here for async case, we allocate an io_end structure to | ||
2045 | + * hook to the iocb. | ||
2046 | + */ | ||
2047 | + iocb->private = NULL; | ||
2048 | + EXT4_I(inode)->cur_aio_dio = NULL; | ||
2049 | + if (!is_sync_kiocb(iocb)) { | ||
2050 | + iocb->private = ext4_init_io_end(inode); | ||
2051 | + if (!iocb->private) | ||
2052 | + return -ENOMEM; | ||
2053 | + /* | ||
2054 | + * we save the io structure for current async | ||
2055 | + * direct IO, so that later ext4_get_blocks() | ||
2056 | + * could flag the io structure whether there | ||
2057 | + * is a unwritten extents needs to be converted | ||
2058 | + * when IO is completed. | ||
2059 | + */ | ||
2060 | + EXT4_I(inode)->cur_aio_dio = iocb->private; | ||
2061 | + } | ||
2062 | + | ||
2063 | + ret = blockdev_direct_IO(rw, iocb, inode, | ||
2064 | + inode->i_sb->s_bdev, iov, | ||
2065 | + offset, nr_segs, | ||
2066 | + ext4_get_block_dio_write, | ||
2067 | + ext4_end_io_dio); | ||
2068 | + if (iocb->private) | ||
2069 | + EXT4_I(inode)->cur_aio_dio = NULL; | ||
2070 | + /* | ||
2071 | + * The io_end structure takes a reference to the inode, | ||
2072 | + * that structure needs to be destroyed and the | ||
2073 | + * reference to the inode need to be dropped, when IO is | ||
2074 | + * complete, even with 0 byte write, or failed. | ||
2075 | + * | ||
2076 | + * In the successful AIO DIO case, the io_end structure will be | ||
2077 | + * desctroyed and the reference to the inode will be dropped | ||
2078 | + * after the end_io call back function is called. | ||
2079 | + * | ||
2080 | + * In the case there is 0 byte write, or error case, since | ||
2081 | + * VFS direct IO won't invoke the end_io call back function, | ||
2082 | + * we need to free the end_io structure here. | ||
2083 | + */ | ||
2084 | + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | ||
2085 | + ext4_free_io_end(iocb->private); | ||
2086 | + iocb->private = NULL; | ||
2087 | + } else if (ret > 0 && (EXT4_I(inode)->i_state & | ||
2088 | + EXT4_STATE_DIO_UNWRITTEN)) { | ||
2089 | + int err; | ||
2090 | + /* | ||
2091 | + * for non AIO case, since the IO is already | ||
2092 | + * completed, we could do the convertion right here | ||
2093 | + */ | ||
2094 | + err = ext4_convert_unwritten_extents(inode, | ||
2095 | + offset, ret); | ||
2096 | + if (err < 0) | ||
2097 | + ret = err; | ||
2098 | + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; | ||
2099 | + } | ||
2100 | + return ret; | ||
2101 | + } | ||
2102 | + | ||
2103 | + /* for write the the end of file case, we fall back to old way */ | ||
2104 | + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
2105 | +} | ||
2106 | + | ||
2107 | +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
2108 | + const struct iovec *iov, loff_t offset, | ||
2109 | + unsigned long nr_segs) | ||
2110 | +{ | ||
2111 | + struct file *file = iocb->ki_filp; | ||
2112 | + struct inode *inode = file->f_mapping->host; | ||
2113 | + | ||
2114 | + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
2115 | + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
2116 | + | ||
2117 | + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
2118 | +} | ||
2119 | + | ||
2120 | /* | ||
2121 | * Pages can be marked dirty completely asynchronously from ext4's journalling | ||
2122 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do | ||
2123 | @@ -3653,13 +4122,16 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
2124 | __le32 *last) | ||
2125 | { | ||
2126 | __le32 *p; | ||
2127 | + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); | ||
2128 | + | ||
2129 | if (try_to_extend_transaction(handle, inode)) { | ||
2130 | if (bh) { | ||
2131 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
2132 | ext4_handle_dirty_metadata(handle, inode, bh); | ||
2133 | } | ||
2134 | ext4_mark_inode_dirty(handle, inode); | ||
2135 | - ext4_journal_test_restart(handle, inode); | ||
2136 | + ext4_truncate_restart_trans(handle, inode, | ||
2137 | + blocks_for_truncate(inode)); | ||
2138 | if (bh) { | ||
2139 | BUFFER_TRACE(bh, "retaking write access"); | ||
2140 | ext4_journal_get_write_access(handle, bh); | ||
2141 | @@ -3682,11 +4154,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
2142 | |||
2143 | *p = 0; | ||
2144 | tbh = sb_find_get_block(inode->i_sb, nr); | ||
2145 | - ext4_forget(handle, 0, inode, tbh, nr); | ||
2146 | + ext4_forget(handle, is_metadata, inode, tbh, nr); | ||
2147 | } | ||
2148 | } | ||
2149 | |||
2150 | - ext4_free_blocks(handle, inode, block_to_free, count, 0); | ||
2151 | + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); | ||
2152 | } | ||
2153 | |||
2154 | /** | ||
2155 | @@ -3870,7 +4342,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
2156 | return; | ||
2157 | if (try_to_extend_transaction(handle, inode)) { | ||
2158 | ext4_mark_inode_dirty(handle, inode); | ||
2159 | - ext4_journal_test_restart(handle, inode); | ||
2160 | + ext4_truncate_restart_trans(handle, inode, | ||
2161 | + blocks_for_truncate(inode)); | ||
2162 | } | ||
2163 | |||
2164 | ext4_free_blocks(handle, inode, nr, 1, 1); | ||
2165 | @@ -3958,8 +4431,7 @@ void ext4_truncate(struct inode *inode) | ||
2166 | if (!ext4_can_truncate(inode)) | ||
2167 | return; | ||
2168 | |||
2169 | - if (ei->i_disksize && inode->i_size == 0 && | ||
2170 | - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | ||
2171 | + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | ||
2172 | ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; | ||
2173 | |||
2174 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | ||
2175 | @@ -4313,8 +4785,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2176 | struct ext4_iloc iloc; | ||
2177 | struct ext4_inode *raw_inode; | ||
2178 | struct ext4_inode_info *ei; | ||
2179 | - struct buffer_head *bh; | ||
2180 | struct inode *inode; | ||
2181 | + journal_t *journal = EXT4_SB(sb)->s_journal; | ||
2182 | long ret; | ||
2183 | int block; | ||
2184 | |||
2185 | @@ -4325,11 +4797,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2186 | return inode; | ||
2187 | |||
2188 | ei = EXT4_I(inode); | ||
2189 | + iloc.bh = 0; | ||
2190 | |||
2191 | ret = __ext4_get_inode_loc(inode, &iloc, 0); | ||
2192 | if (ret < 0) | ||
2193 | goto bad_inode; | ||
2194 | - bh = iloc.bh; | ||
2195 | raw_inode = ext4_raw_inode(&iloc); | ||
2196 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); | ||
2197 | inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); | ||
2198 | @@ -4352,7 +4824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2199 | if (inode->i_mode == 0 || | ||
2200 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { | ||
2201 | /* this inode is deleted */ | ||
2202 | - brelse(bh); | ||
2203 | ret = -ESTALE; | ||
2204 | goto bad_inode; | ||
2205 | } | ||
2206 | @@ -4380,11 +4851,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2207 | ei->i_data[block] = raw_inode->i_block[block]; | ||
2208 | INIT_LIST_HEAD(&ei->i_orphan); | ||
2209 | |||
2210 | + /* | ||
2211 | + * Set transaction id's of transactions that have to be committed | ||
2212 | + * to finish f[data]sync. We set them to currently running transaction | ||
2213 | + * as we cannot be sure that the inode or some of its metadata isn't | ||
2214 | + * part of the transaction - the inode could have been reclaimed and | ||
2215 | + * now it is reread from disk. | ||
2216 | + */ | ||
2217 | + if (journal) { | ||
2218 | + transaction_t *transaction; | ||
2219 | + tid_t tid; | ||
2220 | + | ||
2221 | + spin_lock(&journal->j_state_lock); | ||
2222 | + if (journal->j_running_transaction) | ||
2223 | + transaction = journal->j_running_transaction; | ||
2224 | + else | ||
2225 | + transaction = journal->j_committing_transaction; | ||
2226 | + if (transaction) | ||
2227 | + tid = transaction->t_tid; | ||
2228 | + else | ||
2229 | + tid = journal->j_commit_sequence; | ||
2230 | + spin_unlock(&journal->j_state_lock); | ||
2231 | + ei->i_sync_tid = tid; | ||
2232 | + ei->i_datasync_tid = tid; | ||
2233 | + } | ||
2234 | + | ||
2235 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { | ||
2236 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); | ||
2237 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > | ||
2238 | EXT4_INODE_SIZE(inode->i_sb)) { | ||
2239 | - brelse(bh); | ||
2240 | ret = -EIO; | ||
2241 | goto bad_inode; | ||
2242 | } | ||
2243 | @@ -4416,10 +4911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2244 | |||
2245 | ret = 0; | ||
2246 | if (ei->i_file_acl && | ||
2247 | - ((ei->i_file_acl < | ||
2248 | - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + | ||
2249 | - EXT4_SB(sb)->s_gdb_count)) || | ||
2250 | - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { | ||
2251 | + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { | ||
2252 | ext4_error(sb, __func__, | ||
2253 | "bad extended attribute block %llu in inode #%lu", | ||
2254 | ei->i_file_acl, inode->i_ino); | ||
2255 | @@ -4437,10 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2256 | /* Validate block references which are part of inode */ | ||
2257 | ret = ext4_check_inode_blockref(inode); | ||
2258 | } | ||
2259 | - if (ret) { | ||
2260 | - brelse(bh); | ||
2261 | + if (ret) | ||
2262 | goto bad_inode; | ||
2263 | - } | ||
2264 | |||
2265 | if (S_ISREG(inode->i_mode)) { | ||
2266 | inode->i_op = &ext4_file_inode_operations; | ||
2267 | @@ -4468,7 +4958,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2268 | init_special_inode(inode, inode->i_mode, | ||
2269 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); | ||
2270 | } else { | ||
2271 | - brelse(bh); | ||
2272 | ret = -EIO; | ||
2273 | ext4_error(inode->i_sb, __func__, | ||
2274 | "bogus i_mode (%o) for inode=%lu", | ||
2275 | @@ -4481,6 +4970,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | ||
2276 | return inode; | ||
2277 | |||
2278 | bad_inode: | ||
2279 | + brelse(iloc.bh); | ||
2280 | iget_failed(inode); | ||
2281 | return ERR_PTR(ret); | ||
2282 | } | ||
2283 | @@ -4581,8 +5071,7 @@ static int ext4_do_update_inode(handle_t *handle, | ||
2284 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) | ||
2285 | goto out_brelse; | ||
2286 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); | ||
2287 | - /* clear the migrate flag in the raw_inode */ | ||
2288 | - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); | ||
2289 | + raw_inode->i_flags = cpu_to_le32(ei->i_flags); | ||
2290 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != | ||
2291 | cpu_to_le32(EXT4_OS_HURD)) | ||
2292 | raw_inode->i_file_acl_high = | ||
2293 | @@ -4641,6 +5130,7 @@ static int ext4_do_update_inode(handle_t *handle, | ||
2294 | err = rc; | ||
2295 | ei->i_state &= ~EXT4_STATE_NEW; | ||
2296 | |||
2297 | + ext4_update_inode_fsync_trans(handle, inode, 0); | ||
2298 | out_brelse: | ||
2299 | brelse(bh); | ||
2300 | ext4_std_error(inode->i_sb, err); | ||
2301 | @@ -4684,19 +5174,40 @@ out_brelse: | ||
2302 | */ | ||
2303 | int ext4_write_inode(struct inode *inode, int wait) | ||
2304 | { | ||
2305 | + int err; | ||
2306 | + | ||
2307 | if (current->flags & PF_MEMALLOC) | ||
2308 | return 0; | ||
2309 | |||
2310 | - if (ext4_journal_current_handle()) { | ||
2311 | - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); | ||
2312 | - dump_stack(); | ||
2313 | - return -EIO; | ||
2314 | - } | ||
2315 | + if (EXT4_SB(inode->i_sb)->s_journal) { | ||
2316 | + if (ext4_journal_current_handle()) { | ||
2317 | + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); | ||
2318 | + dump_stack(); | ||
2319 | + return -EIO; | ||
2320 | + } | ||
2321 | |||
2322 | - if (!wait) | ||
2323 | - return 0; | ||
2324 | + if (!wait) | ||
2325 | + return 0; | ||
2326 | + | ||
2327 | + err = ext4_force_commit(inode->i_sb); | ||
2328 | + } else { | ||
2329 | + struct ext4_iloc iloc; | ||
2330 | |||
2331 | - return ext4_force_commit(inode->i_sb); | ||
2332 | + err = ext4_get_inode_loc(inode, &iloc); | ||
2333 | + if (err) | ||
2334 | + return err; | ||
2335 | + if (wait) | ||
2336 | + sync_dirty_buffer(iloc.bh); | ||
2337 | + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { | ||
2338 | + ext4_error(inode->i_sb, __func__, | ||
2339 | + "IO error syncing inode, " | ||
2340 | + "inode=%lu, block=%llu", | ||
2341 | + inode->i_ino, | ||
2342 | + (unsigned long long)iloc.bh->b_blocknr); | ||
2343 | + err = -EIO; | ||
2344 | + } | ||
2345 | + } | ||
2346 | + return err; | ||
2347 | } | ||
2348 | |||
2349 | /* | ||
2350 | @@ -4739,8 +5250,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | ||
2351 | |||
2352 | /* (user+group)*(old+new) structure, inode write (sb, | ||
2353 | * inode block, ? - but truncate inode update has it) */ | ||
2354 | - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ | ||
2355 | - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); | ||
2356 | + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ | ||
2357 | + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); | ||
2358 | if (IS_ERR(handle)) { | ||
2359 | error = PTR_ERR(handle); | ||
2360 | goto err_out; | ||
2361 | @@ -5137,24 +5648,13 @@ void ext4_dirty_inode(struct inode *inode) | ||
2362 | handle_t *current_handle = ext4_journal_current_handle(); | ||
2363 | handle_t *handle; | ||
2364 | |||
2365 | - if (!ext4_handle_valid(current_handle)) { | ||
2366 | - ext4_mark_inode_dirty(current_handle, inode); | ||
2367 | - return; | ||
2368 | - } | ||
2369 | - | ||
2370 | handle = ext4_journal_start(inode, 2); | ||
2371 | if (IS_ERR(handle)) | ||
2372 | goto out; | ||
2373 | - if (current_handle && | ||
2374 | - current_handle->h_transaction != handle->h_transaction) { | ||
2375 | - /* This task has a transaction open against a different fs */ | ||
2376 | - printk(KERN_EMERG "%s: transactions do not match!\n", | ||
2377 | - __func__); | ||
2378 | - } else { | ||
2379 | - jbd_debug(5, "marking dirty. outer handle=%p\n", | ||
2380 | - current_handle); | ||
2381 | - ext4_mark_inode_dirty(handle, inode); | ||
2382 | - } | ||
2383 | + | ||
2384 | + jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); | ||
2385 | + ext4_mark_inode_dirty(handle, inode); | ||
2386 | + | ||
2387 | ext4_journal_stop(handle); | ||
2388 | out: | ||
2389 | return; | ||
2390 | @@ -5281,12 +5781,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
2391 | else | ||
2392 | len = PAGE_CACHE_SIZE; | ||
2393 | |||
2394 | + lock_page(page); | ||
2395 | + /* | ||
2396 | + * return if we have all the buffers mapped. This avoid | ||
2397 | + * the need to call write_begin/write_end which does a | ||
2398 | + * journal_start/journal_stop which can block and take | ||
2399 | + * long time | ||
2400 | + */ | ||
2401 | if (page_has_buffers(page)) { | ||
2402 | - /* return if we have all the buffers mapped */ | ||
2403 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
2404 | - ext4_bh_unmapped)) | ||
2405 | + ext4_bh_unmapped)) { | ||
2406 | + unlock_page(page); | ||
2407 | goto out_unlock; | ||
2408 | + } | ||
2409 | } | ||
2410 | + unlock_page(page); | ||
2411 | /* | ||
2412 | * OK, we need to fill the hole... Do write_begin write_end | ||
2413 | * to do block allocation/reservation.We are not holding | ||
2414 | diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c | ||
2415 | index 7050a9c..b63d193 100644 | ||
2416 | --- a/fs/ext4/ioctl.c | ||
2417 | +++ b/fs/ext4/ioctl.c | ||
2418 | @@ -221,32 +221,38 @@ setversion_out: | ||
2419 | struct file *donor_filp; | ||
2420 | int err; | ||
2421 | |||
2422 | + if (!(filp->f_mode & FMODE_READ) || | ||
2423 | + !(filp->f_mode & FMODE_WRITE)) | ||
2424 | + return -EBADF; | ||
2425 | + | ||
2426 | if (copy_from_user(&me, | ||
2427 | (struct move_extent __user *)arg, sizeof(me))) | ||
2428 | return -EFAULT; | ||
2429 | + me.moved_len = 0; | ||
2430 | |||
2431 | donor_filp = fget(me.donor_fd); | ||
2432 | if (!donor_filp) | ||
2433 | return -EBADF; | ||
2434 | |||
2435 | - if (!capable(CAP_DAC_OVERRIDE)) { | ||
2436 | - if ((current->real_cred->fsuid != inode->i_uid) || | ||
2437 | - !(inode->i_mode & S_IRUSR) || | ||
2438 | - !(donor_filp->f_dentry->d_inode->i_mode & | ||
2439 | - S_IRUSR)) { | ||
2440 | - fput(donor_filp); | ||
2441 | - return -EACCES; | ||
2442 | - } | ||
2443 | + if (!(donor_filp->f_mode & FMODE_WRITE)) { | ||
2444 | + err = -EBADF; | ||
2445 | + goto mext_out; | ||
2446 | } | ||
2447 | |||
2448 | + err = mnt_want_write(filp->f_path.mnt); | ||
2449 | + if (err) | ||
2450 | + goto mext_out; | ||
2451 | + | ||
2452 | err = ext4_move_extents(filp, donor_filp, me.orig_start, | ||
2453 | me.donor_start, me.len, &me.moved_len); | ||
2454 | - fput(donor_filp); | ||
2455 | + mnt_drop_write(filp->f_path.mnt); | ||
2456 | + if (me.moved_len > 0) | ||
2457 | + file_remove_suid(donor_filp); | ||
2458 | |||
2459 | - if (!err) | ||
2460 | - if (copy_to_user((struct move_extent *)arg, | ||
2461 | - &me, sizeof(me))) | ||
2462 | - return -EFAULT; | ||
2463 | + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) | ||
2464 | + err = -EFAULT; | ||
2465 | +mext_out: | ||
2466 | + fput(donor_filp); | ||
2467 | return err; | ||
2468 | } | ||
2469 | |||
2470 | diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c | ||
2471 | index cd25846..099fd47 100644 | ||
2472 | --- a/fs/ext4/mballoc.c | ||
2473 | +++ b/fs/ext4/mballoc.c | ||
2474 | @@ -908,6 +908,97 @@ out: | ||
2475 | return err; | ||
2476 | } | ||
2477 | |||
2478 | +static noinline_for_stack | ||
2479 | +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | ||
2480 | +{ | ||
2481 | + | ||
2482 | + int ret = 0; | ||
2483 | + void *bitmap; | ||
2484 | + int blocks_per_page; | ||
2485 | + int block, pnum, poff; | ||
2486 | + int num_grp_locked = 0; | ||
2487 | + struct ext4_group_info *this_grp; | ||
2488 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2489 | + struct inode *inode = sbi->s_buddy_cache; | ||
2490 | + struct page *page = NULL, *bitmap_page = NULL; | ||
2491 | + | ||
2492 | + mb_debug("init group %lu\n", group); | ||
2493 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
2494 | + this_grp = ext4_get_group_info(sb, group); | ||
2495 | + /* | ||
2496 | + * This ensures we don't add group | ||
2497 | + * to this buddy cache via resize | ||
2498 | + */ | ||
2499 | + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); | ||
2500 | + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { | ||
2501 | + /* | ||
2502 | + * somebody initialized the group | ||
2503 | + * return without doing anything | ||
2504 | + */ | ||
2505 | + ret = 0; | ||
2506 | + goto err; | ||
2507 | + } | ||
2508 | + /* | ||
2509 | + * the buddy cache inode stores the block bitmap | ||
2510 | + * and buddy information in consecutive blocks. | ||
2511 | + * So for each group we need two blocks. | ||
2512 | + */ | ||
2513 | + block = group * 2; | ||
2514 | + pnum = block / blocks_per_page; | ||
2515 | + poff = block % blocks_per_page; | ||
2516 | + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
2517 | + if (page) { | ||
2518 | + BUG_ON(page->mapping != inode->i_mapping); | ||
2519 | + ret = ext4_mb_init_cache(page, NULL); | ||
2520 | + if (ret) { | ||
2521 | + unlock_page(page); | ||
2522 | + goto err; | ||
2523 | + } | ||
2524 | + unlock_page(page); | ||
2525 | + } | ||
2526 | + if (page == NULL || !PageUptodate(page)) { | ||
2527 | + ret = -EIO; | ||
2528 | + goto err; | ||
2529 | + } | ||
2530 | + mark_page_accessed(page); | ||
2531 | + bitmap_page = page; | ||
2532 | + bitmap = page_address(page) + (poff * sb->s_blocksize); | ||
2533 | + | ||
2534 | + /* init buddy cache */ | ||
2535 | + block++; | ||
2536 | + pnum = block / blocks_per_page; | ||
2537 | + poff = block % blocks_per_page; | ||
2538 | + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
2539 | + if (page == bitmap_page) { | ||
2540 | + /* | ||
2541 | + * If both the bitmap and buddy are in | ||
2542 | + * the same page we don't need to force | ||
2543 | + * init the buddy | ||
2544 | + */ | ||
2545 | + unlock_page(page); | ||
2546 | + } else if (page) { | ||
2547 | + BUG_ON(page->mapping != inode->i_mapping); | ||
2548 | + ret = ext4_mb_init_cache(page, bitmap); | ||
2549 | + if (ret) { | ||
2550 | + unlock_page(page); | ||
2551 | + goto err; | ||
2552 | + } | ||
2553 | + unlock_page(page); | ||
2554 | + } | ||
2555 | + if (page == NULL || !PageUptodate(page)) { | ||
2556 | + ret = -EIO; | ||
2557 | + goto err; | ||
2558 | + } | ||
2559 | + mark_page_accessed(page); | ||
2560 | +err: | ||
2561 | + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); | ||
2562 | + if (bitmap_page) | ||
2563 | + page_cache_release(bitmap_page); | ||
2564 | + if (page) | ||
2565 | + page_cache_release(page); | ||
2566 | + return ret; | ||
2567 | +} | ||
2568 | + | ||
2569 | static noinline_for_stack int | ||
2570 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
2571 | struct ext4_buddy *e4b) | ||
2572 | @@ -941,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
2573 | * groups mapped by the page is blocked | ||
2574 | * till we are done with allocation | ||
2575 | */ | ||
2576 | +repeat_load_buddy: | ||
2577 | down_read(e4b->alloc_semp); | ||
2578 | |||
2579 | + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { | ||
2580 | + /* we need to check for group need init flag | ||
2581 | + * with alloc_semp held so that we can be sure | ||
2582 | + * that new blocks didn't get added to the group | ||
2583 | + * when we are loading the buddy cache | ||
2584 | + */ | ||
2585 | + up_read(e4b->alloc_semp); | ||
2586 | + /* | ||
2587 | + * we need full data about the group | ||
2588 | + * to make a good selection | ||
2589 | + */ | ||
2590 | + ret = ext4_mb_init_group(sb, group); | ||
2591 | + if (ret) | ||
2592 | + return ret; | ||
2593 | + goto repeat_load_buddy; | ||
2594 | + } | ||
2595 | + | ||
2596 | /* | ||
2597 | * the buddy cache inode stores the block bitmap | ||
2598 | * and buddy information in consecutive blocks. | ||
2599 | @@ -1360,7 +1469,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, | ||
2600 | ac->alloc_semp = e4b->alloc_semp; | ||
2601 | e4b->alloc_semp = NULL; | ||
2602 | /* store last allocated for subsequent stream allocation */ | ||
2603 | - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
2604 | + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { | ||
2605 | spin_lock(&sbi->s_md_lock); | ||
2606 | sbi->s_mb_last_group = ac->ac_f_ex.fe_group; | ||
2607 | sbi->s_mb_last_start = ac->ac_f_ex.fe_start; | ||
2608 | @@ -1837,97 +1946,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, | ||
2609 | |||
2610 | } | ||
2611 | |||
2612 | -static noinline_for_stack | ||
2613 | -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | ||
2614 | -{ | ||
2615 | - | ||
2616 | - int ret; | ||
2617 | - void *bitmap; | ||
2618 | - int blocks_per_page; | ||
2619 | - int block, pnum, poff; | ||
2620 | - int num_grp_locked = 0; | ||
2621 | - struct ext4_group_info *this_grp; | ||
2622 | - struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2623 | - struct inode *inode = sbi->s_buddy_cache; | ||
2624 | - struct page *page = NULL, *bitmap_page = NULL; | ||
2625 | - | ||
2626 | - mb_debug("init group %lu\n", group); | ||
2627 | - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
2628 | - this_grp = ext4_get_group_info(sb, group); | ||
2629 | - /* | ||
2630 | - * This ensures we don't add group | ||
2631 | - * to this buddy cache via resize | ||
2632 | - */ | ||
2633 | - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); | ||
2634 | - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { | ||
2635 | - /* | ||
2636 | - * somebody initialized the group | ||
2637 | - * return without doing anything | ||
2638 | - */ | ||
2639 | - ret = 0; | ||
2640 | - goto err; | ||
2641 | - } | ||
2642 | - /* | ||
2643 | - * the buddy cache inode stores the block bitmap | ||
2644 | - * and buddy information in consecutive blocks. | ||
2645 | - * So for each group we need two blocks. | ||
2646 | - */ | ||
2647 | - block = group * 2; | ||
2648 | - pnum = block / blocks_per_page; | ||
2649 | - poff = block % blocks_per_page; | ||
2650 | - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
2651 | - if (page) { | ||
2652 | - BUG_ON(page->mapping != inode->i_mapping); | ||
2653 | - ret = ext4_mb_init_cache(page, NULL); | ||
2654 | - if (ret) { | ||
2655 | - unlock_page(page); | ||
2656 | - goto err; | ||
2657 | - } | ||
2658 | - unlock_page(page); | ||
2659 | - } | ||
2660 | - if (page == NULL || !PageUptodate(page)) { | ||
2661 | - ret = -EIO; | ||
2662 | - goto err; | ||
2663 | - } | ||
2664 | - mark_page_accessed(page); | ||
2665 | - bitmap_page = page; | ||
2666 | - bitmap = page_address(page) + (poff * sb->s_blocksize); | ||
2667 | - | ||
2668 | - /* init buddy cache */ | ||
2669 | - block++; | ||
2670 | - pnum = block / blocks_per_page; | ||
2671 | - poff = block % blocks_per_page; | ||
2672 | - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
2673 | - if (page == bitmap_page) { | ||
2674 | - /* | ||
2675 | - * If both the bitmap and buddy are in | ||
2676 | - * the same page we don't need to force | ||
2677 | - * init the buddy | ||
2678 | - */ | ||
2679 | - unlock_page(page); | ||
2680 | - } else if (page) { | ||
2681 | - BUG_ON(page->mapping != inode->i_mapping); | ||
2682 | - ret = ext4_mb_init_cache(page, bitmap); | ||
2683 | - if (ret) { | ||
2684 | - unlock_page(page); | ||
2685 | - goto err; | ||
2686 | - } | ||
2687 | - unlock_page(page); | ||
2688 | - } | ||
2689 | - if (page == NULL || !PageUptodate(page)) { | ||
2690 | - ret = -EIO; | ||
2691 | - goto err; | ||
2692 | - } | ||
2693 | - mark_page_accessed(page); | ||
2694 | -err: | ||
2695 | - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); | ||
2696 | - if (bitmap_page) | ||
2697 | - page_cache_release(bitmap_page); | ||
2698 | - if (page) | ||
2699 | - page_cache_release(page); | ||
2700 | - return ret; | ||
2701 | -} | ||
2702 | - | ||
2703 | static noinline_for_stack int | ||
2704 | ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | ||
2705 | { | ||
2706 | @@ -1938,11 +1956,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | ||
2707 | struct ext4_sb_info *sbi; | ||
2708 | struct super_block *sb; | ||
2709 | struct ext4_buddy e4b; | ||
2710 | - loff_t size, isize; | ||
2711 | |||
2712 | sb = ac->ac_sb; | ||
2713 | sbi = EXT4_SB(sb); | ||
2714 | ngroups = ext4_get_groups_count(sb); | ||
2715 | + /* non-extent files are limited to low blocks/groups */ | ||
2716 | + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) | ||
2717 | + ngroups = sbi->s_blockfile_groups; | ||
2718 | + | ||
2719 | BUG_ON(ac->ac_status == AC_STATUS_FOUND); | ||
2720 | |||
2721 | /* first, try the goal */ | ||
2722 | @@ -1974,20 +1995,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | ||
2723 | } | ||
2724 | |||
2725 | bsbits = ac->ac_sb->s_blocksize_bits; | ||
2726 | - /* if stream allocation is enabled, use global goal */ | ||
2727 | - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
2728 | - isize = i_size_read(ac->ac_inode) >> bsbits; | ||
2729 | - if (size < isize) | ||
2730 | - size = isize; | ||
2731 | |||
2732 | - if (size < sbi->s_mb_stream_request && | ||
2733 | - (ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
2734 | + /* if stream allocation is enabled, use global goal */ | ||
2735 | + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { | ||
2736 | /* TBD: may be hot point */ | ||
2737 | spin_lock(&sbi->s_md_lock); | ||
2738 | ac->ac_g_ex.fe_group = sbi->s_mb_last_group; | ||
2739 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; | ||
2740 | spin_unlock(&sbi->s_md_lock); | ||
2741 | } | ||
2742 | + | ||
2743 | /* Let's just scan groups to find more-less suitable blocks */ | ||
2744 | cr = ac->ac_2order ? 0 : 1; | ||
2745 | /* | ||
2746 | @@ -2015,27 +2032,6 @@ repeat: | ||
2747 | if (grp->bb_free == 0) | ||
2748 | continue; | ||
2749 | |||
2750 | - /* | ||
2751 | - * if the group is already init we check whether it is | ||
2752 | - * a good group and if not we don't load the buddy | ||
2753 | - */ | ||
2754 | - if (EXT4_MB_GRP_NEED_INIT(grp)) { | ||
2755 | - /* | ||
2756 | - * we need full data about the group | ||
2757 | - * to make a good selection | ||
2758 | - */ | ||
2759 | - err = ext4_mb_init_group(sb, group); | ||
2760 | - if (err) | ||
2761 | - goto out; | ||
2762 | - } | ||
2763 | - | ||
2764 | - /* | ||
2765 | - * If the particular group doesn't satisfy our | ||
2766 | - * criteria we continue with the next group | ||
2767 | - */ | ||
2768 | - if (!ext4_mb_good_group(ac, group, cr)) | ||
2769 | - continue; | ||
2770 | - | ||
2771 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2772 | if (err) | ||
2773 | goto out; | ||
2774 | @@ -2571,13 +2567,11 @@ static int ext4_mb_init_backend(struct super_block *sb) | ||
2775 | { | ||
2776 | ext4_group_t ngroups = ext4_get_groups_count(sb); | ||
2777 | ext4_group_t i; | ||
2778 | - int metalen; | ||
2779 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2780 | struct ext4_super_block *es = sbi->s_es; | ||
2781 | int num_meta_group_infos; | ||
2782 | int num_meta_group_infos_max; | ||
2783 | int array_size; | ||
2784 | - struct ext4_group_info **meta_group_info; | ||
2785 | struct ext4_group_desc *desc; | ||
2786 | |||
2787 | /* This is the number of blocks used by GDT */ | ||
2788 | @@ -2622,22 +2616,6 @@ static int ext4_mb_init_backend(struct super_block *sb) | ||
2789 | goto err_freesgi; | ||
2790 | } | ||
2791 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; | ||
2792 | - | ||
2793 | - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2794 | - for (i = 0; i < num_meta_group_infos; i++) { | ||
2795 | - if ((i + 1) == num_meta_group_infos) | ||
2796 | - metalen = sizeof(*meta_group_info) * | ||
2797 | - (ngroups - | ||
2798 | - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); | ||
2799 | - meta_group_info = kmalloc(metalen, GFP_KERNEL); | ||
2800 | - if (meta_group_info == NULL) { | ||
2801 | - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | ||
2802 | - "buddy group\n"); | ||
2803 | - goto err_freemeta; | ||
2804 | - } | ||
2805 | - sbi->s_group_info[i] = meta_group_info; | ||
2806 | - } | ||
2807 | - | ||
2808 | for (i = 0; i < ngroups; i++) { | ||
2809 | desc = ext4_get_group_desc(sb, i, NULL); | ||
2810 | if (desc == NULL) { | ||
2811 | @@ -2655,7 +2633,6 @@ err_freebuddy: | ||
2812 | while (i-- > 0) | ||
2813 | kfree(ext4_get_group_info(sb, i)); | ||
2814 | i = num_meta_group_infos; | ||
2815 | -err_freemeta: | ||
2816 | while (i-- > 0) | ||
2817 | kfree(sbi->s_group_info[i]); | ||
2818 | iput(sbi->s_buddy_cache); | ||
2819 | @@ -2833,7 +2810,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | ||
2820 | struct ext4_group_info *db; | ||
2821 | int err, count = 0, count2 = 0; | ||
2822 | struct ext4_free_data *entry; | ||
2823 | - ext4_fsblk_t discard_block; | ||
2824 | struct list_head *l, *ltmp; | ||
2825 | |||
2826 | list_for_each_safe(l, ltmp, &txn->t_private_list) { | ||
2827 | @@ -2863,13 +2839,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | ||
2828 | page_cache_release(e4b.bd_bitmap_page); | ||
2829 | } | ||
2830 | ext4_unlock_group(sb, entry->group); | ||
2831 | - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) | ||
2832 | - + entry->start_blk | ||
2833 | - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
2834 | - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, | ||
2835 | - entry->count); | ||
2836 | - sb_issue_discard(sb, discard_block, entry->count); | ||
2837 | - | ||
2838 | + if (test_opt(sb, DISCARD)) { | ||
2839 | + ext4_fsblk_t discard_block; | ||
2840 | + struct ext4_super_block *es = EXT4_SB(sb)->s_es; | ||
2841 | + | ||
2842 | + discard_block = (ext4_fsblk_t)entry->group * | ||
2843 | + EXT4_BLOCKS_PER_GROUP(sb) | ||
2844 | + + entry->start_blk | ||
2845 | + + le32_to_cpu(es->s_first_data_block); | ||
2846 | + trace_ext4_discard_blocks(sb, | ||
2847 | + (unsigned long long)discard_block, | ||
2848 | + entry->count); | ||
2849 | + sb_issue_discard(sb, discard_block, entry->count); | ||
2850 | + } | ||
2851 | kmem_cache_free(ext4_free_ext_cachep, entry); | ||
2852 | ext4_mb_release_desc(&e4b); | ||
2853 | } | ||
2854 | @@ -3276,6 +3258,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) | ||
2855 | } | ||
2856 | |||
2857 | /* | ||
2858 | + * Called on failure; free up any blocks from the inode PA for this | ||
2859 | + * context. We don't need this for MB_GROUP_PA because we only change | ||
2860 | + * pa_free in ext4_mb_release_context(), but on failure, we've already | ||
2861 | + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. | ||
2862 | + */ | ||
2863 | +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) | ||
2864 | +{ | ||
2865 | + struct ext4_prealloc_space *pa = ac->ac_pa; | ||
2866 | + int len; | ||
2867 | + | ||
2868 | + if (pa && pa->pa_type == MB_INODE_PA) { | ||
2869 | + len = ac->ac_b_ex.fe_len; | ||
2870 | + pa->pa_free += len; | ||
2871 | + } | ||
2872 | + | ||
2873 | +} | ||
2874 | + | ||
2875 | +/* | ||
2876 | * use blocks preallocated to inode | ||
2877 | */ | ||
2878 | static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, | ||
2879 | @@ -3382,6 +3382,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | ||
2880 | ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) | ||
2881 | continue; | ||
2882 | |||
2883 | + /* non-extent files can't have physical blocks past 2^32 */ | ||
2884 | + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && | ||
2885 | + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) | ||
2886 | + continue; | ||
2887 | + | ||
2888 | /* found preallocated blocks, use them */ | ||
2889 | spin_lock(&pa->pa_lock); | ||
2890 | if (pa->pa_deleted == 0 && pa->pa_free) { | ||
2891 | @@ -4174,16 +4179,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) | ||
2892 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
2893 | return; | ||
2894 | |||
2895 | + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
2896 | + return; | ||
2897 | + | ||
2898 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
2899 | - isize = i_size_read(ac->ac_inode) >> bsbits; | ||
2900 | - size = max(size, isize); | ||
2901 | + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) | ||
2902 | + >> bsbits; | ||
2903 | |||
2904 | - /* don't use group allocation for large files */ | ||
2905 | - if (size >= sbi->s_mb_stream_request) | ||
2906 | + if ((size == isize) && | ||
2907 | + !ext4_fs_is_busy(sbi) && | ||
2908 | + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { | ||
2909 | + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; | ||
2910 | return; | ||
2911 | + } | ||
2912 | |||
2913 | - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
2914 | + /* don't use group allocation for large files */ | ||
2915 | + size = max(size, isize); | ||
2916 | + if (size >= sbi->s_mb_stream_request) { | ||
2917 | + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; | ||
2918 | return; | ||
2919 | + } | ||
2920 | |||
2921 | BUG_ON(ac->ac_lg != NULL); | ||
2922 | /* | ||
2923 | @@ -4549,6 +4564,7 @@ repeat: | ||
2924 | ac->ac_status = AC_STATUS_CONTINUE; | ||
2925 | goto repeat; | ||
2926 | } else if (*errp) { | ||
2927 | + ext4_discard_allocated_blocks(ac); | ||
2928 | ac->ac_b_ex.fe_len = 0; | ||
2929 | ar->len = 0; | ||
2930 | ext4_mb_show_ac(ac); | ||
2931 | diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c | ||
2932 | index 313a50b..8646149 100644 | ||
2933 | --- a/fs/ext4/migrate.c | ||
2934 | +++ b/fs/ext4/migrate.c | ||
2935 | @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode, | ||
2936 | goto err_out; | ||
2937 | } | ||
2938 | } | ||
2939 | - retval = ext4_ext_insert_extent(handle, inode, path, &newext); | ||
2940 | + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); | ||
2941 | err_out: | ||
2942 | if (path) { | ||
2943 | ext4_ext_drop_refs(path); | ||
2944 | @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) | ||
2945 | * So allocate a credit of 3. We may update | ||
2946 | * quota (user and group). | ||
2947 | */ | ||
2948 | - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | ||
2949 | + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); | ||
2950 | |||
2951 | if (ext4_journal_extend(handle, needed) != 0) | ||
2952 | retval = ext4_journal_restart(handle, needed); | ||
2953 | @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, | ||
2954 | |||
2955 | down_write(&EXT4_I(inode)->i_data_sem); | ||
2956 | /* | ||
2957 | - * if EXT4_EXT_MIGRATE is cleared a block allocation | ||
2958 | + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation | ||
2959 | * happened after we started the migrate. We need to | ||
2960 | * fail the migrate | ||
2961 | */ | ||
2962 | - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { | ||
2963 | + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { | ||
2964 | retval = -EAGAIN; | ||
2965 | up_write(&EXT4_I(inode)->i_data_sem); | ||
2966 | goto err_out; | ||
2967 | } else | ||
2968 | - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & | ||
2969 | - ~EXT4_EXT_MIGRATE; | ||
2970 | + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; | ||
2971 | /* | ||
2972 | * We have the extent map build with the tmp inode. | ||
2973 | * Now copy the i_data across | ||
2974 | @@ -478,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode) | ||
2975 | handle = ext4_journal_start(inode, | ||
2976 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + | ||
2977 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | ||
2978 | - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) | ||
2979 | + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) | ||
2980 | + 1); | ||
2981 | if (IS_ERR(handle)) { | ||
2982 | retval = PTR_ERR(handle); | ||
2983 | @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode) | ||
2984 | * when we add extents we extent the journal | ||
2985 | */ | ||
2986 | /* | ||
2987 | - * Even though we take i_mutex we can still cause block allocation | ||
2988 | - * via mmap write to holes. If we have allocated new blocks we fail | ||
2989 | - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. | ||
2990 | - * The flag is updated with i_data_sem held to prevent racing with | ||
2991 | - * block allocation. | ||
2992 | + * Even though we take i_mutex we can still cause block | ||
2993 | + * allocation via mmap write to holes. If we have allocated | ||
2994 | + * new blocks we fail migrate. New block allocation will | ||
2995 | + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated | ||
2996 | + * with i_data_sem held to prevent racing with block | ||
2997 | + * allocation. | ||
2998 | */ | ||
2999 | down_read((&EXT4_I(inode)->i_data_sem)); | ||
3000 | - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; | ||
3001 | + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; | ||
3002 | up_read((&EXT4_I(inode)->i_data_sem)); | ||
3003 | |||
3004 | handle = ext4_journal_start(inode, 1); | ||
3005 | @@ -618,7 +618,7 @@ err_out: | ||
3006 | tmp_inode->i_nlink = 0; | ||
3007 | |||
3008 | ext4_journal_stop(handle); | ||
3009 | - | ||
3010 | + unlock_new_inode(tmp_inode); | ||
3011 | iput(tmp_inode); | ||
3012 | |||
3013 | return retval; | ||
3014 | diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c | ||
3015 | index bbf2dd9..9a573a6 100644 | ||
3016 | --- a/fs/ext4/move_extent.c | ||
3017 | +++ b/fs/ext4/move_extent.c | ||
3018 | @@ -19,14 +19,31 @@ | ||
3019 | #include "ext4_extents.h" | ||
3020 | #include "ext4.h" | ||
3021 | |||
3022 | -#define get_ext_path(path, inode, block, ret) \ | ||
3023 | - do { \ | ||
3024 | - path = ext4_ext_find_extent(inode, block, path); \ | ||
3025 | - if (IS_ERR(path)) { \ | ||
3026 | - ret = PTR_ERR(path); \ | ||
3027 | - path = NULL; \ | ||
3028 | - } \ | ||
3029 | - } while (0) | ||
3030 | +/** | ||
3031 | + * get_ext_path - Find an extent path for designated logical block number. | ||
3032 | + * | ||
3033 | + * @inode: an inode which is searched | ||
3034 | + * @lblock: logical block number to find an extent path | ||
3035 | + * @path: pointer to an extent path pointer (for output) | ||
3036 | + * | ||
3037 | + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value | ||
3038 | + * on failure. | ||
3039 | + */ | ||
3040 | +static inline int | ||
3041 | +get_ext_path(struct inode *inode, ext4_lblk_t lblock, | ||
3042 | + struct ext4_ext_path **path) | ||
3043 | +{ | ||
3044 | + int ret = 0; | ||
3045 | + | ||
3046 | + *path = ext4_ext_find_extent(inode, lblock, *path); | ||
3047 | + if (IS_ERR(*path)) { | ||
3048 | + ret = PTR_ERR(*path); | ||
3049 | + *path = NULL; | ||
3050 | + } else if ((*path)[ext_depth(inode)].p_ext == NULL) | ||
3051 | + ret = -ENODATA; | ||
3052 | + | ||
3053 | + return ret; | ||
3054 | +} | ||
3055 | |||
3056 | /** | ||
3057 | * copy_extent_status - Copy the extent's initialization status | ||
3058 | @@ -60,12 +77,14 @@ static int | ||
3059 | mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | ||
3060 | struct ext4_extent **extent) | ||
3061 | { | ||
3062 | + struct ext4_extent_header *eh; | ||
3063 | int ppos, leaf_ppos = path->p_depth; | ||
3064 | |||
3065 | ppos = leaf_ppos; | ||
3066 | if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { | ||
3067 | /* leaf block */ | ||
3068 | *extent = ++path[ppos].p_ext; | ||
3069 | + path[ppos].p_block = ext_pblock(path[ppos].p_ext); | ||
3070 | return 0; | ||
3071 | } | ||
3072 | |||
3073 | @@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | ||
3074 | ext_block_hdr(path[cur_ppos+1].p_bh); | ||
3075 | } | ||
3076 | |||
3077 | + path[leaf_ppos].p_ext = *extent = NULL; | ||
3078 | + | ||
3079 | + eh = path[leaf_ppos].p_hdr; | ||
3080 | + if (le16_to_cpu(eh->eh_entries) == 0) | ||
3081 | + /* empty leaf is found */ | ||
3082 | + return -ENODATA; | ||
3083 | + | ||
3084 | /* leaf block */ | ||
3085 | path[leaf_ppos].p_ext = *extent = | ||
3086 | EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); | ||
3087 | + path[leaf_ppos].p_block = | ||
3088 | + ext_pblock(path[leaf_ppos].p_ext); | ||
3089 | return 0; | ||
3090 | } | ||
3091 | } | ||
3092 | @@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | ||
3093 | } | ||
3094 | |||
3095 | /** | ||
3096 | - * mext_double_down_read - Acquire two inodes' read semaphore | ||
3097 | + * mext_check_null_inode - NULL check for two inodes | ||
3098 | * | ||
3099 | - * @orig_inode: original inode structure | ||
3100 | - * @donor_inode: donor inode structure | ||
3101 | - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. | ||
3102 | + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. | ||
3103 | */ | ||
3104 | -static void | ||
3105 | -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) | ||
3106 | +static int | ||
3107 | +mext_check_null_inode(struct inode *inode1, struct inode *inode2, | ||
3108 | + const char *function) | ||
3109 | { | ||
3110 | - struct inode *first = orig_inode, *second = donor_inode; | ||
3111 | - | ||
3112 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); | ||
3113 | - | ||
3114 | - /* | ||
3115 | - * Use the inode number to provide the stable locking order instead | ||
3116 | - * of its address, because the C language doesn't guarantee you can | ||
3117 | - * compare pointers that don't come from the same array. | ||
3118 | - */ | ||
3119 | - if (donor_inode->i_ino < orig_inode->i_ino) { | ||
3120 | - first = donor_inode; | ||
3121 | - second = orig_inode; | ||
3122 | + int ret = 0; | ||
3123 | + | ||
3124 | + if (inode1 == NULL) { | ||
3125 | + ext4_error(inode2->i_sb, function, | ||
3126 | + "Both inodes should not be NULL: " | ||
3127 | + "inode1 NULL inode2 %lu", inode2->i_ino); | ||
3128 | + ret = -EIO; | ||
3129 | + } else if (inode2 == NULL) { | ||
3130 | + ext4_error(inode1->i_sb, function, | ||
3131 | + "Both inodes should not be NULL: " | ||
3132 | + "inode1 %lu inode2 NULL", inode1->i_ino); | ||
3133 | + ret = -EIO; | ||
3134 | } | ||
3135 | - | ||
3136 | - down_read(&EXT4_I(first)->i_data_sem); | ||
3137 | - down_read(&EXT4_I(second)->i_data_sem); | ||
3138 | + return ret; | ||
3139 | } | ||
3140 | |||
3141 | /** | ||
3142 | - * mext_double_down_write - Acquire two inodes' write semaphore | ||
3143 | + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem | ||
3144 | * | ||
3145 | * @orig_inode: original inode structure | ||
3146 | * @donor_inode: donor inode structure | ||
3147 | - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. | ||
3148 | + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by | ||
3149 | + * i_ino order. | ||
3150 | */ | ||
3151 | static void | ||
3152 | -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) | ||
3153 | +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) | ||
3154 | { | ||
3155 | struct inode *first = orig_inode, *second = donor_inode; | ||
3156 | |||
3157 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); | ||
3158 | - | ||
3159 | /* | ||
3160 | * Use the inode number to provide the stable locking order instead | ||
3161 | * of its address, because the C language doesn't guarantee you can | ||
3162 | @@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) | ||
3163 | } | ||
3164 | |||
3165 | down_write(&EXT4_I(first)->i_data_sem); | ||
3166 | - down_write(&EXT4_I(second)->i_data_sem); | ||
3167 | -} | ||
3168 | - | ||
3169 | -/** | ||
3170 | - * mext_double_up_read - Release two inodes' read semaphore | ||
3171 | - * | ||
3172 | - * @orig_inode: original inode structure to be released its lock first | ||
3173 | - * @donor_inode: donor inode structure to be released its lock second | ||
3174 | - * Release read semaphore of two inodes (orig and donor). | ||
3175 | - */ | ||
3176 | -static void | ||
3177 | -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) | ||
3178 | -{ | ||
3179 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); | ||
3180 | - | ||
3181 | - up_read(&EXT4_I(orig_inode)->i_data_sem); | ||
3182 | - up_read(&EXT4_I(donor_inode)->i_data_sem); | ||
3183 | + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); | ||
3184 | } | ||
3185 | |||
3186 | /** | ||
3187 | - * mext_double_up_write - Release two inodes' write semaphore | ||
3188 | + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem | ||
3189 | * | ||
3190 | * @orig_inode: original inode structure to be released its lock first | ||
3191 | * @donor_inode: donor inode structure to be released its lock second | ||
3192 | - * Release write semaphore of two inodes (orig and donor). | ||
3193 | + * Release write lock of i_data_sem of two inodes (orig and donor). | ||
3194 | */ | ||
3195 | static void | ||
3196 | -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) | ||
3197 | +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) | ||
3198 | { | ||
3199 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); | ||
3200 | - | ||
3201 | up_write(&EXT4_I(orig_inode)->i_data_sem); | ||
3202 | up_write(&EXT4_I(donor_inode)->i_data_sem); | ||
3203 | } | ||
3204 | @@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, | ||
3205 | } | ||
3206 | |||
3207 | if (new_flag) { | ||
3208 | - get_ext_path(orig_path, orig_inode, eblock, err); | ||
3209 | - if (orig_path == NULL) | ||
3210 | + err = get_ext_path(orig_inode, eblock, &orig_path); | ||
3211 | + if (err) | ||
3212 | goto out; | ||
3213 | |||
3214 | if (ext4_ext_insert_extent(handle, orig_inode, | ||
3215 | - orig_path, new_ext)) | ||
3216 | + orig_path, new_ext, 0)) | ||
3217 | goto out; | ||
3218 | } | ||
3219 | |||
3220 | if (end_flag) { | ||
3221 | - get_ext_path(orig_path, orig_inode, | ||
3222 | - le32_to_cpu(end_ext->ee_block) - 1, err); | ||
3223 | - if (orig_path == NULL) | ||
3224 | + err = get_ext_path(orig_inode, | ||
3225 | + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); | ||
3226 | + if (err) | ||
3227 | goto out; | ||
3228 | |||
3229 | if (ext4_ext_insert_extent(handle, orig_inode, | ||
3230 | - orig_path, end_ext)) | ||
3231 | + orig_path, end_ext, 0)) | ||
3232 | goto out; | ||
3233 | } | ||
3234 | out: | ||
3235 | @@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, | ||
3236 | * oext |-----------| | ||
3237 | * new_ext |-------| | ||
3238 | */ | ||
3239 | - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); | ||
3240 | + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { | ||
3241 | + ext4_error(orig_inode->i_sb, __func__, | ||
3242 | + "new_ext_end(%u) should be less than or equal to " | ||
3243 | + "oext->ee_block(%u) + oext_alen(%d) - 1", | ||
3244 | + new_ext_end, le32_to_cpu(oext->ee_block), | ||
3245 | + oext_alen); | ||
3246 | + ret = -EIO; | ||
3247 | + goto out; | ||
3248 | + } | ||
3249 | |||
3250 | /* | ||
3251 | * Case: new_ext is smaller than original extent | ||
3252 | @@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, | ||
3253 | |||
3254 | ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, | ||
3255 | o_end, &start_ext, &new_ext, &end_ext); | ||
3256 | +out: | ||
3257 | return ret; | ||
3258 | } | ||
3259 | |||
3260 | @@ -554,8 +569,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, | ||
3261 | * @orig_off: block offset of original inode | ||
3262 | * @donor_off: block offset of donor inode | ||
3263 | * @max_count: the maximun length of extents | ||
3264 | + * | ||
3265 | + * Return 0 on success, or a negative error value on failure. | ||
3266 | */ | ||
3267 | -static void | ||
3268 | +static int | ||
3269 | mext_calc_swap_extents(struct ext4_extent *tmp_dext, | ||
3270 | struct ext4_extent *tmp_oext, | ||
3271 | ext4_lblk_t orig_off, ext4_lblk_t donor_off, | ||
3272 | @@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, | ||
3273 | ext4_lblk_t diff, orig_diff; | ||
3274 | struct ext4_extent dext_old, oext_old; | ||
3275 | |||
3276 | + BUG_ON(orig_off != donor_off); | ||
3277 | + | ||
3278 | + /* original and donor extents have to cover the same block offset */ | ||
3279 | + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || | ||
3280 | + le32_to_cpu(tmp_oext->ee_block) + | ||
3281 | + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) | ||
3282 | + return -ENODATA; | ||
3283 | + | ||
3284 | + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || | ||
3285 | + le32_to_cpu(tmp_dext->ee_block) + | ||
3286 | + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) | ||
3287 | + return -ENODATA; | ||
3288 | + | ||
3289 | dext_old = *tmp_dext; | ||
3290 | oext_old = *tmp_oext; | ||
3291 | |||
3292 | @@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, | ||
3293 | |||
3294 | copy_extent_status(&oext_old, tmp_dext); | ||
3295 | copy_extent_status(&dext_old, tmp_oext); | ||
3296 | + | ||
3297 | + return 0; | ||
3298 | } | ||
3299 | |||
3300 | /** | ||
3301 | @@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, | ||
3302 | * @donor_inode: donor inode | ||
3303 | * @from: block offset of orig_inode | ||
3304 | * @count: block count to be replaced | ||
3305 | + * @err: pointer to save return value | ||
3306 | * | ||
3307 | * Replace original inode extents and donor inode extents page by page. | ||
3308 | * We implement this replacement in the following three steps: | ||
3309 | @@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, | ||
3310 | * 3. Change the block information of donor inode to point at the saved | ||
3311 | * original inode blocks in the dummy extents. | ||
3312 | * | ||
3313 | - * Return 0 on success, or a negative error value on failure. | ||
3314 | + * Return replaced block count. | ||
3315 | */ | ||
3316 | static int | ||
3317 | mext_replace_branches(handle_t *handle, struct inode *orig_inode, | ||
3318 | struct inode *donor_inode, ext4_lblk_t from, | ||
3319 | - ext4_lblk_t count) | ||
3320 | + ext4_lblk_t count, int *err) | ||
3321 | { | ||
3322 | struct ext4_ext_path *orig_path = NULL; | ||
3323 | struct ext4_ext_path *donor_path = NULL; | ||
3324 | struct ext4_extent *oext, *dext; | ||
3325 | struct ext4_extent tmp_dext, tmp_oext; | ||
3326 | ext4_lblk_t orig_off = from, donor_off = from; | ||
3327 | - int err = 0; | ||
3328 | int depth; | ||
3329 | int replaced_count = 0; | ||
3330 | int dext_alen; | ||
3331 | |||
3332 | - mext_double_down_write(orig_inode, donor_inode); | ||
3333 | + /* Protect extent trees against block allocations via delalloc */ | ||
3334 | + double_down_write_data_sem(orig_inode, donor_inode); | ||
3335 | |||
3336 | /* Get the original extent for the block "orig_off" */ | ||
3337 | - get_ext_path(orig_path, orig_inode, orig_off, err); | ||
3338 | - if (orig_path == NULL) | ||
3339 | + *err = get_ext_path(orig_inode, orig_off, &orig_path); | ||
3340 | + if (*err) | ||
3341 | goto out; | ||
3342 | |||
3343 | /* Get the donor extent for the head */ | ||
3344 | - get_ext_path(donor_path, donor_inode, donor_off, err); | ||
3345 | - if (donor_path == NULL) | ||
3346 | + *err = get_ext_path(donor_inode, donor_off, &donor_path); | ||
3347 | + if (*err) | ||
3348 | goto out; | ||
3349 | depth = ext_depth(orig_inode); | ||
3350 | oext = orig_path[depth].p_ext; | ||
3351 | @@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, | ||
3352 | dext = donor_path[depth].p_ext; | ||
3353 | tmp_dext = *dext; | ||
3354 | |||
3355 | - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, | ||
3356 | + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, | ||
3357 | donor_off, count); | ||
3358 | + if (*err) | ||
3359 | + goto out; | ||
3360 | |||
3361 | /* Loop for the donor extents */ | ||
3362 | while (1) { | ||
3363 | /* The extent for donor must be found. */ | ||
3364 | - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); | ||
3365 | + if (!dext) { | ||
3366 | + ext4_error(donor_inode->i_sb, __func__, | ||
3367 | + "The extent for donor must be found"); | ||
3368 | + *err = -EIO; | ||
3369 | + goto out; | ||
3370 | + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { | ||
3371 | + ext4_error(donor_inode->i_sb, __func__, | ||
3372 | + "Donor offset(%u) and the first block of donor " | ||
3373 | + "extent(%u) should be equal", | ||
3374 | + donor_off, | ||
3375 | + le32_to_cpu(tmp_dext.ee_block)); | ||
3376 | + *err = -EIO; | ||
3377 | + goto out; | ||
3378 | + } | ||
3379 | |||
3380 | /* Set donor extent to orig extent */ | ||
3381 | - err = mext_leaf_block(handle, orig_inode, | ||
3382 | + *err = mext_leaf_block(handle, orig_inode, | ||
3383 | orig_path, &tmp_dext, &orig_off); | ||
3384 | - if (err < 0) | ||
3385 | + if (*err) | ||
3386 | goto out; | ||
3387 | |||
3388 | /* Set orig extent to donor extent */ | ||
3389 | - err = mext_leaf_block(handle, donor_inode, | ||
3390 | + *err = mext_leaf_block(handle, donor_inode, | ||
3391 | donor_path, &tmp_oext, &donor_off); | ||
3392 | - if (err < 0) | ||
3393 | + if (*err) | ||
3394 | goto out; | ||
3395 | |||
3396 | dext_alen = ext4_ext_get_actual_len(&tmp_dext); | ||
3397 | @@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, | ||
3398 | |||
3399 | if (orig_path) | ||
3400 | ext4_ext_drop_refs(orig_path); | ||
3401 | - get_ext_path(orig_path, orig_inode, orig_off, err); | ||
3402 | - if (orig_path == NULL) | ||
3403 | + *err = get_ext_path(orig_inode, orig_off, &orig_path); | ||
3404 | + if (*err) | ||
3405 | goto out; | ||
3406 | depth = ext_depth(orig_inode); | ||
3407 | oext = orig_path[depth].p_ext; | ||
3408 | - if (le32_to_cpu(oext->ee_block) + | ||
3409 | - ext4_ext_get_actual_len(oext) <= orig_off) { | ||
3410 | - err = 0; | ||
3411 | - goto out; | ||
3412 | - } | ||
3413 | tmp_oext = *oext; | ||
3414 | |||
3415 | if (donor_path) | ||
3416 | ext4_ext_drop_refs(donor_path); | ||
3417 | - get_ext_path(donor_path, donor_inode, | ||
3418 | - donor_off, err); | ||
3419 | - if (donor_path == NULL) | ||
3420 | + *err = get_ext_path(donor_inode, donor_off, &donor_path); | ||
3421 | + if (*err) | ||
3422 | goto out; | ||
3423 | depth = ext_depth(donor_inode); | ||
3424 | dext = donor_path[depth].p_ext; | ||
3425 | - if (le32_to_cpu(dext->ee_block) + | ||
3426 | - ext4_ext_get_actual_len(dext) <= donor_off) { | ||
3427 | - err = 0; | ||
3428 | - goto out; | ||
3429 | - } | ||
3430 | tmp_dext = *dext; | ||
3431 | |||
3432 | - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, | ||
3433 | - donor_off, | ||
3434 | - count - replaced_count); | ||
3435 | + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, | ||
3436 | + donor_off, count - replaced_count); | ||
3437 | + if (*err) | ||
3438 | + goto out; | ||
3439 | } | ||
3440 | |||
3441 | out: | ||
3442 | @@ -720,8 +758,12 @@ out: | ||
3443 | kfree(donor_path); | ||
3444 | } | ||
3445 | |||
3446 | - mext_double_up_write(orig_inode, donor_inode); | ||
3447 | - return err; | ||
3448 | + ext4_ext_invalidate_cache(orig_inode); | ||
3449 | + ext4_ext_invalidate_cache(donor_inode); | ||
3450 | + | ||
3451 | + double_up_write_data_sem(orig_inode, donor_inode); | ||
3452 | + | ||
3453 | + return replaced_count; | ||
3454 | } | ||
3455 | |||
3456 | /** | ||
3457 | @@ -733,16 +775,17 @@ out: | ||
3458 | * @data_offset_in_page: block index where data swapping starts | ||
3459 | * @block_len_in_page: the number of blocks to be swapped | ||
3460 | * @uninit: orig extent is uninitialized or not | ||
3461 | + * @err: pointer to save return value | ||
3462 | * | ||
3463 | * Save the data in original inode blocks and replace original inode extents | ||
3464 | * with donor inode extents by calling mext_replace_branches(). | ||
3465 | - * Finally, write out the saved data in new original inode blocks. Return 0 | ||
3466 | - * on success, or a negative error value on failure. | ||
3467 | + * Finally, write out the saved data in new original inode blocks. Return | ||
3468 | + * replaced block count. | ||
3469 | */ | ||
3470 | static int | ||
3471 | -move_extent_par_page(struct file *o_filp, struct inode *donor_inode, | ||
3472 | +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | ||
3473 | pgoff_t orig_page_offset, int data_offset_in_page, | ||
3474 | - int block_len_in_page, int uninit) | ||
3475 | + int block_len_in_page, int uninit, int *err) | ||
3476 | { | ||
3477 | struct inode *orig_inode = o_filp->f_dentry->d_inode; | ||
3478 | struct address_space *mapping = orig_inode->i_mapping; | ||
3479 | @@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, | ||
3480 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; | ||
3481 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; | ||
3482 | unsigned int w_flags = 0; | ||
3483 | - unsigned int tmp_data_len, data_len; | ||
3484 | + unsigned int tmp_data_size, data_size, replaced_size; | ||
3485 | void *fsdata; | ||
3486 | - int ret, i, jblocks; | ||
3487 | + int i, jblocks; | ||
3488 | + int err2 = 0; | ||
3489 | + int replaced_count = 0; | ||
3490 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; | ||
3491 | |||
3492 | /* | ||
3493 | @@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, | ||
3494 | jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; | ||
3495 | handle = ext4_journal_start(orig_inode, jblocks); | ||
3496 | if (IS_ERR(handle)) { | ||
3497 | - ret = PTR_ERR(handle); | ||
3498 | - return ret; | ||
3499 | + *err = PTR_ERR(handle); | ||
3500 | + return 0; | ||
3501 | } | ||
3502 | |||
3503 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
3504 | @@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, | ||
3505 | * Just swap data blocks between orig and donor. | ||
3506 | */ | ||
3507 | if (uninit) { | ||
3508 | - ret = mext_replace_branches(handle, orig_inode, | ||
3509 | - donor_inode, orig_blk_offset, | ||
3510 | - block_len_in_page); | ||
3511 | - | ||
3512 | - /* Clear the inode cache not to refer to the old data */ | ||
3513 | - ext4_ext_invalidate_cache(orig_inode); | ||
3514 | - ext4_ext_invalidate_cache(donor_inode); | ||
3515 | + replaced_count = mext_replace_branches(handle, orig_inode, | ||
3516 | + donor_inode, orig_blk_offset, | ||
3517 | + block_len_in_page, err); | ||
3518 | goto out2; | ||
3519 | } | ||
3520 | |||
3521 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; | ||
3522 | |||
3523 | - /* Calculate data_len */ | ||
3524 | + /* Calculate data_size */ | ||
3525 | if ((orig_blk_offset + block_len_in_page - 1) == | ||
3526 | ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { | ||
3527 | /* Replace the last block */ | ||
3528 | - tmp_data_len = orig_inode->i_size & (blocksize - 1); | ||
3529 | + tmp_data_size = orig_inode->i_size & (blocksize - 1); | ||
3530 | /* | ||
3531 | - * If data_len equal zero, it shows data_len is multiples of | ||
3532 | + * If data_size equal zero, it shows data_size is multiples of | ||
3533 | * blocksize. So we set appropriate value. | ||
3534 | */ | ||
3535 | - if (tmp_data_len == 0) | ||
3536 | - tmp_data_len = blocksize; | ||
3537 | + if (tmp_data_size == 0) | ||
3538 | + tmp_data_size = blocksize; | ||
3539 | |||
3540 | - data_len = tmp_data_len + | ||
3541 | + data_size = tmp_data_size + | ||
3542 | ((block_len_in_page - 1) << orig_inode->i_blkbits); | ||
3543 | - } else { | ||
3544 | - data_len = block_len_in_page << orig_inode->i_blkbits; | ||
3545 | - } | ||
3546 | + } else | ||
3547 | + data_size = block_len_in_page << orig_inode->i_blkbits; | ||
3548 | + | ||
3549 | + replaced_size = data_size; | ||
3550 | |||
3551 | - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, | ||
3552 | + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, | ||
3553 | &page, &fsdata); | ||
3554 | - if (unlikely(ret < 0)) | ||
3555 | + if (unlikely(*err < 0)) | ||
3556 | goto out; | ||
3557 | |||
3558 | if (!PageUptodate(page)) { | ||
3559 | @@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, | ||
3560 | /* Release old bh and drop refs */ | ||
3561 | try_to_release_page(page, 0); | ||
3562 | |||
3563 | - ret = mext_replace_branches(handle, orig_inode, donor_inode, | ||
3564 | - orig_blk_offset, block_len_in_page); | ||
3565 | - if (ret < 0) | ||
3566 | - goto out; | ||
3567 | - | ||
3568 | - /* Clear the inode cache not to refer to the old data */ | ||
3569 | - ext4_ext_invalidate_cache(orig_inode); | ||
3570 | - ext4_ext_invalidate_cache(donor_inode); | ||
3571 | + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, | ||
3572 | + orig_blk_offset, block_len_in_page, | ||
3573 | + &err2); | ||
3574 | + if (err2) { | ||
3575 | + if (replaced_count) { | ||
3576 | + block_len_in_page = replaced_count; | ||
3577 | + replaced_size = | ||
3578 | + block_len_in_page << orig_inode->i_blkbits; | ||
3579 | + } else | ||
3580 | + goto out; | ||
3581 | + } | ||
3582 | |||
3583 | if (!page_has_buffers(page)) | ||
3584 | create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); | ||
3585 | @@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, | ||
3586 | bh = bh->b_this_page; | ||
3587 | |||
3588 | for (i = 0; i < block_len_in_page; i++) { | ||
3589 | - ret = ext4_get_block(orig_inode, | ||
3590 | + *err = ext4_get_block(orig_inode, | ||
3591 | (sector_t)(orig_blk_offset + i), bh, 0); | ||
3592 | - if (ret < 0) | ||
3593 | + if (*err < 0) | ||
3594 | goto out; | ||
3595 | |||
3596 | if (bh->b_this_page != NULL) | ||
3597 | bh = bh->b_this_page; | ||
3598 | } | ||
3599 | |||
3600 | - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, | ||
3601 | + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, | ||
3602 | page, fsdata); | ||
3603 | page = NULL; | ||
3604 | |||
3605 | @@ -871,11 +916,15 @@ out: | ||
3606 | if (PageLocked(page)) | ||
3607 | unlock_page(page); | ||
3608 | page_cache_release(page); | ||
3609 | + ext4_journal_stop(handle); | ||
3610 | } | ||
3611 | out2: | ||
3612 | ext4_journal_stop(handle); | ||
3613 | |||
3614 | - return ret < 0 ? ret : 0; | ||
3615 | + if (err2) | ||
3616 | + *err = err2; | ||
3617 | + | ||
3618 | + return replaced_count; | ||
3619 | } | ||
3620 | |||
3621 | /** | ||
3622 | @@ -886,7 +935,6 @@ out2: | ||
3623 | * @orig_start: logical start offset in block for orig | ||
3624 | * @donor_start: logical start offset in block for donor | ||
3625 | * @len: the number of blocks to be moved | ||
3626 | - * @moved_len: moved block length | ||
3627 | * | ||
3628 | * Check the arguments of ext4_move_extents() whether the files can be | ||
3629 | * exchanged with each other. | ||
3630 | @@ -894,9 +942,13 @@ out2: | ||
3631 | */ | ||
3632 | static int | ||
3633 | mext_check_arguments(struct inode *orig_inode, | ||
3634 | - struct inode *donor_inode, __u64 orig_start, | ||
3635 | - __u64 donor_start, __u64 *len, __u64 moved_len) | ||
3636 | + struct inode *donor_inode, __u64 orig_start, | ||
3637 | + __u64 donor_start, __u64 *len) | ||
3638 | { | ||
3639 | + ext4_lblk_t orig_blocks, donor_blocks; | ||
3640 | + unsigned int blkbits = orig_inode->i_blkbits; | ||
3641 | + unsigned int blocksize = 1 << blkbits; | ||
3642 | + | ||
3643 | /* Regular file check */ | ||
3644 | if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { | ||
3645 | ext4_debug("ext4 move extent: The argument files should be " | ||
3646 | @@ -905,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, | ||
3647 | return -EINVAL; | ||
3648 | } | ||
3649 | |||
3650 | + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { | ||
3651 | + ext4_debug("ext4 move extent: suid or sgid is set" | ||
3652 | + " to donor file [ino:orig %lu, donor %lu]\n", | ||
3653 | + orig_inode->i_ino, donor_inode->i_ino); | ||
3654 | + return -EINVAL; | ||
3655 | + } | ||
3656 | + | ||
3657 | /* Ext4 move extent does not support swapfile */ | ||
3658 | if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { | ||
3659 | ext4_debug("ext4 move extent: The argument files should " | ||
3660 | @@ -921,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode, | ||
3661 | return -EINVAL; | ||
3662 | } | ||
3663 | |||
3664 | - /* orig and donor should be different file */ | ||
3665 | - if (orig_inode->i_ino == donor_inode->i_ino) { | ||
3666 | - ext4_debug("ext4 move extent: The argument files should not " | ||
3667 | - "be same file [ino:orig %lu, donor %lu]\n", | ||
3668 | - orig_inode->i_ino, donor_inode->i_ino); | ||
3669 | - return -EINVAL; | ||
3670 | - } | ||
3671 | - | ||
3672 | /* Ext4 move extent supports only extent based file */ | ||
3673 | if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { | ||
3674 | ext4_debug("ext4 move extent: orig file is not extents " | ||
3675 | @@ -953,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode, | ||
3676 | return -EINVAL; | ||
3677 | } | ||
3678 | |||
3679 | - if (moved_len) { | ||
3680 | - ext4_debug("ext4 move extent: moved_len should be 0 " | ||
3681 | - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, | ||
3682 | - donor_inode->i_ino); | ||
3683 | - return -EINVAL; | ||
3684 | - } | ||
3685 | - | ||
3686 | if ((orig_start > MAX_DEFRAG_SIZE) || | ||
3687 | (donor_start > MAX_DEFRAG_SIZE) || | ||
3688 | (*len > MAX_DEFRAG_SIZE) || | ||
3689 | @@ -971,43 +1015,47 @@ mext_check_arguments(struct inode *orig_inode, | ||
3690 | } | ||
3691 | |||
3692 | if (orig_inode->i_size > donor_inode->i_size) { | ||
3693 | - if (orig_start >= donor_inode->i_size) { | ||
3694 | + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; | ||
3695 | + /* TODO: eliminate this artificial restriction */ | ||
3696 | + if (orig_start >= donor_blocks) { | ||
3697 | ext4_debug("ext4 move extent: orig start offset " | ||
3698 | - "[%llu] should be less than donor file size " | ||
3699 | - "[%lld] [ino:orig %lu, donor_inode %lu]\n", | ||
3700 | - orig_start, donor_inode->i_size, | ||
3701 | + "[%llu] should be less than donor file blocks " | ||
3702 | + "[%u] [ino:orig %lu, donor %lu]\n", | ||
3703 | + orig_start, donor_blocks, | ||
3704 | orig_inode->i_ino, donor_inode->i_ino); | ||
3705 | return -EINVAL; | ||
3706 | } | ||
3707 | |||
3708 | - if (orig_start + *len > donor_inode->i_size) { | ||
3709 | + /* TODO: eliminate this artificial restriction */ | ||
3710 | + if (orig_start + *len > donor_blocks) { | ||
3711 | ext4_debug("ext4 move extent: End offset [%llu] should " | ||
3712 | - "be less than donor file size [%lld]." | ||
3713 | - "So adjust length from %llu to %lld " | ||
3714 | + "be less than donor file blocks [%u]." | ||
3715 | + "So adjust length from %llu to %llu " | ||
3716 | "[ino:orig %lu, donor %lu]\n", | ||
3717 | - orig_start + *len, donor_inode->i_size, | ||
3718 | - *len, donor_inode->i_size - orig_start, | ||
3719 | + orig_start + *len, donor_blocks, | ||
3720 | + *len, donor_blocks - orig_start, | ||
3721 | orig_inode->i_ino, donor_inode->i_ino); | ||
3722 | - *len = donor_inode->i_size - orig_start; | ||
3723 | + *len = donor_blocks - orig_start; | ||
3724 | } | ||
3725 | } else { | ||
3726 | - if (orig_start >= orig_inode->i_size) { | ||
3727 | + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; | ||
3728 | + if (orig_start >= orig_blocks) { | ||
3729 | ext4_debug("ext4 move extent: start offset [%llu] " | ||
3730 | - "should be less than original file size " | ||
3731 | - "[%lld] [inode:orig %lu, donor %lu]\n", | ||
3732 | - orig_start, orig_inode->i_size, | ||
3733 | + "should be less than original file blocks " | ||
3734 | + "[%u] [ino:orig %lu, donor %lu]\n", | ||
3735 | + orig_start, orig_blocks, | ||
3736 | orig_inode->i_ino, donor_inode->i_ino); | ||
3737 | return -EINVAL; | ||
3738 | } | ||
3739 | |||
3740 | - if (orig_start + *len > orig_inode->i_size) { | ||
3741 | + if (orig_start + *len > orig_blocks) { | ||
3742 | ext4_debug("ext4 move extent: Adjust length " | ||
3743 | - "from %llu to %lld. Because it should be " | ||
3744 | - "less than original file size " | ||
3745 | + "from %llu to %llu. Because it should be " | ||
3746 | + "less than original file blocks " | ||
3747 | "[ino:orig %lu, donor %lu]\n", | ||
3748 | - *len, orig_inode->i_size - orig_start, | ||
3749 | + *len, orig_blocks - orig_start, | ||
3750 | orig_inode->i_ino, donor_inode->i_ino); | ||
3751 | - *len = orig_inode->i_size - orig_start; | ||
3752 | + *len = orig_blocks - orig_start; | ||
3753 | } | ||
3754 | } | ||
3755 | |||
3756 | @@ -1027,18 +1075,23 @@ mext_check_arguments(struct inode *orig_inode, | ||
3757 | * @inode1: the inode structure | ||
3758 | * @inode2: the inode structure | ||
3759 | * | ||
3760 | - * Lock two inodes' i_mutex by i_ino order. This function is moved from | ||
3761 | - * fs/inode.c. | ||
3762 | + * Lock two inodes' i_mutex by i_ino order. | ||
3763 | + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. | ||
3764 | */ | ||
3765 | -static void | ||
3766 | +static int | ||
3767 | mext_inode_double_lock(struct inode *inode1, struct inode *inode2) | ||
3768 | { | ||
3769 | - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { | ||
3770 | - if (inode1) | ||
3771 | - mutex_lock(&inode1->i_mutex); | ||
3772 | - else if (inode2) | ||
3773 | - mutex_lock(&inode2->i_mutex); | ||
3774 | - return; | ||
3775 | + int ret = 0; | ||
3776 | + | ||
3777 | + BUG_ON(inode1 == NULL && inode2 == NULL); | ||
3778 | + | ||
3779 | + ret = mext_check_null_inode(inode1, inode2, __func__); | ||
3780 | + if (ret < 0) | ||
3781 | + goto out; | ||
3782 | + | ||
3783 | + if (inode1 == inode2) { | ||
3784 | + mutex_lock(&inode1->i_mutex); | ||
3785 | + goto out; | ||
3786 | } | ||
3787 | |||
3788 | if (inode1->i_ino < inode2->i_ino) { | ||
3789 | @@ -1048,6 +1101,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) | ||
3790 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); | ||
3791 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); | ||
3792 | } | ||
3793 | + | ||
3794 | +out: | ||
3795 | + return ret; | ||
3796 | } | ||
3797 | |||
3798 | /** | ||
3799 | @@ -1056,17 +1112,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) | ||
3800 | * @inode1: the inode that is released first | ||
3801 | * @inode2: the inode that is released second | ||
3802 | * | ||
3803 | - * This function is moved from fs/inode.c. | ||
3804 | + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. | ||
3805 | */ | ||
3806 | |||
3807 | -static void | ||
3808 | +static int | ||
3809 | mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) | ||
3810 | { | ||
3811 | + int ret = 0; | ||
3812 | + | ||
3813 | + BUG_ON(inode1 == NULL && inode2 == NULL); | ||
3814 | + | ||
3815 | + ret = mext_check_null_inode(inode1, inode2, __func__); | ||
3816 | + if (ret < 0) | ||
3817 | + goto out; | ||
3818 | + | ||
3819 | if (inode1) | ||
3820 | mutex_unlock(&inode1->i_mutex); | ||
3821 | |||
3822 | if (inode2 && inode2 != inode1) | ||
3823 | mutex_unlock(&inode2->i_mutex); | ||
3824 | + | ||
3825 | +out: | ||
3826 | + return ret; | ||
3827 | } | ||
3828 | |||
3829 | /** | ||
3830 | @@ -1123,70 +1190,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | ||
3831 | ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; | ||
3832 | ext4_lblk_t rest_blocks; | ||
3833 | pgoff_t orig_page_offset = 0, seq_end_page; | ||
3834 | - int ret, depth, last_extent = 0; | ||
3835 | + int ret1, ret2, depth, last_extent = 0; | ||
3836 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; | ||
3837 | int data_offset_in_page; | ||
3838 | int block_len_in_page; | ||
3839 | int uninit; | ||
3840 | |||
3841 | - /* protect orig and donor against a truncate */ | ||
3842 | - mext_inode_double_lock(orig_inode, donor_inode); | ||
3843 | + /* orig and donor should be different file */ | ||
3844 | + if (orig_inode->i_ino == donor_inode->i_ino) { | ||
3845 | + ext4_debug("ext4 move extent: The argument files should not " | ||
3846 | + "be same file [ino:orig %lu, donor %lu]\n", | ||
3847 | + orig_inode->i_ino, donor_inode->i_ino); | ||
3848 | + return -EINVAL; | ||
3849 | + } | ||
3850 | + | ||
3851 | + /* Protect orig and donor inodes against a truncate */ | ||
3852 | + ret1 = mext_inode_double_lock(orig_inode, donor_inode); | ||
3853 | + if (ret1 < 0) | ||
3854 | + return ret1; | ||
3855 | |||
3856 | - mext_double_down_read(orig_inode, donor_inode); | ||
3857 | + /* Protect extent tree against block allocations via delalloc */ | ||
3858 | + double_down_write_data_sem(orig_inode, donor_inode); | ||
3859 | /* Check the filesystem environment whether move_extent can be done */ | ||
3860 | - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, | ||
3861 | - donor_start, &len, *moved_len); | ||
3862 | - mext_double_up_read(orig_inode, donor_inode); | ||
3863 | - if (ret) | ||
3864 | - goto out2; | ||
3865 | + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, | ||
3866 | + donor_start, &len); | ||
3867 | + if (ret1) | ||
3868 | + goto out; | ||
3869 | |||
3870 | file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; | ||
3871 | block_end = block_start + len - 1; | ||
3872 | if (file_end < block_end) | ||
3873 | len -= block_end - file_end; | ||
3874 | |||
3875 | - get_ext_path(orig_path, orig_inode, block_start, ret); | ||
3876 | - if (orig_path == NULL) | ||
3877 | - goto out2; | ||
3878 | + ret1 = get_ext_path(orig_inode, block_start, &orig_path); | ||
3879 | + if (ret1) | ||
3880 | + goto out; | ||
3881 | |||
3882 | /* Get path structure to check the hole */ | ||
3883 | - get_ext_path(holecheck_path, orig_inode, block_start, ret); | ||
3884 | - if (holecheck_path == NULL) | ||
3885 | + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); | ||
3886 | + if (ret1) | ||
3887 | goto out; | ||
3888 | |||
3889 | depth = ext_depth(orig_inode); | ||
3890 | ext_cur = holecheck_path[depth].p_ext; | ||
3891 | - if (ext_cur == NULL) { | ||
3892 | - ret = -EINVAL; | ||
3893 | - goto out; | ||
3894 | - } | ||
3895 | |||
3896 | /* | ||
3897 | - * Get proper extent whose ee_block is beyond block_start | ||
3898 | - * if block_start was within the hole. | ||
3899 | + * Get proper starting location of block replacement if block_start was | ||
3900 | + * within the hole. | ||
3901 | */ | ||
3902 | if (le32_to_cpu(ext_cur->ee_block) + | ||
3903 | ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { | ||
3904 | + /* | ||
3905 | + * The hole exists between extents or the tail of | ||
3906 | + * original file. | ||
3907 | + */ | ||
3908 | last_extent = mext_next_extent(orig_inode, | ||
3909 | holecheck_path, &ext_cur); | ||
3910 | if (last_extent < 0) { | ||
3911 | - ret = last_extent; | ||
3912 | + ret1 = last_extent; | ||
3913 | goto out; | ||
3914 | } | ||
3915 | last_extent = mext_next_extent(orig_inode, orig_path, | ||
3916 | &ext_dummy); | ||
3917 | if (last_extent < 0) { | ||
3918 | - ret = last_extent; | ||
3919 | + ret1 = last_extent; | ||
3920 | goto out; | ||
3921 | } | ||
3922 | - } | ||
3923 | - seq_start = block_start; | ||
3924 | + seq_start = le32_to_cpu(ext_cur->ee_block); | ||
3925 | + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) | ||
3926 | + /* The hole exists at the beginning of original file. */ | ||
3927 | + seq_start = le32_to_cpu(ext_cur->ee_block); | ||
3928 | + else | ||
3929 | + seq_start = block_start; | ||
3930 | |||
3931 | /* No blocks within the specified range. */ | ||
3932 | if (le32_to_cpu(ext_cur->ee_block) > block_end) { | ||
3933 | ext4_debug("ext4 move extent: The specified range of file " | ||
3934 | "may be the hole\n"); | ||
3935 | - ret = -EINVAL; | ||
3936 | + ret1 = -EINVAL; | ||
3937 | goto out; | ||
3938 | } | ||
3939 | |||
3940 | @@ -1206,7 +1287,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | ||
3941 | last_extent = mext_next_extent(orig_inode, holecheck_path, | ||
3942 | &ext_cur); | ||
3943 | if (last_extent < 0) { | ||
3944 | - ret = last_extent; | ||
3945 | + ret1 = last_extent; | ||
3946 | break; | ||
3947 | } | ||
3948 | add_blocks = ext4_ext_get_actual_len(ext_cur); | ||
3949 | @@ -1246,29 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | ||
3950 | seq_start = le32_to_cpu(ext_cur->ee_block); | ||
3951 | rest_blocks = seq_blocks; | ||
3952 | |||
3953 | - /* Discard preallocations of two inodes */ | ||
3954 | - down_write(&EXT4_I(orig_inode)->i_data_sem); | ||
3955 | - ext4_discard_preallocations(orig_inode); | ||
3956 | - up_write(&EXT4_I(orig_inode)->i_data_sem); | ||
3957 | - | ||
3958 | - down_write(&EXT4_I(donor_inode)->i_data_sem); | ||
3959 | - ext4_discard_preallocations(donor_inode); | ||
3960 | - up_write(&EXT4_I(donor_inode)->i_data_sem); | ||
3961 | + /* | ||
3962 | + * Up semaphore to avoid following problems: | ||
3963 | + * a. transaction deadlock among ext4_journal_start, | ||
3964 | + * ->write_begin via pagefault, and jbd2_journal_commit | ||
3965 | + * b. racing with ->readpage, ->write_begin, and ext4_get_block | ||
3966 | + * in move_extent_per_page | ||
3967 | + */ | ||
3968 | + double_up_write_data_sem(orig_inode, donor_inode); | ||
3969 | |||
3970 | while (orig_page_offset <= seq_end_page) { | ||
3971 | |||
3972 | /* Swap original branches with new branches */ | ||
3973 | - ret = move_extent_par_page(o_filp, donor_inode, | ||
3974 | + block_len_in_page = move_extent_per_page( | ||
3975 | + o_filp, donor_inode, | ||
3976 | orig_page_offset, | ||
3977 | data_offset_in_page, | ||
3978 | - block_len_in_page, uninit); | ||
3979 | - if (ret < 0) | ||
3980 | - goto out; | ||
3981 | - orig_page_offset++; | ||
3982 | + block_len_in_page, uninit, | ||
3983 | + &ret1); | ||
3984 | + | ||
3985 | /* Count how many blocks we have exchanged */ | ||
3986 | *moved_len += block_len_in_page; | ||
3987 | - BUG_ON(*moved_len > len); | ||
3988 | + if (ret1 < 0) | ||
3989 | + break; | ||
3990 | + if (*moved_len > len) { | ||
3991 | + ext4_error(orig_inode->i_sb, __func__, | ||
3992 | + "We replaced blocks too much! " | ||
3993 | + "sum of replaced: %llu requested: %llu", | ||
3994 | + *moved_len, len); | ||
3995 | + ret1 = -EIO; | ||
3996 | + break; | ||
3997 | + } | ||
3998 | |||
3999 | + orig_page_offset++; | ||
4000 | data_offset_in_page = 0; | ||
4001 | rest_blocks -= block_len_in_page; | ||
4002 | if (rest_blocks > blocks_per_page) | ||
4003 | @@ -1277,20 +1368,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | ||
4004 | block_len_in_page = rest_blocks; | ||
4005 | } | ||
4006 | |||
4007 | + double_down_write_data_sem(orig_inode, donor_inode); | ||
4008 | + if (ret1 < 0) | ||
4009 | + break; | ||
4010 | + | ||
4011 | /* Decrease buffer counter */ | ||
4012 | if (holecheck_path) | ||
4013 | ext4_ext_drop_refs(holecheck_path); | ||
4014 | - get_ext_path(holecheck_path, orig_inode, | ||
4015 | - seq_start, ret); | ||
4016 | - if (holecheck_path == NULL) | ||
4017 | + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); | ||
4018 | + if (ret1) | ||
4019 | break; | ||
4020 | depth = holecheck_path->p_depth; | ||
4021 | |||
4022 | /* Decrease buffer counter */ | ||
4023 | if (orig_path) | ||
4024 | ext4_ext_drop_refs(orig_path); | ||
4025 | - get_ext_path(orig_path, orig_inode, seq_start, ret); | ||
4026 | - if (orig_path == NULL) | ||
4027 | + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); | ||
4028 | + if (ret1) | ||
4029 | break; | ||
4030 | |||
4031 | ext_cur = holecheck_path[depth].p_ext; | ||
4032 | @@ -1299,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | ||
4033 | |||
4034 | } | ||
4035 | out: | ||
4036 | + if (*moved_len) { | ||
4037 | + ext4_discard_preallocations(orig_inode); | ||
4038 | + ext4_discard_preallocations(donor_inode); | ||
4039 | + } | ||
4040 | + | ||
4041 | if (orig_path) { | ||
4042 | ext4_ext_drop_refs(orig_path); | ||
4043 | kfree(orig_path); | ||
4044 | @@ -1307,14 +1406,13 @@ out: | ||
4045 | ext4_ext_drop_refs(holecheck_path); | ||
4046 | kfree(holecheck_path); | ||
4047 | } | ||
4048 | -out2: | ||
4049 | - mext_inode_double_unlock(orig_inode, donor_inode); | ||
4050 | - | ||
4051 | - if (ret) | ||
4052 | - return ret; | ||
4053 | + double_up_write_data_sem(orig_inode, donor_inode); | ||
4054 | + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); | ||
4055 | |||
4056 | - /* All of the specified blocks must be exchanged in succeed */ | ||
4057 | - BUG_ON(*moved_len != len); | ||
4058 | + if (ret1) | ||
4059 | + return ret1; | ||
4060 | + else if (ret2) | ||
4061 | + return ret2; | ||
4062 | |||
4063 | return 0; | ||
4064 | } | ||
4065 | diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c | ||
4066 | index de04013..9dcd686 100644 | ||
4067 | --- a/fs/ext4/namei.c | ||
4068 | +++ b/fs/ext4/namei.c | ||
4069 | @@ -1292,9 +1292,6 @@ errout: | ||
4070 | * add_dirent_to_buf will attempt search the directory block for | ||
4071 | * space. It will return -ENOSPC if no space is available, and -EIO | ||
4072 | * and -EEXIST if directory entry already exists. | ||
4073 | - * | ||
4074 | - * NOTE! bh is NOT released in the case where ENOSPC is returned. In | ||
4075 | - * all other cases bh is released. | ||
4076 | */ | ||
4077 | static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | ||
4078 | struct inode *inode, struct ext4_dir_entry_2 *de, | ||
4079 | @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | ||
4080 | top = bh->b_data + blocksize - reclen; | ||
4081 | while ((char *) de <= top) { | ||
4082 | if (!ext4_check_dir_entry("ext4_add_entry", dir, de, | ||
4083 | - bh, offset)) { | ||
4084 | - brelse(bh); | ||
4085 | + bh, offset)) | ||
4086 | return -EIO; | ||
4087 | - } | ||
4088 | - if (ext4_match(namelen, name, de)) { | ||
4089 | - brelse(bh); | ||
4090 | + if (ext4_match(namelen, name, de)) | ||
4091 | return -EEXIST; | ||
4092 | - } | ||
4093 | nlen = EXT4_DIR_REC_LEN(de->name_len); | ||
4094 | rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); | ||
4095 | if ((de->inode? rlen - nlen: rlen) >= reclen) | ||
4096 | @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | ||
4097 | err = ext4_journal_get_write_access(handle, bh); | ||
4098 | if (err) { | ||
4099 | ext4_std_error(dir->i_sb, err); | ||
4100 | - brelse(bh); | ||
4101 | return err; | ||
4102 | } | ||
4103 | |||
4104 | @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | ||
4105 | err = ext4_handle_dirty_metadata(handle, dir, bh); | ||
4106 | if (err) | ||
4107 | ext4_std_error(dir->i_sb, err); | ||
4108 | - brelse(bh); | ||
4109 | return 0; | ||
4110 | } | ||
4111 | |||
4112 | @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | ||
4113 | if (!(de)) | ||
4114 | return retval; | ||
4115 | |||
4116 | - return add_dirent_to_buf(handle, dentry, inode, de, bh); | ||
4117 | + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); | ||
4118 | + brelse(bh); | ||
4119 | + return retval; | ||
4120 | } | ||
4121 | |||
4122 | /* | ||
4123 | @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, | ||
4124 | if(!bh) | ||
4125 | return retval; | ||
4126 | retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); | ||
4127 | - if (retval != -ENOSPC) | ||
4128 | + if (retval != -ENOSPC) { | ||
4129 | + brelse(bh); | ||
4130 | return retval; | ||
4131 | + } | ||
4132 | |||
4133 | if (blocks == 1 && !dx_fallback && | ||
4134 | EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) | ||
4135 | @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, | ||
4136 | de = (struct ext4_dir_entry_2 *) bh->b_data; | ||
4137 | de->inode = 0; | ||
4138 | de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); | ||
4139 | - return add_dirent_to_buf(handle, dentry, inode, de, bh); | ||
4140 | + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); | ||
4141 | + brelse(bh); | ||
4142 | + return retval; | ||
4143 | } | ||
4144 | |||
4145 | /* | ||
4146 | @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | ||
4147 | goto journal_error; | ||
4148 | |||
4149 | err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); | ||
4150 | - if (err != -ENOSPC) { | ||
4151 | - bh = NULL; | ||
4152 | + if (err != -ENOSPC) | ||
4153 | goto cleanup; | ||
4154 | - } | ||
4155 | |||
4156 | /* Block full, should compress but for now just split */ | ||
4157 | dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", | ||
4158 | @@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | ||
4159 | goto cleanup; | ||
4160 | node2 = (struct dx_node *)(bh2->b_data); | ||
4161 | entries2 = node2->entries; | ||
4162 | + memset(&node2->fake, 0, sizeof(struct fake_dirent)); | ||
4163 | node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, | ||
4164 | sb->s_blocksize); | ||
4165 | - node2->fake.inode = 0; | ||
4166 | BUFFER_TRACE(frame->bh, "get_write_access"); | ||
4167 | err = ext4_journal_get_write_access(handle, frame->bh); | ||
4168 | if (err) | ||
4169 | @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | ||
4170 | if (!de) | ||
4171 | goto cleanup; | ||
4172 | err = add_dirent_to_buf(handle, dentry, inode, de, bh); | ||
4173 | - bh = NULL; | ||
4174 | goto cleanup; | ||
4175 | |||
4176 | journal_error: | ||
4177 | @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, | ||
4178 | retry: | ||
4179 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | ||
4180 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | ||
4181 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); | ||
4182 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
4183 | if (IS_ERR(handle)) | ||
4184 | return PTR_ERR(handle); | ||
4185 | |||
4186 | @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, | ||
4187 | retry: | ||
4188 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | ||
4189 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | ||
4190 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); | ||
4191 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
4192 | if (IS_ERR(handle)) | ||
4193 | return PTR_ERR(handle); | ||
4194 | |||
4195 | @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
4196 | retry: | ||
4197 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | ||
4198 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | ||
4199 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); | ||
4200 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
4201 | if (IS_ERR(handle)) | ||
4202 | return PTR_ERR(handle); | ||
4203 | |||
4204 | @@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) | ||
4205 | struct ext4_iloc iloc; | ||
4206 | int err = 0; | ||
4207 | |||
4208 | - if (!ext4_handle_valid(handle)) | ||
4209 | + /* ext4_handle_valid() assumes a valid handle_t pointer */ | ||
4210 | + if (handle && !ext4_handle_valid(handle)) | ||
4211 | return 0; | ||
4212 | |||
4213 | mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); | ||
4214 | @@ -2258,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, | ||
4215 | retry: | ||
4216 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | ||
4217 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + | ||
4218 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); | ||
4219 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
4220 | if (IS_ERR(handle)) | ||
4221 | return PTR_ERR(handle); | ||
4222 | |||
4223 | @@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry, | ||
4224 | struct inode *inode = old_dentry->d_inode; | ||
4225 | int err, retries = 0; | ||
4226 | |||
4227 | - if (EXT4_DIR_LINK_MAX(inode)) | ||
4228 | + if (inode->i_nlink >= EXT4_LINK_MAX) | ||
4229 | return -EMLINK; | ||
4230 | |||
4231 | /* | ||
4232 | @@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
4233 | goto end_rename; | ||
4234 | retval = -EMLINK; | ||
4235 | if (!new_inode && new_dir != old_dir && | ||
4236 | - new_dir->i_nlink >= EXT4_LINK_MAX) | ||
4237 | + EXT4_DIR_LINK_MAX(new_dir)) | ||
4238 | goto end_rename; | ||
4239 | } | ||
4240 | if (!new_bh) { | ||
4241 | diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c | ||
4242 | index 68b0351..96302cd 100644 | ||
4243 | --- a/fs/ext4/resize.c | ||
4244 | +++ b/fs/ext4/resize.c | ||
4245 | @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, | ||
4246 | goto exit_bh; | ||
4247 | |||
4248 | if (IS_ERR(gdb = bclean(handle, sb, block))) { | ||
4249 | - err = PTR_ERR(bh); | ||
4250 | + err = PTR_ERR(gdb); | ||
4251 | goto exit_bh; | ||
4252 | } | ||
4253 | ext4_handle_dirty_metadata(handle, NULL, gdb); | ||
4254 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c | ||
4255 | index 8f4f079..ed38f25 100644 | ||
4256 | --- a/fs/ext4/super.c | ||
4257 | +++ b/fs/ext4/super.c | ||
4258 | @@ -45,6 +45,7 @@ | ||
4259 | #include "ext4_jbd2.h" | ||
4260 | #include "xattr.h" | ||
4261 | #include "acl.h" | ||
4262 | +#include "mballoc.h" | ||
4263 | |||
4264 | #define CREATE_TRACE_POINTS | ||
4265 | #include <trace/events/ext4.h> | ||
4266 | @@ -188,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb, | ||
4267 | bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); | ||
4268 | } | ||
4269 | |||
4270 | + | ||
4271 | +/* Just increment the non-pointer handle value */ | ||
4272 | +static handle_t *ext4_get_nojournal(void) | ||
4273 | +{ | ||
4274 | + handle_t *handle = current->journal_info; | ||
4275 | + unsigned long ref_cnt = (unsigned long)handle; | ||
4276 | + | ||
4277 | + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); | ||
4278 | + | ||
4279 | + ref_cnt++; | ||
4280 | + handle = (handle_t *)ref_cnt; | ||
4281 | + | ||
4282 | + current->journal_info = handle; | ||
4283 | + return handle; | ||
4284 | +} | ||
4285 | + | ||
4286 | + | ||
4287 | +/* Decrement the non-pointer handle value */ | ||
4288 | +static void ext4_put_nojournal(handle_t *handle) | ||
4289 | +{ | ||
4290 | + unsigned long ref_cnt = (unsigned long)handle; | ||
4291 | + | ||
4292 | + BUG_ON(ref_cnt == 0); | ||
4293 | + | ||
4294 | + ref_cnt--; | ||
4295 | + handle = (handle_t *)ref_cnt; | ||
4296 | + | ||
4297 | + current->journal_info = handle; | ||
4298 | +} | ||
4299 | + | ||
4300 | /* | ||
4301 | * Wrappers for jbd2_journal_start/end. | ||
4302 | * | ||
4303 | @@ -214,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) | ||
4304 | } | ||
4305 | return jbd2_journal_start(journal, nblocks); | ||
4306 | } | ||
4307 | - /* | ||
4308 | - * We're not journaling, return the appropriate indication. | ||
4309 | - */ | ||
4310 | - current->journal_info = EXT4_NOJOURNAL_HANDLE; | ||
4311 | - return current->journal_info; | ||
4312 | + return ext4_get_nojournal(); | ||
4313 | } | ||
4314 | |||
4315 | /* | ||
4316 | @@ -234,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle) | ||
4317 | int rc; | ||
4318 | |||
4319 | if (!ext4_handle_valid(handle)) { | ||
4320 | - /* | ||
4321 | - * Do this here since we don't call jbd2_journal_stop() in | ||
4322 | - * no-journal mode. | ||
4323 | - */ | ||
4324 | - current->journal_info = NULL; | ||
4325 | + ext4_put_nojournal(handle); | ||
4326 | return 0; | ||
4327 | } | ||
4328 | sb = handle->h_transaction->t_journal->j_private; | ||
4329 | @@ -344,7 +367,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, | ||
4330 | errstr = "Out of memory"; | ||
4331 | break; | ||
4332 | case -EROFS: | ||
4333 | - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) | ||
4334 | + if (!sb || (EXT4_SB(sb)->s_journal && | ||
4335 | + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) | ||
4336 | errstr = "Journal has aborted"; | ||
4337 | else | ||
4338 | errstr = "Readonly filesystem"; | ||
4339 | @@ -578,15 +602,14 @@ static void ext4_put_super(struct super_block *sb) | ||
4340 | struct ext4_super_block *es = sbi->s_es; | ||
4341 | int i, err; | ||
4342 | |||
4343 | + flush_workqueue(sbi->dio_unwritten_wq); | ||
4344 | + destroy_workqueue(sbi->dio_unwritten_wq); | ||
4345 | + | ||
4346 | lock_super(sb); | ||
4347 | lock_kernel(); | ||
4348 | if (sb->s_dirt) | ||
4349 | ext4_commit_super(sb, 1); | ||
4350 | |||
4351 | - ext4_release_system_zone(sb); | ||
4352 | - ext4_mb_release(sb); | ||
4353 | - ext4_ext_release(sb); | ||
4354 | - ext4_xattr_put_super(sb); | ||
4355 | if (sbi->s_journal) { | ||
4356 | err = jbd2_journal_destroy(sbi->s_journal); | ||
4357 | sbi->s_journal = NULL; | ||
4358 | @@ -594,6 +617,12 @@ static void ext4_put_super(struct super_block *sb) | ||
4359 | ext4_abort(sb, __func__, | ||
4360 | "Couldn't clean up the journal"); | ||
4361 | } | ||
4362 | + | ||
4363 | + ext4_release_system_zone(sb); | ||
4364 | + ext4_mb_release(sb); | ||
4365 | + ext4_ext_release(sb); | ||
4366 | + ext4_xattr_put_super(sb); | ||
4367 | + | ||
4368 | if (!(sb->s_flags & MS_RDONLY)) { | ||
4369 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | ||
4370 | es->s_state = cpu_to_le16(sbi->s_mount_state); | ||
4371 | @@ -682,6 +711,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | ||
4372 | ei->i_allocated_meta_blocks = 0; | ||
4373 | ei->i_delalloc_reserved_flag = 0; | ||
4374 | spin_lock_init(&(ei->i_block_reservation_lock)); | ||
4375 | + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); | ||
4376 | + ei->cur_aio_dio = NULL; | ||
4377 | + ei->i_sync_tid = 0; | ||
4378 | + ei->i_datasync_tid = 0; | ||
4379 | |||
4380 | return &ei->vfs_inode; | ||
4381 | } | ||
4382 | @@ -877,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | ||
4383 | if (test_opt(sb, NO_AUTO_DA_ALLOC)) | ||
4384 | seq_puts(seq, ",noauto_da_alloc"); | ||
4385 | |||
4386 | + if (test_opt(sb, DISCARD)) | ||
4387 | + seq_puts(seq, ",discard"); | ||
4388 | + | ||
4389 | + if (test_opt(sb, NOLOAD)) | ||
4390 | + seq_puts(seq, ",norecovery"); | ||
4391 | + | ||
4392 | ext4_show_quota_options(seq, sb); | ||
4393 | |||
4394 | return 0; | ||
4395 | @@ -1057,7 +1096,8 @@ enum { | ||
4396 | Opt_usrquota, Opt_grpquota, Opt_i_version, | ||
4397 | Opt_stripe, Opt_delalloc, Opt_nodelalloc, | ||
4398 | Opt_block_validity, Opt_noblock_validity, | ||
4399 | - Opt_inode_readahead_blks, Opt_journal_ioprio | ||
4400 | + Opt_inode_readahead_blks, Opt_journal_ioprio, | ||
4401 | + Opt_discard, Opt_nodiscard, | ||
4402 | }; | ||
4403 | |||
4404 | static const match_table_t tokens = { | ||
4405 | @@ -1082,6 +1122,7 @@ static const match_table_t tokens = { | ||
4406 | {Opt_acl, "acl"}, | ||
4407 | {Opt_noacl, "noacl"}, | ||
4408 | {Opt_noload, "noload"}, | ||
4409 | + {Opt_noload, "norecovery"}, | ||
4410 | {Opt_nobh, "nobh"}, | ||
4411 | {Opt_bh, "bh"}, | ||
4412 | {Opt_commit, "commit=%u"}, | ||
4413 | @@ -1123,6 +1164,8 @@ static const match_table_t tokens = { | ||
4414 | {Opt_auto_da_alloc, "auto_da_alloc=%u"}, | ||
4415 | {Opt_auto_da_alloc, "auto_da_alloc"}, | ||
4416 | {Opt_noauto_da_alloc, "noauto_da_alloc"}, | ||
4417 | + {Opt_discard, "discard"}, | ||
4418 | + {Opt_nodiscard, "nodiscard"}, | ||
4419 | {Opt_err, NULL}, | ||
4420 | }; | ||
4421 | |||
4422 | @@ -1551,6 +1594,12 @@ set_qf_format: | ||
4423 | else | ||
4424 | set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); | ||
4425 | break; | ||
4426 | + case Opt_discard: | ||
4427 | + set_opt(sbi->s_mount_opt, DISCARD); | ||
4428 | + break; | ||
4429 | + case Opt_nodiscard: | ||
4430 | + clear_opt(sbi->s_mount_opt, DISCARD); | ||
4431 | + break; | ||
4432 | default: | ||
4433 | ext4_msg(sb, KERN_ERR, | ||
4434 | "Unrecognized mount option \"%s\" " | ||
4435 | @@ -1666,14 +1715,14 @@ static int ext4_fill_flex_info(struct super_block *sb) | ||
4436 | size_t size; | ||
4437 | int i; | ||
4438 | |||
4439 | - if (!sbi->s_es->s_log_groups_per_flex) { | ||
4440 | + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; | ||
4441 | + groups_per_flex = 1 << sbi->s_log_groups_per_flex; | ||
4442 | + | ||
4443 | + if (groups_per_flex < 2) { | ||
4444 | sbi->s_log_groups_per_flex = 0; | ||
4445 | return 1; | ||
4446 | } | ||
4447 | |||
4448 | - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; | ||
4449 | - groups_per_flex = 1 << sbi->s_log_groups_per_flex; | ||
4450 | - | ||
4451 | /* We allocate both existing and potentially added groups */ | ||
4452 | flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + | ||
4453 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << | ||
4454 | @@ -1695,12 +1744,12 @@ static int ext4_fill_flex_info(struct super_block *sb) | ||
4455 | gdp = ext4_get_group_desc(sb, i, NULL); | ||
4456 | |||
4457 | flex_group = ext4_flex_group(sbi, i); | ||
4458 | - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, | ||
4459 | - ext4_free_inodes_count(sb, gdp)); | ||
4460 | - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, | ||
4461 | - ext4_free_blks_count(sb, gdp)); | ||
4462 | - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, | ||
4463 | - ext4_used_dirs_count(sb, gdp)); | ||
4464 | + atomic_add(ext4_free_inodes_count(sb, gdp), | ||
4465 | + &sbi->s_flex_groups[flex_group].free_inodes); | ||
4466 | + atomic_add(ext4_free_blks_count(sb, gdp), | ||
4467 | + &sbi->s_flex_groups[flex_group].free_blocks); | ||
4468 | + atomic_add(ext4_used_dirs_count(sb, gdp), | ||
4469 | + &sbi->s_flex_groups[flex_group].used_dirs); | ||
4470 | } | ||
4471 | |||
4472 | return 1; | ||
4473 | @@ -2197,6 +2246,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); | ||
4474 | EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); | ||
4475 | EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); | ||
4476 | EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); | ||
4477 | +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); | ||
4478 | |||
4479 | static struct attribute *ext4_attrs[] = { | ||
4480 | ATTR_LIST(delayed_allocation_blocks), | ||
4481 | @@ -2210,6 +2260,7 @@ static struct attribute *ext4_attrs[] = { | ||
4482 | ATTR_LIST(mb_order2_req), | ||
4483 | ATTR_LIST(mb_stream_req), | ||
4484 | ATTR_LIST(mb_group_prealloc), | ||
4485 | + ATTR_LIST(max_writeback_mb_bump), | ||
4486 | NULL, | ||
4487 | }; | ||
4488 | |||
4489 | @@ -2253,6 +2304,49 @@ static struct kobj_type ext4_ktype = { | ||
4490 | .release = ext4_sb_release, | ||
4491 | }; | ||
4492 | |||
4493 | +/* | ||
4494 | + * Check whether this filesystem can be mounted based on | ||
4495 | + * the features present and the RDONLY/RDWR mount requested. | ||
4496 | + * Returns 1 if this filesystem can be mounted as requested, | ||
4497 | + * 0 if it cannot be. | ||
4498 | + */ | ||
4499 | +static int ext4_feature_set_ok(struct super_block *sb, int readonly) | ||
4500 | +{ | ||
4501 | + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { | ||
4502 | + ext4_msg(sb, KERN_ERR, | ||
4503 | + "Couldn't mount because of " | ||
4504 | + "unsupported optional features (%x)", | ||
4505 | + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & | ||
4506 | + ~EXT4_FEATURE_INCOMPAT_SUPP)); | ||
4507 | + return 0; | ||
4508 | + } | ||
4509 | + | ||
4510 | + if (readonly) | ||
4511 | + return 1; | ||
4512 | + | ||
4513 | + /* Check that feature set is OK for a read-write mount */ | ||
4514 | + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { | ||
4515 | + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " | ||
4516 | + "unsupported optional features (%x)", | ||
4517 | + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & | ||
4518 | + ~EXT4_FEATURE_RO_COMPAT_SUPP)); | ||
4519 | + return 0; | ||
4520 | + } | ||
4521 | + /* | ||
4522 | + * Large file size enabled file system can only be mounted | ||
4523 | + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF | ||
4524 | + */ | ||
4525 | + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { | ||
4526 | + if (sizeof(blkcnt_t) < sizeof(u64)) { | ||
4527 | + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " | ||
4528 | + "cannot be mounted RDWR without " | ||
4529 | + "CONFIG_LBDAF"); | ||
4530 | + return 0; | ||
4531 | + } | ||
4532 | + } | ||
4533 | + return 1; | ||
4534 | +} | ||
4535 | + | ||
4536 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4537 | __releases(kernel_lock) | ||
4538 | __acquires(kernel_lock) | ||
4539 | @@ -2274,7 +2368,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4540 | unsigned int db_count; | ||
4541 | unsigned int i; | ||
4542 | int needs_recovery, has_huge_files; | ||
4543 | - int features; | ||
4544 | __u64 blocks_count; | ||
4545 | int err; | ||
4546 | unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; | ||
4547 | @@ -2401,39 +2494,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4548 | * previously didn't change the revision level when setting the flags, | ||
4549 | * so there is a chance incompat flags are set on a rev 0 filesystem. | ||
4550 | */ | ||
4551 | - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); | ||
4552 | - if (features) { | ||
4553 | - ext4_msg(sb, KERN_ERR, | ||
4554 | - "Couldn't mount because of " | ||
4555 | - "unsupported optional features (%x)", | ||
4556 | - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & | ||
4557 | - ~EXT4_FEATURE_INCOMPAT_SUPP)); | ||
4558 | - goto failed_mount; | ||
4559 | - } | ||
4560 | - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); | ||
4561 | - if (!(sb->s_flags & MS_RDONLY) && features) { | ||
4562 | - ext4_msg(sb, KERN_ERR, | ||
4563 | - "Couldn't mount RDWR because of " | ||
4564 | - "unsupported optional features (%x)", | ||
4565 | - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & | ||
4566 | - ~EXT4_FEATURE_RO_COMPAT_SUPP)); | ||
4567 | + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) | ||
4568 | goto failed_mount; | ||
4569 | - } | ||
4570 | - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, | ||
4571 | - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); | ||
4572 | - if (has_huge_files) { | ||
4573 | - /* | ||
4574 | - * Large file size enabled file system can only be | ||
4575 | - * mount if kernel is build with CONFIG_LBDAF | ||
4576 | - */ | ||
4577 | - if (sizeof(root->i_blocks) < sizeof(u64) && | ||
4578 | - !(sb->s_flags & MS_RDONLY)) { | ||
4579 | - ext4_msg(sb, KERN_ERR, "Filesystem with huge " | ||
4580 | - "files cannot be mounted read-write " | ||
4581 | - "without CONFIG_LBDAF"); | ||
4582 | - goto failed_mount; | ||
4583 | - } | ||
4584 | - } | ||
4585 | + | ||
4586 | blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); | ||
4587 | |||
4588 | if (blocksize < EXT4_MIN_BLOCK_SIZE || | ||
4589 | @@ -2469,6 +2532,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4590 | } | ||
4591 | } | ||
4592 | |||
4593 | + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, | ||
4594 | + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); | ||
4595 | sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, | ||
4596 | has_huge_files); | ||
4597 | sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); | ||
4598 | @@ -2549,12 +2614,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4599 | goto failed_mount; | ||
4600 | } | ||
4601 | |||
4602 | - if (ext4_blocks_count(es) > | ||
4603 | - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { | ||
4604 | + /* | ||
4605 | + * Test whether we have more sectors than will fit in sector_t, | ||
4606 | + * and whether the max offset is addressable by the page cache. | ||
4607 | + */ | ||
4608 | + if ((ext4_blocks_count(es) > | ||
4609 | + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || | ||
4610 | + (ext4_blocks_count(es) > | ||
4611 | + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { | ||
4612 | ext4_msg(sb, KERN_ERR, "filesystem" | ||
4613 | - " too large to mount safely"); | ||
4614 | + " too large to mount safely on this system"); | ||
4615 | if (sizeof(sector_t) < 8) | ||
4616 | ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); | ||
4617 | + ret = -EFBIG; | ||
4618 | goto failed_mount; | ||
4619 | } | ||
4620 | |||
4621 | @@ -2595,6 +2667,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4622 | goto failed_mount; | ||
4623 | } | ||
4624 | sbi->s_groups_count = blocks_count; | ||
4625 | + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, | ||
4626 | + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); | ||
4627 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | ||
4628 | EXT4_DESC_PER_BLOCK(sb); | ||
4629 | sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), | ||
4630 | @@ -2656,6 +2730,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
4631 | } | ||
4632 | |||
4633 | sbi->s_stripe = ext4_get_stripe_size(sbi); | ||
4634 | + sbi->s_max_writeback_mb_bump = 128; | ||
4635 | |||
4636 | /* | ||
4637 | * set up enough so that it can read an inode | ||
4638 | @@ -2781,6 +2856,12 @@ no_journal: | ||
4639 | clear_opt(sbi->s_mount_opt, NOBH); | ||
4640 | } | ||
4641 | } | ||
4642 | + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); | ||
4643 | + if (!EXT4_SB(sb)->dio_unwritten_wq) { | ||
4644 | + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); | ||
4645 | + goto failed_mount_wq; | ||
4646 | + } | ||
4647 | + | ||
4648 | /* | ||
4649 | * The jbd2_journal_load will have done any necessary log recovery, | ||
4650 | * so we can safely mount the rest of the filesystem now. | ||
4651 | @@ -2893,6 +2974,8 @@ cantfind_ext4: | ||
4652 | |||
4653 | failed_mount4: | ||
4654 | ext4_msg(sb, KERN_ERR, "mount failed"); | ||
4655 | + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); | ||
4656 | +failed_mount_wq: | ||
4657 | ext4_release_system_zone(sb); | ||
4658 | if (sbi->s_journal) { | ||
4659 | jbd2_journal_destroy(sbi->s_journal); | ||
4660 | @@ -3208,7 +3291,18 @@ static int ext4_commit_super(struct super_block *sb, int sync) | ||
4661 | clear_buffer_write_io_error(sbh); | ||
4662 | set_buffer_uptodate(sbh); | ||
4663 | } | ||
4664 | - es->s_wtime = cpu_to_le32(get_seconds()); | ||
4665 | + /* | ||
4666 | + * If the file system is mounted read-only, don't update the | ||
4667 | + * superblock write time. This avoids updating the superblock | ||
4668 | + * write time when we are mounting the root file system | ||
4669 | + * read/only but we need to replay the journal; at that point, | ||
4670 | + * for people who are east of GMT and who make their clock | ||
4671 | + * tick in localtime for Windows bug-for-bug compatibility, | ||
4672 | + * the clock is set in the future, and this will cause e2fsck | ||
4673 | + * to complain and force a full file system check. | ||
4674 | + */ | ||
4675 | + if (!(sb->s_flags & MS_RDONLY)) | ||
4676 | + es->s_wtime = cpu_to_le32(get_seconds()); | ||
4677 | es->s_kbytes_written = | ||
4678 | cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + | ||
4679 | ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - | ||
4680 | @@ -3333,11 +3427,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) | ||
4681 | { | ||
4682 | int ret = 0; | ||
4683 | tid_t target; | ||
4684 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4685 | |||
4686 | trace_ext4_sync_fs(sb, wait); | ||
4687 | - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { | ||
4688 | + flush_workqueue(sbi->dio_unwritten_wq); | ||
4689 | + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { | ||
4690 | if (wait) | ||
4691 | - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); | ||
4692 | + jbd2_log_wait_commit(sbi->s_journal, target); | ||
4693 | } | ||
4694 | return ret; | ||
4695 | } | ||
4696 | @@ -3477,18 +3573,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | ||
4697 | if (sbi->s_journal) | ||
4698 | ext4_mark_recovery_complete(sb, es); | ||
4699 | } else { | ||
4700 | - int ret; | ||
4701 | - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, | ||
4702 | - ~EXT4_FEATURE_RO_COMPAT_SUPP))) { | ||
4703 | - ext4_msg(sb, KERN_WARNING, "couldn't " | ||
4704 | - "remount RDWR because of unsupported " | ||
4705 | - "optional features (%x)", | ||
4706 | - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & | ||
4707 | - ~EXT4_FEATURE_RO_COMPAT_SUPP)); | ||
4708 | + /* Make sure we can mount this feature set readwrite */ | ||
4709 | + if (!ext4_feature_set_ok(sb, 0)) { | ||
4710 | err = -EROFS; | ||
4711 | goto restore_opts; | ||
4712 | } | ||
4713 | - | ||
4714 | /* | ||
4715 | * Make sure the group descriptor checksums | ||
4716 | * are sane. If they aren't, refuse to remount r/w. | ||
4717 | @@ -3624,13 +3713,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) | ||
4718 | buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; | ||
4719 | buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - | ||
4720 | percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); | ||
4721 | - ext4_free_blocks_count_set(es, buf->f_bfree); | ||
4722 | buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); | ||
4723 | if (buf->f_bfree < ext4_r_blocks_count(es)) | ||
4724 | buf->f_bavail = 0; | ||
4725 | buf->f_files = le32_to_cpu(es->s_inodes_count); | ||
4726 | buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); | ||
4727 | - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); | ||
4728 | buf->f_namelen = EXT4_NAME_LEN; | ||
4729 | fsid = le64_to_cpup((void *)es->s_uuid) ^ | ||
4730 | le64_to_cpup((void *)es->s_uuid + sizeof(u64)); | ||
4731 | diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c | ||
4732 | index 62b31c2..0257019 100644 | ||
4733 | --- a/fs/ext4/xattr.c | ||
4734 | +++ b/fs/ext4/xattr.c | ||
4735 | @@ -810,12 +810,23 @@ inserted: | ||
4736 | get_bh(new_bh); | ||
4737 | } else { | ||
4738 | /* We need to allocate a new block */ | ||
4739 | - ext4_fsblk_t goal = ext4_group_first_block_no(sb, | ||
4740 | + ext4_fsblk_t goal, block; | ||
4741 | + | ||
4742 | + goal = ext4_group_first_block_no(sb, | ||
4743 | EXT4_I(inode)->i_block_group); | ||
4744 | - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, | ||
4745 | + | ||
4746 | + /* non-extent files can't have physical blocks past 2^32 */ | ||
4747 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | ||
4748 | + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
4749 | + | ||
4750 | + block = ext4_new_meta_blocks(handle, inode, | ||
4751 | goal, NULL, &error); | ||
4752 | if (error) | ||
4753 | goto cleanup; | ||
4754 | + | ||
4755 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | ||
4756 | + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); | ||
4757 | + | ||
4758 | ea_idebug(inode, "creating block %d", block); | ||
4759 | |||
4760 | new_bh = sb_getblk(sb, block); | ||
4761 | @@ -977,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, | ||
4762 | if (error) | ||
4763 | goto cleanup; | ||
4764 | |||
4765 | + error = ext4_journal_get_write_access(handle, is.iloc.bh); | ||
4766 | + if (error) | ||
4767 | + goto cleanup; | ||
4768 | + | ||
4769 | if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { | ||
4770 | struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); | ||
4771 | memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); | ||
4772 | @@ -1002,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, | ||
4773 | if (flags & XATTR_CREATE) | ||
4774 | goto cleanup; | ||
4775 | } | ||
4776 | - error = ext4_journal_get_write_access(handle, is.iloc.bh); | ||
4777 | - if (error) | ||
4778 | - goto cleanup; | ||
4779 | if (!value) { | ||
4780 | if (!is.s.not_found) | ||
4781 | error = ext4_xattr_ibody_set(handle, inode, &i, &is); | ||
4782 | diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c | ||
4783 | index 7b4088b..8cf902a 100644 | ||
4784 | --- a/fs/jbd2/commit.c | ||
4785 | +++ b/fs/jbd2/commit.c | ||
4786 | @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) | ||
4787 | JBUFFER_TRACE(jh, "ph3: write metadata"); | ||
4788 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, | ||
4789 | jh, &new_jh, blocknr); | ||
4790 | + if (flags < 0) { | ||
4791 | + jbd2_journal_abort(journal, flags); | ||
4792 | + continue; | ||
4793 | + } | ||
4794 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | ||
4795 | wbuf[bufs++] = jh2bh(new_jh); | ||
4796 | |||
4797 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c | ||
4798 | index e378cb3..4b74149 100644 | ||
4799 | --- a/fs/jbd2/journal.c | ||
4800 | +++ b/fs/jbd2/journal.c | ||
4801 | @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); | ||
4802 | EXPORT_SYMBOL(jbd2_journal_ack_err); | ||
4803 | EXPORT_SYMBOL(jbd2_journal_clear_err); | ||
4804 | EXPORT_SYMBOL(jbd2_log_wait_commit); | ||
4805 | +EXPORT_SYMBOL(jbd2_log_start_commit); | ||
4806 | EXPORT_SYMBOL(jbd2_journal_start_commit); | ||
4807 | EXPORT_SYMBOL(jbd2_journal_force_commit_nested); | ||
4808 | EXPORT_SYMBOL(jbd2_journal_wipe); | ||
4809 | @@ -361,6 +362,10 @@ repeat: | ||
4810 | |||
4811 | jbd_unlock_bh_state(bh_in); | ||
4812 | tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); | ||
4813 | + if (!tmp) { | ||
4814 | + jbd2_journal_put_journal_head(new_jh); | ||
4815 | + return -ENOMEM; | ||
4816 | + } | ||
4817 | jbd_lock_bh_state(bh_in); | ||
4818 | if (jh_in->b_frozen_data) { | ||
4819 | jbd2_free(tmp, bh_in->b_size); | ||
4820 | @@ -1187,6 +1192,12 @@ static int journal_reset(journal_t *journal) | ||
4821 | |||
4822 | first = be32_to_cpu(sb->s_first); | ||
4823 | last = be32_to_cpu(sb->s_maxlen); | ||
4824 | + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { | ||
4825 | + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", | ||
4826 | + first, last); | ||
4827 | + journal_fail_superblock(journal); | ||
4828 | + return -EINVAL; | ||
4829 | + } | ||
4830 | |||
4831 | journal->j_first = first; | ||
4832 | journal->j_last = last; | ||
4833 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c | ||
4834 | index 6213ac7..a051270 100644 | ||
4835 | --- a/fs/jbd2/transaction.c | ||
4836 | +++ b/fs/jbd2/transaction.c | ||
4837 | @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | ||
4838 | INIT_LIST_HEAD(&transaction->t_private_list); | ||
4839 | |||
4840 | /* Set up the commit timer for the new transaction. */ | ||
4841 | - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); | ||
4842 | + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); | ||
4843 | add_timer(&journal->j_commit_timer); | ||
4844 | |||
4845 | J_ASSERT(journal->j_running_transaction == NULL); | ||
4846 | @@ -238,6 +238,8 @@ repeat_locked: | ||
4847 | __jbd2_log_space_left(journal)); | ||
4848 | spin_unlock(&transaction->t_handle_lock); | ||
4849 | spin_unlock(&journal->j_state_lock); | ||
4850 | + | ||
4851 | + lock_map_acquire(&handle->h_lockdep_map); | ||
4852 | out: | ||
4853 | if (unlikely(new_transaction)) /* It's usually NULL */ | ||
4854 | kfree(new_transaction); | ||
4855 | @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) | ||
4856 | handle = ERR_PTR(err); | ||
4857 | goto out; | ||
4858 | } | ||
4859 | - | ||
4860 | - lock_map_acquire(&handle->h_lockdep_map); | ||
4861 | out: | ||
4862 | return handle; | ||
4863 | } | ||
4864 | @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) | ||
4865 | __jbd2_log_start_commit(journal, transaction->t_tid); | ||
4866 | spin_unlock(&journal->j_state_lock); | ||
4867 | |||
4868 | + lock_map_release(&handle->h_lockdep_map); | ||
4869 | handle->h_buffer_credits = nblocks; | ||
4870 | ret = start_this_handle(journal, handle); | ||
4871 | return ret; | ||
4872 | diff --git a/include/linux/sched.h b/include/linux/sched.h | ||
4873 | index 0f1ea4a..d3e910b 100644 | ||
4874 | --- a/include/linux/sched.h | ||
4875 | +++ b/include/linux/sched.h | ||
4876 | @@ -1999,11 +1999,18 @@ static inline int is_si_special(const struct siginfo *info) | ||
4877 | return info <= SEND_SIG_FORCED; | ||
4878 | } | ||
4879 | |||
4880 | -/* True if we are on the alternate signal stack. */ | ||
4881 | - | ||
4882 | +/* | ||
4883 | + * True if we are on the alternate signal stack. | ||
4884 | + */ | ||
4885 | static inline int on_sig_stack(unsigned long sp) | ||
4886 | { | ||
4887 | - return (sp - current->sas_ss_sp < current->sas_ss_size); | ||
4888 | +#ifdef CONFIG_STACK_GROWSUP | ||
4889 | + return sp >= current->sas_ss_sp && | ||
4890 | + sp - current->sas_ss_sp < current->sas_ss_size; | ||
4891 | +#else | ||
4892 | + return sp > current->sas_ss_sp && | ||
4893 | + sp - current->sas_ss_sp <= current->sas_ss_size; | ||
4894 | +#endif | ||
4895 | } | ||
4896 | |||
4897 | static inline int sas_ss_flags(unsigned long sp) | ||
4898 | diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h | ||
4899 | index 2cc8e8b..6856612 100644 | ||
4900 | --- a/include/scsi/osd_protocol.h | ||
4901 | +++ b/include/scsi/osd_protocol.h | ||
4902 | @@ -17,6 +17,7 @@ | ||
4903 | #define __OSD_PROTOCOL_H__ | ||
4904 | |||
4905 | #include <linux/types.h> | ||
4906 | +#include <linux/kernel.h> | ||
4907 | #include <asm/unaligned.h> | ||
4908 | #include <scsi/scsi.h> | ||
4909 | |||
4910 | diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h | ||
4911 | index b62a097..6cc72e2 100644 | ||
4912 | --- a/include/scsi/scsi_host.h | ||
4913 | +++ b/include/scsi/scsi_host.h | ||
4914 | @@ -677,6 +677,12 @@ struct Scsi_Host { | ||
4915 | void *shost_data; | ||
4916 | |||
4917 | /* | ||
4918 | + * Points to the physical bus device we'd use to do DMA | ||
4919 | + * Needed just in case we have virtual hosts. | ||
4920 | + */ | ||
4921 | + struct device *dma_dev; | ||
4922 | + | ||
4923 | + /* | ||
4924 | * We should ensure that this is aligned, both for better performance | ||
4925 | * and also because some compilers (m68k) don't automatically force | ||
4926 | * alignment to a long boundary. | ||
4927 | @@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); | ||
4928 | extern void scsi_flush_work(struct Scsi_Host *); | ||
4929 | |||
4930 | extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int); | ||
4931 | -extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *); | ||
4932 | +extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, | ||
4933 | + struct device *, | ||
4934 | + struct device *); | ||
4935 | extern void scsi_scan_host(struct Scsi_Host *); | ||
4936 | extern void scsi_rescan_device(struct device *); | ||
4937 | extern void scsi_remove_host(struct Scsi_Host *); | ||
4938 | @@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(enum scsi_host_state); | ||
4939 | |||
4940 | extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *); | ||
4941 | |||
4942 | +static inline int __must_check scsi_add_host(struct Scsi_Host *host, | ||
4943 | + struct device *dev) | ||
4944 | +{ | ||
4945 | + return scsi_add_host_with_dma(host, dev, dev); | ||
4946 | +} | ||
4947 | + | ||
4948 | static inline struct device *scsi_get_device(struct Scsi_Host *shost) | ||
4949 | { | ||
4950 | return shost->shost_gendev.parent; | ||
4951 | diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h | ||
4952 | index 7d8b5bc..824979e 100644 | ||
4953 | --- a/include/trace/events/ext4.h | ||
4954 | +++ b/include/trace/events/ext4.h | ||
4955 | @@ -5,10 +5,12 @@ | ||
4956 | #define _TRACE_EXT4_H | ||
4957 | |||
4958 | #include <linux/writeback.h> | ||
4959 | -#include "../../../fs/ext4/ext4.h" | ||
4960 | -#include "../../../fs/ext4/mballoc.h" | ||
4961 | #include <linux/tracepoint.h> | ||
4962 | |||
4963 | +struct ext4_allocation_context; | ||
4964 | +struct ext4_allocation_request; | ||
4965 | +struct ext4_prealloc_space; | ||
4966 | + | ||
4967 | TRACE_EVENT(ext4_free_inode, | ||
4968 | TP_PROTO(struct inode *inode), | ||
4969 | |||
4970 | @@ -229,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages, | ||
4971 | __field( char, for_reclaim ) | ||
4972 | __field( char, for_writepages ) | ||
4973 | __field( char, range_cyclic ) | ||
4974 | + __field( pgoff_t, writeback_index ) | ||
4975 | ), | ||
4976 | |||
4977 | TP_fast_assign( | ||
4978 | @@ -243,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages, | ||
4979 | __entry->for_reclaim = wbc->for_reclaim; | ||
4980 | __entry->for_writepages = wbc->for_writepages; | ||
4981 | __entry->range_cyclic = wbc->range_cyclic; | ||
4982 | + __entry->writeback_index = inode->i_mapping->writeback_index; | ||
4983 | ), | ||
4984 | |||
4985 | - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d", | ||
4986 | - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, | ||
4987 | + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu", | ||
4988 | + jbd2_dev_to_name(__entry->dev), | ||
4989 | + (unsigned long) __entry->ino, __entry->nr_to_write, | ||
4990 | __entry->pages_skipped, __entry->range_start, | ||
4991 | __entry->range_end, __entry->nonblocking, | ||
4992 | __entry->for_kupdate, __entry->for_reclaim, | ||
4993 | - __entry->for_writepages, __entry->range_cyclic) | ||
4994 | + __entry->for_writepages, __entry->range_cyclic, | ||
4995 | + (unsigned long) __entry->writeback_index) | ||
4996 | +); | ||
4997 | + | ||
4998 | +TRACE_EVENT(ext4_da_write_pages, | ||
4999 | + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), | ||
5000 | + | ||
5001 | + TP_ARGS(inode, mpd), | ||
5002 | + | ||
5003 | + TP_STRUCT__entry( | ||
5004 | + __field( dev_t, dev ) | ||
5005 | + __field( ino_t, ino ) | ||
5006 | + __field( __u64, b_blocknr ) | ||
5007 | + __field( __u32, b_size ) | ||
5008 | + __field( __u32, b_state ) | ||
5009 | + __field( unsigned long, first_page ) | ||
5010 | + __field( int, io_done ) | ||
5011 | + __field( int, pages_written ) | ||
5012 | + ), | ||
5013 | + | ||
5014 | + TP_fast_assign( | ||
5015 | + __entry->dev = inode->i_sb->s_dev; | ||
5016 | + __entry->ino = inode->i_ino; | ||
5017 | + __entry->b_blocknr = mpd->b_blocknr; | ||
5018 | + __entry->b_size = mpd->b_size; | ||
5019 | + __entry->b_state = mpd->b_state; | ||
5020 | + __entry->first_page = mpd->first_page; | ||
5021 | + __entry->io_done = mpd->io_done; | ||
5022 | + __entry->pages_written = mpd->pages_written; | ||
5023 | + ), | ||
5024 | + | ||
5025 | + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", | ||
5026 | + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, | ||
5027 | + __entry->b_blocknr, __entry->b_size, | ||
5028 | + __entry->b_state, __entry->first_page, | ||
5029 | + __entry->io_done, __entry->pages_written) | ||
5030 | ); | ||
5031 | |||
5032 | TRACE_EVENT(ext4_da_writepages_result, | ||
5033 | @@ -268,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result, | ||
5034 | __field( char, encountered_congestion ) | ||
5035 | __field( char, more_io ) | ||
5036 | __field( char, no_nrwrite_index_update ) | ||
5037 | + __field( pgoff_t, writeback_index ) | ||
5038 | ), | ||
5039 | |||
5040 | TP_fast_assign( | ||
5041 | @@ -279,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result, | ||
5042 | __entry->encountered_congestion = wbc->encountered_congestion; | ||
5043 | __entry->more_io = wbc->more_io; | ||
5044 | __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; | ||
5045 | + __entry->writeback_index = inode->i_mapping->writeback_index; | ||
5046 | ), | ||
5047 | |||
5048 | - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", | ||
5049 | - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, | ||
5050 | + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", | ||
5051 | + jbd2_dev_to_name(__entry->dev), | ||
5052 | + (unsigned long) __entry->ino, __entry->ret, | ||
5053 | __entry->pages_written, __entry->pages_skipped, | ||
5054 | __entry->encountered_congestion, __entry->more_io, | ||
5055 | - __entry->no_nrwrite_index_update) | ||
5056 | + __entry->no_nrwrite_index_update, | ||
5057 | + (unsigned long) __entry->writeback_index) | ||
5058 | ); | ||
5059 | |||
5060 | TRACE_EVENT(ext4_da_write_begin, |