diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 7be02ac..32c3da4 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that - if the filesystem was not unmounted cleanly, +norecovery Don't load the journal on mounting. Note that +noload if the filesystem was not unmounted cleanly, skipping the journal replay will lead to the filesystem containing inconsistencies that can lead to any number of problems. @@ -338,6 +338,12 @@ noauto_da_alloc replacing existing files via patterns such as system crashes before the delayed allocation blocks are forced to disk. +discard Controls whether ext4 should issue discard/TRIM +nodiscard(*) commands to the underlying block device when + blocks are freed. This is useful for SSD devices + and sparse/thinly-provisioned LUNs, but it is off + by default until sufficient testing has been done. + Data Mode ========= There are 3 different data modes: diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 5fd2da4..28a753d 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *shost) EXPORT_SYMBOL(scsi_remove_host); /** - * scsi_add_host - add a scsi host + * scsi_add_host_with_dma - add a scsi host with dma device * @shost: scsi host pointer to add * @dev: a struct device of type scsi class + * @dma_dev: dma device for the host + * + * Note: You rarely need to worry about this unless you're in a + * virtualised host environments, so use the simpler scsi_add_host() + * function instead. * * Return value: * 0 on success / != 0 for error **/ -int scsi_add_host(struct Scsi_Host *shost, struct device *dev) +int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, + struct device *dma_dev) { struct scsi_host_template *sht = shost->hostt; int error = -EINVAL; @@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev) if (!shost->shost_gendev.parent) shost->shost_gendev.parent = dev ? dev : &platform_bus; + shost->dma_dev = dma_dev; error = device_add(&shost->shost_gendev); if (error) @@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev) fail: return error; } -EXPORT_SYMBOL(scsi_add_host); +EXPORT_SYMBOL(scsi_add_host_with_dma); static void scsi_host_dev_release(struct device *dev) { diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index fc67cc6..cf13ff2 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) vport->els_tmofunc.function = lpfc_els_timeout; vport->els_tmofunc.data = (unsigned long)vport; - error = scsi_add_host(shost, dev); + error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev); if (error) goto out_put_shost; diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c index 7dc3d18..7a838c8 100644 --- a/drivers/scsi/megaraid/megaraid_sas.c +++ b/drivers/scsi/megaraid/megaraid_sas.c @@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, int error = 0, i; void *sense = NULL; dma_addr_t sense_handle; - u32 *sense_ptr; + unsigned long *sense_ptr; memset(kbuff_arr, 0, sizeof(kbuff_arr)); @@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, } sense_ptr = - (u32 *) ((unsigned long)cmd->frame + ioc->sense_off); + (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off); *sense_ptr = sense_handle; } @@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, * sense_ptr points to the location that has the user * sense buffer address */ - sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw + - ioc->sense_off); + sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw + + ioc->sense_off); if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)), sense, ioc->sense_len)) { diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 0f87962..67e016d 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc_vport, bool disable) fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN); } - if (scsi_add_host(vha->host, &fc_vport->dev)) { + if (scsi_add_host_with_dma(vha->host, &fc_vport->dev, + &ha->pdev->dev)) { DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n", vha->host_no, vha->vp_idx)); goto vport_create_failed_2; diff --git a/drivers/scsi/scsi_lib_dma.c b/drivers/scsi/scsi_lib_dma.c index ac6855c..dcd1285 100644 --- a/drivers/scsi/scsi_lib_dma.c +++ b/drivers/scsi/scsi_lib_dma.c @@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd) int nseg = 0; if (scsi_sg_count(cmd)) { - struct device *dev = cmd->device->host->shost_gendev.parent; + struct device *dev = cmd->device->host->dma_dev; nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), cmd->sc_data_direction); @@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map); void scsi_dma_unmap(struct scsi_cmnd *cmd) { if (scsi_sg_count(cmd)) { - struct device *dev = cmd->device->host->shost_gendev.parent; + struct device *dev = cmd->device->host->dma_dev; dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), cmd->sc_data_direction); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e2126d7..34bb797 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; } /** diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 50784ef..dc79b75 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - sbi->s_gdb_count + 1); + ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); if (ret) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9714db3..3b8321b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_HINT_TRY_GOAL 512 /* blocks already pre-reserved by delayed allocation */ #define EXT4_MB_DELALLOC_RESERVED 1024 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 2048 struct ext4_allocation_request { @@ -111,6 +113,33 @@ struct ext4_allocation_request { unsigned int flags; }; +#define DIO_AIO_UNWRITTEN 0x1 +typedef struct ext4_io_end { + struct list_head list; /* per-file finished AIO list */ + struct inode *inode; /* file being written to */ + unsigned int flag; /* sync IO or AIO */ + int error; /* I/O error code */ + ext4_lblk_t offset; /* offset in the file */ + size_t size; /* size of the extent */ + struct work_struct work; /* data work queue */ +} ext4_io_end_t; + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; + /* * Special inodes numbers */ @@ -251,7 +280,6 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ @@ -289,6 +317,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { @@ -330,7 +360,16 @@ struct ext4_new_group_data { /* Call ext4_da_update_reserve_space() after successfully allocating the blocks */ #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 - + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_DIO 0x0010 +#define EXT4_GET_BLOCKS_CONVERT 0x0020 +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after direct IO complete */ +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* * ioctl commands @@ -386,6 +425,9 @@ struct ext4_mount_options { #endif }; +/* Max physical block we can addres w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + /* * Structure of an inode on the disk */ @@ -481,8 +523,8 @@ struct move_extent { static inline __le32 ext4_encode_extra_time(struct timespec *time) { return cpu_to_le32((sizeof(time->tv_sec) > 4 ? - time->tv_sec >> 32 : 0) | - ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); } static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) @@ -490,7 +532,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) if (sizeof(time->tv_sec) > 4) time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ @@ -653,6 +695,18 @@ struct ext4_inode_info { __u16 i_extra_isize; spinlock_t i_block_reservation_lock; + + /* completed async DIOs that might need unwritten extents handling */ + struct list_head i_aio_dio_complete_list; + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* @@ -700,6 +754,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt @@ -841,6 +896,7 @@ struct ext4_sb_info { unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead_last; /* Last calculated overhead */ unsigned long s_blocks_last; /* Last seen block count */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ @@ -923,6 +979,7 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; @@ -950,6 +1007,7 @@ struct ext4_sb_info { atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; + atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group *s_locality_groups; @@ -960,6 +1018,9 @@ struct ext4_sb_info { unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1367,6 +1428,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -1378,7 +1440,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); - +extern int flush_aio_dio_completed_IO(struct inode *inode); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -1591,15 +1653,42 @@ struct ext4_group_info { #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - spin_lock(ext4_group_lock_ptr(sb, group)); + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } } static inline void ext4_unlock_group(struct super_block *sb, @@ -1650,6 +1739,8 @@ extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 20a8410..1c2db3f 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index eb27fd0..6a94099 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle, handle, err); } else - brelse(bh); + bforget(bh); return err; } @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, handle, err); } else - brelse(bh); + bforget(bh); return err; } @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } else { - mark_buffer_dirty(bh); + if (inode && bh) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 139fb8c..1892a77 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -49,7 +49,7 @@ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. @@ -57,7 +57,7 @@ * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be @@ -92,6 +92,7 @@ * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else @@ -99,6 +100,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) int ext4_mark_iloc_dirty(handle_t *handle, @@ -161,11 +165,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { - if (handle == EXT4_NOJOURNAL_HANDLE) + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } @@ -252,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 73ebfb4..24fb20b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static int ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) { int err; @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) err = ext4_journal_extend(handle, needed); if (err <= 0) return err; - return ext4_journal_restart(handle, needed); + err = ext4_truncate_restart_trans(handle, inode, needed); + /* + * We have dropped i_data_sem so someone might have cached again + * an extent we are going to truncate. + */ + ext4_ext_invalidate_cache(inode); + + return err; } /* @@ -701,7 +710,7 @@ err: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, +int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { @@ -1563,7 +1572,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, int flag) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1579,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, BUG_ON(path[depth].p_hdr == NULL); /* try to insert block into found extent and return */ - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append %d block to %d:%d (from %llu)\n", ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), @@ -1694,7 +1704,8 @@ has_space: merge: /* try to merge extents to the right */ - ext4_ext_try_to_merge(inode, path, nearex); + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -1731,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, while (block < last && block != EXT_MAX_BLOCK) { num = last - block; /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -2044,7 +2057,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext_debug("free last %u blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, 0, inode, bh, start + i); + ext4_forget(handle, metadata, inode, bh, start + i); } ext4_free_blocks(handle, inode, start, num, metadata); } else if (from == le32_to_cpu(ex->ee_block) @@ -2136,9 +2149,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - err = ext4_ext_journal_restart(handle, credits); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) goto out; @@ -2461,7 +2474,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } #define EXT4_EXT_ZERO_LEN 7 - /* * This function is called by ext4_ext_get_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -2554,7 +2566,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex3->ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(ex3, newblock); ex3->ee_len = cpu_to_le16(allocated); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, + ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2610,7 +2623,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex3, newblock + max_blocks); ex3->ee_len = cpu_to_le16(allocated - max_blocks); ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2728,7 +2741,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + depth); goto out; insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex); + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; +out: + return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * This function is called by ext4_ext_get_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an uninitialized extent. + * + * Writing to an uninitized extent may result in splitting the uninitialized + * extent into multiple /intialized unintialized extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be uninitialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * One of more index blocks maybe needed if the extent tree grow after + * the unintialized extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the uninitialized extent before DIO submit + * the IO. The uninitilized extent called at this time will be split + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. + */ +static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t iblock, + unsigned int max_blocks, + int flags) +{ + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; + ext4_lblk_t ee_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + + ext_debug("ext4_split_unwritten_extents: inode %lu," + "iblock %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)iblock, max_blocks); + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + /* + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. + */ + if ((iblock == ee_block) && (allocated <= max_blocks)) + return allocated; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > max_blocks) + ex2->ee_len = cpu_to_le16(max_blocks); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + max_blocks); + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from iblock */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successful insert of the + * split extent + */ + orig_ex.ee_len = cpu_to_le16(ee_len - + ext4_ext_get_actual_len(ex3)); + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + allocated = max_blocks; + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, + * uninitialised still. + */ + ex2->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + ext4_ext_mark_uninitialized(ex2); + if (ex2 != ex) + goto insert; + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + ext_debug("out here\n"); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2743,6 +2940,7 @@ insert: } else if (err) goto fix_extent_len; out: + ext4_ext_show_leaf(inode, path); return err ? err : allocated; fix_extent_len: @@ -2753,7 +2951,151 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + depth); return err; } +static int ext4_convert_unwritten_extents_dio(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + struct ext4_extent_header *eh; + int depth; + int err = 0; + int ret = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex--; + } + } + /* + * Try to Merge towards right. + */ + ret = ext4_ext_try_to_merge(inode, path, ex); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static int +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int max_blocks, + struct ext4_ext_path *path, int flags, + unsigned int allocated, struct buffer_head *bh_result, + ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" + "block %llu, max_blocks %u, flags %d, allocated %u", + inode->i_ino, (unsigned long long)iblock, max_blocks, + flags, allocated); + ext4_ext_show_leaf(inode, path); + /* DIO get_block() before submit the IO, split the extent */ + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + ret = ext4_split_unwritten_extents(handle, + inode, path, iblock, + max_blocks, flags); + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to convertion to written when IO is + * completed + */ + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; + goto out; + } + /* async DIO end_io complete, convert the filled extent to written */ + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { + ret = ext4_convert_unwritten_extents_dio(handle, inode, + path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + goto map_out; + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + set_buffer_unwritten(bh_result); + goto out1; + } + + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + set_buffer_new(bh_result); +map_out: + set_buffer_mapped(bh_result); +out1: + if (allocated > max_blocks) + allocated = max_blocks; + ext4_ext_show_leaf(inode, path); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} /* * Block allocation/map/preallocation routine for extents based files * @@ -2784,6 +3126,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%u requested for inode %u\n", @@ -2859,33 +3202,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) - goto out; - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - if (allocated > max_blocks) - allocated = max_blocks; - /* - * We have blocks reserved already. We - * return allocated blocks so that delalloc - * won't do block reservation for us. But - * the buffer head will be unmapped so that - * a read from the block returns 0s. - */ - set_buffer_unwritten(bh_result); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; - goto out2; - } - - ret = ext4_ext_convert_to_initialized(handle, inode, - path, iblock, - max_blocks); - if (ret <= 0) { - err = ret; - goto out2; - } else - allocated = ret; - goto outnew; + ret = ext4_ext_handle_uninitialized_extents(handle, + inode, iblock, max_blocks, path, + flags, allocated, bh_result, newblock); + return ret; } } @@ -2956,9 +3276,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); - err = ext4_ext_insert_extent(handle, inode, path, &newex); + /* + * io_end structure was created for every async + * direct IO write to the middle of the file. + * To avoid unecessary convertion for every aio dio rewrite + * to the mid of file, here we flag the IO that is really + * need the convertion. + * For non asycn direct IO case, flag the inode state + * that we need to perform convertion when IO is done. + */ + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= + EXT4_STATE_DIO_UNWRITTEN;; + } + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ /* not a good idea to call discard here directly, @@ -2972,13 +3310,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); -outnew: set_buffer_new(bh_result); - /* Cache only when it is _not_ an uninitialized extent */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > max_blocks) allocated = max_blocks; @@ -3171,6 +3514,64 @@ retry: } /* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + ext4_lblk_t block; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + block = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - block; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + while (ret >= 0 && ret < max_blocks) { + block = block + ret; + max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + map_bh.b_state = 0; + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + if (ret <= 0) { + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, block, max_blocks); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2 ) + break; + } + return ret > 0 ? ret2 : ret; +} +/* * Callback function called for each extent to gather FIEMAP information. */ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, @@ -3308,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_read(&EXT4_I(inode)->i_data_sem); } return error; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 83cf641..d6049e4 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,27 +44,37 @@ * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. + * + * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret = 0; + int ret; + tid_t commit_tid; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) + return ret; + + if (!journal) + return simple_fsync(file, dentry, datasync); + /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for proper transaction to + * commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -74,27 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - goto out; - } - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + if (ext4_should_journal_data(inode)) + return ext4_force_commit(inode->i_sb); - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - ret = sync_inode(inode, &wbc); - if (journal && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); - } -out: + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (jbd2_log_start_commit(journal, commit_tid)) + jbd2_log_wait_commit(journal, commit_tid); + else if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9c642b..38b2154 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -192,11 +193,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) { + int ret; + + /* + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); - return ext4_journal_restart(handle, blocks_for_truncate(inode)); + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + return ret; } /* @@ -551,15 +566,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * * Normally this function find the preferred place for block allocation, * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { + ext4_fsblk_t goal; + /* * XXX need to get goal block from mballoc's data structures */ - return ext4_find_near(inode, partial); + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; } /** @@ -640,6 +661,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + target -= count; /* allocate blocks for indirect blocks */ while (index < indirect_blks && count) { @@ -674,6 +697,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); if (*err && (target == blks)) { /* @@ -998,10 +1022,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -1029,7 +1055,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) EXT4_I(inode)->i_reserved_meta_blocks; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return total; + return (total << inode->i_blkbits); } /* * Calculate the number of metadata blocks need to reserve @@ -1109,22 +1135,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, sector_t logical, - sector_t phys, int len) +static int check_block_validity(struct inode *inode, const char *msg, + sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, "check_block_validity", + ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, (unsigned long long) phys, len); - WARN_ON(1); return -EIO; } return 0; } /* + * Return the number of contiguous dirty pages in a given inode + * starting at page frame idx. + */ +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct pagevec pvec; + pgoff_t num = 0; + int i, nr_pages, done = 0; + + if (max_pages == 0) + return 0; + pagevec_init(&pvec, 0); + while (!done) { + index = idx; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page) || + page->index != idx) { + done = 1; + unlock_page(page); + break; + } + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) + done = 1; + bh = bh->b_this_page; + } while (!done && (bh != head)); + } + unlock_page(page); + if (done) + break; + idx++; + num++; + if (num >= max_pages) + break; + } + pagevec_release(&pvec); + } + return num; +} + +/* * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * @@ -1155,6 +1238,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, clear_buffer_mapped(bh); clear_buffer_unwritten(bh); + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, max_blocks, + (unsigned long)block); /* * Try to see if we can get the block without requesting a new * file system block. @@ -1170,8 +1256,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, - bh->b_blocknr, retval); + int ret = check_block_validity(inode, "file system corruption", + block, bh->b_blocknr, retval); if (ret != 0) return ret; } @@ -1235,8 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & - ~EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; } } @@ -1252,8 +1337,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, - bh->b_blocknr, retval); + int ret = check_block_validity(inode, "file system " + "corruption after allocation", + block, bh->b_blocknr, retval); if (ret != 0) return ret; } @@ -1451,6 +1537,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1516,7 +1612,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1626,7 +1722,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1668,7 +1764,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1731,7 +1827,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1776,11 +1872,11 @@ repeat: if (ext4_claim_free_blocks(sbi, total)) { spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + vfs_dq_release_reservation_block(inode, total); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } - vfs_dq_release_reservation_block(inode, total); return -ENOSPC; } EXT4_I(inode)->i_reserved_data_blocks += nrblocks; @@ -1860,22 +1956,6 @@ static void ext4_da_page_release_reservation(struct page *page, } /* - * Delayed allocation stuff - */ - -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - -/* * mpage_da_submit_io - walks through extent of pages and try to write * them with writepage() call back * @@ -2717,7 +2797,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(inode->i_flags & EXT4_EXTENTS_FL) && + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; @@ -2735,8 +2815,11 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + unsigned int max_pages; int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0, nr_to_writebump = 0; + int needed_blocks, ret = 0; + long desired_nr_to_write, nr_to_writebump = 0; + loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); trace_ext4_da_writepages(inode, wbc); @@ -2762,16 +2845,6 @@ static int ext4_da_writepages(struct address_space *mapping, if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; - /* - * Make sure nr_to_write is >= sbi->s_mb_stream_request - * This make sure small files blocks are allocated in - * single attempt. This ensure that small files - * get less fragmented. - */ - if (wbc->nr_to_write < sbi->s_mb_stream_request) { - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; - wbc->nr_to_write = sbi->s_mb_stream_request; - } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; @@ -2786,6 +2859,36 @@ static int ext4_da_writepages(struct address_space *mapping, } else index = wbc->range_start >> PAGE_CACHE_SHIFT; + /* + * This works around two forms of stupidity. The first is in + * the writeback code, which caps the maximum number of pages + * written to be 1024 pages. This is wrong on multiple + * levels; different architectues have a different page size, + * which changes the maximum amount of data which gets + * written. Secondly, 4 megabytes is way too small. XFS + * forces this value to be 16 megabytes by multiplying + * nr_to_write parameter by four, and then relies on its + * allocator to allocate larger extents to make them + * contiguous. Unfortunately this brings us to the second + * stupidity, which is that ext4's mballoc code only allocates + * at most 2048 blocks. So we force contiguous writes up to + * the number of dirty blocks in the inode, or + * sbi->max_writeback_mb_bump whichever is smaller. + */ + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); + if (!range_cyclic && range_whole) + desired_nr_to_write = wbc->nr_to_write * 8; + else + desired_nr_to_write = ext4_num_dirty_pages(inode, index, + max_pages); + if (desired_nr_to_write > max_pages) + desired_nr_to_write = max_pages; + + if (wbc->nr_to_write < desired_nr_to_write) { + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; + wbc->nr_to_write = desired_nr_to_write; + } + mpd.wbc = wbc; mpd.inode = mapping->host; @@ -2904,7 +3007,9 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - wbc->nr_to_write -= nr_to_writebump; + if (wbc->nr_to_write > nr_to_writebump) + wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; } @@ -2994,7 +3099,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3259,6 +3364,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* + * O_DIRECT for ext3 (or indirect map) based files + * * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size * if the machine crashes during the write. @@ -3267,7 +3374,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -3278,6 +3385,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -3300,9 +3408,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; @@ -3341,6 +3452,364 @@ out: return ret; } +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = NULL; + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; + + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + inode->i_ino, create); + /* + * DIO VFS code passes create = 0 flag for write to + * the middle of file. It does this to avoid block + * allocation for holes, to prevent expose stale data + * out when there is parallel buffered read (which does + * not hold the i_mutex lock) while direct IO write has + * not completed. DIO request on holes finally falls back + * to buffered IO for this reason. + * + * For ext4 extent based file, since we support fallocate, + * new allocated extent as uninitialized, for holes, we + * could fallocate blocks for holes, thus parallel + * buffered IO read will zero out the page when read on + * a hole while parallel DIO write to the hole has not completed. + * + * when we come here, we know it's a direct IO write to + * to the middle of file ( DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, + create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + ext4_journal_stop(handle); +out: + return ret; +} + +static void ext4_free_io_end(ext4_io_end_t *io) +{ + BUG_ON(!io); + iput(io->inode); + kfree(io); +} +static void dump_aio_dio_list(struct inode * inode) +{ +#ifdef EXT4_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} + +/* + * check a range of space and convert unwritten extents to written. + */ +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + size_t size = io->size; + int ret = 0; + + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (io->flag != DIO_AIO_UNWRITTEN) + return ret; + + if (offset + size <= i_size_read(inode)) + ret = ext4_convert_unwritten_extents(inode, offset, size); + + if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten" + "extents to written extents, error is %d" + " io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + /* clear the DIO AIO unwritten flag */ + io->flag = 0; + return ret; +} +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_aio_dio_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + int ret = 0; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_aio_dio_nolock(io); + if (ret >= 0) { + if (!list_empty(&io->list)) + list_del_init(&io->list); + ext4_free_io_end(io); + } + mutex_unlock(&inode->i_mutex); +} +/* + * This function is called from ext4_sync_file(). + * + * When AIO DIO IO is completed, the work to convert unwritten + * extents to written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of completed AIO from DIO path + * that might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents to written. + */ +int flush_aio_dio_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + int ret = 0; + int ret2 = 0; + + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + return ret; + + dump_aio_dio_list(inode); + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_aio_dio_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * convertion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + ret = ext4_end_aio_dio_nolock(io); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + return (ret2 < 0) ? ret2 : 0; +} + +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +{ + ext4_io_end_t *io = NULL; + + io = kmalloc(sizeof(*io), GFP_NOFS); + + if (io) { + igrab(inode); + io->inode = inode; + io->flag = 0; + io->offset = 0; + io->size = 0; + io->error = 0; + INIT_WORK(&io->work, ext4_end_aio_dio_work); + INIT_LIST_HEAD(&io->list); + } + + return io; +} + +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private) +{ + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + return; + + ext_debug("ext4_end_io_dio(): io_end 0x%p" + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + + /* if not aio dio with unwritten extents, just free io and return */ + if (io_end->flag != DIO_AIO_UNWRITTEN){ + ext4_free_io_end(io_end); + iocb->private = NULL; + return; + } + + io_end->offset = offset; + io_end->size = size; + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); + + /* Add the io_end to per-inode completed aio dio list*/ + list_add_tail(&io_end->list, + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + iocb->private = NULL; +} +/* + * For ext4 extent files, ext4 will do direct-io write to holes, + * preallocated extents, and those write extend the file, no need to + * fall back to buffered IO. + * + * For holes, we fallocate those blocks, mark them as unintialized + * If those blocks were preallocated, we mark sure they are splited, but + * still keep the range to write as unintialized. + * + * The unwrritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the convertion + * when async direct IO completed. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + */ +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + size_t count = iov_length(iov, nr_segs); + + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized + * to prevent paralel buffered read to expose the stale data + * before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. + */ + iocb->private = NULL; + EXT4_I(inode)->cur_aio_dio = NULL; + if (!is_sync_kiocb(iocb)) { + iocb->private = ext4_init_io_end(inode); + if (!iocb->private) + return -ENOMEM; + /* + * we save the io structure for current async + * direct IO, so that later ext4_get_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + EXT4_I(inode)->cur_aio_dio = iocb->private; + } + + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_dio_write, + ext4_end_io_dio); + if (iocb->private) + EXT4_I(inode)->cur_aio_dio = NULL; + /* + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && (EXT4_I(inode)->i_state & + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the convertion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; + } + return ret; + } + + /* for write the the end of file case, we fall back to old way */ + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do @@ -3653,13 +4122,16 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, inode, bh); } ext4_mark_inode_dirty(handle, inode); - ext4_journal_test_restart(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext4_journal_get_write_access(handle, bh); @@ -3682,11 +4154,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, 0, inode, tbh, nr); + ext4_forget(handle, is_metadata, inode, tbh, nr); } } - ext4_free_blocks(handle, inode, block_to_free, count, 0); + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); } /** @@ -3870,7 +4342,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); - ext4_journal_test_restart(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); } ext4_free_blocks(handle, inode, nr, 1, 1); @@ -3958,8 +4431,7 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - if (ei->i_disksize && inode->i_size == 0 && - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { @@ -4313,8 +4785,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; - struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4325,11 +4797,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); + iloc.bh = 0; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; - bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); @@ -4352,7 +4824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -4380,11 +4851,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse(bh); ret = -EIO; goto bad_inode; } @@ -4416,10 +4911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - EXT4_SB(sb)->s_gdb_count)) || - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { ext4_error(sb, __func__, "bad extended attribute block %llu in inode #%lu", ei->i_file_acl, inode->i_ino); @@ -4437,10 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } - if (ret) { - brelse(bh); + if (ret) goto bad_inode; - } if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; @@ -4468,7 +4958,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { - brelse(bh); ret = -EIO; ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", @@ -4481,6 +4970,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; bad_inode: + brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } @@ -4581,8 +5071,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - /* clear the migrate flag in the raw_inode */ - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = @@ -4641,6 +5130,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ei->i_state &= ~EXT4_STATE_NEW; + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); @@ -4684,19 +5174,40 @@ out_brelse: */ int ext4_write_inode(struct inode *inode, int wait) { + int err; + if (current->flags & PF_MEMALLOC) return 0; - if (ext4_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); - dump_stack(); - return -EIO; - } + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } - if (!wait) - return 0; + if (!wait) + return 0; + + err = ext4_force_commit(inode->i_sb); + } else { + struct ext4_iloc iloc; - return ext4_force_commit(inode->i_sb); + err = ext4_get_inode_loc(inode, &iloc); + if (err) + return err; + if (wait) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)iloc.bh->b_blocknr); + err = -EIO; + } + } + return err; } /* @@ -4739,8 +5250,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -5137,24 +5648,13 @@ void ext4_dirty_inode(struct inode *inode) handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; - if (!ext4_handle_valid(current_handle)) { - ext4_mark_inode_dirty(current_handle, inode); - return; - } - handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; - if (current_handle && - current_handle->h_transaction != handle->h_transaction) { - /* This task has a transaction open against a different fs */ - printk(KERN_EMERG "%s: transactions do not match!\n", - __func__); - } else { - jbd_debug(5, "marking dirty. outer handle=%p\n", - current_handle); - ext4_mark_inode_dirty(handle, inode); - } + + jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); out: return; @@ -5281,12 +5781,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) else len = PAGE_CACHE_SIZE; + lock_page(page); + /* + * return if we have all the buffers mapped. This avoid + * the need to call write_begin/write_end which does a + * journal_start/journal_stop which can block and take + * long time + */ if (page_has_buffers(page)) { - /* return if we have all the buffers mapped */ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) + ext4_bh_unmapped)) { + unlock_page(page); goto out_unlock; + } } + unlock_page(page); /* * OK, we need to fill the hole... Do write_begin write_end * to do block allocation/reservation.We are not holding diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7050a9c..b63d193 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -221,32 +221,38 @@ setversion_out: struct file *donor_filp; int err; + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; + me.moved_len = 0; donor_filp = fget(me.donor_fd); if (!donor_filp) return -EBADF; - if (!capable(CAP_DAC_OVERRIDE)) { - if ((current->real_cred->fsuid != inode->i_uid) || - !(inode->i_mode & S_IRUSR) || - !(donor_filp->f_dentry->d_inode->i_mode & - S_IRUSR)) { - fput(donor_filp); - return -EACCES; - } + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; } + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto mext_out; + err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); - fput(donor_filp); + mnt_drop_write(filp->f_path.mnt); + if (me.moved_len > 0) + file_remove_suid(donor_filp); - if (!err) - if (copy_to_user((struct move_extent *)arg, - &me, sizeof(me))) - return -EFAULT; + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) + err = -EFAULT; +mext_out: + fput(donor_filp); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cd25846..099fd47 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -908,6 +908,97 @@ out: return err; } +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret = 0; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug("init group %lu\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures we don't add group + * to this buddy cache via resize + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) @@ -941,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * groups mapped by the page is blocked * till we are done with allocation */ +repeat_load_buddy: down_read(e4b->alloc_semp); + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* we need to check for group need init flag + * with alloc_semp held so that we can be sure + * that new blocks didn't get added to the group + * when we are loading the buddy cache + */ + up_read(e4b->alloc_semp); + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + goto repeat_load_buddy; + } + /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. @@ -1360,7 +1469,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->alloc_semp = e4b->alloc_semp; e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; @@ -1837,97 +1946,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, } -static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) -{ - - int ret; - void *bitmap; - int blocks_per_page; - int block, pnum, poff; - int num_grp_locked = 0; - struct ext4_group_info *this_grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - struct page *page = NULL, *bitmap_page = NULL; - - mb_debug("init group %lu\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - this_grp = ext4_get_group_info(sb, group); - /* - * This ensures we don't add group - * to this buddy cache via resize - */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { - /* - * somebody initialized the group - * return without doing anything - */ - ret = 0; - goto err; - } - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); - bitmap_page = page; - bitmap = page_address(page) + (poff * sb->s_blocksize); - - /* init buddy cache */ - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page == bitmap_page) { - /* - * If both the bitmap and buddy are in - * the same page we don't need to force - * init the buddy - */ - unlock_page(page); - } else if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, bitmap); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); -err: - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); - if (bitmap_page) - page_cache_release(bitmap_page); - if (page) - page_cache_release(page); - return ret; -} - static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -1938,11 +1956,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; - loff_t size, isize; sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) + ngroups = sbi->s_blockfile_groups; + BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -1974,20 +1995,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - if (size < isize) - size = isize; - if (size < sbi->s_mb_stream_request && - (ac->ac_flags & EXT4_MB_HINT_DATA)) { + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } + /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -2015,27 +2032,6 @@ repeat: if (grp->bb_free == 0) continue; - /* - * if the group is already init we check whether it is - * a good group and if not we don't load the buddy - */ - if (EXT4_MB_GRP_NEED_INIT(grp)) { - /* - * we need full data about the group - * to make a good selection - */ - err = ext4_mb_init_group(sb, group); - if (err) - goto out; - } - - /* - * If the particular group doesn't satisfy our - * criteria we continue with the next group - */ - if (!ext4_mb_good_group(ac, group, cr)) - continue; - err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; @@ -2571,13 +2567,11 @@ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; - int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int num_meta_group_infos; int num_meta_group_infos_max; int array_size; - struct ext4_group_info **meta_group_info; struct ext4_group_desc *desc; /* This is the number of blocks used by GDT */ @@ -2622,22 +2616,6 @@ static int ext4_mb_init_backend(struct super_block *sb) goto err_freesgi; } EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; - - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - for (i = 0; i < num_meta_group_infos; i++) { - if ((i + 1) == num_meta_group_infos) - metalen = sizeof(*meta_group_info) * - (ngroups - - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); - meta_group_info = kmalloc(metalen, GFP_KERNEL); - if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); - goto err_freemeta; - } - sbi->s_group_info[i] = meta_group_info; - } - for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { @@ -2655,7 +2633,6 @@ err_freebuddy: while (i-- > 0) kfree(ext4_get_group_info(sb, i)); i = num_meta_group_infos; -err_freemeta: while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); @@ -2833,7 +2810,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct ext4_group_info *db; int err, count = 0, count2 = 0; struct ext4_free_data *entry; - ext4_fsblk_t discard_block; struct list_head *l, *ltmp; list_for_each_safe(l, ltmp, &txn->t_private_list) { @@ -2863,13 +2839,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + discard_block = (ext4_fsblk_t)entry->group * + EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(es->s_first_data_block); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } @@ -3276,6 +3258,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) } /* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + +/* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, @@ -3382,6 +3382,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) continue; + /* non-extent files can't have physical blocks past 2^32 */ + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + continue; + /* found preallocated blocks, use them */ spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free) { @@ -4174,16 +4179,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - size = max(size, isize); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; - /* don't use group allocation for large files */ - if (size >= sbi->s_mb_stream_request) + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; + } - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + /* don't use group allocation for large files */ + size = max(size, isize); + if (size >= sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; + } BUG_ON(ac->ac_lg != NULL); /* @@ -4549,6 +4564,7 @@ repeat: ac->ac_status = AC_STATUS_CONTINUE; goto repeat; } else if (*errp) { + ext4_discard_allocated_blocks(ac); ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 313a50b..8646149 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode, goto err_out; } } - retval = ext4_ext_insert_extent(handle, inode, path, &newext); + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); err_out: if (path) { ext4_ext_drop_refs(path); @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) * So allocate a credit of 3. We may update * quota (user and group). */ - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); if (ext4_journal_extend(handle, needed) != 0) retval = ext4_journal_restart(handle, needed); @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, down_write(&EXT4_I(inode)->i_data_sem); /* - * if EXT4_EXT_MIGRATE is cleared a block allocation + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & - ~EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; /* * We have the extent map build with the tmp inode. * Now copy the i_data across @@ -478,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode) * when we add extents we extent the journal */ /* - * Even though we take i_mutex we can still cause block allocation - * via mmap write to holes. If we have allocated new blocks we fail - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. - * The flag is updated with i_data_sem held to prevent racing with - * block allocation. + * Even though we take i_mutex we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); @@ -618,7 +618,7 @@ err_out: tmp_inode->i_nlink = 0; ext4_journal_stop(handle); - + unlock_new_inode(tmp_inode); iput(tmp_inode); return retval; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index bbf2dd9..9a573a6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -19,14 +19,31 @@ #include "ext4_extents.h" #include "ext4.h" -#define get_ext_path(path, inode, block, ret) \ - do { \ - path = ext4_ext_find_extent(inode, block, path); \ - if (IS_ERR(path)) { \ - ret = PTR_ERR(path); \ - path = NULL; \ - } \ - } while (0) +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **path) +{ + int ret = 0; + + *path = ext4_ext_find_extent(inode, lblock, *path); + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; + } else if ((*path)[ext_depth(inode)].p_ext == NULL) + ret = -ENODATA; + + return ret; +} /** * copy_extent_status - Copy the extent's initialization status @@ -60,12 +77,14 @@ static int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { + struct ext4_extent_header *eh; int ppos, leaf_ppos = path->p_depth; ppos = leaf_ppos; if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext_pblock(path[ppos].p_ext); return 0; } @@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, ext_block_hdr(path[cur_ppos+1].p_bh); } + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + /* leaf block */ path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * mext_double_down_read - Acquire two inodes' read semaphore + * mext_check_null_inode - NULL check for two inodes * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) +static int +mext_check_null_inode(struct inode *inode1, struct inode *inode2, + const char *function) { - struct inode *first = orig_inode, *second = donor_inode; - - BUG_ON(orig_inode == NULL || donor_inode == NULL); - - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; + int ret = 0; + + if (inode1 == NULL) { + ext4_error(inode2->i_sb, function, + "Both inodes should not be NULL: " + "inode1 NULL inode2 %lu", inode2->i_ino); + ret = -EIO; + } else if (inode2 == NULL) { + ext4_error(inode1->i_sb, function, + "Both inodes should not be NULL: " + "inode1 %lu inode2 NULL", inode1->i_ino); + ret = -EIO; } - - down_read(&EXT4_I(first)->i_data_sem); - down_read(&EXT4_I(second)->i_data_sem); + return ret; } /** - * mext_double_down_write - Acquire two inodes' write semaphore + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * * @orig_inode: original inode structure * @donor_inode: donor inode structure - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. */ static void -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; - BUG_ON(orig_inode == NULL || donor_inode == NULL); - /* * Use the inode number to provide the stable locking order instead * of its address, because the C language doesn't guarantee you can @@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) } down_write(&EXT4_I(first)->i_data_sem); - down_write(&EXT4_I(second)->i_data_sem); -} - -/** - * mext_double_up_read - Release two inodes' read semaphore - * - * @orig_inode: original inode structure to be released its lock first - * @donor_inode: donor inode structure to be released its lock second - * Release read semaphore of two inodes (orig and donor). - */ -static void -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) -{ - BUG_ON(orig_inode == NULL || donor_inode == NULL); - - up_read(&EXT4_I(orig_inode)->i_data_sem); - up_read(&EXT4_I(donor_inode)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** - * mext_double_up_write - Release two inodes' write semaphore + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second - * Release write semaphore of two inodes (orig and donor). + * Release write lock of i_data_sem of two inodes (orig and donor). */ static void -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { - BUG_ON(orig_inode == NULL || donor_inode == NULL); - up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } @@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } if (new_flag) { - get_ext_path(orig_path, orig_inode, eblock, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, eblock, &orig_path); + if (err) goto out; if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext)) + orig_path, new_ext, 0)) goto out; } if (end_flag) { - get_ext_path(orig_path, orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (err) goto out; if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext)) + orig_path, end_ext, 0)) goto out; } out: @@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * oext |-----------| * new_ext |-------| */ - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { + ext4_error(orig_inode->i_sb, __func__, + "new_ext_end(%u) should be less than or equal to " + "oext->ee_block(%u) + oext_alen(%d) - 1", + new_ext_end, le32_to_cpu(oext->ee_block), + oext_alen); + ret = -EIO; + goto out; + } /* * Case: new_ext is smaller than original extent @@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, o_end, &start_ext, &new_ext, &end_ext); +out: return ret; } @@ -554,8 +569,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * @orig_off: block offset of original inode * @donor_off: block offset of donor inode * @max_count: the maximun length of extents + * + * Return 0 on success, or a negative error value on failure. */ -static void +static int mext_calc_swap_extents(struct ext4_extent *tmp_dext, struct ext4_extent *tmp_oext, ext4_lblk_t orig_off, ext4_lblk_t donor_off, @@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, ext4_lblk_t diff, orig_diff; struct ext4_extent dext_old, oext_old; + BUG_ON(orig_off != donor_off); + + /* original and donor extents have to cover the same block offset */ + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || + le32_to_cpu(tmp_oext->ee_block) + + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) + return -ENODATA; + + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || + le32_to_cpu(tmp_dext->ee_block) + + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) + return -ENODATA; + dext_old = *tmp_dext; oext_old = *tmp_oext; @@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, copy_extent_status(&oext_old, tmp_dext); copy_extent_status(&dext_old, tmp_oext); + + return 0; } /** @@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @donor_inode: donor inode * @from: block offset of orig_inode * @count: block count to be replaced + * @err: pointer to save return value * * Replace original inode extents and donor inode extents page by page. * We implement this replacement in the following three steps: @@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * 3. Change the block information of donor inode to point at the saved * original inode blocks in the dummy extents. * - * Return 0 on success, or a negative error value on failure. + * Return replaced block count. */ static int mext_replace_branches(handle_t *handle, struct inode *orig_inode, struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count) + ext4_lblk_t count, int *err) { struct ext4_ext_path *orig_path = NULL; struct ext4_ext_path *donor_path = NULL; struct ext4_extent *oext, *dext; struct ext4_extent tmp_dext, tmp_oext; ext4_lblk_t orig_off = from, donor_off = from; - int err = 0; int depth; int replaced_count = 0; int dext_alen; - mext_double_down_write(orig_inode, donor_inode); + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - get_ext_path(orig_path, orig_inode, orig_off, err); - if (orig_path == NULL) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; /* Get the donor extent for the head */ - get_ext_path(donor_path, donor_inode, donor_off, err); - if (donor_path == NULL) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); + if (*err) + goto out; /* Loop for the donor extents */ while (1) { /* The extent for donor must be found. */ - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); + if (!dext) { + ext4_error(donor_inode->i_sb, __func__, + "The extent for donor must be found"); + *err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + ext4_error(donor_inode->i_sb, __func__, + "Donor offset(%u) and the first block of donor " + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); + *err = -EIO; + goto out; + } /* Set donor extent to orig extent */ - err = mext_leaf_block(handle, orig_inode, + *err = mext_leaf_block(handle, orig_inode, orig_path, &tmp_dext, &orig_off); - if (err < 0) + if (*err) goto out; /* Set orig extent to donor extent */ - err = mext_leaf_block(handle, donor_inode, + *err = mext_leaf_block(handle, donor_inode, donor_path, &tmp_oext, &donor_off); - if (err < 0) + if (*err) goto out; dext_alen = ext4_ext_get_actual_len(&tmp_dext); @@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - get_ext_path(orig_path, orig_inode, orig_off, err); - if (orig_path == NULL) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; - if (le32_to_cpu(oext->ee_block) + - ext4_ext_get_actual_len(oext) <= orig_off) { - err = 0; - goto out; - } tmp_oext = *oext; if (donor_path) ext4_ext_drop_refs(donor_path); - get_ext_path(donor_path, donor_inode, - donor_off, err); - if (donor_path == NULL) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; - if (le32_to_cpu(dext->ee_block) + - ext4_ext_get_actual_len(dext) <= donor_off) { - err = 0; - goto out; - } tmp_dext = *dext; - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, - count - replaced_count); + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); + if (*err) + goto out; } out: @@ -720,8 +758,12 @@ out: kfree(donor_path); } - mext_double_up_write(orig_inode, donor_inode); - return err; + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + + return replaced_count; } /** @@ -733,16 +775,17 @@ out: * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling mext_replace_branches(). - * Finally, write out the saved data in new original inode blocks. Return 0 - * on success, or a negative error value on failure. + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. */ static int -move_extent_par_page(struct file *o_filp, struct inode *donor_inode, +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit) + int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; struct address_space *mapping = orig_inode->i_mapping; @@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; - unsigned int tmp_data_len, data_len; + unsigned int tmp_data_size, data_size, replaced_size; void *fsdata; - int ret, i, jblocks; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* @@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; + *err = PTR_ERR(handle); + return 0; } if (segment_eq(get_fs(), KERNEL_DS)) @@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - ret = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page); - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); goto out2; } offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_len */ + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ - tmp_data_len = orig_inode->i_size & (blocksize - 1); + tmp_data_size = orig_inode->i_size & (blocksize - 1); /* - * If data_len equal zero, it shows data_len is multiples of + * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ - if (tmp_data_len == 0) - tmp_data_len = blocksize; + if (tmp_data_size == 0) + tmp_data_size = blocksize; - data_len = tmp_data_len + + data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else { - data_len = block_len_in_page << orig_inode->i_blkbits; - } + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, &page, &fsdata); - if (unlikely(ret < 0)) + if (unlikely(*err < 0)) goto out; if (!PageUptodate(page)) { @@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - ret = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page); - if (ret < 0) - goto out; - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); @@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { - ret = ext4_get_block(orig_inode, + *err = ext4_get_block(orig_inode, (sector_t)(orig_blk_offset + i), bh, 0); - if (ret < 0) + if (*err < 0) goto out; if (bh->b_this_page != NULL) bh = bh->b_this_page; } - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, page, fsdata); page = NULL; @@ -871,11 +916,15 @@ out: if (PageLocked(page)) unlock_page(page); page_cache_release(page); + ext4_journal_stop(handle); } out2: ext4_journal_stop(handle); - return ret < 0 ? ret : 0; + if (err2) + *err = err2; + + return replaced_count; } /** @@ -886,7 +935,6 @@ out2: * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved - * @moved_len: moved block length * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. @@ -894,9 +942,13 @@ out2: */ static int mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len, __u64 moved_len) + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) { + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " @@ -905,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " @@ -921,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { - ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - /* Ext4 move extent supports only extent based file */ if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { ext4_debug("ext4 move extent: orig file is not extents " @@ -953,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if (moved_len) { - ext4_debug("ext4 move extent: moved_len should be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); - return -EINVAL; - } - if ((orig_start > MAX_DEFRAG_SIZE) || (donor_start > MAX_DEFRAG_SIZE) || (*len > MAX_DEFRAG_SIZE) || @@ -971,43 +1015,47 @@ mext_check_arguments(struct inode *orig_inode, } if (orig_inode->i_size > donor_inode->i_size) { - if (orig_start >= donor_inode->i_size) { + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; + /* TODO: eliminate this artificial restriction */ + if (orig_start >= donor_blocks) { ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file size " - "[%lld] [ino:orig %lu, donor_inode %lu]\n", - orig_start, donor_inode->i_size, + "[%llu] should be less than donor file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, donor_blocks, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_start + *len > donor_inode->i_size) { + /* TODO: eliminate this artificial restriction */ + if (orig_start + *len > donor_blocks) { ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file size [%lld]." - "So adjust length from %llu to %lld " + "be less than donor file blocks [%u]." + "So adjust length from %llu to %llu " "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_inode->i_size, - *len, donor_inode->i_size - orig_start, + orig_start + *len, donor_blocks, + *len, donor_blocks - orig_start, orig_inode->i_ino, donor_inode->i_ino); - *len = donor_inode->i_size - orig_start; + *len = donor_blocks - orig_start; } } else { - if (orig_start >= orig_inode->i_size) { + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; + if (orig_start >= orig_blocks) { ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file size " - "[%lld] [inode:orig %lu, donor %lu]\n", - orig_start, orig_inode->i_size, + "should be less than original file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, orig_blocks, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_start + *len > orig_inode->i_size) { + if (orig_start + *len > orig_blocks) { ext4_debug("ext4 move extent: Adjust length " - "from %llu to %lld. Because it should be " - "less than original file size " + "from %llu to %llu. Because it should be " + "less than original file blocks " "[ino:orig %lu, donor %lu]\n", - *len, orig_inode->i_size - orig_start, + *len, orig_blocks - orig_start, orig_inode->i_ino, donor_inode->i_ino); - *len = orig_inode->i_size - orig_start; + *len = orig_blocks - orig_start; } } @@ -1027,18 +1075,23 @@ mext_check_arguments(struct inode *orig_inode, * @inode1: the inode structure * @inode2: the inode structure * - * Lock two inodes' i_mutex by i_ino order. This function is moved from - * fs/inode.c. + * Lock two inodes' i_mutex by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void +static int mext_inode_double_lock(struct inode *inode1, struct inode *inode2) { - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { - if (inode1) - mutex_lock(&inode1->i_mutex); - else if (inode2) - mutex_lock(&inode2->i_mutex); - return; + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__); + if (ret < 0) + goto out; + + if (inode1 == inode2) { + mutex_lock(&inode1->i_mutex); + goto out; } if (inode1->i_ino < inode2->i_ino) { @@ -1048,6 +1101,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); } + +out: + return ret; } /** @@ -1056,17 +1112,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) * @inode1: the inode that is released first * @inode2: the inode that is released second * - * This function is moved from fs/inode.c. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void +static int mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) { + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__); + if (ret < 0) + goto out; + if (inode1) mutex_unlock(&inode1->i_mutex); if (inode2 && inode2 != inode1) mutex_unlock(&inode2->i_mutex); + +out: + return ret; } /** @@ -1123,70 +1190,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; ext4_lblk_t rest_blocks; pgoff_t orig_page_offset = 0, seq_end_page; - int ret, depth, last_extent = 0; + int ret1, ret2, depth, last_extent = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; int uninit; - /* protect orig and donor against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Protect orig and donor inodes against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; - mext_double_down_read(orig_inode, donor_inode); + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len, *moved_len); - mext_double_up_read(orig_inode, donor_inode); - if (ret) - goto out2; + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len); + if (ret1) + goto out; file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; block_end = block_start + len - 1; if (file_end < block_end) len -= block_end - file_end; - get_ext_path(orig_path, orig_inode, block_start, ret); - if (orig_path == NULL) - goto out2; + ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (ret1) + goto out; /* Get path structure to check the hole */ - get_ext_path(holecheck_path, orig_inode, block_start, ret); - if (holecheck_path == NULL) + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret1) goto out; depth = ext_depth(orig_inode); ext_cur = holecheck_path[depth].p_ext; - if (ext_cur == NULL) { - ret = -EINVAL; - goto out; - } /* - * Get proper extent whose ee_block is beyond block_start - * if block_start was within the hole. + * Get proper starting location of block replacement if block_start was + * within the hole. */ if (le32_to_cpu(ext_cur->ee_block) + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + /* + * The hole exists between extents or the tail of + * original file. + */ last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; goto out; } last_extent = mext_next_extent(orig_inode, orig_path, &ext_dummy); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; goto out; } - } - seq_start = block_start; + seq_start = le32_to_cpu(ext_cur->ee_block); + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) + /* The hole exists at the beginning of original file. */ + seq_start = le32_to_cpu(ext_cur->ee_block); + else + seq_start = block_start; /* No blocks within the specified range. */ if (le32_to_cpu(ext_cur->ee_block) > block_end) { ext4_debug("ext4 move extent: The specified range of file " "may be the hole\n"); - ret = -EINVAL; + ret1 = -EINVAL; goto out; } @@ -1206,7 +1287,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; break; } add_blocks = ext4_ext_get_actual_len(ext_cur); @@ -1246,29 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, seq_start = le32_to_cpu(ext_cur->ee_block); rest_blocks = seq_blocks; - /* Discard preallocations of two inodes */ - down_write(&EXT4_I(orig_inode)->i_data_sem); - ext4_discard_preallocations(orig_inode); - up_write(&EXT4_I(orig_inode)->i_data_sem); - - down_write(&EXT4_I(donor_inode)->i_data_sem); - ext4_discard_preallocations(donor_inode); - up_write(&EXT4_I(donor_inode)->i_data_sem); + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret = move_extent_par_page(o_filp, donor_inode, + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit); - if (ret < 0) - goto out; - orig_page_offset++; + block_len_in_page, uninit, + &ret1); + /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; - BUG_ON(*moved_len > len); + if (ret1 < 0) + break; + if (*moved_len > len) { + ext4_error(orig_inode->i_sb, __func__, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; + break; + } + orig_page_offset++; data_offset_in_page = 0; rest_blocks -= block_len_in_page; if (rest_blocks > blocks_per_page) @@ -1277,20 +1368,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); - get_ext_path(holecheck_path, orig_inode, - seq_start, ret); - if (holecheck_path == NULL) + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret1) break; depth = holecheck_path->p_depth; /* Decrease buffer counter */ if (orig_path) ext4_ext_drop_refs(orig_path); - get_ext_path(orig_path, orig_inode, seq_start, ret); - if (orig_path == NULL) + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret1) break; ext_cur = holecheck_path[depth].p_ext; @@ -1299,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + if (orig_path) { ext4_ext_drop_refs(orig_path); kfree(orig_path); @@ -1307,14 +1406,13 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } -out2: - mext_inode_double_unlock(orig_inode, donor_inode); - - if (ret) - return ret; + double_up_write_data_sem(orig_inode, donor_inode); + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); - /* All of the specified blocks must be exchanged in succeed */ - BUG_ON(*moved_len != len); + if (ret1) + return ret1; + else if (ret2) + return ret2; return 0; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index de04013..9dcd686 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1292,9 +1292,6 @@ errout: * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) { - brelse(bh); + bh, offset)) return -EIO; - } - if (ext4_match(namelen, name, de)) { - brelse(bh); + if (ext4_match(namelen, name, de)) return -EEXIST; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_journal_get_write_access(handle, bh); if (err) { ext4_std_error(dir->i_sb, err); - brelse(bh); return err; } @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - brelse(bh); return 0; } @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (!(de)) return retval; - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if(!bh) return retval; retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) + if (retval != -ENOSPC) { + brelse(bh); return retval; + } if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto journal_error; err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; + if (err != -ENOSPC) goto cleanup; - } /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", @@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, frame->bh); if (err) @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; goto cleanup; journal_error: @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - if (!ext4_handle_valid(handle)) + /* ext4_handle_valid() assumes a valid handle_t pointer */ + if (handle && !ext4_handle_valid(handle)) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); @@ -2258,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; int err, retries = 0; - if (EXT4_DIR_LINK_MAX(inode)) + if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; /* @@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= EXT4_LINK_MAX) + EXT4_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 68b0351..96302cd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext4_handle_dirty_metadata(handle, NULL, gdb); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f4f079..ed38f25 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,6 +45,7 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "mballoc.h" #define CREATE_TRACE_POINTS #include @@ -188,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb, bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + /* * Wrappers for jbd2_journal_start/end. * @@ -214,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) } return jbd2_journal_start(journal, nblocks); } - /* - * We're not journaling, return the appropriate indication. - */ - current->journal_info = EXT4_NOJOURNAL_HANDLE; - return current->journal_info; + return ext4_get_nojournal(); } /* @@ -234,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int rc; if (!ext4_handle_valid(handle)) { - /* - * Do this here since we don't call jbd2_journal_stop() in - * no-journal mode. - */ - current->journal_info = NULL; + ext4_put_nojournal(handle); return 0; } sb = handle->h_transaction->t_journal->j_private; @@ -344,7 +367,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, errstr = "Out of memory"; break; case -EROFS: - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; @@ -578,15 +602,14 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + flush_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->dio_unwritten_wq); + lock_super(sb); lock_kernel(); if (sb->s_dirt) ext4_commit_super(sb, 1); - ext4_release_system_zone(sb); - ext4_mb_release(sb); - ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -594,6 +617,12 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, __func__, "Couldn't clean up the journal"); } + + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -682,6 +711,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; return &ei->vfs_inode; } @@ -877,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); + if (test_opt(sb, DISCARD)) + seq_puts(seq, ",discard"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext4_show_quota_options(seq, sb); return 0; @@ -1057,7 +1096,8 @@ enum { Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, }; static const match_table_t tokens = { @@ -1082,6 +1122,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, @@ -1123,6 +1164,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, }; @@ -1551,6 +1594,12 @@ set_qf_format: else set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; + case Opt_discard: + set_opt(sbi->s_mount_opt, DISCARD); + break; + case Opt_nodiscard: + clear_opt(sbi->s_mount_opt, DISCARD); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -1666,14 +1715,14 @@ static int ext4_fill_flex_info(struct super_block *sb) size_t size; int i; - if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + if (groups_per_flex < 2) { sbi->s_log_groups_per_flex = 0; return 1; } - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << @@ -1695,12 +1744,12 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, - ext4_free_inodes_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, - ext4_free_blks_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, - ext4_used_dirs_count(sb, gdp)); + atomic_add(ext4_free_inodes_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_inodes); + atomic_add(ext4_free_blks_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(ext4_used_dirs_count(sb, gdp), + &sbi->s_flex_groups[flex_group].used_dirs); } return 1; @@ -2197,6 +2246,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2210,6 +2260,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), NULL, }; @@ -2253,6 +2304,49 @@ static struct kobj_type ext4_ktype = { .release = ext4_sb_release, }; +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +static int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (readonly) + return 1; + + /* Check that feature set is OK for a read-write mount */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + /* + * Large file size enabled file system can only be mounted + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (sizeof(blkcnt_t) < sizeof(u64)) { + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " + "cannot be mounted RDWR without " + "CONFIG_LBDAF"); + return 0; + } + } + return 1; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2274,7 +2368,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int db_count; unsigned int i; int needs_recovery, has_huge_files; - int features; __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -2401,39 +2494,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); - if (features) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & - ~EXT4_FEATURE_INCOMPAT_SUPP)); - goto failed_mount; - } - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); - if (!(sb->s_flags & MS_RDONLY) && features) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount RDWR because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - } - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (has_huge_files) { - /* - * Large file size enabled file system can only be - * mount if kernel is build with CONFIG_LBDAF - */ - if (sizeof(root->i_blocks) < sizeof(u64) && - !(sb->s_flags & MS_RDONLY)) { - ext4_msg(sb, KERN_ERR, "Filesystem with huge " - "files cannot be mounted read-write " - "without CONFIG_LBDAF"); - goto failed_mount; - } - } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT4_MIN_BLOCK_SIZE || @@ -2469,6 +2532,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); @@ -2549,12 +2614,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + if ((ext4_blocks_count(es) > + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || + (ext4_blocks_count(es) > + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely"); + " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = -EFBIG; goto failed_mount; } @@ -2595,6 +2667,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), @@ -2656,6 +2730,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; /* * set up enough so that it can read an inode @@ -2781,6 +2856,12 @@ no_journal: clear_opt(sbi->s_mount_opt, NOBH); } } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); + if (!EXT4_SB(sb)->dio_unwritten_wq) { + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + goto failed_mount_wq; + } + /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -2893,6 +2974,8 @@ cantfind_ext4: failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +failed_mount_wq: ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -3208,7 +3291,18 @@ static int ext4_commit_super(struct super_block *sb, int sync) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - @@ -3333,11 +3427,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); + jbd2_log_wait_commit(sbi->s_journal, target); } return ret; } @@ -3477,18 +3573,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); } else { - int ret; - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, - ~EXT4_FEATURE_RO_COMPAT_SUPP))) { - ext4_msg(sb, KERN_WARNING, "couldn't " - "remount RDWR because of unsupported " - "optional features (%x)", - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); + /* Make sure we can mount this feature set readwrite */ + if (!ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } - /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. @@ -3624,13 +3713,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); - ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 62b31c2..0257019 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -810,12 +810,23 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - ext4_fsblk_t goal = ext4_group_first_block_no(sb, + ext4_fsblk_t goal, block; + + goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + block = ext4_new_meta_blocks(handle, inode, goal, NULL, &error); if (error) goto cleanup; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); @@ -977,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -1002,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7b4088b..8cf902a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e378cb3..4b74149 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); @@ -361,6 +362,10 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + jbd2_journal_put_journal_head(new_jh); + return -ENOMEM; + } jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); @@ -1187,6 +1192,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6213ac7..a051270 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); @@ -238,6 +238,8 @@ repeat_locked: __jbd2_log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) handle = ERR_PTR(err); goto out; } - - lock_map_acquire(&handle->h_lockdep_map); out: return handle; } @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) __jbd2_log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f1ea4a..d3e910b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1999,11 +1999,18 @@ static inline int is_si_special(const struct siginfo *info) return info <= SEND_SIG_FORCED; } -/* True if we are on the alternate signal stack. */ - +/* + * True if we are on the alternate signal stack. + */ static inline int on_sig_stack(unsigned long sp) { - return (sp - current->sas_ss_sp < current->sas_ss_size); +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif } static inline int sas_ss_flags(unsigned long sp) diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h index 2cc8e8b..6856612 100644 --- a/include/scsi/osd_protocol.h +++ b/include/scsi/osd_protocol.h @@ -17,6 +17,7 @@ #define __OSD_PROTOCOL_H__ #include +#include #include #include diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index b62a097..6cc72e2 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -677,6 +677,12 @@ struct Scsi_Host { void *shost_data; /* + * Points to the physical bus device we'd use to do DMA + * Needed just in case we have virtual hosts. + */ + struct device *dma_dev; + + /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. @@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int); -extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *); +extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, + struct device *, + struct device *); extern void scsi_scan_host(struct Scsi_Host *); extern void scsi_rescan_device(struct device *); extern void scsi_remove_host(struct Scsi_Host *); @@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(enum scsi_host_state); extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *); +static inline int __must_check scsi_add_host(struct Scsi_Host *host, + struct device *dev) +{ + return scsi_add_host_with_dma(host, dev, dev); +} + static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7d8b5bc..824979e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -5,10 +5,12 @@ #define _TRACE_EXT4_H #include -#include "../../../fs/ext4/ext4.h" -#include "../../../fs/ext4/mballoc.h" #include +struct ext4_allocation_context; +struct ext4_allocation_request; +struct ext4_prealloc_space; + TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), @@ -229,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages, __field( char, for_reclaim ) __field( char, for_writepages ) __field( char, range_cyclic ) + __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -243,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages, __entry->for_reclaim = wbc->for_reclaim; __entry->for_writepages = wbc->for_writepages; __entry->range_cyclic = wbc->range_cyclic; + __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu", + jbd2_dev_to_name(__entry->dev), + (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, __entry->for_kupdate, __entry->for_reclaim, - __entry->for_writepages, __entry->range_cyclic) + __entry->for_writepages, __entry->range_cyclic, + (unsigned long) __entry->writeback_index) +); + +TRACE_EVENT(ext4_da_write_pages, + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), + + TP_ARGS(inode, mpd), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, b_blocknr ) + __field( __u32, b_size ) + __field( __u32, b_state ) + __field( unsigned long, first_page ) + __field( int, io_done ) + __field( int, pages_written ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->b_blocknr = mpd->b_blocknr; + __entry->b_size = mpd->b_size; + __entry->b_state = mpd->b_state; + __entry->first_page = mpd->first_page; + __entry->io_done = mpd->io_done; + __entry->pages_written = mpd->pages_written; + ), + + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->b_blocknr, __entry->b_size, + __entry->b_state, __entry->first_page, + __entry->io_done, __entry->pages_written) ); TRACE_EVENT(ext4_da_writepages_result, @@ -268,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result, __field( char, encountered_congestion ) __field( char, more_io ) __field( char, no_nrwrite_index_update ) + __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -279,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->encountered_congestion = wbc->encountered_congestion; __entry->more_io = wbc->more_io; __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; + __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", + jbd2_dev_to_name(__entry->dev), + (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->encountered_congestion, __entry->more_io, - __entry->no_nrwrite_index_update) + __entry->no_nrwrite_index_update, + (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_begin,