Magellan Linux

Annotation of /trunk/kernel26-alx/patches-2.6.27-r3/0118-2.6.27.19-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1176 - (hide annotations) (download)
Thu Oct 14 15:11:06 2010 UTC (13 years, 7 months ago) by niro
File size: 80833 byte(s)
-2.6.27-alx-r3: new magellan 0.5.2 kernel
1 niro 1176 diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
2     index 5af4e9b..ada0692 100644
3     --- a/arch/powerpc/kernel/align.c
4     +++ b/arch/powerpc/kernel/align.c
5     @@ -646,11 +646,16 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
6     unsigned int areg, struct pt_regs *regs,
7     unsigned int flags, unsigned int length)
8     {
9     - char *ptr = (char *) &current->thread.TS_FPR(reg);
10     + char *ptr;
11     int ret = 0;
12    
13     flush_vsx_to_thread(current);
14    
15     + if (reg < 32)
16     + ptr = (char *) &current->thread.TS_FPR(reg);
17     + else
18     + ptr = (char *) &current->thread.vr[reg - 32];
19     +
20     if (flags & ST)
21     ret = __copy_to_user(addr, ptr, length);
22     else {
23     diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
24     index 5b719a0..7c3b8dc 100644
25     --- a/arch/x86/mm/pageattr.c
26     +++ b/arch/x86/mm/pageattr.c
27     @@ -619,6 +619,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
28     unsigned int level;
29     pte_t *kpte, old_pte;
30    
31     + /*
32     + * If we're called with lazy mmu updates enabled, the
33     + * in-memory pte state may be stale. Flush pending updates to
34     + * bring them up to date.
35     + */
36     + arch_flush_lazy_mmu_mode();
37     +
38     repeat:
39     kpte = lookup_address(address, &level);
40     if (!kpte)
41     @@ -836,6 +843,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
42     else
43     cpa_flush_all(cache);
44    
45     + /*
46     + * If we've been called with lazy mmu updates enabled, then
47     + * make sure that everything gets flushed out before we
48     + * return.
49     + */
50     + arch_flush_lazy_mmu_mode();
51     +
52     out:
53     cpa_fill_pool(NULL);
54    
55     diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
56     index c5be6a1..b6f55e8 100644
57     --- a/drivers/ata/pata_via.c
58     +++ b/drivers/ata/pata_via.c
59     @@ -111,7 +111,8 @@ static const struct via_isa_bridge {
60     { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
61     { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
62     { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA },
63     - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES},
64     + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
65     + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
66     { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
67     { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
68     { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
69     @@ -594,6 +595,7 @@ static int via_reinit_one(struct pci_dev *pdev)
70     #endif
71    
72     static const struct pci_device_id via[] = {
73     + { PCI_VDEVICE(VIA, 0x0415), },
74     { PCI_VDEVICE(VIA, 0x0571), },
75     { PCI_VDEVICE(VIA, 0x0581), },
76     { PCI_VDEVICE(VIA, 0x1571), },
77     diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
78     index 89e3b7f..8b6f9c0 100644
79     --- a/drivers/ata/sata_nv.c
80     +++ b/drivers/ata/sata_nv.c
81     @@ -421,19 +421,21 @@ static struct ata_port_operations nv_generic_ops = {
82     .hardreset = ATA_OP_NULL,
83     };
84    
85     -/* OSDL bz3352 reports that nf2/3 controllers can't determine device
86     - * signature reliably. Also, the following thread reports detection
87     - * failure on cold boot with the standard debouncing timing.
88     +/* nf2 is ripe with hardreset related problems.
89     + *
90     + * kernel bz#3352 reports nf2/3 controllers can't determine device
91     + * signature reliably. The following thread reports detection failure
92     + * on cold boot with the standard debouncing timing.
93     *
94     * http://thread.gmane.org/gmane.linux.ide/34098
95     *
96     - * Debounce with hotplug timing and request follow-up SRST.
97     + * And bz#12176 reports that hardreset simply doesn't work on nf2.
98     + * Give up on it and just don't do hardreset.
99     */
100     static struct ata_port_operations nv_nf2_ops = {
101     - .inherits = &nv_common_ops,
102     + .inherits = &nv_generic_ops,
103     .freeze = nv_nf2_freeze,
104     .thaw = nv_nf2_thaw,
105     - .hardreset = nv_noclassify_hardreset,
106     };
107    
108     /* For initial probing after boot and hot plugging, hardreset mostly
109     diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
110     index 58630cc..f2ada0c 100644
111     --- a/drivers/bluetooth/btsdio.c
112     +++ b/drivers/bluetooth/btsdio.c
113     @@ -91,6 +91,7 @@ static int btsdio_tx_packet(struct btsdio_data *data, struct sk_buff *skb)
114    
115     err = sdio_writesb(data->func, REG_TDAT, skb->data, skb->len);
116     if (err < 0) {
117     + skb_pull(skb, 4);
118     sdio_writeb(data->func, 0x01, REG_PC_WRT, NULL);
119     return err;
120     }
121     @@ -152,7 +153,7 @@ static int btsdio_rx_packet(struct btsdio_data *data)
122    
123     err = sdio_readsb(data->func, skb->data, REG_RDAT, len - 4);
124     if (err < 0) {
125     - kfree(skb);
126     + kfree_skb(skb);
127     return err;
128     }
129    
130     diff --git a/drivers/net/3c505.c b/drivers/net/3c505.c
131     index fdfb2b2..ae8e36c 100644
132     --- a/drivers/net/3c505.c
133     +++ b/drivers/net/3c505.c
134     @@ -493,21 +493,27 @@ static bool receive_pcb(struct net_device *dev, pcb_struct * pcb)
135     }
136     /* read the data */
137     spin_lock_irqsave(&adapter->lock, flags);
138     - i = 0;
139     - do {
140     - j = 0;
141     - while (((stat = get_status(dev->base_addr)) & ACRF) == 0 && j++ < 20000);
142     - pcb->data.raw[i++] = inb_command(dev->base_addr);
143     - if (i > MAX_PCB_DATA)
144     - INVALID_PCB_MSG(i);
145     - } while ((stat & ASF_PCB_MASK) != ASF_PCB_END && j < 20000);
146     + for (i = 0; i < MAX_PCB_DATA; i++) {
147     + for (j = 0; j < 20000; j++) {
148     + stat = get_status(dev->base_addr);
149     + if (stat & ACRF)
150     + break;
151     + }
152     + pcb->data.raw[i] = inb_command(dev->base_addr);
153     + if ((stat & ASF_PCB_MASK) == ASF_PCB_END || j >= 20000)
154     + break;
155     + }
156     spin_unlock_irqrestore(&adapter->lock, flags);
157     + if (i >= MAX_PCB_DATA) {
158     + INVALID_PCB_MSG(i);
159     + return false;
160     + }
161     if (j >= 20000) {
162     TIMEOUT_MSG(__LINE__);
163     return false;
164     }
165     - /* woops, the last "data" byte was really the length! */
166     - total_length = pcb->data.raw[--i];
167     + /* the last "data" byte was really the length! */
168     + total_length = pcb->data.raw[i];
169    
170     /* safety check total length vs data length */
171     if (total_length != (pcb->length + 2)) {
172     diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
173     index c3edcdc..2d90a3c 100644
174     --- a/drivers/pci/intel-iommu.c
175     +++ b/drivers/pci/intel-iommu.c
176     @@ -72,6 +72,8 @@ static struct deferred_flush_tables *deferred_flush;
177     /* bitmap for indexing intel_iommus */
178     static int g_num_of_iommus;
179    
180     +static int rwbf_quirk = 0;
181     +
182     static DEFINE_SPINLOCK(async_umap_flush_lock);
183     static LIST_HEAD(unmaps_to_do);
184    
185     @@ -527,7 +529,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
186     u32 val;
187     unsigned long flag;
188    
189     - if (!cap_rwbf(iommu->cap))
190     + if (!rwbf_quirk && !cap_rwbf(iommu->cap))
191     return;
192     val = iommu->gcmd | DMA_GCMD_WBF;
193    
194     @@ -2453,3 +2455,12 @@ int __init intel_iommu_init(void)
195     return 0;
196     }
197    
198     +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
199     +{
200     + /* Mobile 4 Series Chipset neglects to set RWBF capability,
201     + but needs it */
202     + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
203     + rwbf_quirk = 1;
204     +}
205     +
206     +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
207     diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
208     index 299e075..55ac5c3 100644
209     --- a/drivers/scsi/libiscsi.c
210     +++ b/drivers/scsi/libiscsi.c
211     @@ -1844,6 +1844,7 @@ void iscsi_pool_free(struct iscsi_pool *q)
212     kfree(q->pool[i]);
213     if (q->pool)
214     kfree(q->pool);
215     + kfree(q->queue);
216     }
217     EXPORT_SYMBOL_GPL(iscsi_pool_free);
218    
219     diff --git a/fs/ext2/super.c b/fs/ext2/super.c
220     index fd88c7b..2ebc0c4 100644
221     --- a/fs/ext2/super.c
222     +++ b/fs/ext2/super.c
223     @@ -1177,9 +1177,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
224     es = sbi->s_es;
225     if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
226     (old_mount_opt & EXT2_MOUNT_XIP)) &&
227     - invalidate_inodes(sb))
228     - ext2_warning(sb, __func__, "busy inodes while remounting "\
229     - "xip remain in cache (no functional problem)");
230     + invalidate_inodes(sb)) {
231     + ext2_warning(sb, __func__, "refusing change of xip flag "
232     + "with busy inodes while remounting");
233     + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
234     + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
235     + }
236     if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
237     return 0;
238     if (*flags & MS_RDONLY) {
239     diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
240     index e9fa960..8b7c776 100644
241     --- a/fs/ext4/balloc.c
242     +++ b/fs/ext4/balloc.c
243     @@ -20,6 +20,7 @@
244     #include "ext4.h"
245     #include "ext4_jbd2.h"
246     #include "group.h"
247     +#include "mballoc.h"
248    
249     /*
250     * balloc.c contains the blocks allocation and deallocation routines
251     @@ -318,18 +319,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
252     block_group, bitmap_blk);
253     return NULL;
254     }
255     - if (bh_uptodate_or_lock(bh))
256     +
257     + if (bitmap_uptodate(bh))
258     return bh;
259    
260     + lock_buffer(bh);
261     + if (bitmap_uptodate(bh)) {
262     + unlock_buffer(bh);
263     + return bh;
264     + }
265     spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
266     if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
267     ext4_init_block_bitmap(sb, bh, block_group, desc);
268     + set_bitmap_uptodate(bh);
269     set_buffer_uptodate(bh);
270     unlock_buffer(bh);
271     spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
272     return bh;
273     }
274     spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
275     + if (buffer_uptodate(bh)) {
276     + /*
277     + * if not uninit if bh is uptodate,
278     + * bitmap is also uptodate
279     + */
280     + set_bitmap_uptodate(bh);
281     + unlock_buffer(bh);
282     + return bh;
283     + }
284     + /*
285     + * submit the buffer_head for read. We can
286     + * safely mark the bitmap as uptodate now.
287     + * We do it here so the bitmap uptodate bit
288     + * get set with buffer lock held.
289     + */
290     + set_bitmap_uptodate(bh);
291     if (bh_submit_read(bh) < 0) {
292     put_bh(bh);
293     ext4_error(sb, __func__,
294     @@ -837,6 +861,136 @@ error_return:
295     }
296    
297     /**
298     + * ext4_add_groupblocks() -- Add given blocks to an existing group
299     + * @handle: handle to this transaction
300     + * @sb: super block
301     + * @block: start physcial block to add to the block group
302     + * @count: number of blocks to free
303     + *
304     + * This marks the blocks as free in the bitmap. We ask the
305     + * mballoc to reload the buddy after this by setting group
306     + * EXT4_GROUP_INFO_NEED_INIT_BIT flag
307     + */
308     +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
309     + ext4_fsblk_t block, unsigned long count)
310     +{
311     + struct buffer_head *bitmap_bh = NULL;
312     + struct buffer_head *gd_bh;
313     + ext4_group_t block_group;
314     + ext4_grpblk_t bit;
315     + unsigned long i;
316     + struct ext4_group_desc *desc;
317     + struct ext4_super_block *es;
318     + struct ext4_sb_info *sbi;
319     + int err = 0, ret;
320     + ext4_grpblk_t blocks_freed;
321     + struct ext4_group_info *grp;
322     +
323     + sbi = EXT4_SB(sb);
324     + es = sbi->s_es;
325     + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
326     +
327     + ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
328     + grp = ext4_get_group_info(sb, block_group);
329     + /*
330     + * Check to see if we are freeing blocks across a group
331     + * boundary.
332     + */
333     + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
334     + goto error_return;
335     +
336     + bitmap_bh = ext4_read_block_bitmap(sb, block_group);
337     + if (!bitmap_bh)
338     + goto error_return;
339     + desc = ext4_get_group_desc(sb, block_group, &gd_bh);
340     + if (!desc)
341     + goto error_return;
342     +
343     + if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
344     + in_range(ext4_inode_bitmap(sb, desc), block, count) ||
345     + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
346     + in_range(block + count - 1, ext4_inode_table(sb, desc),
347     + sbi->s_itb_per_group)) {
348     + ext4_error(sb, __func__,
349     + "Adding blocks in system zones - "
350     + "Block = %llu, count = %lu",
351     + block, count);
352     + goto error_return;
353     + }
354     +
355     + /*
356     + * We are about to add blocks to the bitmap,
357     + * so we need undo access.
358     + */
359     + BUFFER_TRACE(bitmap_bh, "getting undo access");
360     + err = ext4_journal_get_undo_access(handle, bitmap_bh);
361     + if (err)
362     + goto error_return;
363     +
364     + /*
365     + * We are about to modify some metadata. Call the journal APIs
366     + * to unshare ->b_data if a currently-committing transaction is
367     + * using it
368     + */
369     + BUFFER_TRACE(gd_bh, "get_write_access");
370     + err = ext4_journal_get_write_access(handle, gd_bh);
371     + if (err)
372     + goto error_return;
373     + /*
374     + * make sure we don't allow a parallel init on other groups in the
375     + * same buddy cache
376     + */
377     + down_write(&grp->alloc_sem);
378     + for (i = 0, blocks_freed = 0; i < count; i++) {
379     + BUFFER_TRACE(bitmap_bh, "clear bit");
380     + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
381     + bit + i, bitmap_bh->b_data)) {
382     + ext4_error(sb, __func__,
383     + "bit already cleared for block %llu",
384     + (ext4_fsblk_t)(block + i));
385     + BUFFER_TRACE(bitmap_bh, "bit already cleared");
386     + } else {
387     + blocks_freed++;
388     + }
389     + }
390     + spin_lock(sb_bgl_lock(sbi, block_group));
391     + le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
392     + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
393     + spin_unlock(sb_bgl_lock(sbi, block_group));
394     + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
395     +
396     + if (sbi->s_log_groups_per_flex) {
397     + ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
398     + spin_lock(sb_bgl_lock(sbi, flex_group));
399     + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
400     + spin_unlock(sb_bgl_lock(sbi, flex_group));
401     + }
402     + /*
403     + * request to reload the buddy with the
404     + * new bitmap information
405     + */
406     + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
407     + ext4_mb_update_group_info(grp, blocks_freed);
408     + up_write(&grp->alloc_sem);
409     +
410     + /* We dirtied the bitmap block */
411     + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
412     + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
413     +
414     + /* And the group descriptor block */
415     + BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
416     + ret = ext4_journal_dirty_metadata(handle, gd_bh);
417     + if (!err)
418     + err = ret;
419     + sb->s_dirt = 1;
420     +
421     +error_return:
422     + brelse(bitmap_bh);
423     + ext4_std_error(sb, err);
424     + return;
425     +}
426     +
427     +/**
428     * ext4_free_blocks() -- Free given blocks and update quota
429     * @handle: handle for this transaction
430     * @inode: inode
431     diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
432     index 4829dac..85f58af 100644
433     --- a/fs/ext4/ext4.h
434     +++ b/fs/ext4/ext4.h
435     @@ -19,6 +19,7 @@
436     #include <linux/types.h>
437     #include <linux/blkdev.h>
438     #include <linux/magic.h>
439     +#include <linux/jbd2.h>
440     #include "ext4_i.h"
441    
442     /*
443     @@ -889,6 +890,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
444     #define DX_HASH_LEGACY 0
445     #define DX_HASH_HALF_MD4 1
446     #define DX_HASH_TEA 2
447     +#define DX_HASH_LEGACY_UNSIGNED 3
448     +#define DX_HASH_HALF_MD4_UNSIGNED 4
449     +#define DX_HASH_TEA_UNSIGNED 5
450    
451     #ifdef __KERNEL__
452    
453     @@ -988,9 +992,11 @@ extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
454     ext4_fsblk_t nblocks);
455     extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
456     ext4_fsblk_t block, unsigned long count, int metadata);
457     -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
458     - ext4_fsblk_t block, unsigned long count,
459     +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
460     + ext4_fsblk_t block, unsigned long count,
461     unsigned long *pdquot_freed_blocks);
462     +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
463     + ext4_fsblk_t block, unsigned long count);
464     extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
465     extern void ext4_check_blocks_bitmap (struct super_block *);
466     extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
467     @@ -1038,12 +1044,13 @@ extern int __init init_ext4_mballoc(void);
468     extern void exit_ext4_mballoc(void);
469     extern void ext4_mb_free_blocks(handle_t *, struct inode *,
470     unsigned long, unsigned long, int, unsigned long *);
471     -extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
472     +extern int ext4_mb_add_groupinfo(struct super_block *sb,
473     ext4_group_t i, struct ext4_group_desc *desc);
474     extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
475     ext4_grpblk_t add);
476     -
477     -
478     +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
479     +extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
480     + ext4_group_t, int);
481     /* inode.c */
482     int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
483     struct buffer_head *bh, ext4_fsblk_t blocknr);
484     @@ -1167,8 +1174,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
485    
486     static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
487     {
488     - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
489     - le32_to_cpu(raw_inode->i_size_lo);
490     + if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
491     + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
492     + le32_to_cpu(raw_inode->i_size_lo);
493     + else
494     + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
495     }
496    
497     static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
498     @@ -1244,6 +1254,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
499     sector_t block, unsigned long max_blocks,
500     struct buffer_head *bh, int create,
501     int extend_disksize, int flag);
502     +/*
503     + * Add new method to test wether block and inode bitmaps are properly
504     + * initialized. With uninit_bg reading the block from disk is not enough
505     + * to mark the bitmap uptodate. We need to also zero-out the bitmap
506     + */
507     +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
508     +
509     +static inline int bitmap_uptodate(struct buffer_head *bh)
510     +{
511     + return (buffer_uptodate(bh) &&
512     + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
513     +}
514     +static inline void set_bitmap_uptodate(struct buffer_head *bh)
515     +{
516     + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
517     +}
518     +
519     #endif /* __KERNEL__ */
520    
521     #endif /* _EXT4_H */
522     diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
523     index 6300226..f20df8a 100644
524     --- a/fs/ext4/ext4_sb.h
525     +++ b/fs/ext4/ext4_sb.h
526     @@ -56,6 +56,7 @@ struct ext4_sb_info {
527     u32 s_next_generation;
528     u32 s_hash_seed[4];
529     int s_def_hash_version;
530     + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
531     struct percpu_counter s_freeblocks_counter;
532     struct percpu_counter s_freeinodes_counter;
533     struct percpu_counter s_dirs_counter;
534     @@ -102,7 +103,8 @@ struct ext4_sb_info {
535     struct list_head s_committed_transaction;
536     spinlock_t s_md_lock;
537     tid_t s_last_transaction;
538     - unsigned short *s_mb_offsets, *s_mb_maxs;
539     + unsigned short *s_mb_offsets;
540     + unsigned int *s_mb_maxs;
541    
542     /* tunables */
543     unsigned long s_stripe;
544     diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
545     index 1d6329d..bd7d14d 100644
546     --- a/fs/ext4/hash.c
547     +++ b/fs/ext4/hash.c
548     @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
549    
550    
551     /* The old legacy hash */
552     -static __u32 dx_hack_hash (const char *name, int len)
553     +static __u32 dx_hack_hash_unsigned(const char *name, int len)
554     {
555     - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
556     + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
557     + const unsigned char *ucp = (const unsigned char *) name;
558     +
559     + while (len--) {
560     + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
561     +
562     + if (hash & 0x80000000)
563     + hash -= 0x7fffffff;
564     + hash1 = hash0;
565     + hash0 = hash;
566     + }
567     + return hash0 << 1;
568     +}
569     +
570     +static __u32 dx_hack_hash_signed(const char *name, int len)
571     +{
572     + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
573     + const signed char *scp = (const signed char *) name;
574     +
575     while (len--) {
576     - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
577     + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
578    
579     - if (hash & 0x80000000) hash -= 0x7fffffff;
580     + if (hash & 0x80000000)
581     + hash -= 0x7fffffff;
582     hash1 = hash0;
583     hash0 = hash;
584     }
585     - return (hash0 << 1);
586     + return hash0 << 1;
587     }
588    
589     -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
590     +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
591     {
592     __u32 pad, val;
593     int i;
594     + const signed char *scp = (const signed char *) msg;
595     +
596     + pad = (__u32)len | ((__u32)len << 8);
597     + pad |= pad << 16;
598     +
599     + val = pad;
600     + if (len > num*4)
601     + len = num * 4;
602     + for (i = 0; i < len; i++) {
603     + if ((i % 4) == 0)
604     + val = pad;
605     + val = ((int) scp[i]) + (val << 8);
606     + if ((i % 4) == 3) {
607     + *buf++ = val;
608     + val = pad;
609     + num--;
610     + }
611     + }
612     + if (--num >= 0)
613     + *buf++ = val;
614     + while (--num >= 0)
615     + *buf++ = pad;
616     +}
617     +
618     +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
619     +{
620     + __u32 pad, val;
621     + int i;
622     + const unsigned char *ucp = (const unsigned char *) msg;
623    
624     pad = (__u32)len | ((__u32)len << 8);
625     pad |= pad << 16;
626     @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
627     for (i=0; i < len; i++) {
628     if ((i % 4) == 0)
629     val = pad;
630     - val = msg[i] + (val << 8);
631     + val = ((int) ucp[i]) + (val << 8);
632     if ((i % 4) == 3) {
633     *buf++ = val;
634     val = pad;
635     @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
636     const char *p;
637     int i;
638     __u32 in[8], buf[4];
639     + void (*str2hashbuf)(const char *, int, __u32 *, int) =
640     + str2hashbuf_signed;
641    
642     /* Initialize the default seed for the hash checksum functions */
643     buf[0] = 0x67452301;
644     @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
645     }
646    
647     switch (hinfo->hash_version) {
648     + case DX_HASH_LEGACY_UNSIGNED:
649     + hash = dx_hack_hash_unsigned(name, len);
650     + break;
651     case DX_HASH_LEGACY:
652     - hash = dx_hack_hash(name, len);
653     + hash = dx_hack_hash_signed(name, len);
654     break;
655     + case DX_HASH_HALF_MD4_UNSIGNED:
656     + str2hashbuf = str2hashbuf_unsigned;
657     case DX_HASH_HALF_MD4:
658     p = name;
659     while (len > 0) {
660     - str2hashbuf(p, len, in, 8);
661     + (*str2hashbuf)(p, len, in, 8);
662     half_md4_transform(buf, in);
663     len -= 32;
664     p += 32;
665     @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
666     minor_hash = buf[2];
667     hash = buf[1];
668     break;
669     + case DX_HASH_TEA_UNSIGNED:
670     + str2hashbuf = str2hashbuf_unsigned;
671     case DX_HASH_TEA:
672     p = name;
673     while (len > 0) {
674     - str2hashbuf(p, len, in, 4);
675     + (*str2hashbuf)(p, len, in, 4);
676     TEA_transform(buf, in);
677     len -= 16;
678     p += 16;
679     diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
680     index 9805924..b994854 100644
681     --- a/fs/ext4/ialloc.c
682     +++ b/fs/ext4/ialloc.c
683     @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
684     }
685    
686     memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
687     - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
688     + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
689     bh->b_data);
690    
691     return EXT4_INODES_PER_GROUP(sb);
692     @@ -115,18 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
693     block_group, bitmap_blk);
694     return NULL;
695     }
696     - if (bh_uptodate_or_lock(bh))
697     + if (bitmap_uptodate(bh))
698     return bh;
699    
700     + lock_buffer(bh);
701     + if (bitmap_uptodate(bh)) {
702     + unlock_buffer(bh);
703     + return bh;
704     + }
705     spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
706     if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
707     ext4_init_inode_bitmap(sb, bh, block_group, desc);
708     + set_bitmap_uptodate(bh);
709     set_buffer_uptodate(bh);
710     unlock_buffer(bh);
711     spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
712     return bh;
713     }
714     spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
715     + if (buffer_uptodate(bh)) {
716     + /*
717     + * if not uninit if bh is uptodate,
718     + * bitmap is also uptodate
719     + */
720     + set_bitmap_uptodate(bh);
721     + unlock_buffer(bh);
722     + return bh;
723     + }
724     + /*
725     + * submit the buffer_head for read. We can
726     + * safely mark the bitmap as uptodate now.
727     + * We do it here so the bitmap uptodate bit
728     + * get set with buffer lock held.
729     + */
730     + set_bitmap_uptodate(bh);
731     if (bh_submit_read(bh) < 0) {
732     put_bh(bh);
733     ext4_error(sb, __func__,
734     @@ -567,6 +589,77 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
735     }
736    
737     /*
738     + * claim the inode from the inode bitmap. If the group
739     + * is uninit we need to take the groups's sb_bgl_lock
740     + * and clear the uninit flag. The inode bitmap update
741     + * and group desc uninit flag clear should be done
742     + * after holding sb_bgl_lock so that ext4_read_inode_bitmap
743     + * doesn't race with the ext4_claim_inode
744     + */
745     +static int ext4_claim_inode(struct super_block *sb,
746     + struct buffer_head *inode_bitmap_bh,
747     + unsigned long ino, ext4_group_t group, int mode)
748     +{
749     + int free = 0, retval = 0;
750     + struct ext4_sb_info *sbi = EXT4_SB(sb);
751     + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
752     +
753     + spin_lock(sb_bgl_lock(sbi, group));
754     + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
755     + /* not a free inode */
756     + retval = 1;
757     + goto err_ret;
758     + }
759     + ino++;
760     + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
761     + ino > EXT4_INODES_PER_GROUP(sb)) {
762     + spin_unlock(sb_bgl_lock(sbi, group));
763     + ext4_error(sb, __func__,
764     + "reserved inode or inode > inodes count - "
765     + "block_group = %lu, inode=%lu", group,
766     + ino + group * EXT4_INODES_PER_GROUP(sb));
767     + return 1;
768     + }
769     + /* If we didn't allocate from within the initialized part of the inode
770     + * table then we need to initialize up to this inode. */
771     + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
772     +
773     + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
774     + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
775     + /* When marking the block group with
776     + * ~EXT4_BG_INODE_UNINIT we don't want to depend
777     + * on the value of bg_itable_unused even though
778     + * mke2fs could have initialized the same for us.
779     + * Instead we calculated the value below
780     + */
781     +
782     + free = 0;
783     + } else {
784     + free = EXT4_INODES_PER_GROUP(sb) -
785     + le16_to_cpu(gdp->bg_itable_unused);
786     + }
787     +
788     + /*
789     + * Check the relative inode number against the last used
790     + * relative inode number in this group. if it is greater
791     + * we need to update the bg_itable_unused count
792     + *
793     + */
794     + if (ino > free)
795     + gdp->bg_itable_unused =
796     + cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
797     + }
798     + le16_add_cpu(&gdp->bg_free_inodes_count, -1);
799     + if (S_ISDIR(mode)) {
800     + le16_add_cpu(&gdp->bg_used_dirs_count, 1);
801     + }
802     + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
803     +err_ret:
804     + spin_unlock(sb_bgl_lock(sbi, group));
805     + return retval;
806     +}
807     +
808     +/*
809     * There are two policies for allocating an inode. If the new inode is
810     * a directory, then a forward search is made for a block group with both
811     * free space and a low directory-to-inode ratio; if that fails, then of
812     @@ -649,8 +742,12 @@ repeat_in_this_group:
813     if (err)
814     goto fail;
815    
816     - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
817     - ino, bitmap_bh->b_data)) {
818     + BUFFER_TRACE(bh2, "get_write_access");
819     + err = ext4_journal_get_write_access(handle, bh2);
820     + if (err)
821     + goto fail;
822     + if (!ext4_claim_inode(sb, bitmap_bh,
823     + ino, group, mode)) {
824     /* we won it */
825     BUFFER_TRACE(bitmap_bh,
826     "call ext4_journal_dirty_metadata");
827     @@ -658,10 +755,13 @@ repeat_in_this_group:
828     bitmap_bh);
829     if (err)
830     goto fail;
831     + /* zero bit is inode number 1*/
832     + ino++;
833     goto got;
834     }
835     /* we lost it */
836     jbd2_journal_release_buffer(handle, bitmap_bh);
837     + jbd2_journal_release_buffer(handle, bh2);
838    
839     if (++ino < EXT4_INODES_PER_GROUP(sb))
840     goto repeat_in_this_group;
841     @@ -681,21 +781,6 @@ repeat_in_this_group:
842     goto out;
843    
844     got:
845     - ino++;
846     - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
847     - ino > EXT4_INODES_PER_GROUP(sb)) {
848     - ext4_error(sb, __func__,
849     - "reserved inode or inode > inodes count - "
850     - "block_group = %lu, inode=%lu", group,
851     - ino + group * EXT4_INODES_PER_GROUP(sb));
852     - err = -EIO;
853     - goto fail;
854     - }
855     -
856     - BUFFER_TRACE(bh2, "get_write_access");
857     - err = ext4_journal_get_write_access(handle, bh2);
858     - if (err) goto fail;
859     -
860     /* We may have to initialize the block bitmap if it isn't already */
861     if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
862     gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
863     @@ -730,47 +815,10 @@ got:
864     if (err)
865     goto fail;
866     }
867     -
868     - spin_lock(sb_bgl_lock(sbi, group));
869     - /* If we didn't allocate from within the initialized part of the inode
870     - * table then we need to initialize up to this inode. */
871     - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
872     - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
873     - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
874     -
875     - /* When marking the block group with
876     - * ~EXT4_BG_INODE_UNINIT we don't want to depend
877     - * on the value of bg_itable_unused even though
878     - * mke2fs could have initialized the same for us.
879     - * Instead we calculated the value below
880     - */
881     -
882     - free = 0;
883     - } else {
884     - free = EXT4_INODES_PER_GROUP(sb) -
885     - le16_to_cpu(gdp->bg_itable_unused);
886     - }
887     -
888     - /*
889     - * Check the relative inode number against the last used
890     - * relative inode number in this group. if it is greater
891     - * we need to update the bg_itable_unused count
892     - *
893     - */
894     - if (ino > free)
895     - gdp->bg_itable_unused =
896     - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
897     - }
898     -
899     - le16_add_cpu(&gdp->bg_free_inodes_count, -1);
900     - if (S_ISDIR(mode)) {
901     - le16_add_cpu(&gdp->bg_used_dirs_count, 1);
902     - }
903     - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
904     - spin_unlock(sb_bgl_lock(sbi, group));
905     - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
906     + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
907     err = ext4_journal_dirty_metadata(handle, bh2);
908     - if (err) goto fail;
909     + if (err)
910     + goto fail;
911    
912     percpu_counter_dec(&sbi->s_freeinodes_counter);
913     if (S_ISDIR(mode))
914     diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
915     index d77f674..6e7f085 100644
916     --- a/fs/ext4/inode.c
917     +++ b/fs/ext4/inode.c
918     @@ -351,9 +351,9 @@ static int ext4_block_to_path(struct inode *inode,
919     final = ptrs;
920     } else {
921     ext4_warning(inode->i_sb, "ext4_block_to_path",
922     - "block %lu > max",
923     + "block %lu > max in inode %lu",
924     i_block + direct_blocks +
925     - indirect_blocks + double_blocks);
926     + indirect_blocks + double_blocks, inode->i_ino);
927     }
928     if (boundary)
929     *boundary = final - 1 - (i_block & (ptrs - 1));
930     @@ -1648,18 +1648,25 @@ struct mpage_da_data {
931     */
932     static int mpage_da_submit_io(struct mpage_da_data *mpd)
933     {
934     - struct address_space *mapping = mpd->inode->i_mapping;
935     - int ret = 0, err, nr_pages, i;
936     - unsigned long index, end;
937     + long pages_skipped;
938     struct pagevec pvec;
939     + unsigned long index, end;
940     + int ret = 0, err, nr_pages, i;
941     + struct inode *inode = mpd->inode;
942     + struct address_space *mapping = inode->i_mapping;
943    
944     BUG_ON(mpd->next_page <= mpd->first_page);
945     - pagevec_init(&pvec, 0);
946     + /*
947     + * We need to start from the first_page to the next_page - 1
948     + * to make sure we also write the mapped dirty buffer_heads.
949     + * If we look at mpd->lbh.b_blocknr we would only be looking
950     + * at the currently mapped buffer_heads.
951     + */
952     index = mpd->first_page;
953     end = mpd->next_page - 1;
954    
955     + pagevec_init(&pvec, 0);
956     while (index <= end) {
957     - /* XXX: optimize tail */
958     nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
959     if (nr_pages == 0)
960     break;
961     @@ -1671,6 +1678,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
962     break;
963     index++;
964    
965     + BUG_ON(!PageLocked(page));
966     + BUG_ON(PageWriteback(page));
967     +
968     + pages_skipped = mpd->wbc->pages_skipped;
969     err = mapping->a_ops->writepage(page, mpd->wbc);
970     if (!err)
971     mpd->pages_written++;
972     @@ -1991,11 +2002,29 @@ static int __mpage_da_writepage(struct page *page,
973     bh = head;
974     do {
975     BUG_ON(buffer_locked(bh));
976     + /*
977     + * We need to try to allocate
978     + * unmapped blocks in the same page.
979     + * Otherwise we won't make progress
980     + * with the page in ext4_da_writepage
981     + */
982     if (buffer_dirty(bh) &&
983     (!buffer_mapped(bh) || buffer_delay(bh))) {
984     mpage_add_bh_to_extent(mpd, logical, bh);
985     if (mpd->io_done)
986     return MPAGE_DA_EXTENT_TAIL;
987     + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
988     + /*
989     + * mapped dirty buffer. We need to update
990     + * the b_state because we look at
991     + * b_state in mpage_da_map_blocks. We don't
992     + * update b_size because if we find an
993     + * unmapped buffer_head later we need to
994     + * use the b_state flag of that buffer_head.
995     + */
996     + if (mpd->lbh.b_size == 0)
997     + mpd->lbh.b_state =
998     + bh->b_state & BH_FLAGS;
999     }
1000     logical++;
1001     } while ((bh = bh->b_this_page) != head);
1002     @@ -2298,6 +2327,20 @@ static int ext4_da_writepages(struct address_space *mapping,
1003     */
1004     if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1005     return 0;
1006     +
1007     + /*
1008     + * If the filesystem has aborted, it is read-only, so return
1009     + * right away instead of dumping stack traces later on that
1010     + * will obscure the real source of the problem. We test
1011     + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1012     + * the latter could be true if the filesystem is mounted
1013     + * read-only, and in that case, ext4_da_writepages should
1014     + * *never* be called, so if that ever happens, we would want
1015     + * the stack trace.
1016     + */
1017     + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1018     + return -EROFS;
1019     +
1020     /*
1021     * Make sure nr_to_write is >= sbi->s_mb_stream_request
1022     * This make sure small files blocks are allocated in
1023     @@ -2336,7 +2379,7 @@ restart_loop:
1024     handle = ext4_journal_start(inode, needed_blocks);
1025     if (IS_ERR(handle)) {
1026     ret = PTR_ERR(handle);
1027     - printk(KERN_EMERG "%s: jbd2_start: "
1028     + printk(KERN_CRIT "%s: jbd2_start: "
1029     "%ld pages, ino %lu; err %d\n", __func__,
1030     wbc->nr_to_write, inode->i_ino, ret);
1031     dump_stack();
1032     diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
1033     index ba86b56..dbf6c0e 100644
1034     --- a/fs/ext4/mballoc.c
1035     +++ b/fs/ext4/mballoc.c
1036     @@ -100,7 +100,7 @@
1037     * inode as:
1038     *
1039     * { page }
1040     - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1041     + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1042     *
1043     *
1044     * one block each for bitmap and buddy information. So for each group we
1045     @@ -330,6 +330,18 @@
1046     * object
1047     *
1048     */
1049     +static struct kmem_cache *ext4_pspace_cachep;
1050     +static struct kmem_cache *ext4_ac_cachep;
1051     +static struct kmem_cache *ext4_free_ext_cachep;
1052     +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1053     + ext4_group_t group);
1054     +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1055     + ext4_group_t group);
1056     +static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1057     +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1058     +static void ext4_mb_free_committed_blocks(struct super_block *);
1059     +static void ext4_mb_poll_new_transaction(struct super_block *sb,
1060     + handle_t *handle);
1061    
1062     static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1063     {
1064     @@ -718,7 +730,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
1065     * stored in the inode as
1066     *
1067     * { page }
1068     - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1069     + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1070     *
1071     *
1072     * one block each for bitmap and buddy information.
1073     @@ -784,20 +796,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1074     if (bh[i] == NULL)
1075     goto out;
1076    
1077     - if (bh_uptodate_or_lock(bh[i]))
1078     + if (bitmap_uptodate(bh[i]))
1079     continue;
1080    
1081     + lock_buffer(bh[i]);
1082     + if (bitmap_uptodate(bh[i])) {
1083     + unlock_buffer(bh[i]);
1084     + continue;
1085     + }
1086     spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1087     if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1088     ext4_init_block_bitmap(sb, bh[i],
1089     first_group + i, desc);
1090     + set_bitmap_uptodate(bh[i]);
1091     set_buffer_uptodate(bh[i]);
1092     unlock_buffer(bh[i]);
1093     spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1094     continue;
1095     }
1096     spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1097     + if (buffer_uptodate(bh[i])) {
1098     + /*
1099     + * if not uninit if bh is uptodate,
1100     + * bitmap is also uptodate
1101     + */
1102     + set_bitmap_uptodate(bh[i]);
1103     + unlock_buffer(bh[i]);
1104     + continue;
1105     + }
1106     get_bh(bh[i]);
1107     + /*
1108     + * submit the buffer_head for read. We can
1109     + * safely mark the bitmap as uptodate now.
1110     + * We do it here so the bitmap uptodate bit
1111     + * get set with buffer lock held.
1112     + */
1113     + set_bitmap_uptodate(bh[i]);
1114     bh[i]->b_end_io = end_buffer_read_sync;
1115     submit_bh(READ, bh[i]);
1116     mb_debug("read bitmap for group %lu\n", first_group + i);
1117     @@ -814,6 +848,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1118    
1119     err = 0;
1120     first_block = page->index * blocks_per_page;
1121     + /* init the page */
1122     + memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1123     for (i = 0; i < blocks_per_page; i++) {
1124     int group;
1125     struct ext4_group_info *grinfo;
1126     @@ -840,7 +876,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1127     BUG_ON(incore == NULL);
1128     mb_debug("put buddy for group %u in page %lu/%x\n",
1129     group, page->index, i * blocksize);
1130     - memset(data, 0xff, blocksize);
1131     grinfo = ext4_get_group_info(sb, group);
1132     grinfo->bb_fragments = 0;
1133     memset(grinfo->bb_counters, 0,
1134     @@ -848,7 +883,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1135     /*
1136     * incore got set to the group block bitmap below
1137     */
1138     + ext4_lock_group(sb, group);
1139     ext4_mb_generate_buddy(sb, data, incore, group);
1140     + ext4_unlock_group(sb, group);
1141     incore = NULL;
1142     } else {
1143     /* this is block of bitmap */
1144     @@ -862,6 +899,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1145    
1146     /* mark all preallocated blks used in in-core bitmap */
1147     ext4_mb_generate_from_pa(sb, data, group);
1148     + ext4_mb_generate_from_freelist(sb, data, group);
1149     ext4_unlock_group(sb, group);
1150    
1151     /* set incore so that the buddy information can be
1152     @@ -886,18 +924,20 @@ static noinline_for_stack int
1153     ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1154     struct ext4_buddy *e4b)
1155     {
1156     - struct ext4_sb_info *sbi = EXT4_SB(sb);
1157     - struct inode *inode = sbi->s_buddy_cache;
1158     int blocks_per_page;
1159     int block;
1160     int pnum;
1161     int poff;
1162     struct page *page;
1163     int ret;
1164     + struct ext4_group_info *grp;
1165     + struct ext4_sb_info *sbi = EXT4_SB(sb);
1166     + struct inode *inode = sbi->s_buddy_cache;
1167    
1168     mb_debug("load group %lu\n", group);
1169    
1170     blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1171     + grp = ext4_get_group_info(sb, group);
1172    
1173     e4b->bd_blkbits = sb->s_blocksize_bits;
1174     e4b->bd_info = ext4_get_group_info(sb, group);
1175     @@ -905,6 +945,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1176     e4b->bd_group = group;
1177     e4b->bd_buddy_page = NULL;
1178     e4b->bd_bitmap_page = NULL;
1179     + e4b->alloc_semp = &grp->alloc_sem;
1180     +
1181     + /* Take the read lock on the group alloc
1182     + * sem. This would make sure a parallel
1183     + * ext4_mb_init_group happening on other
1184     + * groups mapped by the page is blocked
1185     + * till we are done with allocation
1186     + */
1187     + down_read(e4b->alloc_semp);
1188    
1189     /*
1190     * the buddy cache inode stores the block bitmap
1191     @@ -920,6 +969,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1192     page = find_get_page(inode->i_mapping, pnum);
1193     if (page == NULL || !PageUptodate(page)) {
1194     if (page)
1195     + /*
1196     + * drop the page reference and try
1197     + * to get the page with lock. If we
1198     + * are not uptodate that implies
1199     + * somebody just created the page but
1200     + * is yet to initialize the same. So
1201     + * wait for it to initialize.
1202     + */
1203     page_cache_release(page);
1204     page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1205     if (page) {
1206     @@ -985,6 +1042,9 @@ err:
1207     page_cache_release(e4b->bd_buddy_page);
1208     e4b->bd_buddy = NULL;
1209     e4b->bd_bitmap = NULL;
1210     +
1211     + /* Done with the buddy cache */
1212     + up_read(e4b->alloc_semp);
1213     return ret;
1214     }
1215    
1216     @@ -994,6 +1054,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1217     page_cache_release(e4b->bd_bitmap_page);
1218     if (e4b->bd_buddy_page)
1219     page_cache_release(e4b->bd_buddy_page);
1220     + /* Done with the buddy cache */
1221     + if (e4b->alloc_semp)
1222     + up_read(e4b->alloc_semp);
1223     }
1224    
1225    
1226     @@ -1031,7 +1094,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1227     cur += 32;
1228     continue;
1229     }
1230     - mb_clear_bit_atomic(lock, cur, bm);
1231     + if (lock)
1232     + mb_clear_bit_atomic(lock, cur, bm);
1233     + else
1234     + mb_clear_bit(cur, bm);
1235     cur++;
1236     }
1237     }
1238     @@ -1049,7 +1115,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1239     cur += 32;
1240     continue;
1241     }
1242     - mb_set_bit_atomic(lock, cur, bm);
1243     + if (lock)
1244     + mb_set_bit_atomic(lock, cur, bm);
1245     + else
1246     + mb_set_bit(cur, bm);
1247     cur++;
1248     }
1249     }
1250     @@ -1296,13 +1365,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1251     ac->ac_tail = ret & 0xffff;
1252     ac->ac_buddy = ret >> 16;
1253    
1254     - /* XXXXXXX: SUCH A HORRIBLE **CK */
1255     - /*FIXME!! Why ? */
1256     + /*
1257     + * take the page reference. We want the page to be pinned
1258     + * so that we don't get a ext4_mb_init_cache_call for this
1259     + * group until we update the bitmap. That would mean we
1260     + * double allocate blocks. The reference is dropped
1261     + * in ext4_mb_release_context
1262     + */
1263     ac->ac_bitmap_page = e4b->bd_bitmap_page;
1264     get_page(ac->ac_bitmap_page);
1265     ac->ac_buddy_page = e4b->bd_buddy_page;
1266     get_page(ac->ac_buddy_page);
1267     -
1268     + /* on allocation we use ac to track the held semaphore */
1269     + ac->alloc_semp = e4b->alloc_semp;
1270     + e4b->alloc_semp = NULL;
1271     /* store last allocated for subsequent stream allocation */
1272     if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1273     spin_lock(&sbi->s_md_lock);
1274     @@ -1326,6 +1402,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1275     struct ext4_free_extent ex;
1276     int max;
1277    
1278     + if (ac->ac_status == AC_STATUS_FOUND)
1279     + return;
1280     /*
1281     * We don't want to scan for a whole year
1282     */
1283     @@ -1692,6 +1770,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1284     return 0;
1285     }
1286    
1287     +/*
1288     + * lock the group_info alloc_sem of all the groups
1289     + * belonging to the same buddy cache page. This
1290     + * make sure other parallel operation on the buddy
1291     + * cache doesn't happen whild holding the buddy cache
1292     + * lock
1293     + */
1294     +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1295     +{
1296     + int i;
1297     + int block, pnum;
1298     + int blocks_per_page;
1299     + int groups_per_page;
1300     + ext4_group_t first_group;
1301     + struct ext4_group_info *grp;
1302     +
1303     + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1304     + /*
1305     + * the buddy cache inode stores the block bitmap
1306     + * and buddy information in consecutive blocks.
1307     + * So for each group we need two blocks.
1308     + */
1309     + block = group * 2;
1310     + pnum = block / blocks_per_page;
1311     + first_group = pnum * blocks_per_page / 2;
1312     +
1313     + groups_per_page = blocks_per_page >> 1;
1314     + if (groups_per_page == 0)
1315     + groups_per_page = 1;
1316     + /* read all groups the page covers into the cache */
1317     + for (i = 0; i < groups_per_page; i++) {
1318     +
1319     + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1320     + break;
1321     + grp = ext4_get_group_info(sb, first_group + i);
1322     + /* take all groups write allocation
1323     + * semaphore. This make sure there is
1324     + * no block allocation going on in any
1325     + * of that groups
1326     + */
1327     + down_write(&grp->alloc_sem);
1328     + }
1329     + return i;
1330     +}
1331     +
1332     +void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1333     + ext4_group_t group, int locked_group)
1334     +{
1335     + int i;
1336     + int block, pnum;
1337     + int blocks_per_page;
1338     + ext4_group_t first_group;
1339     + struct ext4_group_info *grp;
1340     +
1341     + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1342     + /*
1343     + * the buddy cache inode stores the block bitmap
1344     + * and buddy information in consecutive blocks.
1345     + * So for each group we need two blocks.
1346     + */
1347     + block = group * 2;
1348     + pnum = block / blocks_per_page;
1349     + first_group = pnum * blocks_per_page / 2;
1350     + /* release locks on all the groups */
1351     + for (i = 0; i < locked_group; i++) {
1352     +
1353     + grp = ext4_get_group_info(sb, first_group + i);
1354     + /* take all groups write allocation
1355     + * semaphore. This make sure there is
1356     + * no block allocation going on in any
1357     + * of that groups
1358     + */
1359     + up_write(&grp->alloc_sem);
1360     + }
1361     +
1362     +}
1363     +
1364     +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1365     +{
1366     +
1367     + int ret;
1368     + void *bitmap;
1369     + int blocks_per_page;
1370     + int block, pnum, poff;
1371     + int num_grp_locked = 0;
1372     + struct ext4_group_info *this_grp;
1373     + struct ext4_sb_info *sbi = EXT4_SB(sb);
1374     + struct inode *inode = sbi->s_buddy_cache;
1375     + struct page *page = NULL, *bitmap_page = NULL;
1376     +
1377     + mb_debug("init group %lu\n", group);
1378     + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1379     + this_grp = ext4_get_group_info(sb, group);
1380     + /*
1381     + * This ensures we don't add group
1382     + * to this buddy cache via resize
1383     + */
1384     + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1385     + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1386     + /*
1387     + * somebody initialized the group
1388     + * return without doing anything
1389     + */
1390     + ret = 0;
1391     + goto err;
1392     + }
1393     + /*
1394     + * the buddy cache inode stores the block bitmap
1395     + * and buddy information in consecutive blocks.
1396     + * So for each group we need two blocks.
1397     + */
1398     + block = group * 2;
1399     + pnum = block / blocks_per_page;
1400     + poff = block % blocks_per_page;
1401     + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1402     + if (page) {
1403     + BUG_ON(page->mapping != inode->i_mapping);
1404     + ret = ext4_mb_init_cache(page, NULL);
1405     + if (ret) {
1406     + unlock_page(page);
1407     + goto err;
1408     + }
1409     + unlock_page(page);
1410     + }
1411     + if (page == NULL || !PageUptodate(page)) {
1412     + ret = -EIO;
1413     + goto err;
1414     + }
1415     + mark_page_accessed(page);
1416     + bitmap_page = page;
1417     + bitmap = page_address(page) + (poff * sb->s_blocksize);
1418     +
1419     + /* init buddy cache */
1420     + block++;
1421     + pnum = block / blocks_per_page;
1422     + poff = block % blocks_per_page;
1423     + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1424     + if (page == bitmap_page) {
1425     + /*
1426     + * If both the bitmap and buddy are in
1427     + * the same page we don't need to force
1428     + * init the buddy
1429     + */
1430     + unlock_page(page);
1431     + } else if (page) {
1432     + BUG_ON(page->mapping != inode->i_mapping);
1433     + ret = ext4_mb_init_cache(page, bitmap);
1434     + if (ret) {
1435     + unlock_page(page);
1436     + goto err;
1437     + }
1438     + unlock_page(page);
1439     + }
1440     + if (page == NULL || !PageUptodate(page)) {
1441     + ret = -EIO;
1442     + goto err;
1443     + }
1444     + mark_page_accessed(page);
1445     +err:
1446     + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1447     + if (bitmap_page)
1448     + page_cache_release(bitmap_page);
1449     + if (page)
1450     + page_cache_release(page);
1451     + return ret;
1452     +}
1453     +
1454     static noinline_for_stack int
1455     ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1456     {
1457     @@ -1775,7 +2020,7 @@ repeat:
1458     group = 0;
1459    
1460     /* quick check to skip empty groups */
1461     - grp = ext4_get_group_info(ac->ac_sb, group);
1462     + grp = ext4_get_group_info(sb, group);
1463     if (grp->bb_free == 0)
1464     continue;
1465    
1466     @@ -1788,10 +2033,9 @@ repeat:
1467     * we need full data about the group
1468     * to make a good selection
1469     */
1470     - err = ext4_mb_load_buddy(sb, group, &e4b);
1471     + err = ext4_mb_init_group(sb, group);
1472     if (err)
1473     goto out;
1474     - ext4_mb_release_desc(&e4b);
1475     }
1476    
1477     /*
1478     @@ -2299,6 +2543,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
1479     }
1480    
1481     INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1482     + init_rwsem(&meta_group_info[i]->alloc_sem);
1483     + meta_group_info[i]->bb_free_root.rb_node = NULL;;
1484    
1485     #ifdef DOUBLE_CHECK
1486     {
1487     @@ -2325,54 +2571,6 @@ exit_meta_group_info:
1488     } /* ext4_mb_add_groupinfo */
1489    
1490     /*
1491     - * Add a group to the existing groups.
1492     - * This function is used for online resize
1493     - */
1494     -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1495     - struct ext4_group_desc *desc)
1496     -{
1497     - struct ext4_sb_info *sbi = EXT4_SB(sb);
1498     - struct inode *inode = sbi->s_buddy_cache;
1499     - int blocks_per_page;
1500     - int block;
1501     - int pnum;
1502     - struct page *page;
1503     - int err;
1504     -
1505     - /* Add group based on group descriptor*/
1506     - err = ext4_mb_add_groupinfo(sb, group, desc);
1507     - if (err)
1508     - return err;
1509     -
1510     - /*
1511     - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1512     - * datas) are set not up to date so that they will be re-initilaized
1513     - * during the next call to ext4_mb_load_buddy
1514     - */
1515     -
1516     - /* Set buddy page as not up to date */
1517     - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1518     - block = group * 2;
1519     - pnum = block / blocks_per_page;
1520     - page = find_get_page(inode->i_mapping, pnum);
1521     - if (page != NULL) {
1522     - ClearPageUptodate(page);
1523     - page_cache_release(page);
1524     - }
1525     -
1526     - /* Set bitmap page as not up to date */
1527     - block++;
1528     - pnum = block / blocks_per_page;
1529     - page = find_get_page(inode->i_mapping, pnum);
1530     - if (page != NULL) {
1531     - ClearPageUptodate(page);
1532     - page_cache_release(page);
1533     - }
1534     -
1535     - return 0;
1536     -}
1537     -
1538     -/*
1539     * Update an existing group.
1540     * This function is used for online resize
1541     */
1542     @@ -2495,6 +2693,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
1543     clear_opt(sbi->s_mount_opt, MBALLOC);
1544     return -ENOMEM;
1545     }
1546     +
1547     + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1548     sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1549     if (sbi->s_mb_maxs == NULL) {
1550     clear_opt(sbi->s_mount_opt, MBALLOC);
1551     @@ -2658,13 +2858,11 @@ int ext4_mb_release(struct super_block *sb)
1552     static noinline_for_stack void
1553     ext4_mb_free_committed_blocks(struct super_block *sb)
1554     {
1555     - struct ext4_sb_info *sbi = EXT4_SB(sb);
1556     - int err;
1557     - int i;
1558     - int count = 0;
1559     - int count2 = 0;
1560     - struct ext4_free_metadata *md;
1561     struct ext4_buddy e4b;
1562     + struct ext4_group_info *db;
1563     + struct ext4_sb_info *sbi = EXT4_SB(sb);
1564     + int err, count = 0, count2 = 0;
1565     + struct ext4_free_data *entry;
1566    
1567     if (list_empty(&sbi->s_committed_transaction))
1568     return;
1569     @@ -2672,44 +2870,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
1570     /* there is committed blocks to be freed yet */
1571     do {
1572     /* get next array of blocks */
1573     - md = NULL;
1574     + entry = NULL;
1575     spin_lock(&sbi->s_md_lock);
1576     if (!list_empty(&sbi->s_committed_transaction)) {
1577     - md = list_entry(sbi->s_committed_transaction.next,
1578     - struct ext4_free_metadata, list);
1579     - list_del(&md->list);
1580     + entry = list_entry(sbi->s_committed_transaction.next,
1581     + struct ext4_free_data, list);
1582     + list_del(&entry->list);
1583     }
1584     spin_unlock(&sbi->s_md_lock);
1585    
1586     - if (md == NULL)
1587     + if (entry == NULL)
1588     break;
1589    
1590     mb_debug("gonna free %u blocks in group %lu (0x%p):",
1591     - md->num, md->group, md);
1592     + entry->count, entry->group, entry);
1593    
1594     - err = ext4_mb_load_buddy(sb, md->group, &e4b);
1595     + err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1596     /* we expect to find existing buddy because it's pinned */
1597     BUG_ON(err != 0);
1598    
1599     + db = e4b.bd_info;
1600     /* there are blocks to put in buddy to make them really free */
1601     - count += md->num;
1602     + count += entry->count;
1603     count2++;
1604     - ext4_lock_group(sb, md->group);
1605     - for (i = 0; i < md->num; i++) {
1606     - mb_debug(" %u", md->blocks[i]);
1607     - mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1608     + ext4_lock_group(sb, entry->group);
1609     + /* Take it out of per group rb tree */
1610     + rb_erase(&entry->node, &(db->bb_free_root));
1611     + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1612     +
1613     + if (!db->bb_free_root.rb_node) {
1614     + /* No more items in the per group rb tree
1615     + * balance refcounts from ext4_mb_free_metadata()
1616     + */
1617     + page_cache_release(e4b.bd_buddy_page);
1618     + page_cache_release(e4b.bd_bitmap_page);
1619     }
1620     - mb_debug("\n");
1621     - ext4_unlock_group(sb, md->group);
1622     -
1623     - /* balance refcounts from ext4_mb_free_metadata() */
1624     - page_cache_release(e4b.bd_buddy_page);
1625     - page_cache_release(e4b.bd_bitmap_page);
1626     + ext4_unlock_group(sb, entry->group);
1627    
1628     - kfree(md);
1629     + kmem_cache_free(ext4_free_ext_cachep, entry);
1630     ext4_mb_release_desc(&e4b);
1631     -
1632     - } while (md);
1633     + } while (1);
1634    
1635     mb_debug("freed %u blocks in %u structures\n", count, count2);
1636     }
1637     @@ -2864,6 +3064,16 @@ int __init init_ext4_mballoc(void)
1638     kmem_cache_destroy(ext4_pspace_cachep);
1639     return -ENOMEM;
1640     }
1641     +
1642     + ext4_free_ext_cachep =
1643     + kmem_cache_create("ext4_free_block_extents",
1644     + sizeof(struct ext4_free_data),
1645     + 0, SLAB_RECLAIM_ACCOUNT, NULL);
1646     + if (ext4_free_ext_cachep == NULL) {
1647     + kmem_cache_destroy(ext4_pspace_cachep);
1648     + kmem_cache_destroy(ext4_ac_cachep);
1649     + return -ENOMEM;
1650     + }
1651     #ifdef CONFIG_PROC_FS
1652     proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1653     if (proc_root_ext4 == NULL)
1654     @@ -2880,6 +3090,7 @@ void exit_ext4_mballoc(void)
1655     #ifdef CONFIG_PROC_FS
1656     remove_proc_entry("fs/ext4", NULL);
1657     #endif
1658     + kmem_cache_destroy(ext4_free_ext_cachep);
1659     }
1660    
1661    
1662     @@ -2941,8 +3152,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1663     in_range(block + len - 1, ext4_inode_table(sb, gdp),
1664     EXT4_SB(sb)->s_itb_per_group)) {
1665     ext4_error(sb, __func__,
1666     - "Allocating block in system zone - block = %llu",
1667     - block);
1668     + "Allocating block %llu in system zone of %lu group\n",
1669     + block, ac->ac_b_ex.fe_group);
1670     /* File system mounted not to panic on error
1671     * Fix the bitmap and repeat the block allocation
1672     * We leak some of the blocks here.
1673     @@ -2964,10 +3175,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1674     }
1675     }
1676     #endif
1677     - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
1678     - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1679     -
1680     spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
1681     + mb_set_bits(NULL, bitmap_bh->b_data,
1682     + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1683     if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1684     gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1685     gdp->bg_free_blocks_count =
1686     @@ -3400,10 +3610,37 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
1687     ac->ac_criteria = 20;
1688     return 1;
1689     }
1690     +
1691     return 0;
1692     }
1693    
1694     /*
1695     + * the function goes through all block freed in the group
1696     + * but not yet committed and marks them used in in-core bitmap.
1697     + * buddy must be generated from this bitmap
1698     + * Need to be called with ext4 group lock (ext4_lock_group)
1699     + */
1700     +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1701     + ext4_group_t group)
1702     +{
1703     + struct rb_node *n;
1704     + struct ext4_group_info *grp;
1705     + struct ext4_free_data *entry;
1706     +
1707     + grp = ext4_get_group_info(sb, group);
1708     + n = rb_first(&(grp->bb_free_root));
1709     +
1710     + while (n) {
1711     + entry = rb_entry(n, struct ext4_free_data, node);
1712     + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
1713     + bitmap, entry->start_blk,
1714     + entry->count);
1715     + n = rb_next(n);
1716     + }
1717     + return;
1718     +}
1719     +
1720     +/*
1721     * the function goes through all preallocation in this group and marks them
1722     * used in in-core bitmap. buddy must be generated from this bitmap
1723     * Need to be called with ext4 group lock (ext4_lock_group)
1724     @@ -4166,6 +4403,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
1725     ac->ac_pa = NULL;
1726     ac->ac_bitmap_page = NULL;
1727     ac->ac_buddy_page = NULL;
1728     + ac->alloc_semp = NULL;
1729     ac->ac_lg = NULL;
1730    
1731     /* we have to define context: we'll we work with a file or
1732     @@ -4346,6 +4584,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
1733     }
1734     ext4_mb_put_pa(ac, ac->ac_sb, pa);
1735     }
1736     + if (ac->alloc_semp)
1737     + up_read(ac->alloc_semp);
1738     if (ac->ac_bitmap_page)
1739     page_cache_release(ac->ac_bitmap_page);
1740     if (ac->ac_buddy_page)
1741     @@ -4449,10 +4689,14 @@ repeat:
1742     ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1743     ext4_mb_new_preallocation(ac);
1744     }
1745     -
1746     if (likely(ac->ac_status == AC_STATUS_FOUND)) {
1747     *errp = ext4_mb_mark_diskspace_used(ac, handle);
1748     if (*errp == -EAGAIN) {
1749     + /*
1750     + * drop the reference that we took
1751     + * in ext4_mb_use_best_found
1752     + */
1753     + ext4_mb_release_context(ac);
1754     ac->ac_b_ex.fe_group = 0;
1755     ac->ac_b_ex.fe_start = 0;
1756     ac->ac_b_ex.fe_len = 0;
1757     @@ -4517,65 +4761,97 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
1758     ext4_mb_free_committed_blocks(sb);
1759     }
1760    
1761     +/*
1762     + * We can merge two free data extents only if the physical blocks
1763     + * are contiguous, AND the extents were freed by the same transaction,
1764     + * AND the blocks are associated with the same group.
1765     + */
1766     +static int can_merge(struct ext4_free_data *entry1,
1767     + struct ext4_free_data *entry2)
1768     +{
1769     + if ((entry1->t_tid == entry2->t_tid) &&
1770     + (entry1->group == entry2->group) &&
1771     + ((entry1->start_blk + entry1->count) == entry2->start_blk))
1772     + return 1;
1773     + return 0;
1774     +}
1775     +
1776     static noinline_for_stack int
1777     ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
1778     - ext4_group_t group, ext4_grpblk_t block, int count)
1779     + struct ext4_free_data *new_entry)
1780     {
1781     + ext4_grpblk_t block;
1782     + struct ext4_free_data *entry;
1783     struct ext4_group_info *db = e4b->bd_info;
1784     struct super_block *sb = e4b->bd_sb;
1785     struct ext4_sb_info *sbi = EXT4_SB(sb);
1786     - struct ext4_free_metadata *md;
1787     - int i;
1788     + struct rb_node **n = &db->bb_free_root.rb_node, *node;
1789     + struct rb_node *parent = NULL, *new_node;
1790    
1791     BUG_ON(e4b->bd_bitmap_page == NULL);
1792     BUG_ON(e4b->bd_buddy_page == NULL);
1793    
1794     - ext4_lock_group(sb, group);
1795     - for (i = 0; i < count; i++) {
1796     - md = db->bb_md_cur;
1797     - if (md && db->bb_tid != handle->h_transaction->t_tid) {
1798     - db->bb_md_cur = NULL;
1799     - md = NULL;
1800     + new_node = &new_entry->node;
1801     + block = new_entry->start_blk;
1802     +
1803     + if (!*n) {
1804     + /* first free block exent. We need to
1805     + protect buddy cache from being freed,
1806     + * otherwise we'll refresh it from
1807     + * on-disk bitmap and lose not-yet-available
1808     + * blocks */
1809     + page_cache_get(e4b->bd_buddy_page);
1810     + page_cache_get(e4b->bd_bitmap_page);
1811     + }
1812     + while (*n) {
1813     + parent = *n;
1814     + entry = rb_entry(parent, struct ext4_free_data, node);
1815     + if (block < entry->start_blk)
1816     + n = &(*n)->rb_left;
1817     + else if (block >= (entry->start_blk + entry->count))
1818     + n = &(*n)->rb_right;
1819     + else {
1820     + ext4_error(sb, __func__,
1821     + "Double free of blocks %d (%d %d)\n",
1822     + block, entry->start_blk, entry->count);
1823     + return 0;
1824     }
1825     + }
1826    
1827     - if (md == NULL) {
1828     - ext4_unlock_group(sb, group);
1829     - md = kmalloc(sizeof(*md), GFP_NOFS);
1830     - if (md == NULL)
1831     - return -ENOMEM;
1832     - md->num = 0;
1833     - md->group = group;
1834     -
1835     - ext4_lock_group(sb, group);
1836     - if (db->bb_md_cur == NULL) {
1837     - spin_lock(&sbi->s_md_lock);
1838     - list_add(&md->list, &sbi->s_active_transaction);
1839     - spin_unlock(&sbi->s_md_lock);
1840     - /* protect buddy cache from being freed,
1841     - * otherwise we'll refresh it from
1842     - * on-disk bitmap and lose not-yet-available
1843     - * blocks */
1844     - page_cache_get(e4b->bd_buddy_page);
1845     - page_cache_get(e4b->bd_bitmap_page);
1846     - db->bb_md_cur = md;
1847     - db->bb_tid = handle->h_transaction->t_tid;
1848     - mb_debug("new md 0x%p for group %lu\n",
1849     - md, md->group);
1850     - } else {
1851     - kfree(md);
1852     - md = db->bb_md_cur;
1853     - }
1854     + rb_link_node(new_node, parent, n);
1855     + rb_insert_color(new_node, &db->bb_free_root);
1856     +
1857     + /* Now try to see the extent can be merged to left and right */
1858     + node = rb_prev(new_node);
1859     + if (node) {
1860     + entry = rb_entry(node, struct ext4_free_data, node);
1861     + if (can_merge(entry, new_entry)) {
1862     + new_entry->start_blk = entry->start_blk;
1863     + new_entry->count += entry->count;
1864     + rb_erase(node, &(db->bb_free_root));
1865     + spin_lock(&sbi->s_md_lock);
1866     + list_del(&entry->list);
1867     + spin_unlock(&sbi->s_md_lock);
1868     + kmem_cache_free(ext4_free_ext_cachep, entry);
1869     }
1870     + }
1871    
1872     - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
1873     - md->blocks[md->num] = block + i;
1874     - md->num++;
1875     - if (md->num == EXT4_BB_MAX_BLOCKS) {
1876     - /* no more space, put full container on a sb's list */
1877     - db->bb_md_cur = NULL;
1878     + node = rb_next(new_node);
1879     + if (node) {
1880     + entry = rb_entry(node, struct ext4_free_data, node);
1881     + if (can_merge(new_entry, entry)) {
1882     + new_entry->count += entry->count;
1883     + rb_erase(node, &(db->bb_free_root));
1884     + spin_lock(&sbi->s_md_lock);
1885     + list_del(&entry->list);
1886     + spin_unlock(&sbi->s_md_lock);
1887     + kmem_cache_free(ext4_free_ext_cachep, entry);
1888     }
1889     }
1890     - ext4_unlock_group(sb, group);
1891     + /* Add the extent to active_transaction list */
1892     + spin_lock(&sbi->s_md_lock);
1893     + list_add(&new_entry->list, &sbi->s_active_transaction);
1894     + spin_unlock(&sbi->s_md_lock);
1895     return 0;
1896     }
1897    
1898     @@ -4675,11 +4951,6 @@ do_more:
1899     err = ext4_journal_get_write_access(handle, gd_bh);
1900     if (err)
1901     goto error_return;
1902     -
1903     - err = ext4_mb_load_buddy(sb, block_group, &e4b);
1904     - if (err)
1905     - goto error_return;
1906     -
1907     #ifdef AGGRESSIVE_CHECK
1908     {
1909     int i;
1910     @@ -4687,13 +4958,6 @@ do_more:
1911     BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
1912     }
1913     #endif
1914     - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1915     - bit, count);
1916     -
1917     - /* We dirtied the bitmap block */
1918     - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1919     - err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1920     -
1921     if (ac) {
1922     ac->ac_b_ex.fe_group = block_group;
1923     ac->ac_b_ex.fe_start = bit;
1924     @@ -4701,12 +4965,33 @@ do_more:
1925     ext4_mb_store_history(ac);
1926     }
1927    
1928     + err = ext4_mb_load_buddy(sb, block_group, &e4b);
1929     + if (err)
1930     + goto error_return;
1931     if (metadata) {
1932     - /* blocks being freed are metadata. these blocks shouldn't
1933     - * be used until this transaction is committed */
1934     - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
1935     + struct ext4_free_data *new_entry;
1936     + /*
1937     + * blocks being freed are metadata. these blocks shouldn't
1938     + * be used until this transaction is committed
1939     + */
1940     + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
1941     + new_entry->start_blk = bit;
1942     + new_entry->group = block_group;
1943     + new_entry->count = count;
1944     + new_entry->t_tid = handle->h_transaction->t_tid;
1945     + ext4_lock_group(sb, block_group);
1946     + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1947     + bit, count);
1948     + ext4_mb_free_metadata(handle, &e4b, new_entry);
1949     + ext4_unlock_group(sb, block_group);
1950     } else {
1951     ext4_lock_group(sb, block_group);
1952     + /* need to update group_info->bb_free and bitmap
1953     + * with group lock held. generate_buddy look at
1954     + * them with group lock_held
1955     + */
1956     + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1957     + bit, count);
1958     mb_free_blocks(inode, &e4b, bit, count);
1959     ext4_mb_return_to_preallocation(inode, &e4b, block, count);
1960     ext4_unlock_group(sb, block_group);
1961     @@ -4729,6 +5014,10 @@ do_more:
1962    
1963     *freed += count;
1964    
1965     + /* We dirtied the bitmap block */
1966     + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1967     + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1968     +
1969     /* And the group descriptor block */
1970     BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
1971     ret = ext4_journal_dirty_metadata(handle, gd_bh);
1972     diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
1973     index c7c9906..0a28dd3 100644
1974     --- a/fs/ext4/mballoc.h
1975     +++ b/fs/ext4/mballoc.h
1976     @@ -18,6 +18,7 @@
1977     #include <linux/pagemap.h>
1978     #include <linux/seq_file.h>
1979     #include <linux/version.h>
1980     +#include <linux/mutex.h>
1981     #include "ext4_jbd2.h"
1982     #include "ext4.h"
1983     #include "group.h"
1984     @@ -96,25 +97,27 @@
1985     */
1986     #define MB_DEFAULT_GROUP_PREALLOC 512
1987    
1988     -static struct kmem_cache *ext4_pspace_cachep;
1989     -static struct kmem_cache *ext4_ac_cachep;
1990     +struct ext4_free_data {
1991     + /* this links the free block information from group_info */
1992     + struct rb_node node;
1993    
1994     -#ifdef EXT4_BB_MAX_BLOCKS
1995     -#undef EXT4_BB_MAX_BLOCKS
1996     -#endif
1997     -#define EXT4_BB_MAX_BLOCKS 30
1998     + /* this links the free block information from ext4_sb_info */
1999     + struct list_head list;
2000    
2001     -struct ext4_free_metadata {
2002     + /* group which free block extent belongs */
2003     ext4_group_t group;
2004     - unsigned short num;
2005     - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
2006     - struct list_head list;
2007     +
2008     + /* free block extent */
2009     + ext4_grpblk_t start_blk;
2010     + ext4_grpblk_t count;
2011     +
2012     + /* transaction which freed this extent */
2013     + tid_t t_tid;
2014     };
2015    
2016     struct ext4_group_info {
2017     unsigned long bb_state;
2018     - unsigned long bb_tid;
2019     - struct ext4_free_metadata *bb_md_cur;
2020     + struct rb_root bb_free_root;
2021     unsigned short bb_first_free;
2022     unsigned short bb_free;
2023     unsigned short bb_fragments;
2024     @@ -122,6 +125,7 @@ struct ext4_group_info {
2025     #ifdef DOUBLE_CHECK
2026     void *bb_bitmap;
2027     #endif
2028     + struct rw_semaphore alloc_sem;
2029     unsigned short bb_counters[];
2030     };
2031    
2032     @@ -209,6 +213,11 @@ struct ext4_allocation_context {
2033     __u8 ac_op; /* operation, for history only */
2034     struct page *ac_bitmap_page;
2035     struct page *ac_buddy_page;
2036     + /*
2037     + * pointer to the held semaphore upon successful
2038     + * block allocation
2039     + */
2040     + struct rw_semaphore *alloc_semp;
2041     struct ext4_prealloc_space *ac_pa;
2042     struct ext4_locality_group *ac_lg;
2043     };
2044     @@ -242,6 +251,7 @@ struct ext4_buddy {
2045     struct super_block *bd_sb;
2046     __u16 bd_blkbits;
2047     ext4_group_t bd_group;
2048     + struct rw_semaphore *alloc_semp;
2049     };
2050     #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
2051     #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
2052     @@ -251,8 +261,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
2053     {
2054     return;
2055     }
2056     -#else
2057     -static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2058     #endif
2059    
2060     #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2061     @@ -260,19 +268,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2062     static struct proc_dir_entry *proc_root_ext4;
2063     struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2064    
2065     -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2066     - ext4_group_t group);
2067     -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2068     -static void ext4_mb_free_committed_blocks(struct super_block *);
2069     -static void ext4_mb_return_to_preallocation(struct inode *inode,
2070     - struct ext4_buddy *e4b, sector_t block,
2071     - int count);
2072     -static void ext4_mb_put_pa(struct ext4_allocation_context *,
2073     - struct super_block *, struct ext4_prealloc_space *pa);
2074     -static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2075     -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2076     -
2077     -
2078     static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2079     {
2080     struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2081     @@ -297,7 +292,7 @@ static inline int ext4_is_group_locked(struct super_block *sb,
2082     &(grinfo->bb_state));
2083     }
2084    
2085     -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2086     +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2087     struct ext4_free_extent *fex)
2088     {
2089     ext4_fsblk_t block;
2090     diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
2091     index d626533..4f3628f 100644
2092     --- a/fs/ext4/namei.c
2093     +++ b/fs/ext4/namei.c
2094     @@ -371,6 +371,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
2095     goto fail;
2096     }
2097     hinfo->hash_version = root->info.hash_version;
2098     + if (hinfo->hash_version <= DX_HASH_TEA)
2099     + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2100     hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2101     if (dentry)
2102     ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2103     @@ -640,6 +642,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
2104     dir = dir_file->f_path.dentry->d_inode;
2105     if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2106     hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2107     + if (hinfo.hash_version <= DX_HASH_TEA)
2108     + hinfo.hash_version +=
2109     + EXT4_SB(dir->i_sb)->s_hash_unsigned;
2110     hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2111     count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2112     start_hash, start_minor_hash);
2113     @@ -1377,7 +1382,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2114     struct fake_dirent *fde;
2115    
2116     blocksize = dir->i_sb->s_blocksize;
2117     - dxtrace(printk("Creating index\n"));
2118     + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2119     retval = ext4_journal_get_write_access(handle, bh);
2120     if (retval) {
2121     ext4_std_error(dir->i_sb, retval);
2122     @@ -1386,6 +1391,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2123     }
2124     root = (struct dx_root *) bh->b_data;
2125    
2126     + /* The 0th block becomes the root, move the dirents out */
2127     + fde = &root->dotdot;
2128     + de = (struct ext4_dir_entry_2 *)((char *)fde +
2129     + ext4_rec_len_from_disk(fde->rec_len));
2130     + if ((char *) de >= (((char *) root) + blocksize)) {
2131     + ext4_error(dir->i_sb, __func__,
2132     + "invalid rec_len for '..' in inode %lu",
2133     + dir->i_ino);
2134     + brelse(bh);
2135     + return -EIO;
2136     + }
2137     + len = ((char *) root) + blocksize - (char *) de;
2138     +
2139     + /* Allocate new block for the 0th block's dirents */
2140     bh2 = ext4_append (handle, dir, &block, &retval);
2141     if (!(bh2)) {
2142     brelse(bh);
2143     @@ -1394,11 +1413,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2144     EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2145     data1 = bh2->b_data;
2146    
2147     - /* The 0th block becomes the root, move the dirents out */
2148     - fde = &root->dotdot;
2149     - de = (struct ext4_dir_entry_2 *)((char *)fde +
2150     - ext4_rec_len_from_disk(fde->rec_len));
2151     - len = ((char *) root) + blocksize - (char *) de;
2152     memcpy (data1, de, len);
2153     de = (struct ext4_dir_entry_2 *) data1;
2154     top = data1 + len;
2155     @@ -1418,6 +1432,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2156    
2157     /* Initialize as for dx_probe */
2158     hinfo.hash_version = root->info.hash_version;
2159     + if (hinfo.hash_version <= DX_HASH_TEA)
2160     + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2161     hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2162     ext4fs_dirhash(name, namelen, &hinfo);
2163     frame = frames;
2164     diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
2165     index 3922a8b..0070431 100644
2166     --- a/fs/ext4/resize.c
2167     +++ b/fs/ext4/resize.c
2168     @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
2169     if ((err = extend_or_restart_transaction(handle, 2, bh)))
2170     goto exit_bh;
2171    
2172     - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2173     - bh->b_data);
2174     + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2175     ext4_journal_dirty_metadata(handle, bh);
2176     brelse(bh);
2177     -
2178     /* Mark unused entries in inode bitmap used */
2179     ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2180     input->inode_bitmap, input->inode_bitmap - start);
2181     @@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb,
2182     goto exit_journal;
2183     }
2184    
2185     - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2186     + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2187     bh->b_data);
2188     ext4_journal_dirty_metadata(handle, bh);
2189     exit_bh:
2190     @@ -747,6 +745,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2191     struct inode *inode = NULL;
2192     handle_t *handle;
2193     int gdb_off, gdb_num;
2194     + int num_grp_locked = 0;
2195     int err, err2;
2196    
2197     gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2198     @@ -787,6 +786,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2199     }
2200     }
2201    
2202     +
2203     if ((err = verify_group_input(sb, input)))
2204     goto exit_put;
2205    
2206     @@ -855,15 +855,18 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2207     * using the new disk blocks.
2208     */
2209    
2210     + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2211     /* Update group descriptor block for new group */
2212     gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2213     gdb_off * EXT4_DESC_SIZE(sb));
2214    
2215     + memset(gdp, 0, EXT4_DESC_SIZE(sb));
2216     ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2217     ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2218     ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2219     gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2220     gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2221     + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2222     gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2223    
2224     /*
2225     @@ -871,9 +874,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2226     * descriptor
2227     */
2228     if (test_opt(sb, MBALLOC)) {
2229     - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2230     - if (err)
2231     + err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2232     + if (err) {
2233     + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2234     goto exit_journal;
2235     + }
2236     }
2237     /*
2238     * Make the new blocks and inodes valid next. We do this before
2239     @@ -915,6 +920,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2240    
2241     /* Update the global fs size fields */
2242     sbi->s_groups_count++;
2243     + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2244    
2245     ext4_journal_dirty_metadata(handle, primary);
2246    
2247     @@ -976,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2248     struct buffer_head * bh;
2249     handle_t *handle;
2250     int err;
2251     - unsigned long freed_blocks;
2252     ext4_group_t group;
2253     - struct ext4_group_info *grp;
2254    
2255     /* We don't need to worry about locking wrt other resizers just
2256     * yet: we're going to revalidate es->s_blocks_count after
2257     @@ -1077,50 +1081,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2258     unlock_super(sb);
2259     ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2260     o_blocks_count + add);
2261     - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2262     + /* We add the blocks to the bitmap and set the group need init bit */
2263     + ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2264     ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2265     o_blocks_count + add);
2266     if ((err = ext4_journal_stop(handle)))
2267     goto exit_put;
2268    
2269     - /*
2270     - * Mark mballoc pages as not up to date so that they will be updated
2271     - * next time they are loaded by ext4_mb_load_buddy.
2272     - */
2273     - if (test_opt(sb, MBALLOC)) {
2274     - struct ext4_sb_info *sbi = EXT4_SB(sb);
2275     - struct inode *inode = sbi->s_buddy_cache;
2276     - int blocks_per_page;
2277     - int block;
2278     - int pnum;
2279     - struct page *page;
2280     -
2281     - /* Set buddy page as not up to date */
2282     - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2283     - block = group * 2;
2284     - pnum = block / blocks_per_page;
2285     - page = find_get_page(inode->i_mapping, pnum);
2286     - if (page != NULL) {
2287     - ClearPageUptodate(page);
2288     - page_cache_release(page);
2289     - }
2290     -
2291     - /* Set bitmap page as not up to date */
2292     - block++;
2293     - pnum = block / blocks_per_page;
2294     - page = find_get_page(inode->i_mapping, pnum);
2295     - if (page != NULL) {
2296     - ClearPageUptodate(page);
2297     - page_cache_release(page);
2298     - }
2299     -
2300     - /* Get the info on the last group */
2301     - grp = ext4_get_group_info(sb, group);
2302     -
2303     - /* Update free blocks in group info */
2304     - ext4_mb_update_group_info(grp, add);
2305     - }
2306     -
2307     if (test_opt(sb, DEBUG))
2308     printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2309     ext4_blocks_count(es));
2310     diff --git a/fs/ext4/super.c b/fs/ext4/super.c
2311     index 7726e8e..5e4491d 100644
2312     --- a/fs/ext4/super.c
2313     +++ b/fs/ext4/super.c
2314     @@ -1493,7 +1493,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2315     ext4_group_t flex_group_count;
2316     ext4_group_t flex_group;
2317     int groups_per_flex = 0;
2318     - __u64 block_bitmap = 0;
2319     int i;
2320    
2321     if (!sbi->s_es->s_log_groups_per_flex) {
2322     @@ -1516,9 +1515,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2323     goto failed;
2324     }
2325    
2326     - gdp = ext4_get_group_desc(sb, 1, &bh);
2327     - block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2328     -
2329     for (i = 0; i < sbi->s_groups_count; i++) {
2330     gdp = ext4_get_group_desc(sb, i, &bh);
2331    
2332     @@ -1920,8 +1916,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2333     struct inode *root;
2334     int ret = -EINVAL;
2335     int blocksize;
2336     - int db_count;
2337     - int i;
2338     + unsigned int db_count;
2339     + unsigned int i;
2340     int needs_recovery;
2341     __le32 features;
2342     __u64 blocks_count;
2343     @@ -2172,6 +2168,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2344     for (i = 0; i < 4; i++)
2345     sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2346     sbi->s_def_hash_version = es->s_def_hash_version;
2347     + i = le32_to_cpu(es->s_flags);
2348     + if (i & EXT2_FLAGS_UNSIGNED_HASH)
2349     + sbi->s_hash_unsigned = 3;
2350     + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2351     +#ifdef __CHAR_UNSIGNED__
2352     + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2353     + sbi->s_hash_unsigned = 3;
2354     +#else
2355     + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2356     +#endif
2357     + sb->s_dirt = 1;
2358     + }
2359    
2360     if (sbi->s_blocks_per_group > blocksize * 8) {
2361     printk(KERN_ERR
2362     @@ -2199,20 +2207,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2363     if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2364     goto cantfind_ext4;
2365    
2366     - /* ensure blocks_count calculation below doesn't sign-extend */
2367     - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2368     - le32_to_cpu(es->s_first_data_block) + 1) {
2369     - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2370     - "first data block %u, blocks per group %lu\n",
2371     - ext4_blocks_count(es),
2372     - le32_to_cpu(es->s_first_data_block),
2373     - EXT4_BLOCKS_PER_GROUP(sb));
2374     + /*
2375     + * It makes no sense for the first data block to be beyond the end
2376     + * of the filesystem.
2377     + */
2378     + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2379     + printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2380     + "block %u is beyond end of filesystem (%llu)\n",
2381     + le32_to_cpu(es->s_first_data_block),
2382     + ext4_blocks_count(es));
2383     goto failed_mount;
2384     }
2385     blocks_count = (ext4_blocks_count(es) -
2386     le32_to_cpu(es->s_first_data_block) +
2387     EXT4_BLOCKS_PER_GROUP(sb) - 1);
2388     do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2389     + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2390     + printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2391     + "(block count %llu, first data block %u, "
2392     + "blocks per group %lu)\n", sbi->s_groups_count,
2393     + ext4_blocks_count(es),
2394     + le32_to_cpu(es->s_first_data_block),
2395     + EXT4_BLOCKS_PER_GROUP(sb));
2396     + goto failed_mount;
2397     + }
2398     sbi->s_groups_count = blocks_count;
2399     db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2400     EXT4_DESC_PER_BLOCK(sb);
2401     diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
2402     index 6caf22d..b1f0756 100644
2403     --- a/fs/jbd2/commit.c
2404     +++ b/fs/jbd2/commit.c
2405     @@ -24,6 +24,7 @@
2406     #include <linux/crc32.h>
2407     #include <linux/writeback.h>
2408     #include <linux/backing-dev.h>
2409     +#include <linux/bio.h>
2410    
2411     /*
2412     * Default IO end handler for temporary BJ_IO buffer_heads.
2413     @@ -170,12 +171,34 @@ static int journal_submit_commit_record(journal_t *journal,
2414     * This function along with journal_submit_commit_record
2415     * allows to write the commit record asynchronously.
2416     */
2417     -static int journal_wait_on_commit_record(struct buffer_head *bh)
2418     +static int journal_wait_on_commit_record(journal_t *journal,
2419     + struct buffer_head *bh)
2420     {
2421     int ret = 0;
2422    
2423     +retry:
2424     clear_buffer_dirty(bh);
2425     wait_on_buffer(bh);
2426     + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2427     + printk(KERN_WARNING
2428     + "JBD2: wait_on_commit_record: sync failed on %s - "
2429     + "disabling barriers\n", journal->j_devname);
2430     + spin_lock(&journal->j_state_lock);
2431     + journal->j_flags &= ~JBD2_BARRIER;
2432     + spin_unlock(&journal->j_state_lock);
2433     +
2434     + lock_buffer(bh);
2435     + clear_buffer_dirty(bh);
2436     + set_buffer_uptodate(bh);
2437     + bh->b_end_io = journal_end_buffer_io_sync;
2438     +
2439     + ret = submit_bh(WRITE_SYNC, bh);
2440     + if (ret) {
2441     + unlock_buffer(bh);
2442     + return ret;
2443     + }
2444     + goto retry;
2445     + }
2446    
2447     if (unlikely(!buffer_uptodate(bh)))
2448     ret = -EIO;
2449     @@ -795,7 +818,7 @@ wait_for_iobuf:
2450     __jbd2_journal_abort_hard(journal);
2451     }
2452     if (!err && !is_journal_aborted(journal))
2453     - err = journal_wait_on_commit_record(cbh);
2454     + err = journal_wait_on_commit_record(journal, cbh);
2455    
2456     if (err)
2457     jbd2_journal_abort(journal, err);
2458     diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
2459     index 66c3499..0e1bd70 100644
2460     --- a/include/linux/jbd2.h
2461     +++ b/include/linux/jbd2.h
2462     @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh);
2463     int val = (expr); \
2464     if (!val) { \
2465     printk(KERN_ERR \
2466     - "EXT3-fs unexpected failure: %s;\n",# expr); \
2467     + "JBD2 unexpected failure: %s: %s;\n", \
2468     + __func__, #expr); \
2469     printk(KERN_ERR why "\n"); \
2470     } \
2471     val; \
2472     @@ -329,6 +330,7 @@ enum jbd_state_bits {
2473     BH_State, /* Pins most journal_head state */
2474     BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
2475     BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
2476     + BH_JBDPrivateStart, /* First bit available for private use by FS */
2477     };
2478    
2479     BUFFER_FNS(JBD, jbd)
2480     diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
2481     index 794e546..e7e7c7d 100644
2482     --- a/include/linux/pci_ids.h
2483     +++ b/include/linux/pci_ids.h
2484     @@ -1301,6 +1301,7 @@
2485     #define PCI_DEVICE_ID_VIA_VT3351 0x0351
2486     #define PCI_DEVICE_ID_VIA_VT3364 0x0364
2487     #define PCI_DEVICE_ID_VIA_8371_0 0x0391
2488     +#define PCI_DEVICE_ID_VIA_6415 0x0415
2489     #define PCI_DEVICE_ID_VIA_8501_0 0x0501
2490     #define PCI_DEVICE_ID_VIA_82C561 0x0561
2491     #define PCI_DEVICE_ID_VIA_82C586_1 0x0571
2492     diff --git a/include/linux/pid.h b/include/linux/pid.h
2493     index d7e98ff..93997c9 100644
2494     --- a/include/linux/pid.h
2495     +++ b/include/linux/pid.h
2496     @@ -123,6 +123,24 @@ extern struct pid *alloc_pid(struct pid_namespace *ns);
2497     extern void free_pid(struct pid *pid);
2498    
2499     /*
2500     + * ns_of_pid() returns the pid namespace in which the specified pid was
2501     + * allocated.
2502     + *
2503     + * NOTE:
2504     + * ns_of_pid() is expected to be called for a process (task) that has
2505     + * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
2506     + * is expected to be non-NULL. If @pid is NULL, caller should handle
2507     + * the resulting NULL pid-ns.
2508     + */
2509     +static inline struct pid_namespace *ns_of_pid(struct pid *pid)
2510     +{
2511     + struct pid_namespace *ns = NULL;
2512     + if (pid)
2513     + ns = pid->numbers[pid->level].ns;
2514     + return ns;
2515     +}
2516     +
2517     +/*
2518     * the helpers to get the pid's id seen from different namespaces
2519     *
2520     * pid_nr() : global id, i.e. the id seen from the init namespace;
2521     diff --git a/ipc/mqueue.c b/ipc/mqueue.c
2522     index a58bfad..ca502aa 100644
2523     --- a/ipc/mqueue.c
2524     +++ b/ipc/mqueue.c
2525     @@ -498,7 +498,8 @@ static void __do_notify(struct mqueue_inode_info *info)
2526     sig_i.si_errno = 0;
2527     sig_i.si_code = SI_MESGQ;
2528     sig_i.si_value = info->notify.sigev_value;
2529     - sig_i.si_pid = task_tgid_vnr(current);
2530     + sig_i.si_pid = task_tgid_nr_ns(current,
2531     + ns_of_pid(info->notify_owner));
2532     sig_i.si_uid = current->uid;
2533    
2534     kill_pid_info(info->notify.sigev_signo,