Magellan Linux

Contents of /trunk/kernel26-alx/patches-2.6.27-r3/0118-2.6.27.19-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1176 - (show annotations) (download)
Thu Oct 14 15:11:06 2010 UTC (13 years, 7 months ago) by niro
File size: 80833 byte(s)
-2.6.27-alx-r3: new magellan 0.5.2 kernel
1 diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
2 index 5af4e9b..ada0692 100644
3 --- a/arch/powerpc/kernel/align.c
4 +++ b/arch/powerpc/kernel/align.c
5 @@ -646,11 +646,16 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
6 unsigned int areg, struct pt_regs *regs,
7 unsigned int flags, unsigned int length)
8 {
9 - char *ptr = (char *) &current->thread.TS_FPR(reg);
10 + char *ptr;
11 int ret = 0;
12
13 flush_vsx_to_thread(current);
14
15 + if (reg < 32)
16 + ptr = (char *) &current->thread.TS_FPR(reg);
17 + else
18 + ptr = (char *) &current->thread.vr[reg - 32];
19 +
20 if (flags & ST)
21 ret = __copy_to_user(addr, ptr, length);
22 else {
23 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
24 index 5b719a0..7c3b8dc 100644
25 --- a/arch/x86/mm/pageattr.c
26 +++ b/arch/x86/mm/pageattr.c
27 @@ -619,6 +619,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
28 unsigned int level;
29 pte_t *kpte, old_pte;
30
31 + /*
32 + * If we're called with lazy mmu updates enabled, the
33 + * in-memory pte state may be stale. Flush pending updates to
34 + * bring them up to date.
35 + */
36 + arch_flush_lazy_mmu_mode();
37 +
38 repeat:
39 kpte = lookup_address(address, &level);
40 if (!kpte)
41 @@ -836,6 +843,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
42 else
43 cpa_flush_all(cache);
44
45 + /*
46 + * If we've been called with lazy mmu updates enabled, then
47 + * make sure that everything gets flushed out before we
48 + * return.
49 + */
50 + arch_flush_lazy_mmu_mode();
51 +
52 out:
53 cpa_fill_pool(NULL);
54
55 diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
56 index c5be6a1..b6f55e8 100644
57 --- a/drivers/ata/pata_via.c
58 +++ b/drivers/ata/pata_via.c
59 @@ -111,7 +111,8 @@ static const struct via_isa_bridge {
60 { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
61 { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
62 { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA },
63 - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES},
64 + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
65 + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
66 { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
67 { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
68 { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
69 @@ -594,6 +595,7 @@ static int via_reinit_one(struct pci_dev *pdev)
70 #endif
71
72 static const struct pci_device_id via[] = {
73 + { PCI_VDEVICE(VIA, 0x0415), },
74 { PCI_VDEVICE(VIA, 0x0571), },
75 { PCI_VDEVICE(VIA, 0x0581), },
76 { PCI_VDEVICE(VIA, 0x1571), },
77 diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
78 index 89e3b7f..8b6f9c0 100644
79 --- a/drivers/ata/sata_nv.c
80 +++ b/drivers/ata/sata_nv.c
81 @@ -421,19 +421,21 @@ static struct ata_port_operations nv_generic_ops = {
82 .hardreset = ATA_OP_NULL,
83 };
84
85 -/* OSDL bz3352 reports that nf2/3 controllers can't determine device
86 - * signature reliably. Also, the following thread reports detection
87 - * failure on cold boot with the standard debouncing timing.
88 +/* nf2 is ripe with hardreset related problems.
89 + *
90 + * kernel bz#3352 reports nf2/3 controllers can't determine device
91 + * signature reliably. The following thread reports detection failure
92 + * on cold boot with the standard debouncing timing.
93 *
94 * http://thread.gmane.org/gmane.linux.ide/34098
95 *
96 - * Debounce with hotplug timing and request follow-up SRST.
97 + * And bz#12176 reports that hardreset simply doesn't work on nf2.
98 + * Give up on it and just don't do hardreset.
99 */
100 static struct ata_port_operations nv_nf2_ops = {
101 - .inherits = &nv_common_ops,
102 + .inherits = &nv_generic_ops,
103 .freeze = nv_nf2_freeze,
104 .thaw = nv_nf2_thaw,
105 - .hardreset = nv_noclassify_hardreset,
106 };
107
108 /* For initial probing after boot and hot plugging, hardreset mostly
109 diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
110 index 58630cc..f2ada0c 100644
111 --- a/drivers/bluetooth/btsdio.c
112 +++ b/drivers/bluetooth/btsdio.c
113 @@ -91,6 +91,7 @@ static int btsdio_tx_packet(struct btsdio_data *data, struct sk_buff *skb)
114
115 err = sdio_writesb(data->func, REG_TDAT, skb->data, skb->len);
116 if (err < 0) {
117 + skb_pull(skb, 4);
118 sdio_writeb(data->func, 0x01, REG_PC_WRT, NULL);
119 return err;
120 }
121 @@ -152,7 +153,7 @@ static int btsdio_rx_packet(struct btsdio_data *data)
122
123 err = sdio_readsb(data->func, skb->data, REG_RDAT, len - 4);
124 if (err < 0) {
125 - kfree(skb);
126 + kfree_skb(skb);
127 return err;
128 }
129
130 diff --git a/drivers/net/3c505.c b/drivers/net/3c505.c
131 index fdfb2b2..ae8e36c 100644
132 --- a/drivers/net/3c505.c
133 +++ b/drivers/net/3c505.c
134 @@ -493,21 +493,27 @@ static bool receive_pcb(struct net_device *dev, pcb_struct * pcb)
135 }
136 /* read the data */
137 spin_lock_irqsave(&adapter->lock, flags);
138 - i = 0;
139 - do {
140 - j = 0;
141 - while (((stat = get_status(dev->base_addr)) & ACRF) == 0 && j++ < 20000);
142 - pcb->data.raw[i++] = inb_command(dev->base_addr);
143 - if (i > MAX_PCB_DATA)
144 - INVALID_PCB_MSG(i);
145 - } while ((stat & ASF_PCB_MASK) != ASF_PCB_END && j < 20000);
146 + for (i = 0; i < MAX_PCB_DATA; i++) {
147 + for (j = 0; j < 20000; j++) {
148 + stat = get_status(dev->base_addr);
149 + if (stat & ACRF)
150 + break;
151 + }
152 + pcb->data.raw[i] = inb_command(dev->base_addr);
153 + if ((stat & ASF_PCB_MASK) == ASF_PCB_END || j >= 20000)
154 + break;
155 + }
156 spin_unlock_irqrestore(&adapter->lock, flags);
157 + if (i >= MAX_PCB_DATA) {
158 + INVALID_PCB_MSG(i);
159 + return false;
160 + }
161 if (j >= 20000) {
162 TIMEOUT_MSG(__LINE__);
163 return false;
164 }
165 - /* woops, the last "data" byte was really the length! */
166 - total_length = pcb->data.raw[--i];
167 + /* the last "data" byte was really the length! */
168 + total_length = pcb->data.raw[i];
169
170 /* safety check total length vs data length */
171 if (total_length != (pcb->length + 2)) {
172 diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
173 index c3edcdc..2d90a3c 100644
174 --- a/drivers/pci/intel-iommu.c
175 +++ b/drivers/pci/intel-iommu.c
176 @@ -72,6 +72,8 @@ static struct deferred_flush_tables *deferred_flush;
177 /* bitmap for indexing intel_iommus */
178 static int g_num_of_iommus;
179
180 +static int rwbf_quirk = 0;
181 +
182 static DEFINE_SPINLOCK(async_umap_flush_lock);
183 static LIST_HEAD(unmaps_to_do);
184
185 @@ -527,7 +529,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
186 u32 val;
187 unsigned long flag;
188
189 - if (!cap_rwbf(iommu->cap))
190 + if (!rwbf_quirk && !cap_rwbf(iommu->cap))
191 return;
192 val = iommu->gcmd | DMA_GCMD_WBF;
193
194 @@ -2453,3 +2455,12 @@ int __init intel_iommu_init(void)
195 return 0;
196 }
197
198 +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
199 +{
200 + /* Mobile 4 Series Chipset neglects to set RWBF capability,
201 + but needs it */
202 + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
203 + rwbf_quirk = 1;
204 +}
205 +
206 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
207 diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
208 index 299e075..55ac5c3 100644
209 --- a/drivers/scsi/libiscsi.c
210 +++ b/drivers/scsi/libiscsi.c
211 @@ -1844,6 +1844,7 @@ void iscsi_pool_free(struct iscsi_pool *q)
212 kfree(q->pool[i]);
213 if (q->pool)
214 kfree(q->pool);
215 + kfree(q->queue);
216 }
217 EXPORT_SYMBOL_GPL(iscsi_pool_free);
218
219 diff --git a/fs/ext2/super.c b/fs/ext2/super.c
220 index fd88c7b..2ebc0c4 100644
221 --- a/fs/ext2/super.c
222 +++ b/fs/ext2/super.c
223 @@ -1177,9 +1177,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
224 es = sbi->s_es;
225 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
226 (old_mount_opt & EXT2_MOUNT_XIP)) &&
227 - invalidate_inodes(sb))
228 - ext2_warning(sb, __func__, "busy inodes while remounting "\
229 - "xip remain in cache (no functional problem)");
230 + invalidate_inodes(sb)) {
231 + ext2_warning(sb, __func__, "refusing change of xip flag "
232 + "with busy inodes while remounting");
233 + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
234 + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
235 + }
236 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
237 return 0;
238 if (*flags & MS_RDONLY) {
239 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
240 index e9fa960..8b7c776 100644
241 --- a/fs/ext4/balloc.c
242 +++ b/fs/ext4/balloc.c
243 @@ -20,6 +20,7 @@
244 #include "ext4.h"
245 #include "ext4_jbd2.h"
246 #include "group.h"
247 +#include "mballoc.h"
248
249 /*
250 * balloc.c contains the blocks allocation and deallocation routines
251 @@ -318,18 +319,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
252 block_group, bitmap_blk);
253 return NULL;
254 }
255 - if (bh_uptodate_or_lock(bh))
256 +
257 + if (bitmap_uptodate(bh))
258 return bh;
259
260 + lock_buffer(bh);
261 + if (bitmap_uptodate(bh)) {
262 + unlock_buffer(bh);
263 + return bh;
264 + }
265 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
266 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
267 ext4_init_block_bitmap(sb, bh, block_group, desc);
268 + set_bitmap_uptodate(bh);
269 set_buffer_uptodate(bh);
270 unlock_buffer(bh);
271 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
272 return bh;
273 }
274 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
275 + if (buffer_uptodate(bh)) {
276 + /*
277 + * if not uninit if bh is uptodate,
278 + * bitmap is also uptodate
279 + */
280 + set_bitmap_uptodate(bh);
281 + unlock_buffer(bh);
282 + return bh;
283 + }
284 + /*
285 + * submit the buffer_head for read. We can
286 + * safely mark the bitmap as uptodate now.
287 + * We do it here so the bitmap uptodate bit
288 + * get set with buffer lock held.
289 + */
290 + set_bitmap_uptodate(bh);
291 if (bh_submit_read(bh) < 0) {
292 put_bh(bh);
293 ext4_error(sb, __func__,
294 @@ -837,6 +861,136 @@ error_return:
295 }
296
297 /**
298 + * ext4_add_groupblocks() -- Add given blocks to an existing group
299 + * @handle: handle to this transaction
300 + * @sb: super block
301 + * @block: start physcial block to add to the block group
302 + * @count: number of blocks to free
303 + *
304 + * This marks the blocks as free in the bitmap. We ask the
305 + * mballoc to reload the buddy after this by setting group
306 + * EXT4_GROUP_INFO_NEED_INIT_BIT flag
307 + */
308 +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
309 + ext4_fsblk_t block, unsigned long count)
310 +{
311 + struct buffer_head *bitmap_bh = NULL;
312 + struct buffer_head *gd_bh;
313 + ext4_group_t block_group;
314 + ext4_grpblk_t bit;
315 + unsigned long i;
316 + struct ext4_group_desc *desc;
317 + struct ext4_super_block *es;
318 + struct ext4_sb_info *sbi;
319 + int err = 0, ret;
320 + ext4_grpblk_t blocks_freed;
321 + struct ext4_group_info *grp;
322 +
323 + sbi = EXT4_SB(sb);
324 + es = sbi->s_es;
325 + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
326 +
327 + ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
328 + grp = ext4_get_group_info(sb, block_group);
329 + /*
330 + * Check to see if we are freeing blocks across a group
331 + * boundary.
332 + */
333 + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
334 + goto error_return;
335 +
336 + bitmap_bh = ext4_read_block_bitmap(sb, block_group);
337 + if (!bitmap_bh)
338 + goto error_return;
339 + desc = ext4_get_group_desc(sb, block_group, &gd_bh);
340 + if (!desc)
341 + goto error_return;
342 +
343 + if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
344 + in_range(ext4_inode_bitmap(sb, desc), block, count) ||
345 + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
346 + in_range(block + count - 1, ext4_inode_table(sb, desc),
347 + sbi->s_itb_per_group)) {
348 + ext4_error(sb, __func__,
349 + "Adding blocks in system zones - "
350 + "Block = %llu, count = %lu",
351 + block, count);
352 + goto error_return;
353 + }
354 +
355 + /*
356 + * We are about to add blocks to the bitmap,
357 + * so we need undo access.
358 + */
359 + BUFFER_TRACE(bitmap_bh, "getting undo access");
360 + err = ext4_journal_get_undo_access(handle, bitmap_bh);
361 + if (err)
362 + goto error_return;
363 +
364 + /*
365 + * We are about to modify some metadata. Call the journal APIs
366 + * to unshare ->b_data if a currently-committing transaction is
367 + * using it
368 + */
369 + BUFFER_TRACE(gd_bh, "get_write_access");
370 + err = ext4_journal_get_write_access(handle, gd_bh);
371 + if (err)
372 + goto error_return;
373 + /*
374 + * make sure we don't allow a parallel init on other groups in the
375 + * same buddy cache
376 + */
377 + down_write(&grp->alloc_sem);
378 + for (i = 0, blocks_freed = 0; i < count; i++) {
379 + BUFFER_TRACE(bitmap_bh, "clear bit");
380 + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
381 + bit + i, bitmap_bh->b_data)) {
382 + ext4_error(sb, __func__,
383 + "bit already cleared for block %llu",
384 + (ext4_fsblk_t)(block + i));
385 + BUFFER_TRACE(bitmap_bh, "bit already cleared");
386 + } else {
387 + blocks_freed++;
388 + }
389 + }
390 + spin_lock(sb_bgl_lock(sbi, block_group));
391 + le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
392 + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
393 + spin_unlock(sb_bgl_lock(sbi, block_group));
394 + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
395 +
396 + if (sbi->s_log_groups_per_flex) {
397 + ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
398 + spin_lock(sb_bgl_lock(sbi, flex_group));
399 + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
400 + spin_unlock(sb_bgl_lock(sbi, flex_group));
401 + }
402 + /*
403 + * request to reload the buddy with the
404 + * new bitmap information
405 + */
406 + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
407 + ext4_mb_update_group_info(grp, blocks_freed);
408 + up_write(&grp->alloc_sem);
409 +
410 + /* We dirtied the bitmap block */
411 + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
412 + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
413 +
414 + /* And the group descriptor block */
415 + BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
416 + ret = ext4_journal_dirty_metadata(handle, gd_bh);
417 + if (!err)
418 + err = ret;
419 + sb->s_dirt = 1;
420 +
421 +error_return:
422 + brelse(bitmap_bh);
423 + ext4_std_error(sb, err);
424 + return;
425 +}
426 +
427 +/**
428 * ext4_free_blocks() -- Free given blocks and update quota
429 * @handle: handle for this transaction
430 * @inode: inode
431 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
432 index 4829dac..85f58af 100644
433 --- a/fs/ext4/ext4.h
434 +++ b/fs/ext4/ext4.h
435 @@ -19,6 +19,7 @@
436 #include <linux/types.h>
437 #include <linux/blkdev.h>
438 #include <linux/magic.h>
439 +#include <linux/jbd2.h>
440 #include "ext4_i.h"
441
442 /*
443 @@ -889,6 +890,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
444 #define DX_HASH_LEGACY 0
445 #define DX_HASH_HALF_MD4 1
446 #define DX_HASH_TEA 2
447 +#define DX_HASH_LEGACY_UNSIGNED 3
448 +#define DX_HASH_HALF_MD4_UNSIGNED 4
449 +#define DX_HASH_TEA_UNSIGNED 5
450
451 #ifdef __KERNEL__
452
453 @@ -988,9 +992,11 @@ extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
454 ext4_fsblk_t nblocks);
455 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
456 ext4_fsblk_t block, unsigned long count, int metadata);
457 -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
458 - ext4_fsblk_t block, unsigned long count,
459 +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
460 + ext4_fsblk_t block, unsigned long count,
461 unsigned long *pdquot_freed_blocks);
462 +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
463 + ext4_fsblk_t block, unsigned long count);
464 extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
465 extern void ext4_check_blocks_bitmap (struct super_block *);
466 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
467 @@ -1038,12 +1044,13 @@ extern int __init init_ext4_mballoc(void);
468 extern void exit_ext4_mballoc(void);
469 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
470 unsigned long, unsigned long, int, unsigned long *);
471 -extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
472 +extern int ext4_mb_add_groupinfo(struct super_block *sb,
473 ext4_group_t i, struct ext4_group_desc *desc);
474 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
475 ext4_grpblk_t add);
476 -
477 -
478 +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
479 +extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
480 + ext4_group_t, int);
481 /* inode.c */
482 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
483 struct buffer_head *bh, ext4_fsblk_t blocknr);
484 @@ -1167,8 +1174,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
485
486 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
487 {
488 - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
489 - le32_to_cpu(raw_inode->i_size_lo);
490 + if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
491 + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
492 + le32_to_cpu(raw_inode->i_size_lo);
493 + else
494 + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
495 }
496
497 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
498 @@ -1244,6 +1254,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
499 sector_t block, unsigned long max_blocks,
500 struct buffer_head *bh, int create,
501 int extend_disksize, int flag);
502 +/*
503 + * Add new method to test wether block and inode bitmaps are properly
504 + * initialized. With uninit_bg reading the block from disk is not enough
505 + * to mark the bitmap uptodate. We need to also zero-out the bitmap
506 + */
507 +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
508 +
509 +static inline int bitmap_uptodate(struct buffer_head *bh)
510 +{
511 + return (buffer_uptodate(bh) &&
512 + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
513 +}
514 +static inline void set_bitmap_uptodate(struct buffer_head *bh)
515 +{
516 + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
517 +}
518 +
519 #endif /* __KERNEL__ */
520
521 #endif /* _EXT4_H */
522 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
523 index 6300226..f20df8a 100644
524 --- a/fs/ext4/ext4_sb.h
525 +++ b/fs/ext4/ext4_sb.h
526 @@ -56,6 +56,7 @@ struct ext4_sb_info {
527 u32 s_next_generation;
528 u32 s_hash_seed[4];
529 int s_def_hash_version;
530 + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
531 struct percpu_counter s_freeblocks_counter;
532 struct percpu_counter s_freeinodes_counter;
533 struct percpu_counter s_dirs_counter;
534 @@ -102,7 +103,8 @@ struct ext4_sb_info {
535 struct list_head s_committed_transaction;
536 spinlock_t s_md_lock;
537 tid_t s_last_transaction;
538 - unsigned short *s_mb_offsets, *s_mb_maxs;
539 + unsigned short *s_mb_offsets;
540 + unsigned int *s_mb_maxs;
541
542 /* tunables */
543 unsigned long s_stripe;
544 diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
545 index 1d6329d..bd7d14d 100644
546 --- a/fs/ext4/hash.c
547 +++ b/fs/ext4/hash.c
548 @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
549
550
551 /* The old legacy hash */
552 -static __u32 dx_hack_hash (const char *name, int len)
553 +static __u32 dx_hack_hash_unsigned(const char *name, int len)
554 {
555 - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
556 + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
557 + const unsigned char *ucp = (const unsigned char *) name;
558 +
559 + while (len--) {
560 + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
561 +
562 + if (hash & 0x80000000)
563 + hash -= 0x7fffffff;
564 + hash1 = hash0;
565 + hash0 = hash;
566 + }
567 + return hash0 << 1;
568 +}
569 +
570 +static __u32 dx_hack_hash_signed(const char *name, int len)
571 +{
572 + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
573 + const signed char *scp = (const signed char *) name;
574 +
575 while (len--) {
576 - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
577 + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
578
579 - if (hash & 0x80000000) hash -= 0x7fffffff;
580 + if (hash & 0x80000000)
581 + hash -= 0x7fffffff;
582 hash1 = hash0;
583 hash0 = hash;
584 }
585 - return (hash0 << 1);
586 + return hash0 << 1;
587 }
588
589 -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
590 +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
591 {
592 __u32 pad, val;
593 int i;
594 + const signed char *scp = (const signed char *) msg;
595 +
596 + pad = (__u32)len | ((__u32)len << 8);
597 + pad |= pad << 16;
598 +
599 + val = pad;
600 + if (len > num*4)
601 + len = num * 4;
602 + for (i = 0; i < len; i++) {
603 + if ((i % 4) == 0)
604 + val = pad;
605 + val = ((int) scp[i]) + (val << 8);
606 + if ((i % 4) == 3) {
607 + *buf++ = val;
608 + val = pad;
609 + num--;
610 + }
611 + }
612 + if (--num >= 0)
613 + *buf++ = val;
614 + while (--num >= 0)
615 + *buf++ = pad;
616 +}
617 +
618 +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
619 +{
620 + __u32 pad, val;
621 + int i;
622 + const unsigned char *ucp = (const unsigned char *) msg;
623
624 pad = (__u32)len | ((__u32)len << 8);
625 pad |= pad << 16;
626 @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
627 for (i=0; i < len; i++) {
628 if ((i % 4) == 0)
629 val = pad;
630 - val = msg[i] + (val << 8);
631 + val = ((int) ucp[i]) + (val << 8);
632 if ((i % 4) == 3) {
633 *buf++ = val;
634 val = pad;
635 @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
636 const char *p;
637 int i;
638 __u32 in[8], buf[4];
639 + void (*str2hashbuf)(const char *, int, __u32 *, int) =
640 + str2hashbuf_signed;
641
642 /* Initialize the default seed for the hash checksum functions */
643 buf[0] = 0x67452301;
644 @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
645 }
646
647 switch (hinfo->hash_version) {
648 + case DX_HASH_LEGACY_UNSIGNED:
649 + hash = dx_hack_hash_unsigned(name, len);
650 + break;
651 case DX_HASH_LEGACY:
652 - hash = dx_hack_hash(name, len);
653 + hash = dx_hack_hash_signed(name, len);
654 break;
655 + case DX_HASH_HALF_MD4_UNSIGNED:
656 + str2hashbuf = str2hashbuf_unsigned;
657 case DX_HASH_HALF_MD4:
658 p = name;
659 while (len > 0) {
660 - str2hashbuf(p, len, in, 8);
661 + (*str2hashbuf)(p, len, in, 8);
662 half_md4_transform(buf, in);
663 len -= 32;
664 p += 32;
665 @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
666 minor_hash = buf[2];
667 hash = buf[1];
668 break;
669 + case DX_HASH_TEA_UNSIGNED:
670 + str2hashbuf = str2hashbuf_unsigned;
671 case DX_HASH_TEA:
672 p = name;
673 while (len > 0) {
674 - str2hashbuf(p, len, in, 4);
675 + (*str2hashbuf)(p, len, in, 4);
676 TEA_transform(buf, in);
677 len -= 16;
678 p += 16;
679 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
680 index 9805924..b994854 100644
681 --- a/fs/ext4/ialloc.c
682 +++ b/fs/ext4/ialloc.c
683 @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
684 }
685
686 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
687 - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
688 + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
689 bh->b_data);
690
691 return EXT4_INODES_PER_GROUP(sb);
692 @@ -115,18 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
693 block_group, bitmap_blk);
694 return NULL;
695 }
696 - if (bh_uptodate_or_lock(bh))
697 + if (bitmap_uptodate(bh))
698 return bh;
699
700 + lock_buffer(bh);
701 + if (bitmap_uptodate(bh)) {
702 + unlock_buffer(bh);
703 + return bh;
704 + }
705 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
706 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
707 ext4_init_inode_bitmap(sb, bh, block_group, desc);
708 + set_bitmap_uptodate(bh);
709 set_buffer_uptodate(bh);
710 unlock_buffer(bh);
711 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
712 return bh;
713 }
714 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
715 + if (buffer_uptodate(bh)) {
716 + /*
717 + * if not uninit if bh is uptodate,
718 + * bitmap is also uptodate
719 + */
720 + set_bitmap_uptodate(bh);
721 + unlock_buffer(bh);
722 + return bh;
723 + }
724 + /*
725 + * submit the buffer_head for read. We can
726 + * safely mark the bitmap as uptodate now.
727 + * We do it here so the bitmap uptodate bit
728 + * get set with buffer lock held.
729 + */
730 + set_bitmap_uptodate(bh);
731 if (bh_submit_read(bh) < 0) {
732 put_bh(bh);
733 ext4_error(sb, __func__,
734 @@ -567,6 +589,77 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
735 }
736
737 /*
738 + * claim the inode from the inode bitmap. If the group
739 + * is uninit we need to take the groups's sb_bgl_lock
740 + * and clear the uninit flag. The inode bitmap update
741 + * and group desc uninit flag clear should be done
742 + * after holding sb_bgl_lock so that ext4_read_inode_bitmap
743 + * doesn't race with the ext4_claim_inode
744 + */
745 +static int ext4_claim_inode(struct super_block *sb,
746 + struct buffer_head *inode_bitmap_bh,
747 + unsigned long ino, ext4_group_t group, int mode)
748 +{
749 + int free = 0, retval = 0;
750 + struct ext4_sb_info *sbi = EXT4_SB(sb);
751 + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
752 +
753 + spin_lock(sb_bgl_lock(sbi, group));
754 + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
755 + /* not a free inode */
756 + retval = 1;
757 + goto err_ret;
758 + }
759 + ino++;
760 + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
761 + ino > EXT4_INODES_PER_GROUP(sb)) {
762 + spin_unlock(sb_bgl_lock(sbi, group));
763 + ext4_error(sb, __func__,
764 + "reserved inode or inode > inodes count - "
765 + "block_group = %lu, inode=%lu", group,
766 + ino + group * EXT4_INODES_PER_GROUP(sb));
767 + return 1;
768 + }
769 + /* If we didn't allocate from within the initialized part of the inode
770 + * table then we need to initialize up to this inode. */
771 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
772 +
773 + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
774 + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
775 + /* When marking the block group with
776 + * ~EXT4_BG_INODE_UNINIT we don't want to depend
777 + * on the value of bg_itable_unused even though
778 + * mke2fs could have initialized the same for us.
779 + * Instead we calculated the value below
780 + */
781 +
782 + free = 0;
783 + } else {
784 + free = EXT4_INODES_PER_GROUP(sb) -
785 + le16_to_cpu(gdp->bg_itable_unused);
786 + }
787 +
788 + /*
789 + * Check the relative inode number against the last used
790 + * relative inode number in this group. if it is greater
791 + * we need to update the bg_itable_unused count
792 + *
793 + */
794 + if (ino > free)
795 + gdp->bg_itable_unused =
796 + cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
797 + }
798 + le16_add_cpu(&gdp->bg_free_inodes_count, -1);
799 + if (S_ISDIR(mode)) {
800 + le16_add_cpu(&gdp->bg_used_dirs_count, 1);
801 + }
802 + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
803 +err_ret:
804 + spin_unlock(sb_bgl_lock(sbi, group));
805 + return retval;
806 +}
807 +
808 +/*
809 * There are two policies for allocating an inode. If the new inode is
810 * a directory, then a forward search is made for a block group with both
811 * free space and a low directory-to-inode ratio; if that fails, then of
812 @@ -649,8 +742,12 @@ repeat_in_this_group:
813 if (err)
814 goto fail;
815
816 - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
817 - ino, bitmap_bh->b_data)) {
818 + BUFFER_TRACE(bh2, "get_write_access");
819 + err = ext4_journal_get_write_access(handle, bh2);
820 + if (err)
821 + goto fail;
822 + if (!ext4_claim_inode(sb, bitmap_bh,
823 + ino, group, mode)) {
824 /* we won it */
825 BUFFER_TRACE(bitmap_bh,
826 "call ext4_journal_dirty_metadata");
827 @@ -658,10 +755,13 @@ repeat_in_this_group:
828 bitmap_bh);
829 if (err)
830 goto fail;
831 + /* zero bit is inode number 1*/
832 + ino++;
833 goto got;
834 }
835 /* we lost it */
836 jbd2_journal_release_buffer(handle, bitmap_bh);
837 + jbd2_journal_release_buffer(handle, bh2);
838
839 if (++ino < EXT4_INODES_PER_GROUP(sb))
840 goto repeat_in_this_group;
841 @@ -681,21 +781,6 @@ repeat_in_this_group:
842 goto out;
843
844 got:
845 - ino++;
846 - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
847 - ino > EXT4_INODES_PER_GROUP(sb)) {
848 - ext4_error(sb, __func__,
849 - "reserved inode or inode > inodes count - "
850 - "block_group = %lu, inode=%lu", group,
851 - ino + group * EXT4_INODES_PER_GROUP(sb));
852 - err = -EIO;
853 - goto fail;
854 - }
855 -
856 - BUFFER_TRACE(bh2, "get_write_access");
857 - err = ext4_journal_get_write_access(handle, bh2);
858 - if (err) goto fail;
859 -
860 /* We may have to initialize the block bitmap if it isn't already */
861 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
862 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
863 @@ -730,47 +815,10 @@ got:
864 if (err)
865 goto fail;
866 }
867 -
868 - spin_lock(sb_bgl_lock(sbi, group));
869 - /* If we didn't allocate from within the initialized part of the inode
870 - * table then we need to initialize up to this inode. */
871 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
872 - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
873 - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
874 -
875 - /* When marking the block group with
876 - * ~EXT4_BG_INODE_UNINIT we don't want to depend
877 - * on the value of bg_itable_unused even though
878 - * mke2fs could have initialized the same for us.
879 - * Instead we calculated the value below
880 - */
881 -
882 - free = 0;
883 - } else {
884 - free = EXT4_INODES_PER_GROUP(sb) -
885 - le16_to_cpu(gdp->bg_itable_unused);
886 - }
887 -
888 - /*
889 - * Check the relative inode number against the last used
890 - * relative inode number in this group. if it is greater
891 - * we need to update the bg_itable_unused count
892 - *
893 - */
894 - if (ino > free)
895 - gdp->bg_itable_unused =
896 - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
897 - }
898 -
899 - le16_add_cpu(&gdp->bg_free_inodes_count, -1);
900 - if (S_ISDIR(mode)) {
901 - le16_add_cpu(&gdp->bg_used_dirs_count, 1);
902 - }
903 - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
904 - spin_unlock(sb_bgl_lock(sbi, group));
905 - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
906 + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
907 err = ext4_journal_dirty_metadata(handle, bh2);
908 - if (err) goto fail;
909 + if (err)
910 + goto fail;
911
912 percpu_counter_dec(&sbi->s_freeinodes_counter);
913 if (S_ISDIR(mode))
914 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
915 index d77f674..6e7f085 100644
916 --- a/fs/ext4/inode.c
917 +++ b/fs/ext4/inode.c
918 @@ -351,9 +351,9 @@ static int ext4_block_to_path(struct inode *inode,
919 final = ptrs;
920 } else {
921 ext4_warning(inode->i_sb, "ext4_block_to_path",
922 - "block %lu > max",
923 + "block %lu > max in inode %lu",
924 i_block + direct_blocks +
925 - indirect_blocks + double_blocks);
926 + indirect_blocks + double_blocks, inode->i_ino);
927 }
928 if (boundary)
929 *boundary = final - 1 - (i_block & (ptrs - 1));
930 @@ -1648,18 +1648,25 @@ struct mpage_da_data {
931 */
932 static int mpage_da_submit_io(struct mpage_da_data *mpd)
933 {
934 - struct address_space *mapping = mpd->inode->i_mapping;
935 - int ret = 0, err, nr_pages, i;
936 - unsigned long index, end;
937 + long pages_skipped;
938 struct pagevec pvec;
939 + unsigned long index, end;
940 + int ret = 0, err, nr_pages, i;
941 + struct inode *inode = mpd->inode;
942 + struct address_space *mapping = inode->i_mapping;
943
944 BUG_ON(mpd->next_page <= mpd->first_page);
945 - pagevec_init(&pvec, 0);
946 + /*
947 + * We need to start from the first_page to the next_page - 1
948 + * to make sure we also write the mapped dirty buffer_heads.
949 + * If we look at mpd->lbh.b_blocknr we would only be looking
950 + * at the currently mapped buffer_heads.
951 + */
952 index = mpd->first_page;
953 end = mpd->next_page - 1;
954
955 + pagevec_init(&pvec, 0);
956 while (index <= end) {
957 - /* XXX: optimize tail */
958 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
959 if (nr_pages == 0)
960 break;
961 @@ -1671,6 +1678,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
962 break;
963 index++;
964
965 + BUG_ON(!PageLocked(page));
966 + BUG_ON(PageWriteback(page));
967 +
968 + pages_skipped = mpd->wbc->pages_skipped;
969 err = mapping->a_ops->writepage(page, mpd->wbc);
970 if (!err)
971 mpd->pages_written++;
972 @@ -1991,11 +2002,29 @@ static int __mpage_da_writepage(struct page *page,
973 bh = head;
974 do {
975 BUG_ON(buffer_locked(bh));
976 + /*
977 + * We need to try to allocate
978 + * unmapped blocks in the same page.
979 + * Otherwise we won't make progress
980 + * with the page in ext4_da_writepage
981 + */
982 if (buffer_dirty(bh) &&
983 (!buffer_mapped(bh) || buffer_delay(bh))) {
984 mpage_add_bh_to_extent(mpd, logical, bh);
985 if (mpd->io_done)
986 return MPAGE_DA_EXTENT_TAIL;
987 + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
988 + /*
989 + * mapped dirty buffer. We need to update
990 + * the b_state because we look at
991 + * b_state in mpage_da_map_blocks. We don't
992 + * update b_size because if we find an
993 + * unmapped buffer_head later we need to
994 + * use the b_state flag of that buffer_head.
995 + */
996 + if (mpd->lbh.b_size == 0)
997 + mpd->lbh.b_state =
998 + bh->b_state & BH_FLAGS;
999 }
1000 logical++;
1001 } while ((bh = bh->b_this_page) != head);
1002 @@ -2298,6 +2327,20 @@ static int ext4_da_writepages(struct address_space *mapping,
1003 */
1004 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1005 return 0;
1006 +
1007 + /*
1008 + * If the filesystem has aborted, it is read-only, so return
1009 + * right away instead of dumping stack traces later on that
1010 + * will obscure the real source of the problem. We test
1011 + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1012 + * the latter could be true if the filesystem is mounted
1013 + * read-only, and in that case, ext4_da_writepages should
1014 + * *never* be called, so if that ever happens, we would want
1015 + * the stack trace.
1016 + */
1017 + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1018 + return -EROFS;
1019 +
1020 /*
1021 * Make sure nr_to_write is >= sbi->s_mb_stream_request
1022 * This make sure small files blocks are allocated in
1023 @@ -2336,7 +2379,7 @@ restart_loop:
1024 handle = ext4_journal_start(inode, needed_blocks);
1025 if (IS_ERR(handle)) {
1026 ret = PTR_ERR(handle);
1027 - printk(KERN_EMERG "%s: jbd2_start: "
1028 + printk(KERN_CRIT "%s: jbd2_start: "
1029 "%ld pages, ino %lu; err %d\n", __func__,
1030 wbc->nr_to_write, inode->i_ino, ret);
1031 dump_stack();
1032 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
1033 index ba86b56..dbf6c0e 100644
1034 --- a/fs/ext4/mballoc.c
1035 +++ b/fs/ext4/mballoc.c
1036 @@ -100,7 +100,7 @@
1037 * inode as:
1038 *
1039 * { page }
1040 - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1041 + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1042 *
1043 *
1044 * one block each for bitmap and buddy information. So for each group we
1045 @@ -330,6 +330,18 @@
1046 * object
1047 *
1048 */
1049 +static struct kmem_cache *ext4_pspace_cachep;
1050 +static struct kmem_cache *ext4_ac_cachep;
1051 +static struct kmem_cache *ext4_free_ext_cachep;
1052 +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1053 + ext4_group_t group);
1054 +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1055 + ext4_group_t group);
1056 +static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1057 +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1058 +static void ext4_mb_free_committed_blocks(struct super_block *);
1059 +static void ext4_mb_poll_new_transaction(struct super_block *sb,
1060 + handle_t *handle);
1061
1062 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1063 {
1064 @@ -718,7 +730,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
1065 * stored in the inode as
1066 *
1067 * { page }
1068 - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1069 + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1070 *
1071 *
1072 * one block each for bitmap and buddy information.
1073 @@ -784,20 +796,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1074 if (bh[i] == NULL)
1075 goto out;
1076
1077 - if (bh_uptodate_or_lock(bh[i]))
1078 + if (bitmap_uptodate(bh[i]))
1079 continue;
1080
1081 + lock_buffer(bh[i]);
1082 + if (bitmap_uptodate(bh[i])) {
1083 + unlock_buffer(bh[i]);
1084 + continue;
1085 + }
1086 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1087 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1088 ext4_init_block_bitmap(sb, bh[i],
1089 first_group + i, desc);
1090 + set_bitmap_uptodate(bh[i]);
1091 set_buffer_uptodate(bh[i]);
1092 unlock_buffer(bh[i]);
1093 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1094 continue;
1095 }
1096 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1097 + if (buffer_uptodate(bh[i])) {
1098 + /*
1099 + * if not uninit if bh is uptodate,
1100 + * bitmap is also uptodate
1101 + */
1102 + set_bitmap_uptodate(bh[i]);
1103 + unlock_buffer(bh[i]);
1104 + continue;
1105 + }
1106 get_bh(bh[i]);
1107 + /*
1108 + * submit the buffer_head for read. We can
1109 + * safely mark the bitmap as uptodate now.
1110 + * We do it here so the bitmap uptodate bit
1111 + * get set with buffer lock held.
1112 + */
1113 + set_bitmap_uptodate(bh[i]);
1114 bh[i]->b_end_io = end_buffer_read_sync;
1115 submit_bh(READ, bh[i]);
1116 mb_debug("read bitmap for group %lu\n", first_group + i);
1117 @@ -814,6 +848,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1118
1119 err = 0;
1120 first_block = page->index * blocks_per_page;
1121 + /* init the page */
1122 + memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1123 for (i = 0; i < blocks_per_page; i++) {
1124 int group;
1125 struct ext4_group_info *grinfo;
1126 @@ -840,7 +876,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1127 BUG_ON(incore == NULL);
1128 mb_debug("put buddy for group %u in page %lu/%x\n",
1129 group, page->index, i * blocksize);
1130 - memset(data, 0xff, blocksize);
1131 grinfo = ext4_get_group_info(sb, group);
1132 grinfo->bb_fragments = 0;
1133 memset(grinfo->bb_counters, 0,
1134 @@ -848,7 +883,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1135 /*
1136 * incore got set to the group block bitmap below
1137 */
1138 + ext4_lock_group(sb, group);
1139 ext4_mb_generate_buddy(sb, data, incore, group);
1140 + ext4_unlock_group(sb, group);
1141 incore = NULL;
1142 } else {
1143 /* this is block of bitmap */
1144 @@ -862,6 +899,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1145
1146 /* mark all preallocated blks used in in-core bitmap */
1147 ext4_mb_generate_from_pa(sb, data, group);
1148 + ext4_mb_generate_from_freelist(sb, data, group);
1149 ext4_unlock_group(sb, group);
1150
1151 /* set incore so that the buddy information can be
1152 @@ -886,18 +924,20 @@ static noinline_for_stack int
1153 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1154 struct ext4_buddy *e4b)
1155 {
1156 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1157 - struct inode *inode = sbi->s_buddy_cache;
1158 int blocks_per_page;
1159 int block;
1160 int pnum;
1161 int poff;
1162 struct page *page;
1163 int ret;
1164 + struct ext4_group_info *grp;
1165 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1166 + struct inode *inode = sbi->s_buddy_cache;
1167
1168 mb_debug("load group %lu\n", group);
1169
1170 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1171 + grp = ext4_get_group_info(sb, group);
1172
1173 e4b->bd_blkbits = sb->s_blocksize_bits;
1174 e4b->bd_info = ext4_get_group_info(sb, group);
1175 @@ -905,6 +945,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1176 e4b->bd_group = group;
1177 e4b->bd_buddy_page = NULL;
1178 e4b->bd_bitmap_page = NULL;
1179 + e4b->alloc_semp = &grp->alloc_sem;
1180 +
1181 + /* Take the read lock on the group alloc
1182 + * sem. This would make sure a parallel
1183 + * ext4_mb_init_group happening on other
1184 + * groups mapped by the page is blocked
1185 + * till we are done with allocation
1186 + */
1187 + down_read(e4b->alloc_semp);
1188
1189 /*
1190 * the buddy cache inode stores the block bitmap
1191 @@ -920,6 +969,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1192 page = find_get_page(inode->i_mapping, pnum);
1193 if (page == NULL || !PageUptodate(page)) {
1194 if (page)
1195 + /*
1196 + * drop the page reference and try
1197 + * to get the page with lock. If we
1198 + * are not uptodate that implies
1199 + * somebody just created the page but
1200 + * is yet to initialize the same. So
1201 + * wait for it to initialize.
1202 + */
1203 page_cache_release(page);
1204 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1205 if (page) {
1206 @@ -985,6 +1042,9 @@ err:
1207 page_cache_release(e4b->bd_buddy_page);
1208 e4b->bd_buddy = NULL;
1209 e4b->bd_bitmap = NULL;
1210 +
1211 + /* Done with the buddy cache */
1212 + up_read(e4b->alloc_semp);
1213 return ret;
1214 }
1215
1216 @@ -994,6 +1054,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1217 page_cache_release(e4b->bd_bitmap_page);
1218 if (e4b->bd_buddy_page)
1219 page_cache_release(e4b->bd_buddy_page);
1220 + /* Done with the buddy cache */
1221 + if (e4b->alloc_semp)
1222 + up_read(e4b->alloc_semp);
1223 }
1224
1225
1226 @@ -1031,7 +1094,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1227 cur += 32;
1228 continue;
1229 }
1230 - mb_clear_bit_atomic(lock, cur, bm);
1231 + if (lock)
1232 + mb_clear_bit_atomic(lock, cur, bm);
1233 + else
1234 + mb_clear_bit(cur, bm);
1235 cur++;
1236 }
1237 }
1238 @@ -1049,7 +1115,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1239 cur += 32;
1240 continue;
1241 }
1242 - mb_set_bit_atomic(lock, cur, bm);
1243 + if (lock)
1244 + mb_set_bit_atomic(lock, cur, bm);
1245 + else
1246 + mb_set_bit(cur, bm);
1247 cur++;
1248 }
1249 }
1250 @@ -1296,13 +1365,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1251 ac->ac_tail = ret & 0xffff;
1252 ac->ac_buddy = ret >> 16;
1253
1254 - /* XXXXXXX: SUCH A HORRIBLE **CK */
1255 - /*FIXME!! Why ? */
1256 + /*
1257 + * take the page reference. We want the page to be pinned
1258 + * so that we don't get a ext4_mb_init_cache_call for this
1259 + * group until we update the bitmap. That would mean we
1260 + * double allocate blocks. The reference is dropped
1261 + * in ext4_mb_release_context
1262 + */
1263 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1264 get_page(ac->ac_bitmap_page);
1265 ac->ac_buddy_page = e4b->bd_buddy_page;
1266 get_page(ac->ac_buddy_page);
1267 -
1268 + /* on allocation we use ac to track the held semaphore */
1269 + ac->alloc_semp = e4b->alloc_semp;
1270 + e4b->alloc_semp = NULL;
1271 /* store last allocated for subsequent stream allocation */
1272 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1273 spin_lock(&sbi->s_md_lock);
1274 @@ -1326,6 +1402,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1275 struct ext4_free_extent ex;
1276 int max;
1277
1278 + if (ac->ac_status == AC_STATUS_FOUND)
1279 + return;
1280 /*
1281 * We don't want to scan for a whole year
1282 */
1283 @@ -1692,6 +1770,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1284 return 0;
1285 }
1286
1287 +/*
1288 + * lock the group_info alloc_sem of all the groups
1289 + * belonging to the same buddy cache page. This
1290 + * make sure other parallel operation on the buddy
1291 + * cache doesn't happen whild holding the buddy cache
1292 + * lock
1293 + */
1294 +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1295 +{
1296 + int i;
1297 + int block, pnum;
1298 + int blocks_per_page;
1299 + int groups_per_page;
1300 + ext4_group_t first_group;
1301 + struct ext4_group_info *grp;
1302 +
1303 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1304 + /*
1305 + * the buddy cache inode stores the block bitmap
1306 + * and buddy information in consecutive blocks.
1307 + * So for each group we need two blocks.
1308 + */
1309 + block = group * 2;
1310 + pnum = block / blocks_per_page;
1311 + first_group = pnum * blocks_per_page / 2;
1312 +
1313 + groups_per_page = blocks_per_page >> 1;
1314 + if (groups_per_page == 0)
1315 + groups_per_page = 1;
1316 + /* read all groups the page covers into the cache */
1317 + for (i = 0; i < groups_per_page; i++) {
1318 +
1319 + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1320 + break;
1321 + grp = ext4_get_group_info(sb, first_group + i);
1322 + /* take all groups write allocation
1323 + * semaphore. This make sure there is
1324 + * no block allocation going on in any
1325 + * of that groups
1326 + */
1327 + down_write(&grp->alloc_sem);
1328 + }
1329 + return i;
1330 +}
1331 +
1332 +void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1333 + ext4_group_t group, int locked_group)
1334 +{
1335 + int i;
1336 + int block, pnum;
1337 + int blocks_per_page;
1338 + ext4_group_t first_group;
1339 + struct ext4_group_info *grp;
1340 +
1341 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1342 + /*
1343 + * the buddy cache inode stores the block bitmap
1344 + * and buddy information in consecutive blocks.
1345 + * So for each group we need two blocks.
1346 + */
1347 + block = group * 2;
1348 + pnum = block / blocks_per_page;
1349 + first_group = pnum * blocks_per_page / 2;
1350 + /* release locks on all the groups */
1351 + for (i = 0; i < locked_group; i++) {
1352 +
1353 + grp = ext4_get_group_info(sb, first_group + i);
1354 + /* take all groups write allocation
1355 + * semaphore. This make sure there is
1356 + * no block allocation going on in any
1357 + * of that groups
1358 + */
1359 + up_write(&grp->alloc_sem);
1360 + }
1361 +
1362 +}
1363 +
1364 +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1365 +{
1366 +
1367 + int ret;
1368 + void *bitmap;
1369 + int blocks_per_page;
1370 + int block, pnum, poff;
1371 + int num_grp_locked = 0;
1372 + struct ext4_group_info *this_grp;
1373 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1374 + struct inode *inode = sbi->s_buddy_cache;
1375 + struct page *page = NULL, *bitmap_page = NULL;
1376 +
1377 + mb_debug("init group %lu\n", group);
1378 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1379 + this_grp = ext4_get_group_info(sb, group);
1380 + /*
1381 + * This ensures we don't add group
1382 + * to this buddy cache via resize
1383 + */
1384 + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1385 + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1386 + /*
1387 + * somebody initialized the group
1388 + * return without doing anything
1389 + */
1390 + ret = 0;
1391 + goto err;
1392 + }
1393 + /*
1394 + * the buddy cache inode stores the block bitmap
1395 + * and buddy information in consecutive blocks.
1396 + * So for each group we need two blocks.
1397 + */
1398 + block = group * 2;
1399 + pnum = block / blocks_per_page;
1400 + poff = block % blocks_per_page;
1401 + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1402 + if (page) {
1403 + BUG_ON(page->mapping != inode->i_mapping);
1404 + ret = ext4_mb_init_cache(page, NULL);
1405 + if (ret) {
1406 + unlock_page(page);
1407 + goto err;
1408 + }
1409 + unlock_page(page);
1410 + }
1411 + if (page == NULL || !PageUptodate(page)) {
1412 + ret = -EIO;
1413 + goto err;
1414 + }
1415 + mark_page_accessed(page);
1416 + bitmap_page = page;
1417 + bitmap = page_address(page) + (poff * sb->s_blocksize);
1418 +
1419 + /* init buddy cache */
1420 + block++;
1421 + pnum = block / blocks_per_page;
1422 + poff = block % blocks_per_page;
1423 + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1424 + if (page == bitmap_page) {
1425 + /*
1426 + * If both the bitmap and buddy are in
1427 + * the same page we don't need to force
1428 + * init the buddy
1429 + */
1430 + unlock_page(page);
1431 + } else if (page) {
1432 + BUG_ON(page->mapping != inode->i_mapping);
1433 + ret = ext4_mb_init_cache(page, bitmap);
1434 + if (ret) {
1435 + unlock_page(page);
1436 + goto err;
1437 + }
1438 + unlock_page(page);
1439 + }
1440 + if (page == NULL || !PageUptodate(page)) {
1441 + ret = -EIO;
1442 + goto err;
1443 + }
1444 + mark_page_accessed(page);
1445 +err:
1446 + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1447 + if (bitmap_page)
1448 + page_cache_release(bitmap_page);
1449 + if (page)
1450 + page_cache_release(page);
1451 + return ret;
1452 +}
1453 +
1454 static noinline_for_stack int
1455 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1456 {
1457 @@ -1775,7 +2020,7 @@ repeat:
1458 group = 0;
1459
1460 /* quick check to skip empty groups */
1461 - grp = ext4_get_group_info(ac->ac_sb, group);
1462 + grp = ext4_get_group_info(sb, group);
1463 if (grp->bb_free == 0)
1464 continue;
1465
1466 @@ -1788,10 +2033,9 @@ repeat:
1467 * we need full data about the group
1468 * to make a good selection
1469 */
1470 - err = ext4_mb_load_buddy(sb, group, &e4b);
1471 + err = ext4_mb_init_group(sb, group);
1472 if (err)
1473 goto out;
1474 - ext4_mb_release_desc(&e4b);
1475 }
1476
1477 /*
1478 @@ -2299,6 +2543,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
1479 }
1480
1481 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1482 + init_rwsem(&meta_group_info[i]->alloc_sem);
1483 + meta_group_info[i]->bb_free_root.rb_node = NULL;;
1484
1485 #ifdef DOUBLE_CHECK
1486 {
1487 @@ -2325,54 +2571,6 @@ exit_meta_group_info:
1488 } /* ext4_mb_add_groupinfo */
1489
1490 /*
1491 - * Add a group to the existing groups.
1492 - * This function is used for online resize
1493 - */
1494 -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1495 - struct ext4_group_desc *desc)
1496 -{
1497 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1498 - struct inode *inode = sbi->s_buddy_cache;
1499 - int blocks_per_page;
1500 - int block;
1501 - int pnum;
1502 - struct page *page;
1503 - int err;
1504 -
1505 - /* Add group based on group descriptor*/
1506 - err = ext4_mb_add_groupinfo(sb, group, desc);
1507 - if (err)
1508 - return err;
1509 -
1510 - /*
1511 - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1512 - * datas) are set not up to date so that they will be re-initilaized
1513 - * during the next call to ext4_mb_load_buddy
1514 - */
1515 -
1516 - /* Set buddy page as not up to date */
1517 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1518 - block = group * 2;
1519 - pnum = block / blocks_per_page;
1520 - page = find_get_page(inode->i_mapping, pnum);
1521 - if (page != NULL) {
1522 - ClearPageUptodate(page);
1523 - page_cache_release(page);
1524 - }
1525 -
1526 - /* Set bitmap page as not up to date */
1527 - block++;
1528 - pnum = block / blocks_per_page;
1529 - page = find_get_page(inode->i_mapping, pnum);
1530 - if (page != NULL) {
1531 - ClearPageUptodate(page);
1532 - page_cache_release(page);
1533 - }
1534 -
1535 - return 0;
1536 -}
1537 -
1538 -/*
1539 * Update an existing group.
1540 * This function is used for online resize
1541 */
1542 @@ -2495,6 +2693,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
1543 clear_opt(sbi->s_mount_opt, MBALLOC);
1544 return -ENOMEM;
1545 }
1546 +
1547 + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1548 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1549 if (sbi->s_mb_maxs == NULL) {
1550 clear_opt(sbi->s_mount_opt, MBALLOC);
1551 @@ -2658,13 +2858,11 @@ int ext4_mb_release(struct super_block *sb)
1552 static noinline_for_stack void
1553 ext4_mb_free_committed_blocks(struct super_block *sb)
1554 {
1555 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1556 - int err;
1557 - int i;
1558 - int count = 0;
1559 - int count2 = 0;
1560 - struct ext4_free_metadata *md;
1561 struct ext4_buddy e4b;
1562 + struct ext4_group_info *db;
1563 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1564 + int err, count = 0, count2 = 0;
1565 + struct ext4_free_data *entry;
1566
1567 if (list_empty(&sbi->s_committed_transaction))
1568 return;
1569 @@ -2672,44 +2870,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
1570 /* there is committed blocks to be freed yet */
1571 do {
1572 /* get next array of blocks */
1573 - md = NULL;
1574 + entry = NULL;
1575 spin_lock(&sbi->s_md_lock);
1576 if (!list_empty(&sbi->s_committed_transaction)) {
1577 - md = list_entry(sbi->s_committed_transaction.next,
1578 - struct ext4_free_metadata, list);
1579 - list_del(&md->list);
1580 + entry = list_entry(sbi->s_committed_transaction.next,
1581 + struct ext4_free_data, list);
1582 + list_del(&entry->list);
1583 }
1584 spin_unlock(&sbi->s_md_lock);
1585
1586 - if (md == NULL)
1587 + if (entry == NULL)
1588 break;
1589
1590 mb_debug("gonna free %u blocks in group %lu (0x%p):",
1591 - md->num, md->group, md);
1592 + entry->count, entry->group, entry);
1593
1594 - err = ext4_mb_load_buddy(sb, md->group, &e4b);
1595 + err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1596 /* we expect to find existing buddy because it's pinned */
1597 BUG_ON(err != 0);
1598
1599 + db = e4b.bd_info;
1600 /* there are blocks to put in buddy to make them really free */
1601 - count += md->num;
1602 + count += entry->count;
1603 count2++;
1604 - ext4_lock_group(sb, md->group);
1605 - for (i = 0; i < md->num; i++) {
1606 - mb_debug(" %u", md->blocks[i]);
1607 - mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1608 + ext4_lock_group(sb, entry->group);
1609 + /* Take it out of per group rb tree */
1610 + rb_erase(&entry->node, &(db->bb_free_root));
1611 + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1612 +
1613 + if (!db->bb_free_root.rb_node) {
1614 + /* No more items in the per group rb tree
1615 + * balance refcounts from ext4_mb_free_metadata()
1616 + */
1617 + page_cache_release(e4b.bd_buddy_page);
1618 + page_cache_release(e4b.bd_bitmap_page);
1619 }
1620 - mb_debug("\n");
1621 - ext4_unlock_group(sb, md->group);
1622 -
1623 - /* balance refcounts from ext4_mb_free_metadata() */
1624 - page_cache_release(e4b.bd_buddy_page);
1625 - page_cache_release(e4b.bd_bitmap_page);
1626 + ext4_unlock_group(sb, entry->group);
1627
1628 - kfree(md);
1629 + kmem_cache_free(ext4_free_ext_cachep, entry);
1630 ext4_mb_release_desc(&e4b);
1631 -
1632 - } while (md);
1633 + } while (1);
1634
1635 mb_debug("freed %u blocks in %u structures\n", count, count2);
1636 }
1637 @@ -2864,6 +3064,16 @@ int __init init_ext4_mballoc(void)
1638 kmem_cache_destroy(ext4_pspace_cachep);
1639 return -ENOMEM;
1640 }
1641 +
1642 + ext4_free_ext_cachep =
1643 + kmem_cache_create("ext4_free_block_extents",
1644 + sizeof(struct ext4_free_data),
1645 + 0, SLAB_RECLAIM_ACCOUNT, NULL);
1646 + if (ext4_free_ext_cachep == NULL) {
1647 + kmem_cache_destroy(ext4_pspace_cachep);
1648 + kmem_cache_destroy(ext4_ac_cachep);
1649 + return -ENOMEM;
1650 + }
1651 #ifdef CONFIG_PROC_FS
1652 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1653 if (proc_root_ext4 == NULL)
1654 @@ -2880,6 +3090,7 @@ void exit_ext4_mballoc(void)
1655 #ifdef CONFIG_PROC_FS
1656 remove_proc_entry("fs/ext4", NULL);
1657 #endif
1658 + kmem_cache_destroy(ext4_free_ext_cachep);
1659 }
1660
1661
1662 @@ -2941,8 +3152,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1663 in_range(block + len - 1, ext4_inode_table(sb, gdp),
1664 EXT4_SB(sb)->s_itb_per_group)) {
1665 ext4_error(sb, __func__,
1666 - "Allocating block in system zone - block = %llu",
1667 - block);
1668 + "Allocating block %llu in system zone of %lu group\n",
1669 + block, ac->ac_b_ex.fe_group);
1670 /* File system mounted not to panic on error
1671 * Fix the bitmap and repeat the block allocation
1672 * We leak some of the blocks here.
1673 @@ -2964,10 +3175,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1674 }
1675 }
1676 #endif
1677 - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
1678 - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1679 -
1680 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
1681 + mb_set_bits(NULL, bitmap_bh->b_data,
1682 + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1683 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1684 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1685 gdp->bg_free_blocks_count =
1686 @@ -3400,10 +3610,37 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
1687 ac->ac_criteria = 20;
1688 return 1;
1689 }
1690 +
1691 return 0;
1692 }
1693
1694 /*
1695 + * the function goes through all block freed in the group
1696 + * but not yet committed and marks them used in in-core bitmap.
1697 + * buddy must be generated from this bitmap
1698 + * Need to be called with ext4 group lock (ext4_lock_group)
1699 + */
1700 +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1701 + ext4_group_t group)
1702 +{
1703 + struct rb_node *n;
1704 + struct ext4_group_info *grp;
1705 + struct ext4_free_data *entry;
1706 +
1707 + grp = ext4_get_group_info(sb, group);
1708 + n = rb_first(&(grp->bb_free_root));
1709 +
1710 + while (n) {
1711 + entry = rb_entry(n, struct ext4_free_data, node);
1712 + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
1713 + bitmap, entry->start_blk,
1714 + entry->count);
1715 + n = rb_next(n);
1716 + }
1717 + return;
1718 +}
1719 +
1720 +/*
1721 * the function goes through all preallocation in this group and marks them
1722 * used in in-core bitmap. buddy must be generated from this bitmap
1723 * Need to be called with ext4 group lock (ext4_lock_group)
1724 @@ -4166,6 +4403,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
1725 ac->ac_pa = NULL;
1726 ac->ac_bitmap_page = NULL;
1727 ac->ac_buddy_page = NULL;
1728 + ac->alloc_semp = NULL;
1729 ac->ac_lg = NULL;
1730
1731 /* we have to define context: we'll we work with a file or
1732 @@ -4346,6 +4584,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
1733 }
1734 ext4_mb_put_pa(ac, ac->ac_sb, pa);
1735 }
1736 + if (ac->alloc_semp)
1737 + up_read(ac->alloc_semp);
1738 if (ac->ac_bitmap_page)
1739 page_cache_release(ac->ac_bitmap_page);
1740 if (ac->ac_buddy_page)
1741 @@ -4449,10 +4689,14 @@ repeat:
1742 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1743 ext4_mb_new_preallocation(ac);
1744 }
1745 -
1746 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
1747 *errp = ext4_mb_mark_diskspace_used(ac, handle);
1748 if (*errp == -EAGAIN) {
1749 + /*
1750 + * drop the reference that we took
1751 + * in ext4_mb_use_best_found
1752 + */
1753 + ext4_mb_release_context(ac);
1754 ac->ac_b_ex.fe_group = 0;
1755 ac->ac_b_ex.fe_start = 0;
1756 ac->ac_b_ex.fe_len = 0;
1757 @@ -4517,65 +4761,97 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
1758 ext4_mb_free_committed_blocks(sb);
1759 }
1760
1761 +/*
1762 + * We can merge two free data extents only if the physical blocks
1763 + * are contiguous, AND the extents were freed by the same transaction,
1764 + * AND the blocks are associated with the same group.
1765 + */
1766 +static int can_merge(struct ext4_free_data *entry1,
1767 + struct ext4_free_data *entry2)
1768 +{
1769 + if ((entry1->t_tid == entry2->t_tid) &&
1770 + (entry1->group == entry2->group) &&
1771 + ((entry1->start_blk + entry1->count) == entry2->start_blk))
1772 + return 1;
1773 + return 0;
1774 +}
1775 +
1776 static noinline_for_stack int
1777 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
1778 - ext4_group_t group, ext4_grpblk_t block, int count)
1779 + struct ext4_free_data *new_entry)
1780 {
1781 + ext4_grpblk_t block;
1782 + struct ext4_free_data *entry;
1783 struct ext4_group_info *db = e4b->bd_info;
1784 struct super_block *sb = e4b->bd_sb;
1785 struct ext4_sb_info *sbi = EXT4_SB(sb);
1786 - struct ext4_free_metadata *md;
1787 - int i;
1788 + struct rb_node **n = &db->bb_free_root.rb_node, *node;
1789 + struct rb_node *parent = NULL, *new_node;
1790
1791 BUG_ON(e4b->bd_bitmap_page == NULL);
1792 BUG_ON(e4b->bd_buddy_page == NULL);
1793
1794 - ext4_lock_group(sb, group);
1795 - for (i = 0; i < count; i++) {
1796 - md = db->bb_md_cur;
1797 - if (md && db->bb_tid != handle->h_transaction->t_tid) {
1798 - db->bb_md_cur = NULL;
1799 - md = NULL;
1800 + new_node = &new_entry->node;
1801 + block = new_entry->start_blk;
1802 +
1803 + if (!*n) {
1804 + /* first free block exent. We need to
1805 + protect buddy cache from being freed,
1806 + * otherwise we'll refresh it from
1807 + * on-disk bitmap and lose not-yet-available
1808 + * blocks */
1809 + page_cache_get(e4b->bd_buddy_page);
1810 + page_cache_get(e4b->bd_bitmap_page);
1811 + }
1812 + while (*n) {
1813 + parent = *n;
1814 + entry = rb_entry(parent, struct ext4_free_data, node);
1815 + if (block < entry->start_blk)
1816 + n = &(*n)->rb_left;
1817 + else if (block >= (entry->start_blk + entry->count))
1818 + n = &(*n)->rb_right;
1819 + else {
1820 + ext4_error(sb, __func__,
1821 + "Double free of blocks %d (%d %d)\n",
1822 + block, entry->start_blk, entry->count);
1823 + return 0;
1824 }
1825 + }
1826
1827 - if (md == NULL) {
1828 - ext4_unlock_group(sb, group);
1829 - md = kmalloc(sizeof(*md), GFP_NOFS);
1830 - if (md == NULL)
1831 - return -ENOMEM;
1832 - md->num = 0;
1833 - md->group = group;
1834 -
1835 - ext4_lock_group(sb, group);
1836 - if (db->bb_md_cur == NULL) {
1837 - spin_lock(&sbi->s_md_lock);
1838 - list_add(&md->list, &sbi->s_active_transaction);
1839 - spin_unlock(&sbi->s_md_lock);
1840 - /* protect buddy cache from being freed,
1841 - * otherwise we'll refresh it from
1842 - * on-disk bitmap and lose not-yet-available
1843 - * blocks */
1844 - page_cache_get(e4b->bd_buddy_page);
1845 - page_cache_get(e4b->bd_bitmap_page);
1846 - db->bb_md_cur = md;
1847 - db->bb_tid = handle->h_transaction->t_tid;
1848 - mb_debug("new md 0x%p for group %lu\n",
1849 - md, md->group);
1850 - } else {
1851 - kfree(md);
1852 - md = db->bb_md_cur;
1853 - }
1854 + rb_link_node(new_node, parent, n);
1855 + rb_insert_color(new_node, &db->bb_free_root);
1856 +
1857 + /* Now try to see the extent can be merged to left and right */
1858 + node = rb_prev(new_node);
1859 + if (node) {
1860 + entry = rb_entry(node, struct ext4_free_data, node);
1861 + if (can_merge(entry, new_entry)) {
1862 + new_entry->start_blk = entry->start_blk;
1863 + new_entry->count += entry->count;
1864 + rb_erase(node, &(db->bb_free_root));
1865 + spin_lock(&sbi->s_md_lock);
1866 + list_del(&entry->list);
1867 + spin_unlock(&sbi->s_md_lock);
1868 + kmem_cache_free(ext4_free_ext_cachep, entry);
1869 }
1870 + }
1871
1872 - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
1873 - md->blocks[md->num] = block + i;
1874 - md->num++;
1875 - if (md->num == EXT4_BB_MAX_BLOCKS) {
1876 - /* no more space, put full container on a sb's list */
1877 - db->bb_md_cur = NULL;
1878 + node = rb_next(new_node);
1879 + if (node) {
1880 + entry = rb_entry(node, struct ext4_free_data, node);
1881 + if (can_merge(new_entry, entry)) {
1882 + new_entry->count += entry->count;
1883 + rb_erase(node, &(db->bb_free_root));
1884 + spin_lock(&sbi->s_md_lock);
1885 + list_del(&entry->list);
1886 + spin_unlock(&sbi->s_md_lock);
1887 + kmem_cache_free(ext4_free_ext_cachep, entry);
1888 }
1889 }
1890 - ext4_unlock_group(sb, group);
1891 + /* Add the extent to active_transaction list */
1892 + spin_lock(&sbi->s_md_lock);
1893 + list_add(&new_entry->list, &sbi->s_active_transaction);
1894 + spin_unlock(&sbi->s_md_lock);
1895 return 0;
1896 }
1897
1898 @@ -4675,11 +4951,6 @@ do_more:
1899 err = ext4_journal_get_write_access(handle, gd_bh);
1900 if (err)
1901 goto error_return;
1902 -
1903 - err = ext4_mb_load_buddy(sb, block_group, &e4b);
1904 - if (err)
1905 - goto error_return;
1906 -
1907 #ifdef AGGRESSIVE_CHECK
1908 {
1909 int i;
1910 @@ -4687,13 +4958,6 @@ do_more:
1911 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
1912 }
1913 #endif
1914 - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1915 - bit, count);
1916 -
1917 - /* We dirtied the bitmap block */
1918 - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1919 - err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1920 -
1921 if (ac) {
1922 ac->ac_b_ex.fe_group = block_group;
1923 ac->ac_b_ex.fe_start = bit;
1924 @@ -4701,12 +4965,33 @@ do_more:
1925 ext4_mb_store_history(ac);
1926 }
1927
1928 + err = ext4_mb_load_buddy(sb, block_group, &e4b);
1929 + if (err)
1930 + goto error_return;
1931 if (metadata) {
1932 - /* blocks being freed are metadata. these blocks shouldn't
1933 - * be used until this transaction is committed */
1934 - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
1935 + struct ext4_free_data *new_entry;
1936 + /*
1937 + * blocks being freed are metadata. these blocks shouldn't
1938 + * be used until this transaction is committed
1939 + */
1940 + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
1941 + new_entry->start_blk = bit;
1942 + new_entry->group = block_group;
1943 + new_entry->count = count;
1944 + new_entry->t_tid = handle->h_transaction->t_tid;
1945 + ext4_lock_group(sb, block_group);
1946 + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1947 + bit, count);
1948 + ext4_mb_free_metadata(handle, &e4b, new_entry);
1949 + ext4_unlock_group(sb, block_group);
1950 } else {
1951 ext4_lock_group(sb, block_group);
1952 + /* need to update group_info->bb_free and bitmap
1953 + * with group lock held. generate_buddy look at
1954 + * them with group lock_held
1955 + */
1956 + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1957 + bit, count);
1958 mb_free_blocks(inode, &e4b, bit, count);
1959 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
1960 ext4_unlock_group(sb, block_group);
1961 @@ -4729,6 +5014,10 @@ do_more:
1962
1963 *freed += count;
1964
1965 + /* We dirtied the bitmap block */
1966 + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1967 + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1968 +
1969 /* And the group descriptor block */
1970 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
1971 ret = ext4_journal_dirty_metadata(handle, gd_bh);
1972 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
1973 index c7c9906..0a28dd3 100644
1974 --- a/fs/ext4/mballoc.h
1975 +++ b/fs/ext4/mballoc.h
1976 @@ -18,6 +18,7 @@
1977 #include <linux/pagemap.h>
1978 #include <linux/seq_file.h>
1979 #include <linux/version.h>
1980 +#include <linux/mutex.h>
1981 #include "ext4_jbd2.h"
1982 #include "ext4.h"
1983 #include "group.h"
1984 @@ -96,25 +97,27 @@
1985 */
1986 #define MB_DEFAULT_GROUP_PREALLOC 512
1987
1988 -static struct kmem_cache *ext4_pspace_cachep;
1989 -static struct kmem_cache *ext4_ac_cachep;
1990 +struct ext4_free_data {
1991 + /* this links the free block information from group_info */
1992 + struct rb_node node;
1993
1994 -#ifdef EXT4_BB_MAX_BLOCKS
1995 -#undef EXT4_BB_MAX_BLOCKS
1996 -#endif
1997 -#define EXT4_BB_MAX_BLOCKS 30
1998 + /* this links the free block information from ext4_sb_info */
1999 + struct list_head list;
2000
2001 -struct ext4_free_metadata {
2002 + /* group which free block extent belongs */
2003 ext4_group_t group;
2004 - unsigned short num;
2005 - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
2006 - struct list_head list;
2007 +
2008 + /* free block extent */
2009 + ext4_grpblk_t start_blk;
2010 + ext4_grpblk_t count;
2011 +
2012 + /* transaction which freed this extent */
2013 + tid_t t_tid;
2014 };
2015
2016 struct ext4_group_info {
2017 unsigned long bb_state;
2018 - unsigned long bb_tid;
2019 - struct ext4_free_metadata *bb_md_cur;
2020 + struct rb_root bb_free_root;
2021 unsigned short bb_first_free;
2022 unsigned short bb_free;
2023 unsigned short bb_fragments;
2024 @@ -122,6 +125,7 @@ struct ext4_group_info {
2025 #ifdef DOUBLE_CHECK
2026 void *bb_bitmap;
2027 #endif
2028 + struct rw_semaphore alloc_sem;
2029 unsigned short bb_counters[];
2030 };
2031
2032 @@ -209,6 +213,11 @@ struct ext4_allocation_context {
2033 __u8 ac_op; /* operation, for history only */
2034 struct page *ac_bitmap_page;
2035 struct page *ac_buddy_page;
2036 + /*
2037 + * pointer to the held semaphore upon successful
2038 + * block allocation
2039 + */
2040 + struct rw_semaphore *alloc_semp;
2041 struct ext4_prealloc_space *ac_pa;
2042 struct ext4_locality_group *ac_lg;
2043 };
2044 @@ -242,6 +251,7 @@ struct ext4_buddy {
2045 struct super_block *bd_sb;
2046 __u16 bd_blkbits;
2047 ext4_group_t bd_group;
2048 + struct rw_semaphore *alloc_semp;
2049 };
2050 #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
2051 #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
2052 @@ -251,8 +261,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
2053 {
2054 return;
2055 }
2056 -#else
2057 -static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2058 #endif
2059
2060 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2061 @@ -260,19 +268,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2062 static struct proc_dir_entry *proc_root_ext4;
2063 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2064
2065 -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2066 - ext4_group_t group);
2067 -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2068 -static void ext4_mb_free_committed_blocks(struct super_block *);
2069 -static void ext4_mb_return_to_preallocation(struct inode *inode,
2070 - struct ext4_buddy *e4b, sector_t block,
2071 - int count);
2072 -static void ext4_mb_put_pa(struct ext4_allocation_context *,
2073 - struct super_block *, struct ext4_prealloc_space *pa);
2074 -static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2075 -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2076 -
2077 -
2078 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2079 {
2080 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2081 @@ -297,7 +292,7 @@ static inline int ext4_is_group_locked(struct super_block *sb,
2082 &(grinfo->bb_state));
2083 }
2084
2085 -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2086 +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2087 struct ext4_free_extent *fex)
2088 {
2089 ext4_fsblk_t block;
2090 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
2091 index d626533..4f3628f 100644
2092 --- a/fs/ext4/namei.c
2093 +++ b/fs/ext4/namei.c
2094 @@ -371,6 +371,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
2095 goto fail;
2096 }
2097 hinfo->hash_version = root->info.hash_version;
2098 + if (hinfo->hash_version <= DX_HASH_TEA)
2099 + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2100 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2101 if (dentry)
2102 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2103 @@ -640,6 +642,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
2104 dir = dir_file->f_path.dentry->d_inode;
2105 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2106 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2107 + if (hinfo.hash_version <= DX_HASH_TEA)
2108 + hinfo.hash_version +=
2109 + EXT4_SB(dir->i_sb)->s_hash_unsigned;
2110 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2111 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2112 start_hash, start_minor_hash);
2113 @@ -1377,7 +1382,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2114 struct fake_dirent *fde;
2115
2116 blocksize = dir->i_sb->s_blocksize;
2117 - dxtrace(printk("Creating index\n"));
2118 + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2119 retval = ext4_journal_get_write_access(handle, bh);
2120 if (retval) {
2121 ext4_std_error(dir->i_sb, retval);
2122 @@ -1386,6 +1391,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2123 }
2124 root = (struct dx_root *) bh->b_data;
2125
2126 + /* The 0th block becomes the root, move the dirents out */
2127 + fde = &root->dotdot;
2128 + de = (struct ext4_dir_entry_2 *)((char *)fde +
2129 + ext4_rec_len_from_disk(fde->rec_len));
2130 + if ((char *) de >= (((char *) root) + blocksize)) {
2131 + ext4_error(dir->i_sb, __func__,
2132 + "invalid rec_len for '..' in inode %lu",
2133 + dir->i_ino);
2134 + brelse(bh);
2135 + return -EIO;
2136 + }
2137 + len = ((char *) root) + blocksize - (char *) de;
2138 +
2139 + /* Allocate new block for the 0th block's dirents */
2140 bh2 = ext4_append (handle, dir, &block, &retval);
2141 if (!(bh2)) {
2142 brelse(bh);
2143 @@ -1394,11 +1413,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2144 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2145 data1 = bh2->b_data;
2146
2147 - /* The 0th block becomes the root, move the dirents out */
2148 - fde = &root->dotdot;
2149 - de = (struct ext4_dir_entry_2 *)((char *)fde +
2150 - ext4_rec_len_from_disk(fde->rec_len));
2151 - len = ((char *) root) + blocksize - (char *) de;
2152 memcpy (data1, de, len);
2153 de = (struct ext4_dir_entry_2 *) data1;
2154 top = data1 + len;
2155 @@ -1418,6 +1432,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2156
2157 /* Initialize as for dx_probe */
2158 hinfo.hash_version = root->info.hash_version;
2159 + if (hinfo.hash_version <= DX_HASH_TEA)
2160 + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2161 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2162 ext4fs_dirhash(name, namelen, &hinfo);
2163 frame = frames;
2164 diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
2165 index 3922a8b..0070431 100644
2166 --- a/fs/ext4/resize.c
2167 +++ b/fs/ext4/resize.c
2168 @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
2169 if ((err = extend_or_restart_transaction(handle, 2, bh)))
2170 goto exit_bh;
2171
2172 - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2173 - bh->b_data);
2174 + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2175 ext4_journal_dirty_metadata(handle, bh);
2176 brelse(bh);
2177 -
2178 /* Mark unused entries in inode bitmap used */
2179 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2180 input->inode_bitmap, input->inode_bitmap - start);
2181 @@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb,
2182 goto exit_journal;
2183 }
2184
2185 - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2186 + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2187 bh->b_data);
2188 ext4_journal_dirty_metadata(handle, bh);
2189 exit_bh:
2190 @@ -747,6 +745,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2191 struct inode *inode = NULL;
2192 handle_t *handle;
2193 int gdb_off, gdb_num;
2194 + int num_grp_locked = 0;
2195 int err, err2;
2196
2197 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2198 @@ -787,6 +786,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2199 }
2200 }
2201
2202 +
2203 if ((err = verify_group_input(sb, input)))
2204 goto exit_put;
2205
2206 @@ -855,15 +855,18 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2207 * using the new disk blocks.
2208 */
2209
2210 + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2211 /* Update group descriptor block for new group */
2212 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2213 gdb_off * EXT4_DESC_SIZE(sb));
2214
2215 + memset(gdp, 0, EXT4_DESC_SIZE(sb));
2216 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2217 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2218 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2219 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2220 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2221 + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2222 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2223
2224 /*
2225 @@ -871,9 +874,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2226 * descriptor
2227 */
2228 if (test_opt(sb, MBALLOC)) {
2229 - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2230 - if (err)
2231 + err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2232 + if (err) {
2233 + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2234 goto exit_journal;
2235 + }
2236 }
2237 /*
2238 * Make the new blocks and inodes valid next. We do this before
2239 @@ -915,6 +920,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2240
2241 /* Update the global fs size fields */
2242 sbi->s_groups_count++;
2243 + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2244
2245 ext4_journal_dirty_metadata(handle, primary);
2246
2247 @@ -976,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2248 struct buffer_head * bh;
2249 handle_t *handle;
2250 int err;
2251 - unsigned long freed_blocks;
2252 ext4_group_t group;
2253 - struct ext4_group_info *grp;
2254
2255 /* We don't need to worry about locking wrt other resizers just
2256 * yet: we're going to revalidate es->s_blocks_count after
2257 @@ -1077,50 +1081,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2258 unlock_super(sb);
2259 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2260 o_blocks_count + add);
2261 - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2262 + /* We add the blocks to the bitmap and set the group need init bit */
2263 + ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2264 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2265 o_blocks_count + add);
2266 if ((err = ext4_journal_stop(handle)))
2267 goto exit_put;
2268
2269 - /*
2270 - * Mark mballoc pages as not up to date so that they will be updated
2271 - * next time they are loaded by ext4_mb_load_buddy.
2272 - */
2273 - if (test_opt(sb, MBALLOC)) {
2274 - struct ext4_sb_info *sbi = EXT4_SB(sb);
2275 - struct inode *inode = sbi->s_buddy_cache;
2276 - int blocks_per_page;
2277 - int block;
2278 - int pnum;
2279 - struct page *page;
2280 -
2281 - /* Set buddy page as not up to date */
2282 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2283 - block = group * 2;
2284 - pnum = block / blocks_per_page;
2285 - page = find_get_page(inode->i_mapping, pnum);
2286 - if (page != NULL) {
2287 - ClearPageUptodate(page);
2288 - page_cache_release(page);
2289 - }
2290 -
2291 - /* Set bitmap page as not up to date */
2292 - block++;
2293 - pnum = block / blocks_per_page;
2294 - page = find_get_page(inode->i_mapping, pnum);
2295 - if (page != NULL) {
2296 - ClearPageUptodate(page);
2297 - page_cache_release(page);
2298 - }
2299 -
2300 - /* Get the info on the last group */
2301 - grp = ext4_get_group_info(sb, group);
2302 -
2303 - /* Update free blocks in group info */
2304 - ext4_mb_update_group_info(grp, add);
2305 - }
2306 -
2307 if (test_opt(sb, DEBUG))
2308 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2309 ext4_blocks_count(es));
2310 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
2311 index 7726e8e..5e4491d 100644
2312 --- a/fs/ext4/super.c
2313 +++ b/fs/ext4/super.c
2314 @@ -1493,7 +1493,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2315 ext4_group_t flex_group_count;
2316 ext4_group_t flex_group;
2317 int groups_per_flex = 0;
2318 - __u64 block_bitmap = 0;
2319 int i;
2320
2321 if (!sbi->s_es->s_log_groups_per_flex) {
2322 @@ -1516,9 +1515,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2323 goto failed;
2324 }
2325
2326 - gdp = ext4_get_group_desc(sb, 1, &bh);
2327 - block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2328 -
2329 for (i = 0; i < sbi->s_groups_count; i++) {
2330 gdp = ext4_get_group_desc(sb, i, &bh);
2331
2332 @@ -1920,8 +1916,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2333 struct inode *root;
2334 int ret = -EINVAL;
2335 int blocksize;
2336 - int db_count;
2337 - int i;
2338 + unsigned int db_count;
2339 + unsigned int i;
2340 int needs_recovery;
2341 __le32 features;
2342 __u64 blocks_count;
2343 @@ -2172,6 +2168,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2344 for (i = 0; i < 4; i++)
2345 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2346 sbi->s_def_hash_version = es->s_def_hash_version;
2347 + i = le32_to_cpu(es->s_flags);
2348 + if (i & EXT2_FLAGS_UNSIGNED_HASH)
2349 + sbi->s_hash_unsigned = 3;
2350 + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2351 +#ifdef __CHAR_UNSIGNED__
2352 + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2353 + sbi->s_hash_unsigned = 3;
2354 +#else
2355 + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2356 +#endif
2357 + sb->s_dirt = 1;
2358 + }
2359
2360 if (sbi->s_blocks_per_group > blocksize * 8) {
2361 printk(KERN_ERR
2362 @@ -2199,20 +2207,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2363 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2364 goto cantfind_ext4;
2365
2366 - /* ensure blocks_count calculation below doesn't sign-extend */
2367 - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2368 - le32_to_cpu(es->s_first_data_block) + 1) {
2369 - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2370 - "first data block %u, blocks per group %lu\n",
2371 - ext4_blocks_count(es),
2372 - le32_to_cpu(es->s_first_data_block),
2373 - EXT4_BLOCKS_PER_GROUP(sb));
2374 + /*
2375 + * It makes no sense for the first data block to be beyond the end
2376 + * of the filesystem.
2377 + */
2378 + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2379 + printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2380 + "block %u is beyond end of filesystem (%llu)\n",
2381 + le32_to_cpu(es->s_first_data_block),
2382 + ext4_blocks_count(es));
2383 goto failed_mount;
2384 }
2385 blocks_count = (ext4_blocks_count(es) -
2386 le32_to_cpu(es->s_first_data_block) +
2387 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2388 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2389 + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2390 + printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2391 + "(block count %llu, first data block %u, "
2392 + "blocks per group %lu)\n", sbi->s_groups_count,
2393 + ext4_blocks_count(es),
2394 + le32_to_cpu(es->s_first_data_block),
2395 + EXT4_BLOCKS_PER_GROUP(sb));
2396 + goto failed_mount;
2397 + }
2398 sbi->s_groups_count = blocks_count;
2399 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2400 EXT4_DESC_PER_BLOCK(sb);
2401 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
2402 index 6caf22d..b1f0756 100644
2403 --- a/fs/jbd2/commit.c
2404 +++ b/fs/jbd2/commit.c
2405 @@ -24,6 +24,7 @@
2406 #include <linux/crc32.h>
2407 #include <linux/writeback.h>
2408 #include <linux/backing-dev.h>
2409 +#include <linux/bio.h>
2410
2411 /*
2412 * Default IO end handler for temporary BJ_IO buffer_heads.
2413 @@ -170,12 +171,34 @@ static int journal_submit_commit_record(journal_t *journal,
2414 * This function along with journal_submit_commit_record
2415 * allows to write the commit record asynchronously.
2416 */
2417 -static int journal_wait_on_commit_record(struct buffer_head *bh)
2418 +static int journal_wait_on_commit_record(journal_t *journal,
2419 + struct buffer_head *bh)
2420 {
2421 int ret = 0;
2422
2423 +retry:
2424 clear_buffer_dirty(bh);
2425 wait_on_buffer(bh);
2426 + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2427 + printk(KERN_WARNING
2428 + "JBD2: wait_on_commit_record: sync failed on %s - "
2429 + "disabling barriers\n", journal->j_devname);
2430 + spin_lock(&journal->j_state_lock);
2431 + journal->j_flags &= ~JBD2_BARRIER;
2432 + spin_unlock(&journal->j_state_lock);
2433 +
2434 + lock_buffer(bh);
2435 + clear_buffer_dirty(bh);
2436 + set_buffer_uptodate(bh);
2437 + bh->b_end_io = journal_end_buffer_io_sync;
2438 +
2439 + ret = submit_bh(WRITE_SYNC, bh);
2440 + if (ret) {
2441 + unlock_buffer(bh);
2442 + return ret;
2443 + }
2444 + goto retry;
2445 + }
2446
2447 if (unlikely(!buffer_uptodate(bh)))
2448 ret = -EIO;
2449 @@ -795,7 +818,7 @@ wait_for_iobuf:
2450 __jbd2_journal_abort_hard(journal);
2451 }
2452 if (!err && !is_journal_aborted(journal))
2453 - err = journal_wait_on_commit_record(cbh);
2454 + err = journal_wait_on_commit_record(journal, cbh);
2455
2456 if (err)
2457 jbd2_journal_abort(journal, err);
2458 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
2459 index 66c3499..0e1bd70 100644
2460 --- a/include/linux/jbd2.h
2461 +++ b/include/linux/jbd2.h
2462 @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh);
2463 int val = (expr); \
2464 if (!val) { \
2465 printk(KERN_ERR \
2466 - "EXT3-fs unexpected failure: %s;\n",# expr); \
2467 + "JBD2 unexpected failure: %s: %s;\n", \
2468 + __func__, #expr); \
2469 printk(KERN_ERR why "\n"); \
2470 } \
2471 val; \
2472 @@ -329,6 +330,7 @@ enum jbd_state_bits {
2473 BH_State, /* Pins most journal_head state */
2474 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
2475 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
2476 + BH_JBDPrivateStart, /* First bit available for private use by FS */
2477 };
2478
2479 BUFFER_FNS(JBD, jbd)
2480 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
2481 index 794e546..e7e7c7d 100644
2482 --- a/include/linux/pci_ids.h
2483 +++ b/include/linux/pci_ids.h
2484 @@ -1301,6 +1301,7 @@
2485 #define PCI_DEVICE_ID_VIA_VT3351 0x0351
2486 #define PCI_DEVICE_ID_VIA_VT3364 0x0364
2487 #define PCI_DEVICE_ID_VIA_8371_0 0x0391
2488 +#define PCI_DEVICE_ID_VIA_6415 0x0415
2489 #define PCI_DEVICE_ID_VIA_8501_0 0x0501
2490 #define PCI_DEVICE_ID_VIA_82C561 0x0561
2491 #define PCI_DEVICE_ID_VIA_82C586_1 0x0571
2492 diff --git a/include/linux/pid.h b/include/linux/pid.h
2493 index d7e98ff..93997c9 100644
2494 --- a/include/linux/pid.h
2495 +++ b/include/linux/pid.h
2496 @@ -123,6 +123,24 @@ extern struct pid *alloc_pid(struct pid_namespace *ns);
2497 extern void free_pid(struct pid *pid);
2498
2499 /*
2500 + * ns_of_pid() returns the pid namespace in which the specified pid was
2501 + * allocated.
2502 + *
2503 + * NOTE:
2504 + * ns_of_pid() is expected to be called for a process (task) that has
2505 + * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
2506 + * is expected to be non-NULL. If @pid is NULL, caller should handle
2507 + * the resulting NULL pid-ns.
2508 + */
2509 +static inline struct pid_namespace *ns_of_pid(struct pid *pid)
2510 +{
2511 + struct pid_namespace *ns = NULL;
2512 + if (pid)
2513 + ns = pid->numbers[pid->level].ns;
2514 + return ns;
2515 +}
2516 +
2517 +/*
2518 * the helpers to get the pid's id seen from different namespaces
2519 *
2520 * pid_nr() : global id, i.e. the id seen from the init namespace;
2521 diff --git a/ipc/mqueue.c b/ipc/mqueue.c
2522 index a58bfad..ca502aa 100644
2523 --- a/ipc/mqueue.c
2524 +++ b/ipc/mqueue.c
2525 @@ -498,7 +498,8 @@ static void __do_notify(struct mqueue_inode_info *info)
2526 sig_i.si_errno = 0;
2527 sig_i.si_code = SI_MESGQ;
2528 sig_i.si_value = info->notify.sigev_value;
2529 - sig_i.si_pid = task_tgid_vnr(current);
2530 + sig_i.si_pid = task_tgid_nr_ns(current,
2531 + ns_of_pid(info->notify_owner));
2532 sig_i.si_uid = current->uid;
2533
2534 kill_pid_info(info->notify.sigev_signo,