Annotation of /trunk/kernel26-alx/patches-2.6.27-r3/0118-2.6.27.19-all-fixes.patch
Parent Directory | Revision Log
Revision 1176 -
(hide annotations)
(download)
Thu Oct 14 15:11:06 2010 UTC (13 years, 11 months ago) by niro
File size: 80833 byte(s)
Thu Oct 14 15:11:06 2010 UTC (13 years, 11 months ago) by niro
File size: 80833 byte(s)
-2.6.27-alx-r3: new magellan 0.5.2 kernel
1 | niro | 1176 | diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c |
2 | index 5af4e9b..ada0692 100644 | ||
3 | --- a/arch/powerpc/kernel/align.c | ||
4 | +++ b/arch/powerpc/kernel/align.c | ||
5 | @@ -646,11 +646,16 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, | ||
6 | unsigned int areg, struct pt_regs *regs, | ||
7 | unsigned int flags, unsigned int length) | ||
8 | { | ||
9 | - char *ptr = (char *) ¤t->thread.TS_FPR(reg); | ||
10 | + char *ptr; | ||
11 | int ret = 0; | ||
12 | |||
13 | flush_vsx_to_thread(current); | ||
14 | |||
15 | + if (reg < 32) | ||
16 | + ptr = (char *) ¤t->thread.TS_FPR(reg); | ||
17 | + else | ||
18 | + ptr = (char *) ¤t->thread.vr[reg - 32]; | ||
19 | + | ||
20 | if (flags & ST) | ||
21 | ret = __copy_to_user(addr, ptr, length); | ||
22 | else { | ||
23 | diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c | ||
24 | index 5b719a0..7c3b8dc 100644 | ||
25 | --- a/arch/x86/mm/pageattr.c | ||
26 | +++ b/arch/x86/mm/pageattr.c | ||
27 | @@ -619,6 +619,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) | ||
28 | unsigned int level; | ||
29 | pte_t *kpte, old_pte; | ||
30 | |||
31 | + /* | ||
32 | + * If we're called with lazy mmu updates enabled, the | ||
33 | + * in-memory pte state may be stale. Flush pending updates to | ||
34 | + * bring them up to date. | ||
35 | + */ | ||
36 | + arch_flush_lazy_mmu_mode(); | ||
37 | + | ||
38 | repeat: | ||
39 | kpte = lookup_address(address, &level); | ||
40 | if (!kpte) | ||
41 | @@ -836,6 +843,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | ||
42 | else | ||
43 | cpa_flush_all(cache); | ||
44 | |||
45 | + /* | ||
46 | + * If we've been called with lazy mmu updates enabled, then | ||
47 | + * make sure that everything gets flushed out before we | ||
48 | + * return. | ||
49 | + */ | ||
50 | + arch_flush_lazy_mmu_mode(); | ||
51 | + | ||
52 | out: | ||
53 | cpa_fill_pool(NULL); | ||
54 | |||
55 | diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c | ||
56 | index c5be6a1..b6f55e8 100644 | ||
57 | --- a/drivers/ata/pata_via.c | ||
58 | +++ b/drivers/ata/pata_via.c | ||
59 | @@ -111,7 +111,8 @@ static const struct via_isa_bridge { | ||
60 | { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, | ||
61 | { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, | ||
62 | { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, | ||
63 | - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES}, | ||
64 | + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, | ||
65 | + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, | ||
66 | { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, | ||
67 | { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, | ||
68 | { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, | ||
69 | @@ -594,6 +595,7 @@ static int via_reinit_one(struct pci_dev *pdev) | ||
70 | #endif | ||
71 | |||
72 | static const struct pci_device_id via[] = { | ||
73 | + { PCI_VDEVICE(VIA, 0x0415), }, | ||
74 | { PCI_VDEVICE(VIA, 0x0571), }, | ||
75 | { PCI_VDEVICE(VIA, 0x0581), }, | ||
76 | { PCI_VDEVICE(VIA, 0x1571), }, | ||
77 | diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c | ||
78 | index 89e3b7f..8b6f9c0 100644 | ||
79 | --- a/drivers/ata/sata_nv.c | ||
80 | +++ b/drivers/ata/sata_nv.c | ||
81 | @@ -421,19 +421,21 @@ static struct ata_port_operations nv_generic_ops = { | ||
82 | .hardreset = ATA_OP_NULL, | ||
83 | }; | ||
84 | |||
85 | -/* OSDL bz3352 reports that nf2/3 controllers can't determine device | ||
86 | - * signature reliably. Also, the following thread reports detection | ||
87 | - * failure on cold boot with the standard debouncing timing. | ||
88 | +/* nf2 is ripe with hardreset related problems. | ||
89 | + * | ||
90 | + * kernel bz#3352 reports nf2/3 controllers can't determine device | ||
91 | + * signature reliably. The following thread reports detection failure | ||
92 | + * on cold boot with the standard debouncing timing. | ||
93 | * | ||
94 | * http://thread.gmane.org/gmane.linux.ide/34098 | ||
95 | * | ||
96 | - * Debounce with hotplug timing and request follow-up SRST. | ||
97 | + * And bz#12176 reports that hardreset simply doesn't work on nf2. | ||
98 | + * Give up on it and just don't do hardreset. | ||
99 | */ | ||
100 | static struct ata_port_operations nv_nf2_ops = { | ||
101 | - .inherits = &nv_common_ops, | ||
102 | + .inherits = &nv_generic_ops, | ||
103 | .freeze = nv_nf2_freeze, | ||
104 | .thaw = nv_nf2_thaw, | ||
105 | - .hardreset = nv_noclassify_hardreset, | ||
106 | }; | ||
107 | |||
108 | /* For initial probing after boot and hot plugging, hardreset mostly | ||
109 | diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c | ||
110 | index 58630cc..f2ada0c 100644 | ||
111 | --- a/drivers/bluetooth/btsdio.c | ||
112 | +++ b/drivers/bluetooth/btsdio.c | ||
113 | @@ -91,6 +91,7 @@ static int btsdio_tx_packet(struct btsdio_data *data, struct sk_buff *skb) | ||
114 | |||
115 | err = sdio_writesb(data->func, REG_TDAT, skb->data, skb->len); | ||
116 | if (err < 0) { | ||
117 | + skb_pull(skb, 4); | ||
118 | sdio_writeb(data->func, 0x01, REG_PC_WRT, NULL); | ||
119 | return err; | ||
120 | } | ||
121 | @@ -152,7 +153,7 @@ static int btsdio_rx_packet(struct btsdio_data *data) | ||
122 | |||
123 | err = sdio_readsb(data->func, skb->data, REG_RDAT, len - 4); | ||
124 | if (err < 0) { | ||
125 | - kfree(skb); | ||
126 | + kfree_skb(skb); | ||
127 | return err; | ||
128 | } | ||
129 | |||
130 | diff --git a/drivers/net/3c505.c b/drivers/net/3c505.c | ||
131 | index fdfb2b2..ae8e36c 100644 | ||
132 | --- a/drivers/net/3c505.c | ||
133 | +++ b/drivers/net/3c505.c | ||
134 | @@ -493,21 +493,27 @@ static bool receive_pcb(struct net_device *dev, pcb_struct * pcb) | ||
135 | } | ||
136 | /* read the data */ | ||
137 | spin_lock_irqsave(&adapter->lock, flags); | ||
138 | - i = 0; | ||
139 | - do { | ||
140 | - j = 0; | ||
141 | - while (((stat = get_status(dev->base_addr)) & ACRF) == 0 && j++ < 20000); | ||
142 | - pcb->data.raw[i++] = inb_command(dev->base_addr); | ||
143 | - if (i > MAX_PCB_DATA) | ||
144 | - INVALID_PCB_MSG(i); | ||
145 | - } while ((stat & ASF_PCB_MASK) != ASF_PCB_END && j < 20000); | ||
146 | + for (i = 0; i < MAX_PCB_DATA; i++) { | ||
147 | + for (j = 0; j < 20000; j++) { | ||
148 | + stat = get_status(dev->base_addr); | ||
149 | + if (stat & ACRF) | ||
150 | + break; | ||
151 | + } | ||
152 | + pcb->data.raw[i] = inb_command(dev->base_addr); | ||
153 | + if ((stat & ASF_PCB_MASK) == ASF_PCB_END || j >= 20000) | ||
154 | + break; | ||
155 | + } | ||
156 | spin_unlock_irqrestore(&adapter->lock, flags); | ||
157 | + if (i >= MAX_PCB_DATA) { | ||
158 | + INVALID_PCB_MSG(i); | ||
159 | + return false; | ||
160 | + } | ||
161 | if (j >= 20000) { | ||
162 | TIMEOUT_MSG(__LINE__); | ||
163 | return false; | ||
164 | } | ||
165 | - /* woops, the last "data" byte was really the length! */ | ||
166 | - total_length = pcb->data.raw[--i]; | ||
167 | + /* the last "data" byte was really the length! */ | ||
168 | + total_length = pcb->data.raw[i]; | ||
169 | |||
170 | /* safety check total length vs data length */ | ||
171 | if (total_length != (pcb->length + 2)) { | ||
172 | diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c | ||
173 | index c3edcdc..2d90a3c 100644 | ||
174 | --- a/drivers/pci/intel-iommu.c | ||
175 | +++ b/drivers/pci/intel-iommu.c | ||
176 | @@ -72,6 +72,8 @@ static struct deferred_flush_tables *deferred_flush; | ||
177 | /* bitmap for indexing intel_iommus */ | ||
178 | static int g_num_of_iommus; | ||
179 | |||
180 | +static int rwbf_quirk = 0; | ||
181 | + | ||
182 | static DEFINE_SPINLOCK(async_umap_flush_lock); | ||
183 | static LIST_HEAD(unmaps_to_do); | ||
184 | |||
185 | @@ -527,7 +529,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) | ||
186 | u32 val; | ||
187 | unsigned long flag; | ||
188 | |||
189 | - if (!cap_rwbf(iommu->cap)) | ||
190 | + if (!rwbf_quirk && !cap_rwbf(iommu->cap)) | ||
191 | return; | ||
192 | val = iommu->gcmd | DMA_GCMD_WBF; | ||
193 | |||
194 | @@ -2453,3 +2455,12 @@ int __init intel_iommu_init(void) | ||
195 | return 0; | ||
196 | } | ||
197 | |||
198 | +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) | ||
199 | +{ | ||
200 | + /* Mobile 4 Series Chipset neglects to set RWBF capability, | ||
201 | + but needs it */ | ||
202 | + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n"); | ||
203 | + rwbf_quirk = 1; | ||
204 | +} | ||
205 | + | ||
206 | +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); | ||
207 | diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c | ||
208 | index 299e075..55ac5c3 100644 | ||
209 | --- a/drivers/scsi/libiscsi.c | ||
210 | +++ b/drivers/scsi/libiscsi.c | ||
211 | @@ -1844,6 +1844,7 @@ void iscsi_pool_free(struct iscsi_pool *q) | ||
212 | kfree(q->pool[i]); | ||
213 | if (q->pool) | ||
214 | kfree(q->pool); | ||
215 | + kfree(q->queue); | ||
216 | } | ||
217 | EXPORT_SYMBOL_GPL(iscsi_pool_free); | ||
218 | |||
219 | diff --git a/fs/ext2/super.c b/fs/ext2/super.c | ||
220 | index fd88c7b..2ebc0c4 100644 | ||
221 | --- a/fs/ext2/super.c | ||
222 | +++ b/fs/ext2/super.c | ||
223 | @@ -1177,9 +1177,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) | ||
224 | es = sbi->s_es; | ||
225 | if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != | ||
226 | (old_mount_opt & EXT2_MOUNT_XIP)) && | ||
227 | - invalidate_inodes(sb)) | ||
228 | - ext2_warning(sb, __func__, "busy inodes while remounting "\ | ||
229 | - "xip remain in cache (no functional problem)"); | ||
230 | + invalidate_inodes(sb)) { | ||
231 | + ext2_warning(sb, __func__, "refusing change of xip flag " | ||
232 | + "with busy inodes while remounting"); | ||
233 | + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; | ||
234 | + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; | ||
235 | + } | ||
236 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) | ||
237 | return 0; | ||
238 | if (*flags & MS_RDONLY) { | ||
239 | diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c | ||
240 | index e9fa960..8b7c776 100644 | ||
241 | --- a/fs/ext4/balloc.c | ||
242 | +++ b/fs/ext4/balloc.c | ||
243 | @@ -20,6 +20,7 @@ | ||
244 | #include "ext4.h" | ||
245 | #include "ext4_jbd2.h" | ||
246 | #include "group.h" | ||
247 | +#include "mballoc.h" | ||
248 | |||
249 | /* | ||
250 | * balloc.c contains the blocks allocation and deallocation routines | ||
251 | @@ -318,18 +319,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | ||
252 | block_group, bitmap_blk); | ||
253 | return NULL; | ||
254 | } | ||
255 | - if (bh_uptodate_or_lock(bh)) | ||
256 | + | ||
257 | + if (bitmap_uptodate(bh)) | ||
258 | return bh; | ||
259 | |||
260 | + lock_buffer(bh); | ||
261 | + if (bitmap_uptodate(bh)) { | ||
262 | + unlock_buffer(bh); | ||
263 | + return bh; | ||
264 | + } | ||
265 | spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); | ||
266 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
267 | ext4_init_block_bitmap(sb, bh, block_group, desc); | ||
268 | + set_bitmap_uptodate(bh); | ||
269 | set_buffer_uptodate(bh); | ||
270 | unlock_buffer(bh); | ||
271 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | ||
272 | return bh; | ||
273 | } | ||
274 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | ||
275 | + if (buffer_uptodate(bh)) { | ||
276 | + /* | ||
277 | + * if not uninit if bh is uptodate, | ||
278 | + * bitmap is also uptodate | ||
279 | + */ | ||
280 | + set_bitmap_uptodate(bh); | ||
281 | + unlock_buffer(bh); | ||
282 | + return bh; | ||
283 | + } | ||
284 | + /* | ||
285 | + * submit the buffer_head for read. We can | ||
286 | + * safely mark the bitmap as uptodate now. | ||
287 | + * We do it here so the bitmap uptodate bit | ||
288 | + * get set with buffer lock held. | ||
289 | + */ | ||
290 | + set_bitmap_uptodate(bh); | ||
291 | if (bh_submit_read(bh) < 0) { | ||
292 | put_bh(bh); | ||
293 | ext4_error(sb, __func__, | ||
294 | @@ -837,6 +861,136 @@ error_return: | ||
295 | } | ||
296 | |||
297 | /** | ||
298 | + * ext4_add_groupblocks() -- Add given blocks to an existing group | ||
299 | + * @handle: handle to this transaction | ||
300 | + * @sb: super block | ||
301 | + * @block: start physcial block to add to the block group | ||
302 | + * @count: number of blocks to free | ||
303 | + * | ||
304 | + * This marks the blocks as free in the bitmap. We ask the | ||
305 | + * mballoc to reload the buddy after this by setting group | ||
306 | + * EXT4_GROUP_INFO_NEED_INIT_BIT flag | ||
307 | + */ | ||
308 | +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | ||
309 | + ext4_fsblk_t block, unsigned long count) | ||
310 | +{ | ||
311 | + struct buffer_head *bitmap_bh = NULL; | ||
312 | + struct buffer_head *gd_bh; | ||
313 | + ext4_group_t block_group; | ||
314 | + ext4_grpblk_t bit; | ||
315 | + unsigned long i; | ||
316 | + struct ext4_group_desc *desc; | ||
317 | + struct ext4_super_block *es; | ||
318 | + struct ext4_sb_info *sbi; | ||
319 | + int err = 0, ret; | ||
320 | + ext4_grpblk_t blocks_freed; | ||
321 | + struct ext4_group_info *grp; | ||
322 | + | ||
323 | + sbi = EXT4_SB(sb); | ||
324 | + es = sbi->s_es; | ||
325 | + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); | ||
326 | + | ||
327 | + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | ||
328 | + grp = ext4_get_group_info(sb, block_group); | ||
329 | + /* | ||
330 | + * Check to see if we are freeing blocks across a group | ||
331 | + * boundary. | ||
332 | + */ | ||
333 | + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) | ||
334 | + goto error_return; | ||
335 | + | ||
336 | + bitmap_bh = ext4_read_block_bitmap(sb, block_group); | ||
337 | + if (!bitmap_bh) | ||
338 | + goto error_return; | ||
339 | + desc = ext4_get_group_desc(sb, block_group, &gd_bh); | ||
340 | + if (!desc) | ||
341 | + goto error_return; | ||
342 | + | ||
343 | + if (in_range(ext4_block_bitmap(sb, desc), block, count) || | ||
344 | + in_range(ext4_inode_bitmap(sb, desc), block, count) || | ||
345 | + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || | ||
346 | + in_range(block + count - 1, ext4_inode_table(sb, desc), | ||
347 | + sbi->s_itb_per_group)) { | ||
348 | + ext4_error(sb, __func__, | ||
349 | + "Adding blocks in system zones - " | ||
350 | + "Block = %llu, count = %lu", | ||
351 | + block, count); | ||
352 | + goto error_return; | ||
353 | + } | ||
354 | + | ||
355 | + /* | ||
356 | + * We are about to add blocks to the bitmap, | ||
357 | + * so we need undo access. | ||
358 | + */ | ||
359 | + BUFFER_TRACE(bitmap_bh, "getting undo access"); | ||
360 | + err = ext4_journal_get_undo_access(handle, bitmap_bh); | ||
361 | + if (err) | ||
362 | + goto error_return; | ||
363 | + | ||
364 | + /* | ||
365 | + * We are about to modify some metadata. Call the journal APIs | ||
366 | + * to unshare ->b_data if a currently-committing transaction is | ||
367 | + * using it | ||
368 | + */ | ||
369 | + BUFFER_TRACE(gd_bh, "get_write_access"); | ||
370 | + err = ext4_journal_get_write_access(handle, gd_bh); | ||
371 | + if (err) | ||
372 | + goto error_return; | ||
373 | + /* | ||
374 | + * make sure we don't allow a parallel init on other groups in the | ||
375 | + * same buddy cache | ||
376 | + */ | ||
377 | + down_write(&grp->alloc_sem); | ||
378 | + for (i = 0, blocks_freed = 0; i < count; i++) { | ||
379 | + BUFFER_TRACE(bitmap_bh, "clear bit"); | ||
380 | + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), | ||
381 | + bit + i, bitmap_bh->b_data)) { | ||
382 | + ext4_error(sb, __func__, | ||
383 | + "bit already cleared for block %llu", | ||
384 | + (ext4_fsblk_t)(block + i)); | ||
385 | + BUFFER_TRACE(bitmap_bh, "bit already cleared"); | ||
386 | + } else { | ||
387 | + blocks_freed++; | ||
388 | + } | ||
389 | + } | ||
390 | + spin_lock(sb_bgl_lock(sbi, block_group)); | ||
391 | + le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed); | ||
392 | + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); | ||
393 | + spin_unlock(sb_bgl_lock(sbi, block_group)); | ||
394 | + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); | ||
395 | + | ||
396 | + if (sbi->s_log_groups_per_flex) { | ||
397 | + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); | ||
398 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
399 | + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; | ||
400 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
401 | + } | ||
402 | + /* | ||
403 | + * request to reload the buddy with the | ||
404 | + * new bitmap information | ||
405 | + */ | ||
406 | + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); | ||
407 | + ext4_mb_update_group_info(grp, blocks_freed); | ||
408 | + up_write(&grp->alloc_sem); | ||
409 | + | ||
410 | + /* We dirtied the bitmap block */ | ||
411 | + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
412 | + err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
413 | + | ||
414 | + /* And the group descriptor block */ | ||
415 | + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); | ||
416 | + ret = ext4_journal_dirty_metadata(handle, gd_bh); | ||
417 | + if (!err) | ||
418 | + err = ret; | ||
419 | + sb->s_dirt = 1; | ||
420 | + | ||
421 | +error_return: | ||
422 | + brelse(bitmap_bh); | ||
423 | + ext4_std_error(sb, err); | ||
424 | + return; | ||
425 | +} | ||
426 | + | ||
427 | +/** | ||
428 | * ext4_free_blocks() -- Free given blocks and update quota | ||
429 | * @handle: handle for this transaction | ||
430 | * @inode: inode | ||
431 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h | ||
432 | index 4829dac..85f58af 100644 | ||
433 | --- a/fs/ext4/ext4.h | ||
434 | +++ b/fs/ext4/ext4.h | ||
435 | @@ -19,6 +19,7 @@ | ||
436 | #include <linux/types.h> | ||
437 | #include <linux/blkdev.h> | ||
438 | #include <linux/magic.h> | ||
439 | +#include <linux/jbd2.h> | ||
440 | #include "ext4_i.h" | ||
441 | |||
442 | /* | ||
443 | @@ -889,6 +890,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len) | ||
444 | #define DX_HASH_LEGACY 0 | ||
445 | #define DX_HASH_HALF_MD4 1 | ||
446 | #define DX_HASH_TEA 2 | ||
447 | +#define DX_HASH_LEGACY_UNSIGNED 3 | ||
448 | +#define DX_HASH_HALF_MD4_UNSIGNED 4 | ||
449 | +#define DX_HASH_TEA_UNSIGNED 5 | ||
450 | |||
451 | #ifdef __KERNEL__ | ||
452 | |||
453 | @@ -988,9 +992,11 @@ extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, | ||
454 | ext4_fsblk_t nblocks); | ||
455 | extern void ext4_free_blocks (handle_t *handle, struct inode *inode, | ||
456 | ext4_fsblk_t block, unsigned long count, int metadata); | ||
457 | -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, | ||
458 | - ext4_fsblk_t block, unsigned long count, | ||
459 | +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, | ||
460 | + ext4_fsblk_t block, unsigned long count, | ||
461 | unsigned long *pdquot_freed_blocks); | ||
462 | +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | ||
463 | + ext4_fsblk_t block, unsigned long count); | ||
464 | extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); | ||
465 | extern void ext4_check_blocks_bitmap (struct super_block *); | ||
466 | extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | ||
467 | @@ -1038,12 +1044,13 @@ extern int __init init_ext4_mballoc(void); | ||
468 | extern void exit_ext4_mballoc(void); | ||
469 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, | ||
470 | unsigned long, unsigned long, int, unsigned long *); | ||
471 | -extern int ext4_mb_add_more_groupinfo(struct super_block *sb, | ||
472 | +extern int ext4_mb_add_groupinfo(struct super_block *sb, | ||
473 | ext4_group_t i, struct ext4_group_desc *desc); | ||
474 | extern void ext4_mb_update_group_info(struct ext4_group_info *grp, | ||
475 | ext4_grpblk_t add); | ||
476 | - | ||
477 | - | ||
478 | +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); | ||
479 | +extern void ext4_mb_put_buddy_cache_lock(struct super_block *, | ||
480 | + ext4_group_t, int); | ||
481 | /* inode.c */ | ||
482 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | ||
483 | struct buffer_head *bh, ext4_fsblk_t blocknr); | ||
484 | @@ -1167,8 +1174,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, | ||
485 | |||
486 | static inline loff_t ext4_isize(struct ext4_inode *raw_inode) | ||
487 | { | ||
488 | - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | | ||
489 | - le32_to_cpu(raw_inode->i_size_lo); | ||
490 | + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) | ||
491 | + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | | ||
492 | + le32_to_cpu(raw_inode->i_size_lo); | ||
493 | + else | ||
494 | + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); | ||
495 | } | ||
496 | |||
497 | static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) | ||
498 | @@ -1244,6 +1254,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, | ||
499 | sector_t block, unsigned long max_blocks, | ||
500 | struct buffer_head *bh, int create, | ||
501 | int extend_disksize, int flag); | ||
502 | +/* | ||
503 | + * Add new method to test wether block and inode bitmaps are properly | ||
504 | + * initialized. With uninit_bg reading the block from disk is not enough | ||
505 | + * to mark the bitmap uptodate. We need to also zero-out the bitmap | ||
506 | + */ | ||
507 | +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart | ||
508 | + | ||
509 | +static inline int bitmap_uptodate(struct buffer_head *bh) | ||
510 | +{ | ||
511 | + return (buffer_uptodate(bh) && | ||
512 | + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); | ||
513 | +} | ||
514 | +static inline void set_bitmap_uptodate(struct buffer_head *bh) | ||
515 | +{ | ||
516 | + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); | ||
517 | +} | ||
518 | + | ||
519 | #endif /* __KERNEL__ */ | ||
520 | |||
521 | #endif /* _EXT4_H */ | ||
522 | diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h | ||
523 | index 6300226..f20df8a 100644 | ||
524 | --- a/fs/ext4/ext4_sb.h | ||
525 | +++ b/fs/ext4/ext4_sb.h | ||
526 | @@ -56,6 +56,7 @@ struct ext4_sb_info { | ||
527 | u32 s_next_generation; | ||
528 | u32 s_hash_seed[4]; | ||
529 | int s_def_hash_version; | ||
530 | + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ | ||
531 | struct percpu_counter s_freeblocks_counter; | ||
532 | struct percpu_counter s_freeinodes_counter; | ||
533 | struct percpu_counter s_dirs_counter; | ||
534 | @@ -102,7 +103,8 @@ struct ext4_sb_info { | ||
535 | struct list_head s_committed_transaction; | ||
536 | spinlock_t s_md_lock; | ||
537 | tid_t s_last_transaction; | ||
538 | - unsigned short *s_mb_offsets, *s_mb_maxs; | ||
539 | + unsigned short *s_mb_offsets; | ||
540 | + unsigned int *s_mb_maxs; | ||
541 | |||
542 | /* tunables */ | ||
543 | unsigned long s_stripe; | ||
544 | diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c | ||
545 | index 1d6329d..bd7d14d 100644 | ||
546 | --- a/fs/ext4/hash.c | ||
547 | +++ b/fs/ext4/hash.c | ||
548 | @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) | ||
549 | |||
550 | |||
551 | /* The old legacy hash */ | ||
552 | -static __u32 dx_hack_hash (const char *name, int len) | ||
553 | +static __u32 dx_hack_hash_unsigned(const char *name, int len) | ||
554 | { | ||
555 | - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | ||
556 | + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | ||
557 | + const unsigned char *ucp = (const unsigned char *) name; | ||
558 | + | ||
559 | + while (len--) { | ||
560 | + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); | ||
561 | + | ||
562 | + if (hash & 0x80000000) | ||
563 | + hash -= 0x7fffffff; | ||
564 | + hash1 = hash0; | ||
565 | + hash0 = hash; | ||
566 | + } | ||
567 | + return hash0 << 1; | ||
568 | +} | ||
569 | + | ||
570 | +static __u32 dx_hack_hash_signed(const char *name, int len) | ||
571 | +{ | ||
572 | + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | ||
573 | + const signed char *scp = (const signed char *) name; | ||
574 | + | ||
575 | while (len--) { | ||
576 | - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); | ||
577 | + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); | ||
578 | |||
579 | - if (hash & 0x80000000) hash -= 0x7fffffff; | ||
580 | + if (hash & 0x80000000) | ||
581 | + hash -= 0x7fffffff; | ||
582 | hash1 = hash0; | ||
583 | hash0 = hash; | ||
584 | } | ||
585 | - return (hash0 << 1); | ||
586 | + return hash0 << 1; | ||
587 | } | ||
588 | |||
589 | -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) | ||
590 | +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) | ||
591 | { | ||
592 | __u32 pad, val; | ||
593 | int i; | ||
594 | + const signed char *scp = (const signed char *) msg; | ||
595 | + | ||
596 | + pad = (__u32)len | ((__u32)len << 8); | ||
597 | + pad |= pad << 16; | ||
598 | + | ||
599 | + val = pad; | ||
600 | + if (len > num*4) | ||
601 | + len = num * 4; | ||
602 | + for (i = 0; i < len; i++) { | ||
603 | + if ((i % 4) == 0) | ||
604 | + val = pad; | ||
605 | + val = ((int) scp[i]) + (val << 8); | ||
606 | + if ((i % 4) == 3) { | ||
607 | + *buf++ = val; | ||
608 | + val = pad; | ||
609 | + num--; | ||
610 | + } | ||
611 | + } | ||
612 | + if (--num >= 0) | ||
613 | + *buf++ = val; | ||
614 | + while (--num >= 0) | ||
615 | + *buf++ = pad; | ||
616 | +} | ||
617 | + | ||
618 | +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) | ||
619 | +{ | ||
620 | + __u32 pad, val; | ||
621 | + int i; | ||
622 | + const unsigned char *ucp = (const unsigned char *) msg; | ||
623 | |||
624 | pad = (__u32)len | ((__u32)len << 8); | ||
625 | pad |= pad << 16; | ||
626 | @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) | ||
627 | for (i=0; i < len; i++) { | ||
628 | if ((i % 4) == 0) | ||
629 | val = pad; | ||
630 | - val = msg[i] + (val << 8); | ||
631 | + val = ((int) ucp[i]) + (val << 8); | ||
632 | if ((i % 4) == 3) { | ||
633 | *buf++ = val; | ||
634 | val = pad; | ||
635 | @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | ||
636 | const char *p; | ||
637 | int i; | ||
638 | __u32 in[8], buf[4]; | ||
639 | + void (*str2hashbuf)(const char *, int, __u32 *, int) = | ||
640 | + str2hashbuf_signed; | ||
641 | |||
642 | /* Initialize the default seed for the hash checksum functions */ | ||
643 | buf[0] = 0x67452301; | ||
644 | @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | ||
645 | } | ||
646 | |||
647 | switch (hinfo->hash_version) { | ||
648 | + case DX_HASH_LEGACY_UNSIGNED: | ||
649 | + hash = dx_hack_hash_unsigned(name, len); | ||
650 | + break; | ||
651 | case DX_HASH_LEGACY: | ||
652 | - hash = dx_hack_hash(name, len); | ||
653 | + hash = dx_hack_hash_signed(name, len); | ||
654 | break; | ||
655 | + case DX_HASH_HALF_MD4_UNSIGNED: | ||
656 | + str2hashbuf = str2hashbuf_unsigned; | ||
657 | case DX_HASH_HALF_MD4: | ||
658 | p = name; | ||
659 | while (len > 0) { | ||
660 | - str2hashbuf(p, len, in, 8); | ||
661 | + (*str2hashbuf)(p, len, in, 8); | ||
662 | half_md4_transform(buf, in); | ||
663 | len -= 32; | ||
664 | p += 32; | ||
665 | @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | ||
666 | minor_hash = buf[2]; | ||
667 | hash = buf[1]; | ||
668 | break; | ||
669 | + case DX_HASH_TEA_UNSIGNED: | ||
670 | + str2hashbuf = str2hashbuf_unsigned; | ||
671 | case DX_HASH_TEA: | ||
672 | p = name; | ||
673 | while (len > 0) { | ||
674 | - str2hashbuf(p, len, in, 4); | ||
675 | + (*str2hashbuf)(p, len, in, 4); | ||
676 | TEA_transform(buf, in); | ||
677 | len -= 16; | ||
678 | p += 16; | ||
679 | diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c | ||
680 | index 9805924..b994854 100644 | ||
681 | --- a/fs/ext4/ialloc.c | ||
682 | +++ b/fs/ext4/ialloc.c | ||
683 | @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, | ||
684 | } | ||
685 | |||
686 | memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); | ||
687 | - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), | ||
688 | + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, | ||
689 | bh->b_data); | ||
690 | |||
691 | return EXT4_INODES_PER_GROUP(sb); | ||
692 | @@ -115,18 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) | ||
693 | block_group, bitmap_blk); | ||
694 | return NULL; | ||
695 | } | ||
696 | - if (bh_uptodate_or_lock(bh)) | ||
697 | + if (bitmap_uptodate(bh)) | ||
698 | return bh; | ||
699 | |||
700 | + lock_buffer(bh); | ||
701 | + if (bitmap_uptodate(bh)) { | ||
702 | + unlock_buffer(bh); | ||
703 | + return bh; | ||
704 | + } | ||
705 | spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); | ||
706 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
707 | ext4_init_inode_bitmap(sb, bh, block_group, desc); | ||
708 | + set_bitmap_uptodate(bh); | ||
709 | set_buffer_uptodate(bh); | ||
710 | unlock_buffer(bh); | ||
711 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | ||
712 | return bh; | ||
713 | } | ||
714 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | ||
715 | + if (buffer_uptodate(bh)) { | ||
716 | + /* | ||
717 | + * if not uninit if bh is uptodate, | ||
718 | + * bitmap is also uptodate | ||
719 | + */ | ||
720 | + set_bitmap_uptodate(bh); | ||
721 | + unlock_buffer(bh); | ||
722 | + return bh; | ||
723 | + } | ||
724 | + /* | ||
725 | + * submit the buffer_head for read. We can | ||
726 | + * safely mark the bitmap as uptodate now. | ||
727 | + * We do it here so the bitmap uptodate bit | ||
728 | + * get set with buffer lock held. | ||
729 | + */ | ||
730 | + set_bitmap_uptodate(bh); | ||
731 | if (bh_submit_read(bh) < 0) { | ||
732 | put_bh(bh); | ||
733 | ext4_error(sb, __func__, | ||
734 | @@ -567,6 +589,77 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | ||
735 | } | ||
736 | |||
737 | /* | ||
738 | + * claim the inode from the inode bitmap. If the group | ||
739 | + * is uninit we need to take the groups's sb_bgl_lock | ||
740 | + * and clear the uninit flag. The inode bitmap update | ||
741 | + * and group desc uninit flag clear should be done | ||
742 | + * after holding sb_bgl_lock so that ext4_read_inode_bitmap | ||
743 | + * doesn't race with the ext4_claim_inode | ||
744 | + */ | ||
745 | +static int ext4_claim_inode(struct super_block *sb, | ||
746 | + struct buffer_head *inode_bitmap_bh, | ||
747 | + unsigned long ino, ext4_group_t group, int mode) | ||
748 | +{ | ||
749 | + int free = 0, retval = 0; | ||
750 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
751 | + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); | ||
752 | + | ||
753 | + spin_lock(sb_bgl_lock(sbi, group)); | ||
754 | + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { | ||
755 | + /* not a free inode */ | ||
756 | + retval = 1; | ||
757 | + goto err_ret; | ||
758 | + } | ||
759 | + ino++; | ||
760 | + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || | ||
761 | + ino > EXT4_INODES_PER_GROUP(sb)) { | ||
762 | + spin_unlock(sb_bgl_lock(sbi, group)); | ||
763 | + ext4_error(sb, __func__, | ||
764 | + "reserved inode or inode > inodes count - " | ||
765 | + "block_group = %lu, inode=%lu", group, | ||
766 | + ino + group * EXT4_INODES_PER_GROUP(sb)); | ||
767 | + return 1; | ||
768 | + } | ||
769 | + /* If we didn't allocate from within the initialized part of the inode | ||
770 | + * table then we need to initialize up to this inode. */ | ||
771 | + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { | ||
772 | + | ||
773 | + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
774 | + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); | ||
775 | + /* When marking the block group with | ||
776 | + * ~EXT4_BG_INODE_UNINIT we don't want to depend | ||
777 | + * on the value of bg_itable_unused even though | ||
778 | + * mke2fs could have initialized the same for us. | ||
779 | + * Instead we calculated the value below | ||
780 | + */ | ||
781 | + | ||
782 | + free = 0; | ||
783 | + } else { | ||
784 | + free = EXT4_INODES_PER_GROUP(sb) - | ||
785 | + le16_to_cpu(gdp->bg_itable_unused); | ||
786 | + } | ||
787 | + | ||
788 | + /* | ||
789 | + * Check the relative inode number against the last used | ||
790 | + * relative inode number in this group. if it is greater | ||
791 | + * we need to update the bg_itable_unused count | ||
792 | + * | ||
793 | + */ | ||
794 | + if (ino > free) | ||
795 | + gdp->bg_itable_unused = | ||
796 | + cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); | ||
797 | + } | ||
798 | + le16_add_cpu(&gdp->bg_free_inodes_count, -1); | ||
799 | + if (S_ISDIR(mode)) { | ||
800 | + le16_add_cpu(&gdp->bg_used_dirs_count, 1); | ||
801 | + } | ||
802 | + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); | ||
803 | +err_ret: | ||
804 | + spin_unlock(sb_bgl_lock(sbi, group)); | ||
805 | + return retval; | ||
806 | +} | ||
807 | + | ||
808 | +/* | ||
809 | * There are two policies for allocating an inode. If the new inode is | ||
810 | * a directory, then a forward search is made for a block group with both | ||
811 | * free space and a low directory-to-inode ratio; if that fails, then of | ||
812 | @@ -649,8 +742,12 @@ repeat_in_this_group: | ||
813 | if (err) | ||
814 | goto fail; | ||
815 | |||
816 | - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), | ||
817 | - ino, bitmap_bh->b_data)) { | ||
818 | + BUFFER_TRACE(bh2, "get_write_access"); | ||
819 | + err = ext4_journal_get_write_access(handle, bh2); | ||
820 | + if (err) | ||
821 | + goto fail; | ||
822 | + if (!ext4_claim_inode(sb, bitmap_bh, | ||
823 | + ino, group, mode)) { | ||
824 | /* we won it */ | ||
825 | BUFFER_TRACE(bitmap_bh, | ||
826 | "call ext4_journal_dirty_metadata"); | ||
827 | @@ -658,10 +755,13 @@ repeat_in_this_group: | ||
828 | bitmap_bh); | ||
829 | if (err) | ||
830 | goto fail; | ||
831 | + /* zero bit is inode number 1*/ | ||
832 | + ino++; | ||
833 | goto got; | ||
834 | } | ||
835 | /* we lost it */ | ||
836 | jbd2_journal_release_buffer(handle, bitmap_bh); | ||
837 | + jbd2_journal_release_buffer(handle, bh2); | ||
838 | |||
839 | if (++ino < EXT4_INODES_PER_GROUP(sb)) | ||
840 | goto repeat_in_this_group; | ||
841 | @@ -681,21 +781,6 @@ repeat_in_this_group: | ||
842 | goto out; | ||
843 | |||
844 | got: | ||
845 | - ino++; | ||
846 | - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || | ||
847 | - ino > EXT4_INODES_PER_GROUP(sb)) { | ||
848 | - ext4_error(sb, __func__, | ||
849 | - "reserved inode or inode > inodes count - " | ||
850 | - "block_group = %lu, inode=%lu", group, | ||
851 | - ino + group * EXT4_INODES_PER_GROUP(sb)); | ||
852 | - err = -EIO; | ||
853 | - goto fail; | ||
854 | - } | ||
855 | - | ||
856 | - BUFFER_TRACE(bh2, "get_write_access"); | ||
857 | - err = ext4_journal_get_write_access(handle, bh2); | ||
858 | - if (err) goto fail; | ||
859 | - | ||
860 | /* We may have to initialize the block bitmap if it isn't already */ | ||
861 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && | ||
862 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
863 | @@ -730,47 +815,10 @@ got: | ||
864 | if (err) | ||
865 | goto fail; | ||
866 | } | ||
867 | - | ||
868 | - spin_lock(sb_bgl_lock(sbi, group)); | ||
869 | - /* If we didn't allocate from within the initialized part of the inode | ||
870 | - * table then we need to initialize up to this inode. */ | ||
871 | - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { | ||
872 | - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
873 | - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); | ||
874 | - | ||
875 | - /* When marking the block group with | ||
876 | - * ~EXT4_BG_INODE_UNINIT we don't want to depend | ||
877 | - * on the value of bg_itable_unused even though | ||
878 | - * mke2fs could have initialized the same for us. | ||
879 | - * Instead we calculated the value below | ||
880 | - */ | ||
881 | - | ||
882 | - free = 0; | ||
883 | - } else { | ||
884 | - free = EXT4_INODES_PER_GROUP(sb) - | ||
885 | - le16_to_cpu(gdp->bg_itable_unused); | ||
886 | - } | ||
887 | - | ||
888 | - /* | ||
889 | - * Check the relative inode number against the last used | ||
890 | - * relative inode number in this group. if it is greater | ||
891 | - * we need to update the bg_itable_unused count | ||
892 | - * | ||
893 | - */ | ||
894 | - if (ino > free) | ||
895 | - gdp->bg_itable_unused = | ||
896 | - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); | ||
897 | - } | ||
898 | - | ||
899 | - le16_add_cpu(&gdp->bg_free_inodes_count, -1); | ||
900 | - if (S_ISDIR(mode)) { | ||
901 | - le16_add_cpu(&gdp->bg_used_dirs_count, 1); | ||
902 | - } | ||
903 | - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); | ||
904 | - spin_unlock(sb_bgl_lock(sbi, group)); | ||
905 | - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); | ||
906 | + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); | ||
907 | err = ext4_journal_dirty_metadata(handle, bh2); | ||
908 | - if (err) goto fail; | ||
909 | + if (err) | ||
910 | + goto fail; | ||
911 | |||
912 | percpu_counter_dec(&sbi->s_freeinodes_counter); | ||
913 | if (S_ISDIR(mode)) | ||
914 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c | ||
915 | index d77f674..6e7f085 100644 | ||
916 | --- a/fs/ext4/inode.c | ||
917 | +++ b/fs/ext4/inode.c | ||
918 | @@ -351,9 +351,9 @@ static int ext4_block_to_path(struct inode *inode, | ||
919 | final = ptrs; | ||
920 | } else { | ||
921 | ext4_warning(inode->i_sb, "ext4_block_to_path", | ||
922 | - "block %lu > max", | ||
923 | + "block %lu > max in inode %lu", | ||
924 | i_block + direct_blocks + | ||
925 | - indirect_blocks + double_blocks); | ||
926 | + indirect_blocks + double_blocks, inode->i_ino); | ||
927 | } | ||
928 | if (boundary) | ||
929 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
930 | @@ -1648,18 +1648,25 @@ struct mpage_da_data { | ||
931 | */ | ||
932 | static int mpage_da_submit_io(struct mpage_da_data *mpd) | ||
933 | { | ||
934 | - struct address_space *mapping = mpd->inode->i_mapping; | ||
935 | - int ret = 0, err, nr_pages, i; | ||
936 | - unsigned long index, end; | ||
937 | + long pages_skipped; | ||
938 | struct pagevec pvec; | ||
939 | + unsigned long index, end; | ||
940 | + int ret = 0, err, nr_pages, i; | ||
941 | + struct inode *inode = mpd->inode; | ||
942 | + struct address_space *mapping = inode->i_mapping; | ||
943 | |||
944 | BUG_ON(mpd->next_page <= mpd->first_page); | ||
945 | - pagevec_init(&pvec, 0); | ||
946 | + /* | ||
947 | + * We need to start from the first_page to the next_page - 1 | ||
948 | + * to make sure we also write the mapped dirty buffer_heads. | ||
949 | + * If we look at mpd->lbh.b_blocknr we would only be looking | ||
950 | + * at the currently mapped buffer_heads. | ||
951 | + */ | ||
952 | index = mpd->first_page; | ||
953 | end = mpd->next_page - 1; | ||
954 | |||
955 | + pagevec_init(&pvec, 0); | ||
956 | while (index <= end) { | ||
957 | - /* XXX: optimize tail */ | ||
958 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
959 | if (nr_pages == 0) | ||
960 | break; | ||
961 | @@ -1671,6 +1678,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | ||
962 | break; | ||
963 | index++; | ||
964 | |||
965 | + BUG_ON(!PageLocked(page)); | ||
966 | + BUG_ON(PageWriteback(page)); | ||
967 | + | ||
968 | + pages_skipped = mpd->wbc->pages_skipped; | ||
969 | err = mapping->a_ops->writepage(page, mpd->wbc); | ||
970 | if (!err) | ||
971 | mpd->pages_written++; | ||
972 | @@ -1991,11 +2002,29 @@ static int __mpage_da_writepage(struct page *page, | ||
973 | bh = head; | ||
974 | do { | ||
975 | BUG_ON(buffer_locked(bh)); | ||
976 | + /* | ||
977 | + * We need to try to allocate | ||
978 | + * unmapped blocks in the same page. | ||
979 | + * Otherwise we won't make progress | ||
980 | + * with the page in ext4_da_writepage | ||
981 | + */ | ||
982 | if (buffer_dirty(bh) && | ||
983 | (!buffer_mapped(bh) || buffer_delay(bh))) { | ||
984 | mpage_add_bh_to_extent(mpd, logical, bh); | ||
985 | if (mpd->io_done) | ||
986 | return MPAGE_DA_EXTENT_TAIL; | ||
987 | + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { | ||
988 | + /* | ||
989 | + * mapped dirty buffer. We need to update | ||
990 | + * the b_state because we look at | ||
991 | + * b_state in mpage_da_map_blocks. We don't | ||
992 | + * update b_size because if we find an | ||
993 | + * unmapped buffer_head later we need to | ||
994 | + * use the b_state flag of that buffer_head. | ||
995 | + */ | ||
996 | + if (mpd->lbh.b_size == 0) | ||
997 | + mpd->lbh.b_state = | ||
998 | + bh->b_state & BH_FLAGS; | ||
999 | } | ||
1000 | logical++; | ||
1001 | } while ((bh = bh->b_this_page) != head); | ||
1002 | @@ -2298,6 +2327,20 @@ static int ext4_da_writepages(struct address_space *mapping, | ||
1003 | */ | ||
1004 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | ||
1005 | return 0; | ||
1006 | + | ||
1007 | + /* | ||
1008 | + * If the filesystem has aborted, it is read-only, so return | ||
1009 | + * right away instead of dumping stack traces later on that | ||
1010 | + * will obscure the real source of the problem. We test | ||
1011 | + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because | ||
1012 | + * the latter could be true if the filesystem is mounted | ||
1013 | + * read-only, and in that case, ext4_da_writepages should | ||
1014 | + * *never* be called, so if that ever happens, we would want | ||
1015 | + * the stack trace. | ||
1016 | + */ | ||
1017 | + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) | ||
1018 | + return -EROFS; | ||
1019 | + | ||
1020 | /* | ||
1021 | * Make sure nr_to_write is >= sbi->s_mb_stream_request | ||
1022 | * This make sure small files blocks are allocated in | ||
1023 | @@ -2336,7 +2379,7 @@ restart_loop: | ||
1024 | handle = ext4_journal_start(inode, needed_blocks); | ||
1025 | if (IS_ERR(handle)) { | ||
1026 | ret = PTR_ERR(handle); | ||
1027 | - printk(KERN_EMERG "%s: jbd2_start: " | ||
1028 | + printk(KERN_CRIT "%s: jbd2_start: " | ||
1029 | "%ld pages, ino %lu; err %d\n", __func__, | ||
1030 | wbc->nr_to_write, inode->i_ino, ret); | ||
1031 | dump_stack(); | ||
1032 | diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c | ||
1033 | index ba86b56..dbf6c0e 100644 | ||
1034 | --- a/fs/ext4/mballoc.c | ||
1035 | +++ b/fs/ext4/mballoc.c | ||
1036 | @@ -100,7 +100,7 @@ | ||
1037 | * inode as: | ||
1038 | * | ||
1039 | * { page } | ||
1040 | - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | ||
1041 | + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... | ||
1042 | * | ||
1043 | * | ||
1044 | * one block each for bitmap and buddy information. So for each group we | ||
1045 | @@ -330,6 +330,18 @@ | ||
1046 | * object | ||
1047 | * | ||
1048 | */ | ||
1049 | +static struct kmem_cache *ext4_pspace_cachep; | ||
1050 | +static struct kmem_cache *ext4_ac_cachep; | ||
1051 | +static struct kmem_cache *ext4_free_ext_cachep; | ||
1052 | +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
1053 | + ext4_group_t group); | ||
1054 | +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | ||
1055 | + ext4_group_t group); | ||
1056 | +static int ext4_mb_init_per_dev_proc(struct super_block *sb); | ||
1057 | +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); | ||
1058 | +static void ext4_mb_free_committed_blocks(struct super_block *); | ||
1059 | +static void ext4_mb_poll_new_transaction(struct super_block *sb, | ||
1060 | + handle_t *handle); | ||
1061 | |||
1062 | static inline void *mb_correct_addr_and_bit(int *bit, void *addr) | ||
1063 | { | ||
1064 | @@ -718,7 +730,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, | ||
1065 | * stored in the inode as | ||
1066 | * | ||
1067 | * { page } | ||
1068 | - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | ||
1069 | + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... | ||
1070 | * | ||
1071 | * | ||
1072 | * one block each for bitmap and buddy information. | ||
1073 | @@ -784,20 +796,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1074 | if (bh[i] == NULL) | ||
1075 | goto out; | ||
1076 | |||
1077 | - if (bh_uptodate_or_lock(bh[i])) | ||
1078 | + if (bitmap_uptodate(bh[i])) | ||
1079 | continue; | ||
1080 | |||
1081 | + lock_buffer(bh[i]); | ||
1082 | + if (bitmap_uptodate(bh[i])) { | ||
1083 | + unlock_buffer(bh[i]); | ||
1084 | + continue; | ||
1085 | + } | ||
1086 | spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | ||
1087 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
1088 | ext4_init_block_bitmap(sb, bh[i], | ||
1089 | first_group + i, desc); | ||
1090 | + set_bitmap_uptodate(bh[i]); | ||
1091 | set_buffer_uptodate(bh[i]); | ||
1092 | unlock_buffer(bh[i]); | ||
1093 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | ||
1094 | continue; | ||
1095 | } | ||
1096 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | ||
1097 | + if (buffer_uptodate(bh[i])) { | ||
1098 | + /* | ||
1099 | + * if not uninit if bh is uptodate, | ||
1100 | + * bitmap is also uptodate | ||
1101 | + */ | ||
1102 | + set_bitmap_uptodate(bh[i]); | ||
1103 | + unlock_buffer(bh[i]); | ||
1104 | + continue; | ||
1105 | + } | ||
1106 | get_bh(bh[i]); | ||
1107 | + /* | ||
1108 | + * submit the buffer_head for read. We can | ||
1109 | + * safely mark the bitmap as uptodate now. | ||
1110 | + * We do it here so the bitmap uptodate bit | ||
1111 | + * get set with buffer lock held. | ||
1112 | + */ | ||
1113 | + set_bitmap_uptodate(bh[i]); | ||
1114 | bh[i]->b_end_io = end_buffer_read_sync; | ||
1115 | submit_bh(READ, bh[i]); | ||
1116 | mb_debug("read bitmap for group %lu\n", first_group + i); | ||
1117 | @@ -814,6 +848,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1118 | |||
1119 | err = 0; | ||
1120 | first_block = page->index * blocks_per_page; | ||
1121 | + /* init the page */ | ||
1122 | + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); | ||
1123 | for (i = 0; i < blocks_per_page; i++) { | ||
1124 | int group; | ||
1125 | struct ext4_group_info *grinfo; | ||
1126 | @@ -840,7 +876,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1127 | BUG_ON(incore == NULL); | ||
1128 | mb_debug("put buddy for group %u in page %lu/%x\n", | ||
1129 | group, page->index, i * blocksize); | ||
1130 | - memset(data, 0xff, blocksize); | ||
1131 | grinfo = ext4_get_group_info(sb, group); | ||
1132 | grinfo->bb_fragments = 0; | ||
1133 | memset(grinfo->bb_counters, 0, | ||
1134 | @@ -848,7 +883,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1135 | /* | ||
1136 | * incore got set to the group block bitmap below | ||
1137 | */ | ||
1138 | + ext4_lock_group(sb, group); | ||
1139 | ext4_mb_generate_buddy(sb, data, incore, group); | ||
1140 | + ext4_unlock_group(sb, group); | ||
1141 | incore = NULL; | ||
1142 | } else { | ||
1143 | /* this is block of bitmap */ | ||
1144 | @@ -862,6 +899,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1145 | |||
1146 | /* mark all preallocated blks used in in-core bitmap */ | ||
1147 | ext4_mb_generate_from_pa(sb, data, group); | ||
1148 | + ext4_mb_generate_from_freelist(sb, data, group); | ||
1149 | ext4_unlock_group(sb, group); | ||
1150 | |||
1151 | /* set incore so that the buddy information can be | ||
1152 | @@ -886,18 +924,20 @@ static noinline_for_stack int | ||
1153 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
1154 | struct ext4_buddy *e4b) | ||
1155 | { | ||
1156 | - struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1157 | - struct inode *inode = sbi->s_buddy_cache; | ||
1158 | int blocks_per_page; | ||
1159 | int block; | ||
1160 | int pnum; | ||
1161 | int poff; | ||
1162 | struct page *page; | ||
1163 | int ret; | ||
1164 | + struct ext4_group_info *grp; | ||
1165 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1166 | + struct inode *inode = sbi->s_buddy_cache; | ||
1167 | |||
1168 | mb_debug("load group %lu\n", group); | ||
1169 | |||
1170 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1171 | + grp = ext4_get_group_info(sb, group); | ||
1172 | |||
1173 | e4b->bd_blkbits = sb->s_blocksize_bits; | ||
1174 | e4b->bd_info = ext4_get_group_info(sb, group); | ||
1175 | @@ -905,6 +945,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
1176 | e4b->bd_group = group; | ||
1177 | e4b->bd_buddy_page = NULL; | ||
1178 | e4b->bd_bitmap_page = NULL; | ||
1179 | + e4b->alloc_semp = &grp->alloc_sem; | ||
1180 | + | ||
1181 | + /* Take the read lock on the group alloc | ||
1182 | + * sem. This would make sure a parallel | ||
1183 | + * ext4_mb_init_group happening on other | ||
1184 | + * groups mapped by the page is blocked | ||
1185 | + * till we are done with allocation | ||
1186 | + */ | ||
1187 | + down_read(e4b->alloc_semp); | ||
1188 | |||
1189 | /* | ||
1190 | * the buddy cache inode stores the block bitmap | ||
1191 | @@ -920,6 +969,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
1192 | page = find_get_page(inode->i_mapping, pnum); | ||
1193 | if (page == NULL || !PageUptodate(page)) { | ||
1194 | if (page) | ||
1195 | + /* | ||
1196 | + * drop the page reference and try | ||
1197 | + * to get the page with lock. If we | ||
1198 | + * are not uptodate that implies | ||
1199 | + * somebody just created the page but | ||
1200 | + * is yet to initialize the same. So | ||
1201 | + * wait for it to initialize. | ||
1202 | + */ | ||
1203 | page_cache_release(page); | ||
1204 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1205 | if (page) { | ||
1206 | @@ -985,6 +1042,9 @@ err: | ||
1207 | page_cache_release(e4b->bd_buddy_page); | ||
1208 | e4b->bd_buddy = NULL; | ||
1209 | e4b->bd_bitmap = NULL; | ||
1210 | + | ||
1211 | + /* Done with the buddy cache */ | ||
1212 | + up_read(e4b->alloc_semp); | ||
1213 | return ret; | ||
1214 | } | ||
1215 | |||
1216 | @@ -994,6 +1054,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b) | ||
1217 | page_cache_release(e4b->bd_bitmap_page); | ||
1218 | if (e4b->bd_buddy_page) | ||
1219 | page_cache_release(e4b->bd_buddy_page); | ||
1220 | + /* Done with the buddy cache */ | ||
1221 | + if (e4b->alloc_semp) | ||
1222 | + up_read(e4b->alloc_semp); | ||
1223 | } | ||
1224 | |||
1225 | |||
1226 | @@ -1031,7 +1094,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) | ||
1227 | cur += 32; | ||
1228 | continue; | ||
1229 | } | ||
1230 | - mb_clear_bit_atomic(lock, cur, bm); | ||
1231 | + if (lock) | ||
1232 | + mb_clear_bit_atomic(lock, cur, bm); | ||
1233 | + else | ||
1234 | + mb_clear_bit(cur, bm); | ||
1235 | cur++; | ||
1236 | } | ||
1237 | } | ||
1238 | @@ -1049,7 +1115,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) | ||
1239 | cur += 32; | ||
1240 | continue; | ||
1241 | } | ||
1242 | - mb_set_bit_atomic(lock, cur, bm); | ||
1243 | + if (lock) | ||
1244 | + mb_set_bit_atomic(lock, cur, bm); | ||
1245 | + else | ||
1246 | + mb_set_bit(cur, bm); | ||
1247 | cur++; | ||
1248 | } | ||
1249 | } | ||
1250 | @@ -1296,13 +1365,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, | ||
1251 | ac->ac_tail = ret & 0xffff; | ||
1252 | ac->ac_buddy = ret >> 16; | ||
1253 | |||
1254 | - /* XXXXXXX: SUCH A HORRIBLE **CK */ | ||
1255 | - /*FIXME!! Why ? */ | ||
1256 | + /* | ||
1257 | + * take the page reference. We want the page to be pinned | ||
1258 | + * so that we don't get a ext4_mb_init_cache_call for this | ||
1259 | + * group until we update the bitmap. That would mean we | ||
1260 | + * double allocate blocks. The reference is dropped | ||
1261 | + * in ext4_mb_release_context | ||
1262 | + */ | ||
1263 | ac->ac_bitmap_page = e4b->bd_bitmap_page; | ||
1264 | get_page(ac->ac_bitmap_page); | ||
1265 | ac->ac_buddy_page = e4b->bd_buddy_page; | ||
1266 | get_page(ac->ac_buddy_page); | ||
1267 | - | ||
1268 | + /* on allocation we use ac to track the held semaphore */ | ||
1269 | + ac->alloc_semp = e4b->alloc_semp; | ||
1270 | + e4b->alloc_semp = NULL; | ||
1271 | /* store last allocated for subsequent stream allocation */ | ||
1272 | if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
1273 | spin_lock(&sbi->s_md_lock); | ||
1274 | @@ -1326,6 +1402,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, | ||
1275 | struct ext4_free_extent ex; | ||
1276 | int max; | ||
1277 | |||
1278 | + if (ac->ac_status == AC_STATUS_FOUND) | ||
1279 | + return; | ||
1280 | /* | ||
1281 | * We don't want to scan for a whole year | ||
1282 | */ | ||
1283 | @@ -1692,6 +1770,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, | ||
1284 | return 0; | ||
1285 | } | ||
1286 | |||
1287 | +/* | ||
1288 | + * lock the group_info alloc_sem of all the groups | ||
1289 | + * belonging to the same buddy cache page. This | ||
1290 | + * make sure other parallel operation on the buddy | ||
1291 | + * cache doesn't happen whild holding the buddy cache | ||
1292 | + * lock | ||
1293 | + */ | ||
1294 | +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) | ||
1295 | +{ | ||
1296 | + int i; | ||
1297 | + int block, pnum; | ||
1298 | + int blocks_per_page; | ||
1299 | + int groups_per_page; | ||
1300 | + ext4_group_t first_group; | ||
1301 | + struct ext4_group_info *grp; | ||
1302 | + | ||
1303 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1304 | + /* | ||
1305 | + * the buddy cache inode stores the block bitmap | ||
1306 | + * and buddy information in consecutive blocks. | ||
1307 | + * So for each group we need two blocks. | ||
1308 | + */ | ||
1309 | + block = group * 2; | ||
1310 | + pnum = block / blocks_per_page; | ||
1311 | + first_group = pnum * blocks_per_page / 2; | ||
1312 | + | ||
1313 | + groups_per_page = blocks_per_page >> 1; | ||
1314 | + if (groups_per_page == 0) | ||
1315 | + groups_per_page = 1; | ||
1316 | + /* read all groups the page covers into the cache */ | ||
1317 | + for (i = 0; i < groups_per_page; i++) { | ||
1318 | + | ||
1319 | + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) | ||
1320 | + break; | ||
1321 | + grp = ext4_get_group_info(sb, first_group + i); | ||
1322 | + /* take all groups write allocation | ||
1323 | + * semaphore. This make sure there is | ||
1324 | + * no block allocation going on in any | ||
1325 | + * of that groups | ||
1326 | + */ | ||
1327 | + down_write(&grp->alloc_sem); | ||
1328 | + } | ||
1329 | + return i; | ||
1330 | +} | ||
1331 | + | ||
1332 | +void ext4_mb_put_buddy_cache_lock(struct super_block *sb, | ||
1333 | + ext4_group_t group, int locked_group) | ||
1334 | +{ | ||
1335 | + int i; | ||
1336 | + int block, pnum; | ||
1337 | + int blocks_per_page; | ||
1338 | + ext4_group_t first_group; | ||
1339 | + struct ext4_group_info *grp; | ||
1340 | + | ||
1341 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1342 | + /* | ||
1343 | + * the buddy cache inode stores the block bitmap | ||
1344 | + * and buddy information in consecutive blocks. | ||
1345 | + * So for each group we need two blocks. | ||
1346 | + */ | ||
1347 | + block = group * 2; | ||
1348 | + pnum = block / blocks_per_page; | ||
1349 | + first_group = pnum * blocks_per_page / 2; | ||
1350 | + /* release locks on all the groups */ | ||
1351 | + for (i = 0; i < locked_group; i++) { | ||
1352 | + | ||
1353 | + grp = ext4_get_group_info(sb, first_group + i); | ||
1354 | + /* take all groups write allocation | ||
1355 | + * semaphore. This make sure there is | ||
1356 | + * no block allocation going on in any | ||
1357 | + * of that groups | ||
1358 | + */ | ||
1359 | + up_write(&grp->alloc_sem); | ||
1360 | + } | ||
1361 | + | ||
1362 | +} | ||
1363 | + | ||
1364 | +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | ||
1365 | +{ | ||
1366 | + | ||
1367 | + int ret; | ||
1368 | + void *bitmap; | ||
1369 | + int blocks_per_page; | ||
1370 | + int block, pnum, poff; | ||
1371 | + int num_grp_locked = 0; | ||
1372 | + struct ext4_group_info *this_grp; | ||
1373 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1374 | + struct inode *inode = sbi->s_buddy_cache; | ||
1375 | + struct page *page = NULL, *bitmap_page = NULL; | ||
1376 | + | ||
1377 | + mb_debug("init group %lu\n", group); | ||
1378 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1379 | + this_grp = ext4_get_group_info(sb, group); | ||
1380 | + /* | ||
1381 | + * This ensures we don't add group | ||
1382 | + * to this buddy cache via resize | ||
1383 | + */ | ||
1384 | + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); | ||
1385 | + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { | ||
1386 | + /* | ||
1387 | + * somebody initialized the group | ||
1388 | + * return without doing anything | ||
1389 | + */ | ||
1390 | + ret = 0; | ||
1391 | + goto err; | ||
1392 | + } | ||
1393 | + /* | ||
1394 | + * the buddy cache inode stores the block bitmap | ||
1395 | + * and buddy information in consecutive blocks. | ||
1396 | + * So for each group we need two blocks. | ||
1397 | + */ | ||
1398 | + block = group * 2; | ||
1399 | + pnum = block / blocks_per_page; | ||
1400 | + poff = block % blocks_per_page; | ||
1401 | + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1402 | + if (page) { | ||
1403 | + BUG_ON(page->mapping != inode->i_mapping); | ||
1404 | + ret = ext4_mb_init_cache(page, NULL); | ||
1405 | + if (ret) { | ||
1406 | + unlock_page(page); | ||
1407 | + goto err; | ||
1408 | + } | ||
1409 | + unlock_page(page); | ||
1410 | + } | ||
1411 | + if (page == NULL || !PageUptodate(page)) { | ||
1412 | + ret = -EIO; | ||
1413 | + goto err; | ||
1414 | + } | ||
1415 | + mark_page_accessed(page); | ||
1416 | + bitmap_page = page; | ||
1417 | + bitmap = page_address(page) + (poff * sb->s_blocksize); | ||
1418 | + | ||
1419 | + /* init buddy cache */ | ||
1420 | + block++; | ||
1421 | + pnum = block / blocks_per_page; | ||
1422 | + poff = block % blocks_per_page; | ||
1423 | + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1424 | + if (page == bitmap_page) { | ||
1425 | + /* | ||
1426 | + * If both the bitmap and buddy are in | ||
1427 | + * the same page we don't need to force | ||
1428 | + * init the buddy | ||
1429 | + */ | ||
1430 | + unlock_page(page); | ||
1431 | + } else if (page) { | ||
1432 | + BUG_ON(page->mapping != inode->i_mapping); | ||
1433 | + ret = ext4_mb_init_cache(page, bitmap); | ||
1434 | + if (ret) { | ||
1435 | + unlock_page(page); | ||
1436 | + goto err; | ||
1437 | + } | ||
1438 | + unlock_page(page); | ||
1439 | + } | ||
1440 | + if (page == NULL || !PageUptodate(page)) { | ||
1441 | + ret = -EIO; | ||
1442 | + goto err; | ||
1443 | + } | ||
1444 | + mark_page_accessed(page); | ||
1445 | +err: | ||
1446 | + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); | ||
1447 | + if (bitmap_page) | ||
1448 | + page_cache_release(bitmap_page); | ||
1449 | + if (page) | ||
1450 | + page_cache_release(page); | ||
1451 | + return ret; | ||
1452 | +} | ||
1453 | + | ||
1454 | static noinline_for_stack int | ||
1455 | ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | ||
1456 | { | ||
1457 | @@ -1775,7 +2020,7 @@ repeat: | ||
1458 | group = 0; | ||
1459 | |||
1460 | /* quick check to skip empty groups */ | ||
1461 | - grp = ext4_get_group_info(ac->ac_sb, group); | ||
1462 | + grp = ext4_get_group_info(sb, group); | ||
1463 | if (grp->bb_free == 0) | ||
1464 | continue; | ||
1465 | |||
1466 | @@ -1788,10 +2033,9 @@ repeat: | ||
1467 | * we need full data about the group | ||
1468 | * to make a good selection | ||
1469 | */ | ||
1470 | - err = ext4_mb_load_buddy(sb, group, &e4b); | ||
1471 | + err = ext4_mb_init_group(sb, group); | ||
1472 | if (err) | ||
1473 | goto out; | ||
1474 | - ext4_mb_release_desc(&e4b); | ||
1475 | } | ||
1476 | |||
1477 | /* | ||
1478 | @@ -2299,6 +2543,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | ||
1479 | } | ||
1480 | |||
1481 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); | ||
1482 | + init_rwsem(&meta_group_info[i]->alloc_sem); | ||
1483 | + meta_group_info[i]->bb_free_root.rb_node = NULL;; | ||
1484 | |||
1485 | #ifdef DOUBLE_CHECK | ||
1486 | { | ||
1487 | @@ -2325,54 +2571,6 @@ exit_meta_group_info: | ||
1488 | } /* ext4_mb_add_groupinfo */ | ||
1489 | |||
1490 | /* | ||
1491 | - * Add a group to the existing groups. | ||
1492 | - * This function is used for online resize | ||
1493 | - */ | ||
1494 | -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, | ||
1495 | - struct ext4_group_desc *desc) | ||
1496 | -{ | ||
1497 | - struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1498 | - struct inode *inode = sbi->s_buddy_cache; | ||
1499 | - int blocks_per_page; | ||
1500 | - int block; | ||
1501 | - int pnum; | ||
1502 | - struct page *page; | ||
1503 | - int err; | ||
1504 | - | ||
1505 | - /* Add group based on group descriptor*/ | ||
1506 | - err = ext4_mb_add_groupinfo(sb, group, desc); | ||
1507 | - if (err) | ||
1508 | - return err; | ||
1509 | - | ||
1510 | - /* | ||
1511 | - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap | ||
1512 | - * datas) are set not up to date so that they will be re-initilaized | ||
1513 | - * during the next call to ext4_mb_load_buddy | ||
1514 | - */ | ||
1515 | - | ||
1516 | - /* Set buddy page as not up to date */ | ||
1517 | - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1518 | - block = group * 2; | ||
1519 | - pnum = block / blocks_per_page; | ||
1520 | - page = find_get_page(inode->i_mapping, pnum); | ||
1521 | - if (page != NULL) { | ||
1522 | - ClearPageUptodate(page); | ||
1523 | - page_cache_release(page); | ||
1524 | - } | ||
1525 | - | ||
1526 | - /* Set bitmap page as not up to date */ | ||
1527 | - block++; | ||
1528 | - pnum = block / blocks_per_page; | ||
1529 | - page = find_get_page(inode->i_mapping, pnum); | ||
1530 | - if (page != NULL) { | ||
1531 | - ClearPageUptodate(page); | ||
1532 | - page_cache_release(page); | ||
1533 | - } | ||
1534 | - | ||
1535 | - return 0; | ||
1536 | -} | ||
1537 | - | ||
1538 | -/* | ||
1539 | * Update an existing group. | ||
1540 | * This function is used for online resize | ||
1541 | */ | ||
1542 | @@ -2495,6 +2693,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | ||
1543 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
1544 | return -ENOMEM; | ||
1545 | } | ||
1546 | + | ||
1547 | + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); | ||
1548 | sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); | ||
1549 | if (sbi->s_mb_maxs == NULL) { | ||
1550 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
1551 | @@ -2658,13 +2858,11 @@ int ext4_mb_release(struct super_block *sb) | ||
1552 | static noinline_for_stack void | ||
1553 | ext4_mb_free_committed_blocks(struct super_block *sb) | ||
1554 | { | ||
1555 | - struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1556 | - int err; | ||
1557 | - int i; | ||
1558 | - int count = 0; | ||
1559 | - int count2 = 0; | ||
1560 | - struct ext4_free_metadata *md; | ||
1561 | struct ext4_buddy e4b; | ||
1562 | + struct ext4_group_info *db; | ||
1563 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1564 | + int err, count = 0, count2 = 0; | ||
1565 | + struct ext4_free_data *entry; | ||
1566 | |||
1567 | if (list_empty(&sbi->s_committed_transaction)) | ||
1568 | return; | ||
1569 | @@ -2672,44 +2870,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb) | ||
1570 | /* there is committed blocks to be freed yet */ | ||
1571 | do { | ||
1572 | /* get next array of blocks */ | ||
1573 | - md = NULL; | ||
1574 | + entry = NULL; | ||
1575 | spin_lock(&sbi->s_md_lock); | ||
1576 | if (!list_empty(&sbi->s_committed_transaction)) { | ||
1577 | - md = list_entry(sbi->s_committed_transaction.next, | ||
1578 | - struct ext4_free_metadata, list); | ||
1579 | - list_del(&md->list); | ||
1580 | + entry = list_entry(sbi->s_committed_transaction.next, | ||
1581 | + struct ext4_free_data, list); | ||
1582 | + list_del(&entry->list); | ||
1583 | } | ||
1584 | spin_unlock(&sbi->s_md_lock); | ||
1585 | |||
1586 | - if (md == NULL) | ||
1587 | + if (entry == NULL) | ||
1588 | break; | ||
1589 | |||
1590 | mb_debug("gonna free %u blocks in group %lu (0x%p):", | ||
1591 | - md->num, md->group, md); | ||
1592 | + entry->count, entry->group, entry); | ||
1593 | |||
1594 | - err = ext4_mb_load_buddy(sb, md->group, &e4b); | ||
1595 | + err = ext4_mb_load_buddy(sb, entry->group, &e4b); | ||
1596 | /* we expect to find existing buddy because it's pinned */ | ||
1597 | BUG_ON(err != 0); | ||
1598 | |||
1599 | + db = e4b.bd_info; | ||
1600 | /* there are blocks to put in buddy to make them really free */ | ||
1601 | - count += md->num; | ||
1602 | + count += entry->count; | ||
1603 | count2++; | ||
1604 | - ext4_lock_group(sb, md->group); | ||
1605 | - for (i = 0; i < md->num; i++) { | ||
1606 | - mb_debug(" %u", md->blocks[i]); | ||
1607 | - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | ||
1608 | + ext4_lock_group(sb, entry->group); | ||
1609 | + /* Take it out of per group rb tree */ | ||
1610 | + rb_erase(&entry->node, &(db->bb_free_root)); | ||
1611 | + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); | ||
1612 | + | ||
1613 | + if (!db->bb_free_root.rb_node) { | ||
1614 | + /* No more items in the per group rb tree | ||
1615 | + * balance refcounts from ext4_mb_free_metadata() | ||
1616 | + */ | ||
1617 | + page_cache_release(e4b.bd_buddy_page); | ||
1618 | + page_cache_release(e4b.bd_bitmap_page); | ||
1619 | } | ||
1620 | - mb_debug("\n"); | ||
1621 | - ext4_unlock_group(sb, md->group); | ||
1622 | - | ||
1623 | - /* balance refcounts from ext4_mb_free_metadata() */ | ||
1624 | - page_cache_release(e4b.bd_buddy_page); | ||
1625 | - page_cache_release(e4b.bd_bitmap_page); | ||
1626 | + ext4_unlock_group(sb, entry->group); | ||
1627 | |||
1628 | - kfree(md); | ||
1629 | + kmem_cache_free(ext4_free_ext_cachep, entry); | ||
1630 | ext4_mb_release_desc(&e4b); | ||
1631 | - | ||
1632 | - } while (md); | ||
1633 | + } while (1); | ||
1634 | |||
1635 | mb_debug("freed %u blocks in %u structures\n", count, count2); | ||
1636 | } | ||
1637 | @@ -2864,6 +3064,16 @@ int __init init_ext4_mballoc(void) | ||
1638 | kmem_cache_destroy(ext4_pspace_cachep); | ||
1639 | return -ENOMEM; | ||
1640 | } | ||
1641 | + | ||
1642 | + ext4_free_ext_cachep = | ||
1643 | + kmem_cache_create("ext4_free_block_extents", | ||
1644 | + sizeof(struct ext4_free_data), | ||
1645 | + 0, SLAB_RECLAIM_ACCOUNT, NULL); | ||
1646 | + if (ext4_free_ext_cachep == NULL) { | ||
1647 | + kmem_cache_destroy(ext4_pspace_cachep); | ||
1648 | + kmem_cache_destroy(ext4_ac_cachep); | ||
1649 | + return -ENOMEM; | ||
1650 | + } | ||
1651 | #ifdef CONFIG_PROC_FS | ||
1652 | proc_root_ext4 = proc_mkdir("fs/ext4", NULL); | ||
1653 | if (proc_root_ext4 == NULL) | ||
1654 | @@ -2880,6 +3090,7 @@ void exit_ext4_mballoc(void) | ||
1655 | #ifdef CONFIG_PROC_FS | ||
1656 | remove_proc_entry("fs/ext4", NULL); | ||
1657 | #endif | ||
1658 | + kmem_cache_destroy(ext4_free_ext_cachep); | ||
1659 | } | ||
1660 | |||
1661 | |||
1662 | @@ -2941,8 +3152,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | ||
1663 | in_range(block + len - 1, ext4_inode_table(sb, gdp), | ||
1664 | EXT4_SB(sb)->s_itb_per_group)) { | ||
1665 | ext4_error(sb, __func__, | ||
1666 | - "Allocating block in system zone - block = %llu", | ||
1667 | - block); | ||
1668 | + "Allocating block %llu in system zone of %lu group\n", | ||
1669 | + block, ac->ac_b_ex.fe_group); | ||
1670 | /* File system mounted not to panic on error | ||
1671 | * Fix the bitmap and repeat the block allocation | ||
1672 | * We leak some of the blocks here. | ||
1673 | @@ -2964,10 +3175,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | ||
1674 | } | ||
1675 | } | ||
1676 | #endif | ||
1677 | - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, | ||
1678 | - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); | ||
1679 | - | ||
1680 | spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | ||
1681 | + mb_set_bits(NULL, bitmap_bh->b_data, | ||
1682 | + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); | ||
1683 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
1684 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | ||
1685 | gdp->bg_free_blocks_count = | ||
1686 | @@ -3400,10 +3610,37 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | ||
1687 | ac->ac_criteria = 20; | ||
1688 | return 1; | ||
1689 | } | ||
1690 | + | ||
1691 | return 0; | ||
1692 | } | ||
1693 | |||
1694 | /* | ||
1695 | + * the function goes through all block freed in the group | ||
1696 | + * but not yet committed and marks them used in in-core bitmap. | ||
1697 | + * buddy must be generated from this bitmap | ||
1698 | + * Need to be called with ext4 group lock (ext4_lock_group) | ||
1699 | + */ | ||
1700 | +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | ||
1701 | + ext4_group_t group) | ||
1702 | +{ | ||
1703 | + struct rb_node *n; | ||
1704 | + struct ext4_group_info *grp; | ||
1705 | + struct ext4_free_data *entry; | ||
1706 | + | ||
1707 | + grp = ext4_get_group_info(sb, group); | ||
1708 | + n = rb_first(&(grp->bb_free_root)); | ||
1709 | + | ||
1710 | + while (n) { | ||
1711 | + entry = rb_entry(n, struct ext4_free_data, node); | ||
1712 | + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), | ||
1713 | + bitmap, entry->start_blk, | ||
1714 | + entry->count); | ||
1715 | + n = rb_next(n); | ||
1716 | + } | ||
1717 | + return; | ||
1718 | +} | ||
1719 | + | ||
1720 | +/* | ||
1721 | * the function goes through all preallocation in this group and marks them | ||
1722 | * used in in-core bitmap. buddy must be generated from this bitmap | ||
1723 | * Need to be called with ext4 group lock (ext4_lock_group) | ||
1724 | @@ -4166,6 +4403,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, | ||
1725 | ac->ac_pa = NULL; | ||
1726 | ac->ac_bitmap_page = NULL; | ||
1727 | ac->ac_buddy_page = NULL; | ||
1728 | + ac->alloc_semp = NULL; | ||
1729 | ac->ac_lg = NULL; | ||
1730 | |||
1731 | /* we have to define context: we'll we work with a file or | ||
1732 | @@ -4346,6 +4584,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) | ||
1733 | } | ||
1734 | ext4_mb_put_pa(ac, ac->ac_sb, pa); | ||
1735 | } | ||
1736 | + if (ac->alloc_semp) | ||
1737 | + up_read(ac->alloc_semp); | ||
1738 | if (ac->ac_bitmap_page) | ||
1739 | page_cache_release(ac->ac_bitmap_page); | ||
1740 | if (ac->ac_buddy_page) | ||
1741 | @@ -4449,10 +4689,14 @@ repeat: | ||
1742 | ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) | ||
1743 | ext4_mb_new_preallocation(ac); | ||
1744 | } | ||
1745 | - | ||
1746 | if (likely(ac->ac_status == AC_STATUS_FOUND)) { | ||
1747 | *errp = ext4_mb_mark_diskspace_used(ac, handle); | ||
1748 | if (*errp == -EAGAIN) { | ||
1749 | + /* | ||
1750 | + * drop the reference that we took | ||
1751 | + * in ext4_mb_use_best_found | ||
1752 | + */ | ||
1753 | + ext4_mb_release_context(ac); | ||
1754 | ac->ac_b_ex.fe_group = 0; | ||
1755 | ac->ac_b_ex.fe_start = 0; | ||
1756 | ac->ac_b_ex.fe_len = 0; | ||
1757 | @@ -4517,65 +4761,97 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb, | ||
1758 | ext4_mb_free_committed_blocks(sb); | ||
1759 | } | ||
1760 | |||
1761 | +/* | ||
1762 | + * We can merge two free data extents only if the physical blocks | ||
1763 | + * are contiguous, AND the extents were freed by the same transaction, | ||
1764 | + * AND the blocks are associated with the same group. | ||
1765 | + */ | ||
1766 | +static int can_merge(struct ext4_free_data *entry1, | ||
1767 | + struct ext4_free_data *entry2) | ||
1768 | +{ | ||
1769 | + if ((entry1->t_tid == entry2->t_tid) && | ||
1770 | + (entry1->group == entry2->group) && | ||
1771 | + ((entry1->start_blk + entry1->count) == entry2->start_blk)) | ||
1772 | + return 1; | ||
1773 | + return 0; | ||
1774 | +} | ||
1775 | + | ||
1776 | static noinline_for_stack int | ||
1777 | ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | ||
1778 | - ext4_group_t group, ext4_grpblk_t block, int count) | ||
1779 | + struct ext4_free_data *new_entry) | ||
1780 | { | ||
1781 | + ext4_grpblk_t block; | ||
1782 | + struct ext4_free_data *entry; | ||
1783 | struct ext4_group_info *db = e4b->bd_info; | ||
1784 | struct super_block *sb = e4b->bd_sb; | ||
1785 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1786 | - struct ext4_free_metadata *md; | ||
1787 | - int i; | ||
1788 | + struct rb_node **n = &db->bb_free_root.rb_node, *node; | ||
1789 | + struct rb_node *parent = NULL, *new_node; | ||
1790 | |||
1791 | BUG_ON(e4b->bd_bitmap_page == NULL); | ||
1792 | BUG_ON(e4b->bd_buddy_page == NULL); | ||
1793 | |||
1794 | - ext4_lock_group(sb, group); | ||
1795 | - for (i = 0; i < count; i++) { | ||
1796 | - md = db->bb_md_cur; | ||
1797 | - if (md && db->bb_tid != handle->h_transaction->t_tid) { | ||
1798 | - db->bb_md_cur = NULL; | ||
1799 | - md = NULL; | ||
1800 | + new_node = &new_entry->node; | ||
1801 | + block = new_entry->start_blk; | ||
1802 | + | ||
1803 | + if (!*n) { | ||
1804 | + /* first free block exent. We need to | ||
1805 | + protect buddy cache from being freed, | ||
1806 | + * otherwise we'll refresh it from | ||
1807 | + * on-disk bitmap and lose not-yet-available | ||
1808 | + * blocks */ | ||
1809 | + page_cache_get(e4b->bd_buddy_page); | ||
1810 | + page_cache_get(e4b->bd_bitmap_page); | ||
1811 | + } | ||
1812 | + while (*n) { | ||
1813 | + parent = *n; | ||
1814 | + entry = rb_entry(parent, struct ext4_free_data, node); | ||
1815 | + if (block < entry->start_blk) | ||
1816 | + n = &(*n)->rb_left; | ||
1817 | + else if (block >= (entry->start_blk + entry->count)) | ||
1818 | + n = &(*n)->rb_right; | ||
1819 | + else { | ||
1820 | + ext4_error(sb, __func__, | ||
1821 | + "Double free of blocks %d (%d %d)\n", | ||
1822 | + block, entry->start_blk, entry->count); | ||
1823 | + return 0; | ||
1824 | } | ||
1825 | + } | ||
1826 | |||
1827 | - if (md == NULL) { | ||
1828 | - ext4_unlock_group(sb, group); | ||
1829 | - md = kmalloc(sizeof(*md), GFP_NOFS); | ||
1830 | - if (md == NULL) | ||
1831 | - return -ENOMEM; | ||
1832 | - md->num = 0; | ||
1833 | - md->group = group; | ||
1834 | - | ||
1835 | - ext4_lock_group(sb, group); | ||
1836 | - if (db->bb_md_cur == NULL) { | ||
1837 | - spin_lock(&sbi->s_md_lock); | ||
1838 | - list_add(&md->list, &sbi->s_active_transaction); | ||
1839 | - spin_unlock(&sbi->s_md_lock); | ||
1840 | - /* protect buddy cache from being freed, | ||
1841 | - * otherwise we'll refresh it from | ||
1842 | - * on-disk bitmap and lose not-yet-available | ||
1843 | - * blocks */ | ||
1844 | - page_cache_get(e4b->bd_buddy_page); | ||
1845 | - page_cache_get(e4b->bd_bitmap_page); | ||
1846 | - db->bb_md_cur = md; | ||
1847 | - db->bb_tid = handle->h_transaction->t_tid; | ||
1848 | - mb_debug("new md 0x%p for group %lu\n", | ||
1849 | - md, md->group); | ||
1850 | - } else { | ||
1851 | - kfree(md); | ||
1852 | - md = db->bb_md_cur; | ||
1853 | - } | ||
1854 | + rb_link_node(new_node, parent, n); | ||
1855 | + rb_insert_color(new_node, &db->bb_free_root); | ||
1856 | + | ||
1857 | + /* Now try to see the extent can be merged to left and right */ | ||
1858 | + node = rb_prev(new_node); | ||
1859 | + if (node) { | ||
1860 | + entry = rb_entry(node, struct ext4_free_data, node); | ||
1861 | + if (can_merge(entry, new_entry)) { | ||
1862 | + new_entry->start_blk = entry->start_blk; | ||
1863 | + new_entry->count += entry->count; | ||
1864 | + rb_erase(node, &(db->bb_free_root)); | ||
1865 | + spin_lock(&sbi->s_md_lock); | ||
1866 | + list_del(&entry->list); | ||
1867 | + spin_unlock(&sbi->s_md_lock); | ||
1868 | + kmem_cache_free(ext4_free_ext_cachep, entry); | ||
1869 | } | ||
1870 | + } | ||
1871 | |||
1872 | - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); | ||
1873 | - md->blocks[md->num] = block + i; | ||
1874 | - md->num++; | ||
1875 | - if (md->num == EXT4_BB_MAX_BLOCKS) { | ||
1876 | - /* no more space, put full container on a sb's list */ | ||
1877 | - db->bb_md_cur = NULL; | ||
1878 | + node = rb_next(new_node); | ||
1879 | + if (node) { | ||
1880 | + entry = rb_entry(node, struct ext4_free_data, node); | ||
1881 | + if (can_merge(new_entry, entry)) { | ||
1882 | + new_entry->count += entry->count; | ||
1883 | + rb_erase(node, &(db->bb_free_root)); | ||
1884 | + spin_lock(&sbi->s_md_lock); | ||
1885 | + list_del(&entry->list); | ||
1886 | + spin_unlock(&sbi->s_md_lock); | ||
1887 | + kmem_cache_free(ext4_free_ext_cachep, entry); | ||
1888 | } | ||
1889 | } | ||
1890 | - ext4_unlock_group(sb, group); | ||
1891 | + /* Add the extent to active_transaction list */ | ||
1892 | + spin_lock(&sbi->s_md_lock); | ||
1893 | + list_add(&new_entry->list, &sbi->s_active_transaction); | ||
1894 | + spin_unlock(&sbi->s_md_lock); | ||
1895 | return 0; | ||
1896 | } | ||
1897 | |||
1898 | @@ -4675,11 +4951,6 @@ do_more: | ||
1899 | err = ext4_journal_get_write_access(handle, gd_bh); | ||
1900 | if (err) | ||
1901 | goto error_return; | ||
1902 | - | ||
1903 | - err = ext4_mb_load_buddy(sb, block_group, &e4b); | ||
1904 | - if (err) | ||
1905 | - goto error_return; | ||
1906 | - | ||
1907 | #ifdef AGGRESSIVE_CHECK | ||
1908 | { | ||
1909 | int i; | ||
1910 | @@ -4687,13 +4958,6 @@ do_more: | ||
1911 | BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); | ||
1912 | } | ||
1913 | #endif | ||
1914 | - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
1915 | - bit, count); | ||
1916 | - | ||
1917 | - /* We dirtied the bitmap block */ | ||
1918 | - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
1919 | - err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
1920 | - | ||
1921 | if (ac) { | ||
1922 | ac->ac_b_ex.fe_group = block_group; | ||
1923 | ac->ac_b_ex.fe_start = bit; | ||
1924 | @@ -4701,12 +4965,33 @@ do_more: | ||
1925 | ext4_mb_store_history(ac); | ||
1926 | } | ||
1927 | |||
1928 | + err = ext4_mb_load_buddy(sb, block_group, &e4b); | ||
1929 | + if (err) | ||
1930 | + goto error_return; | ||
1931 | if (metadata) { | ||
1932 | - /* blocks being freed are metadata. these blocks shouldn't | ||
1933 | - * be used until this transaction is committed */ | ||
1934 | - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); | ||
1935 | + struct ext4_free_data *new_entry; | ||
1936 | + /* | ||
1937 | + * blocks being freed are metadata. these blocks shouldn't | ||
1938 | + * be used until this transaction is committed | ||
1939 | + */ | ||
1940 | + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); | ||
1941 | + new_entry->start_blk = bit; | ||
1942 | + new_entry->group = block_group; | ||
1943 | + new_entry->count = count; | ||
1944 | + new_entry->t_tid = handle->h_transaction->t_tid; | ||
1945 | + ext4_lock_group(sb, block_group); | ||
1946 | + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
1947 | + bit, count); | ||
1948 | + ext4_mb_free_metadata(handle, &e4b, new_entry); | ||
1949 | + ext4_unlock_group(sb, block_group); | ||
1950 | } else { | ||
1951 | ext4_lock_group(sb, block_group); | ||
1952 | + /* need to update group_info->bb_free and bitmap | ||
1953 | + * with group lock held. generate_buddy look at | ||
1954 | + * them with group lock_held | ||
1955 | + */ | ||
1956 | + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
1957 | + bit, count); | ||
1958 | mb_free_blocks(inode, &e4b, bit, count); | ||
1959 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); | ||
1960 | ext4_unlock_group(sb, block_group); | ||
1961 | @@ -4729,6 +5014,10 @@ do_more: | ||
1962 | |||
1963 | *freed += count; | ||
1964 | |||
1965 | + /* We dirtied the bitmap block */ | ||
1966 | + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
1967 | + err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
1968 | + | ||
1969 | /* And the group descriptor block */ | ||
1970 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); | ||
1971 | ret = ext4_journal_dirty_metadata(handle, gd_bh); | ||
1972 | diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h | ||
1973 | index c7c9906..0a28dd3 100644 | ||
1974 | --- a/fs/ext4/mballoc.h | ||
1975 | +++ b/fs/ext4/mballoc.h | ||
1976 | @@ -18,6 +18,7 @@ | ||
1977 | #include <linux/pagemap.h> | ||
1978 | #include <linux/seq_file.h> | ||
1979 | #include <linux/version.h> | ||
1980 | +#include <linux/mutex.h> | ||
1981 | #include "ext4_jbd2.h" | ||
1982 | #include "ext4.h" | ||
1983 | #include "group.h" | ||
1984 | @@ -96,25 +97,27 @@ | ||
1985 | */ | ||
1986 | #define MB_DEFAULT_GROUP_PREALLOC 512 | ||
1987 | |||
1988 | -static struct kmem_cache *ext4_pspace_cachep; | ||
1989 | -static struct kmem_cache *ext4_ac_cachep; | ||
1990 | +struct ext4_free_data { | ||
1991 | + /* this links the free block information from group_info */ | ||
1992 | + struct rb_node node; | ||
1993 | |||
1994 | -#ifdef EXT4_BB_MAX_BLOCKS | ||
1995 | -#undef EXT4_BB_MAX_BLOCKS | ||
1996 | -#endif | ||
1997 | -#define EXT4_BB_MAX_BLOCKS 30 | ||
1998 | + /* this links the free block information from ext4_sb_info */ | ||
1999 | + struct list_head list; | ||
2000 | |||
2001 | -struct ext4_free_metadata { | ||
2002 | + /* group which free block extent belongs */ | ||
2003 | ext4_group_t group; | ||
2004 | - unsigned short num; | ||
2005 | - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; | ||
2006 | - struct list_head list; | ||
2007 | + | ||
2008 | + /* free block extent */ | ||
2009 | + ext4_grpblk_t start_blk; | ||
2010 | + ext4_grpblk_t count; | ||
2011 | + | ||
2012 | + /* transaction which freed this extent */ | ||
2013 | + tid_t t_tid; | ||
2014 | }; | ||
2015 | |||
2016 | struct ext4_group_info { | ||
2017 | unsigned long bb_state; | ||
2018 | - unsigned long bb_tid; | ||
2019 | - struct ext4_free_metadata *bb_md_cur; | ||
2020 | + struct rb_root bb_free_root; | ||
2021 | unsigned short bb_first_free; | ||
2022 | unsigned short bb_free; | ||
2023 | unsigned short bb_fragments; | ||
2024 | @@ -122,6 +125,7 @@ struct ext4_group_info { | ||
2025 | #ifdef DOUBLE_CHECK | ||
2026 | void *bb_bitmap; | ||
2027 | #endif | ||
2028 | + struct rw_semaphore alloc_sem; | ||
2029 | unsigned short bb_counters[]; | ||
2030 | }; | ||
2031 | |||
2032 | @@ -209,6 +213,11 @@ struct ext4_allocation_context { | ||
2033 | __u8 ac_op; /* operation, for history only */ | ||
2034 | struct page *ac_bitmap_page; | ||
2035 | struct page *ac_buddy_page; | ||
2036 | + /* | ||
2037 | + * pointer to the held semaphore upon successful | ||
2038 | + * block allocation | ||
2039 | + */ | ||
2040 | + struct rw_semaphore *alloc_semp; | ||
2041 | struct ext4_prealloc_space *ac_pa; | ||
2042 | struct ext4_locality_group *ac_lg; | ||
2043 | }; | ||
2044 | @@ -242,6 +251,7 @@ struct ext4_buddy { | ||
2045 | struct super_block *bd_sb; | ||
2046 | __u16 bd_blkbits; | ||
2047 | ext4_group_t bd_group; | ||
2048 | + struct rw_semaphore *alloc_semp; | ||
2049 | }; | ||
2050 | #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) | ||
2051 | #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) | ||
2052 | @@ -251,8 +261,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) | ||
2053 | { | ||
2054 | return; | ||
2055 | } | ||
2056 | -#else | ||
2057 | -static void ext4_mb_store_history(struct ext4_allocation_context *ac); | ||
2058 | #endif | ||
2059 | |||
2060 | #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) | ||
2061 | @@ -260,19 +268,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac); | ||
2062 | static struct proc_dir_entry *proc_root_ext4; | ||
2063 | struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); | ||
2064 | |||
2065 | -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
2066 | - ext4_group_t group); | ||
2067 | -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); | ||
2068 | -static void ext4_mb_free_committed_blocks(struct super_block *); | ||
2069 | -static void ext4_mb_return_to_preallocation(struct inode *inode, | ||
2070 | - struct ext4_buddy *e4b, sector_t block, | ||
2071 | - int count); | ||
2072 | -static void ext4_mb_put_pa(struct ext4_allocation_context *, | ||
2073 | - struct super_block *, struct ext4_prealloc_space *pa); | ||
2074 | -static int ext4_mb_init_per_dev_proc(struct super_block *sb); | ||
2075 | -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); | ||
2076 | - | ||
2077 | - | ||
2078 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) | ||
2079 | { | ||
2080 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
2081 | @@ -297,7 +292,7 @@ static inline int ext4_is_group_locked(struct super_block *sb, | ||
2082 | &(grinfo->bb_state)); | ||
2083 | } | ||
2084 | |||
2085 | -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, | ||
2086 | +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, | ||
2087 | struct ext4_free_extent *fex) | ||
2088 | { | ||
2089 | ext4_fsblk_t block; | ||
2090 | diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c | ||
2091 | index d626533..4f3628f 100644 | ||
2092 | --- a/fs/ext4/namei.c | ||
2093 | +++ b/fs/ext4/namei.c | ||
2094 | @@ -371,6 +371,8 @@ dx_probe(struct dentry *dentry, struct inode *dir, | ||
2095 | goto fail; | ||
2096 | } | ||
2097 | hinfo->hash_version = root->info.hash_version; | ||
2098 | + if (hinfo->hash_version <= DX_HASH_TEA) | ||
2099 | + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; | ||
2100 | hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; | ||
2101 | if (dentry) | ||
2102 | ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); | ||
2103 | @@ -640,6 +642,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, | ||
2104 | dir = dir_file->f_path.dentry->d_inode; | ||
2105 | if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { | ||
2106 | hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; | ||
2107 | + if (hinfo.hash_version <= DX_HASH_TEA) | ||
2108 | + hinfo.hash_version += | ||
2109 | + EXT4_SB(dir->i_sb)->s_hash_unsigned; | ||
2110 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; | ||
2111 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, | ||
2112 | start_hash, start_minor_hash); | ||
2113 | @@ -1377,7 +1382,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | ||
2114 | struct fake_dirent *fde; | ||
2115 | |||
2116 | blocksize = dir->i_sb->s_blocksize; | ||
2117 | - dxtrace(printk("Creating index\n")); | ||
2118 | + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); | ||
2119 | retval = ext4_journal_get_write_access(handle, bh); | ||
2120 | if (retval) { | ||
2121 | ext4_std_error(dir->i_sb, retval); | ||
2122 | @@ -1386,6 +1391,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | ||
2123 | } | ||
2124 | root = (struct dx_root *) bh->b_data; | ||
2125 | |||
2126 | + /* The 0th block becomes the root, move the dirents out */ | ||
2127 | + fde = &root->dotdot; | ||
2128 | + de = (struct ext4_dir_entry_2 *)((char *)fde + | ||
2129 | + ext4_rec_len_from_disk(fde->rec_len)); | ||
2130 | + if ((char *) de >= (((char *) root) + blocksize)) { | ||
2131 | + ext4_error(dir->i_sb, __func__, | ||
2132 | + "invalid rec_len for '..' in inode %lu", | ||
2133 | + dir->i_ino); | ||
2134 | + brelse(bh); | ||
2135 | + return -EIO; | ||
2136 | + } | ||
2137 | + len = ((char *) root) + blocksize - (char *) de; | ||
2138 | + | ||
2139 | + /* Allocate new block for the 0th block's dirents */ | ||
2140 | bh2 = ext4_append (handle, dir, &block, &retval); | ||
2141 | if (!(bh2)) { | ||
2142 | brelse(bh); | ||
2143 | @@ -1394,11 +1413,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | ||
2144 | EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; | ||
2145 | data1 = bh2->b_data; | ||
2146 | |||
2147 | - /* The 0th block becomes the root, move the dirents out */ | ||
2148 | - fde = &root->dotdot; | ||
2149 | - de = (struct ext4_dir_entry_2 *)((char *)fde + | ||
2150 | - ext4_rec_len_from_disk(fde->rec_len)); | ||
2151 | - len = ((char *) root) + blocksize - (char *) de; | ||
2152 | memcpy (data1, de, len); | ||
2153 | de = (struct ext4_dir_entry_2 *) data1; | ||
2154 | top = data1 + len; | ||
2155 | @@ -1418,6 +1432,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | ||
2156 | |||
2157 | /* Initialize as for dx_probe */ | ||
2158 | hinfo.hash_version = root->info.hash_version; | ||
2159 | + if (hinfo.hash_version <= DX_HASH_TEA) | ||
2160 | + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; | ||
2161 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; | ||
2162 | ext4fs_dirhash(name, namelen, &hinfo); | ||
2163 | frame = frames; | ||
2164 | diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c | ||
2165 | index 3922a8b..0070431 100644 | ||
2166 | --- a/fs/ext4/resize.c | ||
2167 | +++ b/fs/ext4/resize.c | ||
2168 | @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb, | ||
2169 | if ((err = extend_or_restart_transaction(handle, 2, bh))) | ||
2170 | goto exit_bh; | ||
2171 | |||
2172 | - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), | ||
2173 | - bh->b_data); | ||
2174 | + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); | ||
2175 | ext4_journal_dirty_metadata(handle, bh); | ||
2176 | brelse(bh); | ||
2177 | - | ||
2178 | /* Mark unused entries in inode bitmap used */ | ||
2179 | ext4_debug("clear inode bitmap %#04llx (+%llu)\n", | ||
2180 | input->inode_bitmap, input->inode_bitmap - start); | ||
2181 | @@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb, | ||
2182 | goto exit_journal; | ||
2183 | } | ||
2184 | |||
2185 | - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), | ||
2186 | + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, | ||
2187 | bh->b_data); | ||
2188 | ext4_journal_dirty_metadata(handle, bh); | ||
2189 | exit_bh: | ||
2190 | @@ -747,6 +745,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | ||
2191 | struct inode *inode = NULL; | ||
2192 | handle_t *handle; | ||
2193 | int gdb_off, gdb_num; | ||
2194 | + int num_grp_locked = 0; | ||
2195 | int err, err2; | ||
2196 | |||
2197 | gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); | ||
2198 | @@ -787,6 +786,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | ||
2199 | } | ||
2200 | } | ||
2201 | |||
2202 | + | ||
2203 | if ((err = verify_group_input(sb, input))) | ||
2204 | goto exit_put; | ||
2205 | |||
2206 | @@ -855,15 +855,18 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | ||
2207 | * using the new disk blocks. | ||
2208 | */ | ||
2209 | |||
2210 | + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); | ||
2211 | /* Update group descriptor block for new group */ | ||
2212 | gdp = (struct ext4_group_desc *)((char *)primary->b_data + | ||
2213 | gdb_off * EXT4_DESC_SIZE(sb)); | ||
2214 | |||
2215 | + memset(gdp, 0, EXT4_DESC_SIZE(sb)); | ||
2216 | ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ | ||
2217 | ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ | ||
2218 | ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ | ||
2219 | gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); | ||
2220 | gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); | ||
2221 | + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); | ||
2222 | gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); | ||
2223 | |||
2224 | /* | ||
2225 | @@ -871,9 +874,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | ||
2226 | * descriptor | ||
2227 | */ | ||
2228 | if (test_opt(sb, MBALLOC)) { | ||
2229 | - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); | ||
2230 | - if (err) | ||
2231 | + err = ext4_mb_add_groupinfo(sb, input->group, gdp); | ||
2232 | + if (err) { | ||
2233 | + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); | ||
2234 | goto exit_journal; | ||
2235 | + } | ||
2236 | } | ||
2237 | /* | ||
2238 | * Make the new blocks and inodes valid next. We do this before | ||
2239 | @@ -915,6 +920,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | ||
2240 | |||
2241 | /* Update the global fs size fields */ | ||
2242 | sbi->s_groups_count++; | ||
2243 | + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); | ||
2244 | |||
2245 | ext4_journal_dirty_metadata(handle, primary); | ||
2246 | |||
2247 | @@ -976,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | ||
2248 | struct buffer_head * bh; | ||
2249 | handle_t *handle; | ||
2250 | int err; | ||
2251 | - unsigned long freed_blocks; | ||
2252 | ext4_group_t group; | ||
2253 | - struct ext4_group_info *grp; | ||
2254 | |||
2255 | /* We don't need to worry about locking wrt other resizers just | ||
2256 | * yet: we're going to revalidate es->s_blocks_count after | ||
2257 | @@ -1077,50 +1081,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | ||
2258 | unlock_super(sb); | ||
2259 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, | ||
2260 | o_blocks_count + add); | ||
2261 | - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); | ||
2262 | + /* We add the blocks to the bitmap and set the group need init bit */ | ||
2263 | + ext4_add_groupblocks(handle, sb, o_blocks_count, add); | ||
2264 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, | ||
2265 | o_blocks_count + add); | ||
2266 | if ((err = ext4_journal_stop(handle))) | ||
2267 | goto exit_put; | ||
2268 | |||
2269 | - /* | ||
2270 | - * Mark mballoc pages as not up to date so that they will be updated | ||
2271 | - * next time they are loaded by ext4_mb_load_buddy. | ||
2272 | - */ | ||
2273 | - if (test_opt(sb, MBALLOC)) { | ||
2274 | - struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2275 | - struct inode *inode = sbi->s_buddy_cache; | ||
2276 | - int blocks_per_page; | ||
2277 | - int block; | ||
2278 | - int pnum; | ||
2279 | - struct page *page; | ||
2280 | - | ||
2281 | - /* Set buddy page as not up to date */ | ||
2282 | - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
2283 | - block = group * 2; | ||
2284 | - pnum = block / blocks_per_page; | ||
2285 | - page = find_get_page(inode->i_mapping, pnum); | ||
2286 | - if (page != NULL) { | ||
2287 | - ClearPageUptodate(page); | ||
2288 | - page_cache_release(page); | ||
2289 | - } | ||
2290 | - | ||
2291 | - /* Set bitmap page as not up to date */ | ||
2292 | - block++; | ||
2293 | - pnum = block / blocks_per_page; | ||
2294 | - page = find_get_page(inode->i_mapping, pnum); | ||
2295 | - if (page != NULL) { | ||
2296 | - ClearPageUptodate(page); | ||
2297 | - page_cache_release(page); | ||
2298 | - } | ||
2299 | - | ||
2300 | - /* Get the info on the last group */ | ||
2301 | - grp = ext4_get_group_info(sb, group); | ||
2302 | - | ||
2303 | - /* Update free blocks in group info */ | ||
2304 | - ext4_mb_update_group_info(grp, add); | ||
2305 | - } | ||
2306 | - | ||
2307 | if (test_opt(sb, DEBUG)) | ||
2308 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", | ||
2309 | ext4_blocks_count(es)); | ||
2310 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c | ||
2311 | index 7726e8e..5e4491d 100644 | ||
2312 | --- a/fs/ext4/super.c | ||
2313 | +++ b/fs/ext4/super.c | ||
2314 | @@ -1493,7 +1493,6 @@ static int ext4_fill_flex_info(struct super_block *sb) | ||
2315 | ext4_group_t flex_group_count; | ||
2316 | ext4_group_t flex_group; | ||
2317 | int groups_per_flex = 0; | ||
2318 | - __u64 block_bitmap = 0; | ||
2319 | int i; | ||
2320 | |||
2321 | if (!sbi->s_es->s_log_groups_per_flex) { | ||
2322 | @@ -1516,9 +1515,6 @@ static int ext4_fill_flex_info(struct super_block *sb) | ||
2323 | goto failed; | ||
2324 | } | ||
2325 | |||
2326 | - gdp = ext4_get_group_desc(sb, 1, &bh); | ||
2327 | - block_bitmap = ext4_block_bitmap(sb, gdp) - 1; | ||
2328 | - | ||
2329 | for (i = 0; i < sbi->s_groups_count; i++) { | ||
2330 | gdp = ext4_get_group_desc(sb, i, &bh); | ||
2331 | |||
2332 | @@ -1920,8 +1916,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
2333 | struct inode *root; | ||
2334 | int ret = -EINVAL; | ||
2335 | int blocksize; | ||
2336 | - int db_count; | ||
2337 | - int i; | ||
2338 | + unsigned int db_count; | ||
2339 | + unsigned int i; | ||
2340 | int needs_recovery; | ||
2341 | __le32 features; | ||
2342 | __u64 blocks_count; | ||
2343 | @@ -2172,6 +2168,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
2344 | for (i = 0; i < 4; i++) | ||
2345 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); | ||
2346 | sbi->s_def_hash_version = es->s_def_hash_version; | ||
2347 | + i = le32_to_cpu(es->s_flags); | ||
2348 | + if (i & EXT2_FLAGS_UNSIGNED_HASH) | ||
2349 | + sbi->s_hash_unsigned = 3; | ||
2350 | + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { | ||
2351 | +#ifdef __CHAR_UNSIGNED__ | ||
2352 | + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); | ||
2353 | + sbi->s_hash_unsigned = 3; | ||
2354 | +#else | ||
2355 | + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); | ||
2356 | +#endif | ||
2357 | + sb->s_dirt = 1; | ||
2358 | + } | ||
2359 | |||
2360 | if (sbi->s_blocks_per_group > blocksize * 8) { | ||
2361 | printk(KERN_ERR | ||
2362 | @@ -2199,20 +2207,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | ||
2363 | if (EXT4_BLOCKS_PER_GROUP(sb) == 0) | ||
2364 | goto cantfind_ext4; | ||
2365 | |||
2366 | - /* ensure blocks_count calculation below doesn't sign-extend */ | ||
2367 | - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < | ||
2368 | - le32_to_cpu(es->s_first_data_block) + 1) { | ||
2369 | - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " | ||
2370 | - "first data block %u, blocks per group %lu\n", | ||
2371 | - ext4_blocks_count(es), | ||
2372 | - le32_to_cpu(es->s_first_data_block), | ||
2373 | - EXT4_BLOCKS_PER_GROUP(sb)); | ||
2374 | + /* | ||
2375 | + * It makes no sense for the first data block to be beyond the end | ||
2376 | + * of the filesystem. | ||
2377 | + */ | ||
2378 | + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { | ||
2379 | + printk(KERN_WARNING "EXT4-fs: bad geometry: first data" | ||
2380 | + "block %u is beyond end of filesystem (%llu)\n", | ||
2381 | + le32_to_cpu(es->s_first_data_block), | ||
2382 | + ext4_blocks_count(es)); | ||
2383 | goto failed_mount; | ||
2384 | } | ||
2385 | blocks_count = (ext4_blocks_count(es) - | ||
2386 | le32_to_cpu(es->s_first_data_block) + | ||
2387 | EXT4_BLOCKS_PER_GROUP(sb) - 1); | ||
2388 | do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); | ||
2389 | + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { | ||
2390 | + printk(KERN_WARNING "EXT4-fs: groups count too large: %u " | ||
2391 | + "(block count %llu, first data block %u, " | ||
2392 | + "blocks per group %lu)\n", sbi->s_groups_count, | ||
2393 | + ext4_blocks_count(es), | ||
2394 | + le32_to_cpu(es->s_first_data_block), | ||
2395 | + EXT4_BLOCKS_PER_GROUP(sb)); | ||
2396 | + goto failed_mount; | ||
2397 | + } | ||
2398 | sbi->s_groups_count = blocks_count; | ||
2399 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | ||
2400 | EXT4_DESC_PER_BLOCK(sb); | ||
2401 | diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c | ||
2402 | index 6caf22d..b1f0756 100644 | ||
2403 | --- a/fs/jbd2/commit.c | ||
2404 | +++ b/fs/jbd2/commit.c | ||
2405 | @@ -24,6 +24,7 @@ | ||
2406 | #include <linux/crc32.h> | ||
2407 | #include <linux/writeback.h> | ||
2408 | #include <linux/backing-dev.h> | ||
2409 | +#include <linux/bio.h> | ||
2410 | |||
2411 | /* | ||
2412 | * Default IO end handler for temporary BJ_IO buffer_heads. | ||
2413 | @@ -170,12 +171,34 @@ static int journal_submit_commit_record(journal_t *journal, | ||
2414 | * This function along with journal_submit_commit_record | ||
2415 | * allows to write the commit record asynchronously. | ||
2416 | */ | ||
2417 | -static int journal_wait_on_commit_record(struct buffer_head *bh) | ||
2418 | +static int journal_wait_on_commit_record(journal_t *journal, | ||
2419 | + struct buffer_head *bh) | ||
2420 | { | ||
2421 | int ret = 0; | ||
2422 | |||
2423 | +retry: | ||
2424 | clear_buffer_dirty(bh); | ||
2425 | wait_on_buffer(bh); | ||
2426 | + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { | ||
2427 | + printk(KERN_WARNING | ||
2428 | + "JBD2: wait_on_commit_record: sync failed on %s - " | ||
2429 | + "disabling barriers\n", journal->j_devname); | ||
2430 | + spin_lock(&journal->j_state_lock); | ||
2431 | + journal->j_flags &= ~JBD2_BARRIER; | ||
2432 | + spin_unlock(&journal->j_state_lock); | ||
2433 | + | ||
2434 | + lock_buffer(bh); | ||
2435 | + clear_buffer_dirty(bh); | ||
2436 | + set_buffer_uptodate(bh); | ||
2437 | + bh->b_end_io = journal_end_buffer_io_sync; | ||
2438 | + | ||
2439 | + ret = submit_bh(WRITE_SYNC, bh); | ||
2440 | + if (ret) { | ||
2441 | + unlock_buffer(bh); | ||
2442 | + return ret; | ||
2443 | + } | ||
2444 | + goto retry; | ||
2445 | + } | ||
2446 | |||
2447 | if (unlikely(!buffer_uptodate(bh))) | ||
2448 | ret = -EIO; | ||
2449 | @@ -795,7 +818,7 @@ wait_for_iobuf: | ||
2450 | __jbd2_journal_abort_hard(journal); | ||
2451 | } | ||
2452 | if (!err && !is_journal_aborted(journal)) | ||
2453 | - err = journal_wait_on_commit_record(cbh); | ||
2454 | + err = journal_wait_on_commit_record(journal, cbh); | ||
2455 | |||
2456 | if (err) | ||
2457 | jbd2_journal_abort(journal, err); | ||
2458 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h | ||
2459 | index 66c3499..0e1bd70 100644 | ||
2460 | --- a/include/linux/jbd2.h | ||
2461 | +++ b/include/linux/jbd2.h | ||
2462 | @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh); | ||
2463 | int val = (expr); \ | ||
2464 | if (!val) { \ | ||
2465 | printk(KERN_ERR \ | ||
2466 | - "EXT3-fs unexpected failure: %s;\n",# expr); \ | ||
2467 | + "JBD2 unexpected failure: %s: %s;\n", \ | ||
2468 | + __func__, #expr); \ | ||
2469 | printk(KERN_ERR why "\n"); \ | ||
2470 | } \ | ||
2471 | val; \ | ||
2472 | @@ -329,6 +330,7 @@ enum jbd_state_bits { | ||
2473 | BH_State, /* Pins most journal_head state */ | ||
2474 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
2475 | BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ | ||
2476 | + BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
2477 | }; | ||
2478 | |||
2479 | BUFFER_FNS(JBD, jbd) | ||
2480 | diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h | ||
2481 | index 794e546..e7e7c7d 100644 | ||
2482 | --- a/include/linux/pci_ids.h | ||
2483 | +++ b/include/linux/pci_ids.h | ||
2484 | @@ -1301,6 +1301,7 @@ | ||
2485 | #define PCI_DEVICE_ID_VIA_VT3351 0x0351 | ||
2486 | #define PCI_DEVICE_ID_VIA_VT3364 0x0364 | ||
2487 | #define PCI_DEVICE_ID_VIA_8371_0 0x0391 | ||
2488 | +#define PCI_DEVICE_ID_VIA_6415 0x0415 | ||
2489 | #define PCI_DEVICE_ID_VIA_8501_0 0x0501 | ||
2490 | #define PCI_DEVICE_ID_VIA_82C561 0x0561 | ||
2491 | #define PCI_DEVICE_ID_VIA_82C586_1 0x0571 | ||
2492 | diff --git a/include/linux/pid.h b/include/linux/pid.h | ||
2493 | index d7e98ff..93997c9 100644 | ||
2494 | --- a/include/linux/pid.h | ||
2495 | +++ b/include/linux/pid.h | ||
2496 | @@ -123,6 +123,24 @@ extern struct pid *alloc_pid(struct pid_namespace *ns); | ||
2497 | extern void free_pid(struct pid *pid); | ||
2498 | |||
2499 | /* | ||
2500 | + * ns_of_pid() returns the pid namespace in which the specified pid was | ||
2501 | + * allocated. | ||
2502 | + * | ||
2503 | + * NOTE: | ||
2504 | + * ns_of_pid() is expected to be called for a process (task) that has | ||
2505 | + * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid | ||
2506 | + * is expected to be non-NULL. If @pid is NULL, caller should handle | ||
2507 | + * the resulting NULL pid-ns. | ||
2508 | + */ | ||
2509 | +static inline struct pid_namespace *ns_of_pid(struct pid *pid) | ||
2510 | +{ | ||
2511 | + struct pid_namespace *ns = NULL; | ||
2512 | + if (pid) | ||
2513 | + ns = pid->numbers[pid->level].ns; | ||
2514 | + return ns; | ||
2515 | +} | ||
2516 | + | ||
2517 | +/* | ||
2518 | * the helpers to get the pid's id seen from different namespaces | ||
2519 | * | ||
2520 | * pid_nr() : global id, i.e. the id seen from the init namespace; | ||
2521 | diff --git a/ipc/mqueue.c b/ipc/mqueue.c | ||
2522 | index a58bfad..ca502aa 100644 | ||
2523 | --- a/ipc/mqueue.c | ||
2524 | +++ b/ipc/mqueue.c | ||
2525 | @@ -498,7 +498,8 @@ static void __do_notify(struct mqueue_inode_info *info) | ||
2526 | sig_i.si_errno = 0; | ||
2527 | sig_i.si_code = SI_MESGQ; | ||
2528 | sig_i.si_value = info->notify.sigev_value; | ||
2529 | - sig_i.si_pid = task_tgid_vnr(current); | ||
2530 | + sig_i.si_pid = task_tgid_nr_ns(current, | ||
2531 | + ns_of_pid(info->notify_owner)); | ||
2532 | sig_i.si_uid = current->uid; | ||
2533 | |||
2534 | kill_pid_info(info->notify.sigev_signo, |