Annotation of /trunk/kernel26-magellan/patches-2.6.17-r6/0028-2.6.17-fs-fcache-v2.1.patch
Parent Directory | Revision Log
Revision 105 -
(hide annotations)
(download)
Sun Mar 11 16:17:56 2007 UTC (17 years, 6 months ago) by niro
File size: 46406 byte(s)
Sun Mar 11 16:17:56 2007 UTC (17 years, 6 months ago) by niro
File size: 46406 byte(s)
2.6.17-magellan-r6
1 | niro | 105 | A frontend cache for a block device. The purpose is to speedup a |
2 | fairly random but repeated read work load, like the boot of a system. | ||
3 | |||
4 | Signed-off-by: Jens Axboe <axboe@suse.de> | ||
5 | --- | ||
6 | block/ll_rw_blk.c | 11 | ||
7 | drivers/block/Kconfig | 6 | ||
8 | drivers/block/Makefile | 1 | ||
9 | drivers/block/fcache.c | 1475 ++++++++++++++++++++++++++++++++++++++++++++++++ | ||
10 | fs/ext3/super.c | 81 ++ | ||
11 | include/linux/bio.h | 9 | ||
12 | include/linux/ext3_fs.h | 14 | ||
13 | 7 files changed, 1587 insertions(+), 10 deletions(-) | ||
14 | |||
15 | Index: linux-ck-dev/block/ll_rw_blk.c | ||
16 | =================================================================== | ||
17 | --- linux-ck-dev.orig/block/ll_rw_blk.c 2006-06-18 15:20:10.000000000 +1000 | ||
18 | +++ linux-ck-dev/block/ll_rw_blk.c 2006-06-18 15:25:27.000000000 +1000 | ||
19 | @@ -2817,12 +2817,10 @@ static void init_request_from_bio(struct | ||
20 | */ | ||
21 | if (bio_rw_ahead(bio) || bio_failfast(bio)) | ||
22 | req->flags |= REQ_FAILFAST; | ||
23 | - | ||
24 | - /* | ||
25 | - * REQ_BARRIER implies no merging, but lets make it explicit | ||
26 | - */ | ||
27 | if (unlikely(bio_barrier(bio))) | ||
28 | - req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); | ||
29 | + req->flags |= REQ_HARDBARRIER; | ||
30 | + if (!bio_mergeable(bio)) | ||
31 | + req->flags |= REQ_NOMERGE; | ||
32 | |||
33 | req->errors = 0; | ||
34 | req->hard_sector = req->sector = bio->bi_sector; | ||
35 | @@ -2870,7 +2868,7 @@ static int __make_request(request_queue_ | ||
36 | |||
37 | spin_lock_irq(q->queue_lock); | ||
38 | |||
39 | - if (unlikely(barrier) || elv_queue_empty(q)) | ||
40 | + if (!bio_mergeable(bio) || elv_queue_empty(q)) | ||
41 | goto get_rq; | ||
42 | |||
43 | el_ret = elv_merge(q, &req, bio); | ||
44 | @@ -3109,6 +3107,7 @@ void submit_bio(int rw, struct bio *bio) | ||
45 | |||
46 | BIO_BUG_ON(!bio->bi_size); | ||
47 | BIO_BUG_ON(!bio->bi_io_vec); | ||
48 | + BIO_BUG_ON(bio->bi_next); | ||
49 | bio->bi_rw |= rw; | ||
50 | if (rw & WRITE) | ||
51 | mod_page_state(pgpgout, count); | ||
52 | Index: linux-ck-dev/drivers/block/Kconfig | ||
53 | =================================================================== | ||
54 | --- linux-ck-dev.orig/drivers/block/Kconfig 2006-06-18 15:20:10.000000000 +1000 | ||
55 | +++ linux-ck-dev/drivers/block/Kconfig 2006-06-18 15:25:27.000000000 +1000 | ||
56 | @@ -456,4 +456,10 @@ config ATA_OVER_ETH | ||
57 | This driver provides Support for ATA over Ethernet block | ||
58 | devices like the Coraid EtherDrive (R) Storage Blade. | ||
59 | |||
60 | +config BLK_FCACHE | ||
61 | + bool "Boot frontend cache driver" | ||
62 | + help | ||
63 | + This driver puts the data needed for a boot sequentially in a | ||
64 | + defined place, taking all seeks out of the boot process. | ||
65 | + | ||
66 | endmenu | ||
67 | Index: linux-ck-dev/drivers/block/Makefile | ||
68 | =================================================================== | ||
69 | --- linux-ck-dev.orig/drivers/block/Makefile 2006-06-18 15:20:10.000000000 +1000 | ||
70 | +++ linux-ck-dev/drivers/block/Makefile 2006-06-18 15:25:27.000000000 +1000 | ||
71 | @@ -5,6 +5,7 @@ | ||
72 | # Rewritten to use lists instead of if-statements. | ||
73 | # | ||
74 | |||
75 | +obj-$(CONFIG_BLK_FCACHE) += fcache.o | ||
76 | obj-$(CONFIG_MAC_FLOPPY) += swim3.o | ||
77 | obj-$(CONFIG_BLK_DEV_FD) += floppy.o | ||
78 | obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o | ||
79 | Index: linux-ck-dev/drivers/block/fcache.c | ||
80 | =================================================================== | ||
81 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 | ||
82 | +++ linux-ck-dev/drivers/block/fcache.c 2006-06-18 15:25:27.000000000 +1000 | ||
83 | @@ -0,0 +1,1475 @@ | ||
84 | +/* | ||
85 | + * A frontend cache for a block device. The purpose is to speedup a | ||
86 | + * fairly random but repeated read work load, like the boot of a system. | ||
87 | + * | ||
88 | + * When run in priming mode, fcache allocates and writes data read from | ||
89 | + * the source drive to our extent cache in the order in which they are | ||
90 | + * accessed. When later run in non-priming mode, data accessed in the same | ||
91 | + * order will be linearly available in the cache. | ||
92 | + * | ||
93 | + * Performance when priming is slower than non-fcache usage would be. If | ||
94 | + * the fcache is located on another disk, the hit should be small. If the | ||
95 | + * the fcache is located on the same disk (another partition), it runs | ||
96 | + * at about half the speed. Non-priming performance should be fairly | ||
97 | + * similar on same/other disk. | ||
98 | + * | ||
99 | + * On-disk format is as follows: | ||
100 | + * Block0: header | ||
101 | + * Block1..X extent maps | ||
102 | + * BlockX+1..Y extent data | ||
103 | + * | ||
104 | + * Copyright (C) 2006 Jens Axboe <axboe@suse.de> | ||
105 | + * | ||
106 | + */ | ||
107 | +#include <linux/config.h> | ||
108 | +#include <linux/module.h> | ||
109 | +#include <linux/moduleparam.h> | ||
110 | +#include <linux/sched.h> | ||
111 | +#include <linux/blkdev.h> | ||
112 | +#include <linux/prio_tree.h> | ||
113 | +#include <linux/buffer_head.h> | ||
114 | +#include <linux/slab.h> | ||
115 | + | ||
116 | +#define FCACHE_MAGIC 0x61786663 | ||
117 | +#define FCACHE_VERSION 0x02 | ||
118 | + | ||
119 | +#define FCACHE_HEADER_BLOCK 0 | ||
120 | +#define FCACHE_EXTENT_BLOCK 1 | ||
121 | + | ||
122 | +#undef FCACHE_PAGES_PROTECTED | ||
123 | + | ||
124 | +struct fcache_dev { | ||
125 | + struct block_device *bdev; | ||
126 | + struct block_device *fs_bdev; | ||
127 | + make_request_fn *mfn; | ||
128 | + struct prio_tree_root prio_root; | ||
129 | + unsigned long next_cache_block; | ||
130 | + unsigned long nr_extents; | ||
131 | + unsigned long max_extents; | ||
132 | + unsigned int old_bs; | ||
133 | + spinlock_t lock; | ||
134 | + | ||
135 | + sector_t cache_start_sector; | ||
136 | + unsigned long cache_blocks; | ||
137 | + sector_t fs_start_sector; | ||
138 | + sector_t fs_sectors; | ||
139 | + | ||
140 | + unsigned long flags; | ||
141 | + int priming; | ||
142 | + int serial; | ||
143 | + int chop_ios; | ||
144 | + | ||
145 | + struct list_head list; | ||
146 | + struct work_struct work; | ||
147 | + | ||
148 | + /* | ||
149 | + * stats | ||
150 | + */ | ||
151 | + unsigned int ios[2]; | ||
152 | + unsigned int hits; | ||
153 | + unsigned int misses; | ||
154 | + unsigned int overwrites; | ||
155 | +}; | ||
156 | + | ||
157 | +enum { | ||
158 | + FDEV_F_DOWN = 0, | ||
159 | +}; | ||
160 | + | ||
161 | +static struct fcache_dev fcache_dev; | ||
162 | + | ||
163 | +static int disable; | ||
164 | +module_param(disable, int, 0444); | ||
165 | + | ||
166 | +struct fcache_endio_data { | ||
167 | + struct fcache_dev *fdev; | ||
168 | + sector_t fs_sector; | ||
169 | + unsigned int fs_size; | ||
170 | + sector_t cache_sector; | ||
171 | + atomic_t completions; | ||
172 | + struct bio *bio; | ||
173 | + int io_error; | ||
174 | + struct list_head list; | ||
175 | +}; | ||
176 | + | ||
177 | +/* | ||
178 | + * Maps a file system block to the fcache | ||
179 | + */ | ||
180 | +struct fcache_extent { | ||
181 | + sector_t fs_sector; /* real device offset */ | ||
182 | + unsigned int fs_size; /* extent length */ | ||
183 | + sector_t cache_sector; /* cache device offset */ | ||
184 | + | ||
185 | + struct prio_tree_node prio_node; | ||
186 | +}; | ||
187 | + | ||
188 | +/* | ||
189 | + * Header on fcache device - will take up the first page of data, so | ||
190 | + * plenty of room to go around. | ||
191 | + */ | ||
192 | +struct fcache_header { | ||
193 | + u32 magic; /* fcache magic */ | ||
194 | + u32 version; /* fcache version */ | ||
195 | + u32 nr_extents; /* nr of extents in cache */ | ||
196 | + u32 max_extents; /* max nr of extents available */ | ||
197 | + u32 serial; /* fs and cache serial */ | ||
198 | + u32 extent_offset; /* where extents start */ | ||
199 | + u64 fs_start_sector; /* where fs starts */ | ||
200 | + u64 fs_sectors; /* how big fs is */ | ||
201 | + char fs_dev[BDEVNAME_SIZE]; /* fs partition */ | ||
202 | + u64 cache_blocks; /* number of blocks in cache */ | ||
203 | + u64 cache_blocks_used; /* used blocks in cache */ | ||
204 | + u16 sector_t_size; /* user space helper */ | ||
205 | + u16 extent_size; /* user space helper */ | ||
206 | +}; | ||
207 | + | ||
208 | +#define BLOCK_SHIFT (PAGE_SHIFT - 9) | ||
209 | + | ||
210 | +static struct kmem_cache *fcache_slab; | ||
211 | +static struct kmem_cache *fcache_fed_slab; | ||
212 | +static mempool_t *fed_pool; | ||
213 | +static struct workqueue_struct *fcache_workqueue; | ||
214 | + | ||
215 | +static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err) | ||
216 | +{ | ||
217 | + if (bio->bi_size) | ||
218 | + return 1; | ||
219 | + | ||
220 | + complete(bio->bi_private); | ||
221 | + return 0; | ||
222 | +} | ||
223 | + | ||
224 | +/* | ||
225 | + * Writes out a page of data and waits for it to complete. | ||
226 | + */ | ||
227 | +static int fcache_rw_page(struct fcache_dev *fdev, sector_t index, | ||
228 | + struct page *page, int rw) | ||
229 | +{ | ||
230 | + DECLARE_COMPLETION(wait); | ||
231 | + struct bio *bio; | ||
232 | + int ret = 0; | ||
233 | + | ||
234 | + bio = bio_alloc(GFP_KERNEL, 1); | ||
235 | + | ||
236 | + bio->bi_sector = index << BLOCK_SHIFT; | ||
237 | + bio->bi_bdev = fdev->bdev; | ||
238 | + bio->bi_rw |= (1 << BIO_RW_SYNC); | ||
239 | + bio->bi_end_io = fcache_rw_page_endio; | ||
240 | + bio->bi_private = &wait; | ||
241 | + | ||
242 | + bio_add_page(bio, page, PAGE_SIZE, 0); | ||
243 | + submit_bio(rw, bio); | ||
244 | + | ||
245 | + wait_for_completion(&wait); | ||
246 | + | ||
247 | + if (!bio_flagged(bio, BIO_UPTODATE)) | ||
248 | + ret = -EIO; | ||
249 | + | ||
250 | + bio_put(bio); | ||
251 | + return ret; | ||
252 | +} | ||
253 | + | ||
254 | +static inline void fcache_fill_header(struct fcache_dev *fdev, | ||
255 | + struct fcache_header *header, | ||
256 | + unsigned int nr_extents) | ||
257 | +{ | ||
258 | + /* | ||
259 | + * See how many pages we need for extent headers, then we know where | ||
260 | + * to start putting data. Assume worst case of 1 page per extent, and | ||
261 | + * reserve the first page for the header. | ||
262 | + */ | ||
263 | + | ||
264 | + header->magic = FCACHE_MAGIC; | ||
265 | + header->version = FCACHE_VERSION; | ||
266 | + header->nr_extents = nr_extents; | ||
267 | + header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / (PAGE_SIZE - sizeof(struct fcache_extent)); | ||
268 | + header->serial = fdev->serial; | ||
269 | + | ||
270 | + header->extent_offset = 1 + (header->max_extents * sizeof(struct fcache_extent) / PAGE_SIZE); | ||
271 | + | ||
272 | + header->fs_start_sector = fdev->fs_start_sector; | ||
273 | + header->fs_sectors = fdev->fs_sectors; | ||
274 | + bdevname(fdev->fs_bdev, header->fs_dev); | ||
275 | + header->cache_blocks = fdev->cache_blocks; | ||
276 | + header->cache_blocks_used = fdev->next_cache_block; | ||
277 | + header->sector_t_size = sizeof(sector_t); | ||
278 | + header->extent_size = sizeof(struct fcache_extent); | ||
279 | +} | ||
280 | + | ||
281 | +static int fcache_write_new_header(struct fcache_dev *fdev) | ||
282 | +{ | ||
283 | + struct fcache_header *header; | ||
284 | + struct page *page; | ||
285 | + int ret; | ||
286 | + | ||
287 | + page = alloc_page(GFP_HIGHUSER); | ||
288 | + if (unlikely(!page)) | ||
289 | + return -ENOMEM; | ||
290 | + | ||
291 | + header = kmap_atomic(page, KM_USER0); | ||
292 | + clear_page(header); | ||
293 | + fcache_fill_header(fdev, header, 0); | ||
294 | + fdev->next_cache_block = header->extent_offset; | ||
295 | + fdev->max_extents = header->max_extents; | ||
296 | + kunmap_atomic(header, KM_USER0); | ||
297 | + | ||
298 | + printk("fcache: new header: first block %lu, max %lu\n", | ||
299 | + fdev->next_cache_block, fdev->max_extents); | ||
300 | + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE); | ||
301 | + __free_page(page); | ||
302 | + return ret; | ||
303 | +} | ||
304 | + | ||
305 | +static void fcache_free_prio_tree(struct fcache_dev *fdev) | ||
306 | +{ | ||
307 | + struct fcache_extent *fe; | ||
308 | + struct prio_tree_iter iter; | ||
309 | + struct prio_tree_node *node; | ||
310 | + | ||
311 | + /* | ||
312 | + * Now prune and free tree, wish there was a better way... | ||
313 | + */ | ||
314 | + do { | ||
315 | + prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX); | ||
316 | + | ||
317 | + node = prio_tree_next(&iter); | ||
318 | + if (!node) | ||
319 | + break; | ||
320 | + | ||
321 | + fe = prio_tree_entry(node, struct fcache_extent, prio_node); | ||
322 | + prio_tree_remove(&fdev->prio_root, node); | ||
323 | + kmem_cache_free(fcache_slab, fe); | ||
324 | + } while (1); | ||
325 | +} | ||
326 | + | ||
327 | +/* | ||
328 | + * First clear the header, write extents, then write real header. | ||
329 | + */ | ||
330 | +static int fcache_write_extents(struct fcache_dev *fdev) | ||
331 | +{ | ||
332 | + struct fcache_header *header; | ||
333 | + sector_t index, sectors; | ||
334 | + unsigned int nr_extents, this_extents; | ||
335 | + struct fcache_extent *fe; | ||
336 | + struct prio_tree_iter iter; | ||
337 | + struct prio_tree_node *node; | ||
338 | + struct page *page; | ||
339 | + void *p; | ||
340 | + int ret; | ||
341 | + | ||
342 | + page = alloc_page(GFP_KERNEL); | ||
343 | + if (unlikely(!page)) | ||
344 | + return -ENOMEM; | ||
345 | + | ||
346 | + header = page_address(page); | ||
347 | + clear_page(header); | ||
348 | + fcache_fill_header(fdev, header, 0); | ||
349 | + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE); | ||
350 | + if (ret) | ||
351 | + goto err; | ||
352 | + | ||
353 | + /* | ||
354 | + * Now write the extents in page size chunks. | ||
355 | + */ | ||
356 | + p = page_address(page); | ||
357 | + clear_page(p); | ||
358 | + index = FCACHE_EXTENT_BLOCK; | ||
359 | + sectors = 0; | ||
360 | + this_extents = nr_extents = 0; | ||
361 | + | ||
362 | + prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX); | ||
363 | + | ||
364 | + do { | ||
365 | + node = prio_tree_next(&iter); | ||
366 | + if (!node) | ||
367 | + break; | ||
368 | + | ||
369 | + fe = prio_tree_entry(node, struct fcache_extent, prio_node); | ||
370 | + nr_extents++; | ||
371 | + this_extents++; | ||
372 | + sectors += fe->fs_size >> 9; | ||
373 | + memcpy(p, fe, sizeof(*fe)); | ||
374 | + p += sizeof(*fe); | ||
375 | + if ((this_extents + 1) * sizeof(*fe) > PAGE_SIZE) { | ||
376 | + ret = fcache_rw_page(fdev, index, page, WRITE); | ||
377 | + if (ret) | ||
378 | + break; | ||
379 | + | ||
380 | + this_extents = 0; | ||
381 | + index++; | ||
382 | + p = page_address(page); | ||
383 | + } | ||
384 | + } while (1); | ||
385 | + | ||
386 | + if (this_extents) | ||
387 | + ret = fcache_rw_page(fdev, index, page, WRITE); | ||
388 | + | ||
389 | + fdev->nr_extents = nr_extents; | ||
390 | + printk("fcache: wrote %d extents, holding %llu sectors of data\n", | ||
391 | + nr_extents, (unsigned long long) sectors); | ||
392 | +err: | ||
393 | + __free_page(page); | ||
394 | + return ret; | ||
395 | +} | ||
396 | + | ||
397 | +static int fcache_write_header(struct fcache_dev *fdev) | ||
398 | +{ | ||
399 | + struct page *page; | ||
400 | + int ret; | ||
401 | + | ||
402 | + page = alloc_page(GFP_KERNEL); | ||
403 | + if (unlikely(!page)) | ||
404 | + return -ENOMEM; | ||
405 | + | ||
406 | + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ); | ||
407 | + if (!ret) { | ||
408 | + struct fcache_header *header = page_address(page); | ||
409 | + | ||
410 | + fcache_fill_header(fdev, header, fdev->nr_extents); | ||
411 | + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE); | ||
412 | + printk("fcache: wrote header (extents=%lu,serial=%d)\n", | ||
413 | + fdev->nr_extents, fdev->serial); | ||
414 | + } | ||
415 | + | ||
416 | + __free_page(page); | ||
417 | + return ret; | ||
418 | +} | ||
419 | + | ||
420 | +static void fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent *fe) | ||
421 | +{ | ||
422 | + struct prio_tree_node *node = &fe->prio_node; | ||
423 | + unsigned long flags; | ||
424 | + | ||
425 | + INIT_PRIO_TREE_NODE(node); | ||
426 | + node->start = fe->fs_sector; | ||
427 | + node->last = fe->fs_sector + (fe->fs_size >> 9) - 1; | ||
428 | + | ||
429 | + spin_lock_irqsave(&fdev->lock, flags); | ||
430 | + prio_tree_insert(&fdev->prio_root, node); | ||
431 | + spin_unlock_irqrestore(&fdev->lock, flags); | ||
432 | +} | ||
433 | + | ||
434 | +#define MAX_FE 16 | ||
435 | + | ||
436 | +/* | ||
437 | + * Lookup the range of a given request in the prio tree. Used for both | ||
438 | + * looking up a range covering a read operation to be served from cache, | ||
439 | + * and to lookup potential conflicts from a new write with an existing | ||
440 | + * extent. | ||
441 | + */ | ||
442 | +static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset, | ||
443 | + unsigned int bytes, struct fcache_extent **map) | ||
444 | +{ | ||
445 | + sector_t end_sector = offset + (bytes >> 9) - 1; | ||
446 | + struct prio_tree_node *node; | ||
447 | + struct prio_tree_iter iter; | ||
448 | + int i = 0; | ||
449 | + | ||
450 | + prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector); | ||
451 | + | ||
452 | + /* | ||
453 | + * We only need to lock, if we are priming. The prio tree does | ||
454 | + * not change when in normal mode. | ||
455 | + */ | ||
456 | + if (fdev->priming) | ||
457 | + spin_lock_irq(&fdev->lock); | ||
458 | + | ||
459 | + do { | ||
460 | + node = prio_tree_next(&iter); | ||
461 | + if (!node) | ||
462 | + break; | ||
463 | + | ||
464 | + map[i] = prio_tree_entry(node, struct fcache_extent, prio_node); | ||
465 | + } while (++i < MAX_FE); | ||
466 | + | ||
467 | + if (fdev->priming) | ||
468 | + spin_unlock_irq(&fdev->lock); | ||
469 | + | ||
470 | + return i; | ||
471 | +} | ||
472 | + | ||
473 | +/* | ||
474 | + * Our data write is done, now insert the fcache extents into the rbtree. | ||
475 | + */ | ||
476 | +static int fcache_instantiate_extent(struct fcache_dev *fdev, | ||
477 | + struct fcache_endio_data *fed) | ||
478 | +{ | ||
479 | + struct fcache_extent *fe; | ||
480 | + | ||
481 | + fe = kmem_cache_alloc(fcache_slab, GFP_ATOMIC); | ||
482 | + if (fe) { | ||
483 | + fe->fs_sector = fed->fs_sector; | ||
484 | + fe->fs_size = fed->fs_size; | ||
485 | + fe->cache_sector = fed->cache_sector; | ||
486 | + | ||
487 | + fcache_tree_link(fdev, fe); | ||
488 | + return 0; | ||
489 | + } | ||
490 | + | ||
491 | + return -ENOMEM; | ||
492 | +} | ||
493 | + | ||
494 | +/* | ||
495 | + * Hang on to the bio and its pages - ideally we would want to ensure | ||
496 | + * that the page data doesn't change between calling this function and | ||
497 | + * fcache_put_bio_pages() as well... | ||
498 | + */ | ||
499 | +static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio) | ||
500 | +{ | ||
501 | + /* | ||
502 | + * Currently stubbed out, as we cannot end the bio read before | ||
503 | + * the write completes without also making sure that the pages | ||
504 | + * don't get reused for something else in the mean time. | ||
505 | + */ | ||
506 | +#ifdef FCACHE_PAGES_PROTECTED | ||
507 | + struct bio_vec *bvec; | ||
508 | + int i; | ||
509 | + | ||
510 | + bio_get(bio); | ||
511 | + | ||
512 | + __bio_for_each_segment(bvec, bio, i, 0) | ||
513 | + get_page(bvec->bv_page); | ||
514 | +#endif | ||
515 | +} | ||
516 | + | ||
517 | +static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio) | ||
518 | +{ | ||
519 | +#ifdef FCACHE_PAGES_PROTECTED | ||
520 | + struct bio_vec *bvec; | ||
521 | + int i; | ||
522 | + | ||
523 | + __bio_for_each_segment(bvec, bio, i, 0) | ||
524 | + put_page(bvec->bv_page); | ||
525 | + | ||
526 | + bio_put(bio); | ||
527 | +#endif | ||
528 | +} | ||
529 | + | ||
530 | +static void fcache_chop_write_done(struct fcache_endio_data *fed) | ||
531 | +{ | ||
532 | + /* | ||
533 | + * Last io completes. | ||
534 | + */ | ||
535 | + if (atomic_dec_and_test(&fed->completions)) { | ||
536 | + struct fcache_dev *fdev = fed->fdev; | ||
537 | + struct bio *bio = fed->bio; | ||
538 | + | ||
539 | + /* | ||
540 | + * Release our reference to the original bio and | ||
541 | + * its pages. | ||
542 | + */ | ||
543 | + fcache_put_bio_pages(fdev, bio); | ||
544 | + | ||
545 | + /* | ||
546 | + * End the read! | ||
547 | + */ | ||
548 | + bio_endio(bio, bio->bi_size, 0); | ||
549 | + | ||
550 | + /* | ||
551 | + * All done, now add extent to our list if io completed ok. | ||
552 | + */ | ||
553 | + if (!fed->io_error) | ||
554 | + fcache_instantiate_extent(fdev, fed); | ||
555 | + | ||
556 | + mempool_free(fed, fed_pool); | ||
557 | + } | ||
558 | +} | ||
559 | + | ||
560 | +/* | ||
561 | + * Our data write to the cache completes, we can free our clone and | ||
562 | + * instantiate the extent block. | ||
563 | + */ | ||
564 | +static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes, | ||
565 | + int err) | ||
566 | +{ | ||
567 | + struct fcache_endio_data *fed; | ||
568 | + | ||
569 | + if (bio->bi_size) | ||
570 | + return 1; | ||
571 | + | ||
572 | + fed = bio->bi_private; | ||
573 | + | ||
574 | + if (!bio_flagged(bio, BIO_UPTODATE)) | ||
575 | + fed->io_error = -EIO; | ||
576 | + | ||
577 | + bio_put(bio); | ||
578 | + fcache_chop_write_done(fed); | ||
579 | + return 0; | ||
580 | +} | ||
581 | + | ||
582 | +static void fcache_chop_read_done(struct fcache_endio_data *fed) | ||
583 | +{ | ||
584 | + if (atomic_dec_and_test(&fed->completions)) { | ||
585 | + struct bio *bio = fed->bio; | ||
586 | + | ||
587 | + bio_endio(bio, bio->bi_size, fed->io_error); | ||
588 | + mempool_free(fed, fed_pool); | ||
589 | + } | ||
590 | +} | ||
591 | + | ||
592 | +static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int err) | ||
593 | +{ | ||
594 | + struct fcache_endio_data *fed; | ||
595 | + | ||
596 | + if (bio->bi_size) | ||
597 | + return 1; | ||
598 | + | ||
599 | + fed = bio->bi_private; | ||
600 | + | ||
601 | + if (!bio_flagged(bio, BIO_UPTODATE)) | ||
602 | + fed->io_error = -EIO; | ||
603 | + | ||
604 | + bio_put(bio); | ||
605 | + fcache_chop_read_done(fed); | ||
606 | + return 0; | ||
607 | +} | ||
608 | + | ||
609 | +typedef void (chopper_done_t) (struct fcache_endio_data *); | ||
610 | + | ||
611 | +/* | ||
612 | + * This is our io chopper - it hacks a bio into smaller pieces, suitable | ||
613 | + * for the target device. Caller supplies suitable end_io and done functions. | ||
614 | + */ | ||
615 | +static void fcache_io_chopper(struct fcache_dev *fdev, | ||
616 | + struct fcache_endio_data *fed, | ||
617 | + bio_end_io_t *endio, chopper_done_t *done, int rw) | ||
618 | +{ | ||
619 | + struct bio *bio = NULL; | ||
620 | + struct bio_vec *bv; | ||
621 | + unsigned int total_bytes; | ||
622 | + sector_t sector; | ||
623 | + int i, vecs; | ||
624 | + | ||
625 | + /* | ||
626 | + * Make sure 'fed' doesn't disappear while we are still issuing | ||
627 | + * ios, the artificial reference is dropped at the end. | ||
628 | + */ | ||
629 | + atomic_set(&fed->completions, 1); | ||
630 | + | ||
631 | + sector = fed->cache_sector; | ||
632 | + total_bytes = fed->fs_size; | ||
633 | + vecs = fed->bio->bi_vcnt; | ||
634 | + bio_for_each_segment(bv, fed->bio, i) { | ||
635 | + unsigned int len; | ||
636 | + | ||
637 | + if (!total_bytes) | ||
638 | + break; | ||
639 | + | ||
640 | + len = bv->bv_len; | ||
641 | + if (len > total_bytes) | ||
642 | + len = total_bytes; | ||
643 | + | ||
644 | + do { | ||
645 | + unsigned int l; | ||
646 | + | ||
647 | + if (!bio) { | ||
648 | + bio = bio_alloc(GFP_NOFS, vecs); | ||
649 | + | ||
650 | + bio->bi_sector = sector; | ||
651 | + bio->bi_bdev = fdev->bdev; | ||
652 | + bio->bi_end_io = endio; | ||
653 | + bio->bi_private = fed; | ||
654 | + } | ||
655 | + | ||
656 | + /* | ||
657 | + * If successful, break out of this loop and move on. | ||
658 | + */ | ||
659 | + l = bio_add_page(bio, bv->bv_page, len, bv->bv_offset); | ||
660 | + if (l == len) | ||
661 | + break; | ||
662 | + | ||
663 | + BUG_ON(!bio->bi_size); | ||
664 | + | ||
665 | + /* | ||
666 | + * We could not add this page, submit what we have | ||
667 | + * and alloc a new bio. | ||
668 | + */ | ||
669 | + atomic_inc(&fed->completions); | ||
670 | + submit_bio(rw, bio); | ||
671 | + bio = NULL; | ||
672 | + } while (1); | ||
673 | + | ||
674 | + total_bytes -= len; | ||
675 | + sector += len >> 9; | ||
676 | + vecs--; | ||
677 | + } | ||
678 | + | ||
679 | + if (bio) { | ||
680 | + atomic_inc(&fed->completions); | ||
681 | + submit_bio(rw, bio); | ||
682 | + } | ||
683 | + | ||
684 | + /* | ||
685 | + * Drop our reference to fed. | ||
686 | + */ | ||
687 | + done(fed); | ||
688 | +} | ||
689 | + | ||
690 | +/* | ||
691 | + * cache device has similar or higher queue restrictions than the fs | ||
692 | + * device - in that case, we can resubmit the bio to the device directly. | ||
693 | + */ | ||
694 | +static void fcache_direct_cache_write(struct fcache_dev *fdev, | ||
695 | + struct fcache_endio_data *fed) | ||
696 | +{ | ||
697 | + struct bio *bio = bio_clone(fed->bio, GFP_NOFS); | ||
698 | + | ||
699 | + bio->bi_sector = fed->cache_sector; | ||
700 | + bio->bi_bdev = fdev->bdev; | ||
701 | + bio->bi_end_io = fcache_extent_write_endio; | ||
702 | + bio->bi_private = fed; | ||
703 | + | ||
704 | + atomic_set(&fed->completions, 1); | ||
705 | + submit_bio(WRITE, bio); | ||
706 | +} | ||
707 | + | ||
708 | +/* | ||
709 | + * cache device has more conservative restrictions than the fs device. | ||
710 | + * The safest approach is to split up the bio and let bio_add_page() | ||
711 | + * decide when it's time to submit the pieces. | ||
712 | + */ | ||
713 | +static void fcache_submit_cache_write(struct fcache_dev *fdev, | ||
714 | + struct fcache_endio_data *fed) | ||
715 | +{ | ||
716 | + if (!fdev->chop_ios) | ||
717 | + fcache_direct_cache_write(fdev, fed); | ||
718 | + else | ||
719 | + fcache_io_chopper(fdev, fed, fcache_extent_write_endio, | ||
720 | + fcache_chop_write_done, WRITE); | ||
721 | +} | ||
722 | + | ||
723 | +/* | ||
724 | + * We punt work to fcache_work() whenever we need do work that blocks. The | ||
725 | + * only thing that this thread handles is submitting the extent write | ||
726 | + * when the real read has completed. We used to do the extent instantiation | ||
727 | + * here as well, but fcache_extent_write_endio handles that now. | ||
728 | + */ | ||
729 | +static void fcache_work(void *data) | ||
730 | +{ | ||
731 | + struct fcache_dev *fdev = data; | ||
732 | + | ||
733 | + do { | ||
734 | + struct fcache_endio_data *fed = NULL; | ||
735 | + struct bio *bio; | ||
736 | + | ||
737 | + spin_lock_irq(&fdev->lock); | ||
738 | + if (!list_empty(&fdev->list)) { | ||
739 | + fed = list_entry(fdev->list.next, struct fcache_endio_data,list); | ||
740 | + list_del_init(&fed->list); | ||
741 | + } | ||
742 | + spin_unlock_irq(&fdev->lock); | ||
743 | + | ||
744 | + if (!fed) | ||
745 | + break; | ||
746 | + | ||
747 | + bio = fed->bio; | ||
748 | + | ||
749 | + if (fed->io_error) { | ||
750 | + printk(KERN_ERR "fcache: read error from device\n"); | ||
751 | + bio_endio(bio, bio->bi_size, fed->io_error); | ||
752 | + continue; | ||
753 | + } | ||
754 | + | ||
755 | + /* | ||
756 | + * Get a ref on the original bio and pages, then | ||
757 | + * we should be able to signal completion of the READ | ||
758 | + * without waiting for the write to finish first. | ||
759 | + */ | ||
760 | + fcache_get_bio_pages(fdev, bio); | ||
761 | + | ||
762 | + /* | ||
763 | + * Submit the read data as cache writes. | ||
764 | + */ | ||
765 | + fcache_submit_cache_write(fdev, fed); | ||
766 | + | ||
767 | + /* | ||
768 | + * If fcache_get_bio_pages() could protect the pages from | ||
769 | + * being changed, we could end the io here instead of in | ||
770 | + * fcache_extent_fed_completes(). | ||
771 | + */ | ||
772 | + } while (1); | ||
773 | +} | ||
774 | + | ||
775 | +/* | ||
776 | + * Align bio to start at extent and stop sooner if extent is short. Must | ||
777 | + * be called cautiously - it's only allowed to modify the bio if this is | ||
778 | + * a clone and a write request, reads must be fully aligned and only | ||
779 | + * possibly require a starting offset modification. | ||
780 | + */ | ||
781 | +static void fcache_bio_align(struct bio *bio, struct fcache_extent *fe) | ||
782 | +{ | ||
783 | + struct bio_vec *bvec; | ||
784 | + sector_t start, end; | ||
785 | + sector_t org_start, org_end; | ||
786 | + unsigned int org_size, org_idx; | ||
787 | + int i; | ||
788 | + | ||
789 | + start = bio->bi_sector; | ||
790 | + bio->bi_sector = fe->cache_sector; | ||
791 | + | ||
792 | + /* | ||
793 | + * Nothing to do, perfectly aligned. | ||
794 | + */ | ||
795 | + if (start == fe->fs_sector && bio->bi_size == fe->fs_size) | ||
796 | + return; | ||
797 | + | ||
798 | + org_start = bio->bi_sector; | ||
799 | + org_end = bio->bi_sector + (bio->bi_size >> 9); | ||
800 | + org_size = bio->bi_size; | ||
801 | + org_idx = bio->bi_idx; | ||
802 | + | ||
803 | + /* | ||
804 | + * Adjust beginning. | ||
805 | + */ | ||
806 | + if (start > fe->fs_sector) | ||
807 | + bio->bi_sector += (start - fe->fs_sector); | ||
808 | + else if (start < fe->fs_sector) { | ||
809 | + sector_t diff = fe->fs_sector - start; | ||
810 | + int idx = 0; | ||
811 | + | ||
812 | + BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED))); | ||
813 | + BUG_ON(bio_data_dir(bio) != WRITE); | ||
814 | + | ||
815 | + /* | ||
816 | + * Adjust where bio starts | ||
817 | + */ | ||
818 | + __bio_for_each_segment(bvec, bio, i, 0) { | ||
819 | + unsigned int bsec = bvec->bv_len >> 9; | ||
820 | + unsigned int this_diff = bsec; | ||
821 | + | ||
822 | + if (!diff) | ||
823 | + break; | ||
824 | + if (this_diff > diff) | ||
825 | + this_diff = diff; | ||
826 | + | ||
827 | + bio->bi_sector += this_diff; | ||
828 | + bio->bi_size -= (this_diff << 9); | ||
829 | + | ||
830 | + /* | ||
831 | + * Bigger than this chunk, skip ahead. | ||
832 | + */ | ||
833 | + if (this_diff == bsec) { | ||
834 | + idx++; | ||
835 | + diff -= this_diff; | ||
836 | + continue; | ||
837 | + } | ||
838 | + | ||
839 | + /* | ||
840 | + * Adjust this bvec | ||
841 | + */ | ||
842 | + bvec->bv_offset += (this_diff << 9); | ||
843 | + bvec->bv_len -= (this_diff << 9); | ||
844 | + break; | ||
845 | + } | ||
846 | + bio->bi_idx += idx; | ||
847 | + } | ||
848 | + | ||
849 | + /* | ||
850 | + * Goes beyond the end, shrink size. | ||
851 | + */ | ||
852 | + end = bio->bi_sector + (bio->bi_size >> 9); | ||
853 | + if (end > fe->cache_sector + (fe->fs_size >> 9)) { | ||
854 | + sector_t diff = end - (fe->cache_sector + (fe->fs_size >> 9)); | ||
855 | + int vecs = 0; | ||
856 | + | ||
857 | + BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED))); | ||
858 | + BUG_ON(bio_data_dir(bio) != WRITE); | ||
859 | + | ||
860 | + /* | ||
861 | + * This is __bio_for_each_segment_reverse(). | ||
862 | + */ | ||
863 | + for (i = bio->bi_vcnt - 1; i >= bio->bi_idx; i--) { | ||
864 | + struct bio_vec *bvec = &bio->bi_io_vec[i]; | ||
865 | + unsigned int bsec = bvec->bv_len >> 9; | ||
866 | + unsigned int this_diff = bsec; | ||
867 | + | ||
868 | + if (!diff) | ||
869 | + break; | ||
870 | + if (this_diff > diff) | ||
871 | + this_diff = diff; | ||
872 | + | ||
873 | + bio->bi_size -= (this_diff << 9); | ||
874 | + | ||
875 | + /* | ||
876 | + * Bigger than this chunk, skip ahead. | ||
877 | + */ | ||
878 | + if (this_diff == bsec) { | ||
879 | + vecs++; | ||
880 | + diff -= this_diff; | ||
881 | + continue; | ||
882 | + } | ||
883 | + | ||
884 | + /* | ||
885 | + * Adjust this bvec | ||
886 | + */ | ||
887 | + bvec->bv_len -= (this_diff << 9); | ||
888 | + break; | ||
889 | + } | ||
890 | + bio->bi_vcnt -= vecs; | ||
891 | + } | ||
892 | + | ||
893 | + BUG_ON(bio->bi_sector < fe->cache_sector); | ||
894 | + BUG_ON(bio->bi_sector + (bio->bi_size >> 9) > fe->cache_sector + (fe->fs_size >> 9)); | ||
895 | + | ||
896 | + /* | ||
897 | + * Invalidate the segment counts, we changed the bio layout. | ||
898 | + */ | ||
899 | + bio->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
900 | + bio->bi_flags |= (1 << BIO_NOMERGE); | ||
901 | +} | ||
902 | + | ||
903 | +static int fcache_overwrite_endio(struct bio *bio, unsigned int bytes, int err) | ||
904 | +{ | ||
905 | + if (bio->bi_size) | ||
906 | + return 1; | ||
907 | + | ||
908 | + if (!bio_flagged(bio, BIO_UPTODATE)) { | ||
909 | + struct fcache_dev *fdev = bio->bi_private; | ||
910 | + | ||
911 | + printk(KERN_ERR "fcache: overwrite error, cache off\n"); | ||
912 | + set_bit(FDEV_F_DOWN, &fdev->flags); | ||
913 | + } | ||
914 | + | ||
915 | + bio_put(bio); | ||
916 | + return 0; | ||
917 | +} | ||
918 | + | ||
919 | +/* | ||
920 | + * Schedule overwrite of some existing block(s). | ||
921 | + */ | ||
922 | +static int fcache_overwrite_extent(struct fcache_dev *fdev, | ||
923 | + struct fcache_extent *fe, struct bio *bio) | ||
924 | +{ | ||
925 | + struct bio *clone = bio_clone(bio, GFP_NOFS); | ||
926 | + | ||
927 | + clone->bi_bdev = fdev->bdev; | ||
928 | + clone->bi_end_io = fcache_overwrite_endio; | ||
929 | + clone->bi_private = fdev; | ||
930 | + fcache_bio_align(clone, fe); | ||
931 | + submit_bio(WRITE, clone); | ||
932 | + return 0; | ||
933 | +} | ||
934 | + | ||
935 | +/* | ||
936 | + * Our real data read is complete. Kick our process context handler so it | ||
937 | + * can submit the write to our cache. | ||
938 | + */ | ||
939 | +static int fcache_extent_endio(struct bio *bio, unsigned int bytes, int err) | ||
940 | +{ | ||
941 | + struct fcache_dev *fdev; | ||
942 | + struct fcache_endio_data *fed; | ||
943 | + unsigned long flags; | ||
944 | + | ||
945 | + if (bio->bi_size) | ||
946 | + return 1; | ||
947 | + | ||
948 | + fed = bio->bi_private; | ||
949 | + | ||
950 | + if (!bio_flagged(bio, BIO_UPTODATE)) | ||
951 | + fed->io_error = -EIO; | ||
952 | + | ||
953 | + bio_put(bio); | ||
954 | + | ||
955 | + fdev = fed->fdev; | ||
956 | + spin_lock_irqsave(&fdev->lock, flags); | ||
957 | + list_add_tail(&fed->list, &fdev->list); | ||
958 | + spin_unlock_irqrestore(&fdev->lock, flags); | ||
959 | + queue_work(fcache_workqueue, &fdev->work); | ||
960 | + return 0; | ||
961 | +} | ||
962 | + | ||
963 | +/* | ||
964 | + * This initiates adding an extent to our list. We do this by cloning the | ||
965 | + * original bio and submitting that to the real device and when that completes | ||
966 | + * we write that out to the cache device and instantiate the extent. | ||
967 | + */ | ||
968 | +static int fcache_add_extent(struct fcache_dev *fdev, struct bio *bio) | ||
969 | +{ | ||
970 | + struct fcache_endio_data *fed; | ||
971 | + struct bio *clone; | ||
972 | + | ||
973 | + fed = mempool_alloc(fed_pool, GFP_NOIO); | ||
974 | + | ||
975 | + fed->fdev = fdev; | ||
976 | + fed->fs_sector = bio->bi_sector; | ||
977 | + fed->fs_size = bio->bi_size; | ||
978 | + fed->cache_sector = -1; | ||
979 | + fed->bio = bio; | ||
980 | + fed->io_error = 0; | ||
981 | + INIT_LIST_HEAD(&fed->list); | ||
982 | + | ||
983 | + /* | ||
984 | + * Allocate/assign an extent block for this range | ||
985 | + */ | ||
986 | + spin_lock_irq(&fdev->lock); | ||
987 | + if (fdev->nr_extents < fdev->max_extents) { | ||
988 | + unsigned long nr = (bio->bi_size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
989 | + | ||
990 | + if (fdev->next_cache_block + nr <= fdev->cache_blocks) { | ||
991 | + fdev->nr_extents++; | ||
992 | + fed->cache_sector = fdev->next_cache_block << BLOCK_SHIFT; | ||
993 | + fdev->next_cache_block += nr; | ||
994 | + } | ||
995 | + } | ||
996 | + spin_unlock_irq(&fdev->lock); | ||
997 | + | ||
998 | + /* | ||
999 | + * Ran out of room | ||
1000 | + */ | ||
1001 | + if (fed->cache_sector == -1) { | ||
1002 | + printk(KERN_ERR "fcache: ran out of space, priming now off\n"); | ||
1003 | + fdev->priming = 0; | ||
1004 | + mempool_free(fed, fed_pool); | ||
1005 | + return -ENOENT; | ||
1006 | + } | ||
1007 | + | ||
1008 | + clone = bio_clone(bio, GFP_NOFS); | ||
1009 | + clone->bi_private = fed; | ||
1010 | + clone->bi_end_io = fcache_extent_endio; | ||
1011 | + clone->bi_rw |= (1 << BIO_RW_SYNC); | ||
1012 | + | ||
1013 | + generic_make_request(clone); | ||
1014 | + return 0; | ||
1015 | +} | ||
1016 | + | ||
1017 | +static int fcache_parse_extents(struct fcache_dev *fdev, void *addr, | ||
1018 | + unsigned int max_extents) | ||
1019 | +{ | ||
1020 | + int nr_extents = PAGE_SIZE / sizeof(struct fcache_extent); | ||
1021 | + int extents_read; | ||
1022 | + | ||
1023 | + if (nr_extents > max_extents) | ||
1024 | + nr_extents = max_extents; | ||
1025 | + | ||
1026 | + extents_read = 0; | ||
1027 | + while (nr_extents) { | ||
1028 | + struct fcache_extent *fe, *__fe = addr; | ||
1029 | + | ||
1030 | + fe = kmem_cache_alloc(fcache_slab, GFP_KERNEL); | ||
1031 | + if (unlikely(!fe)) | ||
1032 | + return -ENOMEM; | ||
1033 | + | ||
1034 | + memset(fe, 0, sizeof(*fe)); | ||
1035 | + fe->fs_sector = __fe->fs_sector; | ||
1036 | + fe->fs_size = __fe->fs_size; | ||
1037 | + fe->cache_sector = __fe->cache_sector; | ||
1038 | + | ||
1039 | + fcache_tree_link(fdev, fe); | ||
1040 | + | ||
1041 | + nr_extents--; | ||
1042 | + extents_read++; | ||
1043 | + addr += sizeof(*fe); | ||
1044 | + } | ||
1045 | + | ||
1046 | + return extents_read; | ||
1047 | +} | ||
1048 | + | ||
1049 | +static int fcache_read_extents(struct fcache_dev *fdev) | ||
1050 | +{ | ||
1051 | + unsigned int nr_extents = fdev->nr_extents; | ||
1052 | + int ret, extents, total_extents; | ||
1053 | + struct page *page; | ||
1054 | + sector_t index; | ||
1055 | + void *p; | ||
1056 | + | ||
1057 | + page = alloc_page(GFP_KERNEL); | ||
1058 | + if (unlikely(!page)) | ||
1059 | + return -ENOMEM; | ||
1060 | + | ||
1061 | + ret = 0; | ||
1062 | + total_extents = 0; | ||
1063 | + index = FCACHE_EXTENT_BLOCK; | ||
1064 | + while (nr_extents) { | ||
1065 | + ret = fcache_rw_page(fdev, index, page, READ); | ||
1066 | + if (ret) | ||
1067 | + break; | ||
1068 | + | ||
1069 | + p = page_address(page); | ||
1070 | + extents = fcache_parse_extents(fdev, p, nr_extents); | ||
1071 | + | ||
1072 | + if (extents < 0) { | ||
1073 | + ret = extents; | ||
1074 | + break; | ||
1075 | + } | ||
1076 | + | ||
1077 | + index++; | ||
1078 | + nr_extents -= extents; | ||
1079 | + total_extents += extents; | ||
1080 | + } | ||
1081 | + | ||
1082 | + __free_page(page); | ||
1083 | + | ||
1084 | + if (ret) | ||
1085 | + return ret; | ||
1086 | + | ||
1087 | + return total_extents; | ||
1088 | +} | ||
1089 | + | ||
1090 | +/* | ||
1091 | + * Read an existing fcache header from the device, and then proceed to | ||
1092 | + * reading and adding the extents to out prio tree. | ||
1093 | + */ | ||
1094 | +static int fcache_load_header(struct fcache_dev *fdev, int serial) | ||
1095 | +{ | ||
1096 | + struct fcache_header *header = NULL; | ||
1097 | + struct page *page; | ||
1098 | + int ret, wrong_serial = 0; | ||
1099 | + char b[BDEVNAME_SIZE]; | ||
1100 | + | ||
1101 | + page = alloc_page(GFP_HIGHUSER); | ||
1102 | + if (unlikely(!page)) | ||
1103 | + return -ENOMEM; | ||
1104 | + | ||
1105 | + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ); | ||
1106 | + if (unlikely(ret)) | ||
1107 | + goto err; | ||
1108 | + | ||
1109 | + ret = -EINVAL; | ||
1110 | + header = kmap_atomic(page, KM_USER0); | ||
1111 | + if (header->magic != FCACHE_MAGIC) { | ||
1112 | + printk(KERN_ERR "fcache: bad magic %x\n", header->magic); | ||
1113 | + goto err; | ||
1114 | + } | ||
1115 | + if (header->version != FCACHE_VERSION) { | ||
1116 | + printk(KERN_ERR "fcache: bad version %d\n", header->version); | ||
1117 | + goto err; | ||
1118 | + } | ||
1119 | + if (strcmp(bdevname(fdev->fs_bdev, b), header->fs_dev)) { | ||
1120 | + printk(KERN_ERR "fcache: device mismatch (%s/%s\n", b, | ||
1121 | + header->fs_dev); | ||
1122 | + goto err; | ||
1123 | + } | ||
1124 | + if (header->fs_start_sector != fdev->fs_start_sector || | ||
1125 | + header->fs_sectors != fdev->fs_sectors) { | ||
1126 | + printk(KERN_ERR "fcache: fs appears to have changed size\n"); | ||
1127 | + goto err; | ||
1128 | + } | ||
1129 | + | ||
1130 | + fdev->nr_extents = header->nr_extents; | ||
1131 | + fdev->max_extents = header->max_extents; | ||
1132 | + | ||
1133 | + /* | ||
1134 | + * Don't fail on out-of-date serial, just warn that the user needs | ||
1135 | + * to prime the cache again. Until then we'll just bypass the cache. | ||
1136 | + */ | ||
1137 | + if (header->serial != serial) { | ||
1138 | + printk(KERN_ERR "fcache: found serial %d, expected %d.\n", | ||
1139 | + header->serial, serial); | ||
1140 | + printk(KERN_ERR "fcache: reprime the cache!\n"); | ||
1141 | + wrong_serial = 1; | ||
1142 | + } | ||
1143 | + | ||
1144 | + fdev->serial = header->serial; | ||
1145 | + kunmap_atomic(header, KM_USER0); | ||
1146 | + __free_page(page); | ||
1147 | + | ||
1148 | + if (!wrong_serial) { | ||
1149 | + printk("fcache: header looks valid (extents=%ld extents, serial=%u)\n", fdev->nr_extents, fdev->serial); | ||
1150 | + | ||
1151 | + ret = fcache_read_extents(fdev); | ||
1152 | + printk("fcache: loaded %d extents\n", ret); | ||
1153 | + | ||
1154 | + /* | ||
1155 | + * If we don't find all the extents we require, fail. | ||
1156 | + */ | ||
1157 | + if (ret != fdev->nr_extents) { | ||
1158 | + fcache_free_prio_tree(fdev); | ||
1159 | + ret = -EINVAL; | ||
1160 | + } else | ||
1161 | + ret = 0; | ||
1162 | + } | ||
1163 | + | ||
1164 | + return ret; | ||
1165 | +err: | ||
1166 | + __free_page(page); | ||
1167 | + if (header) | ||
1168 | + kunmap_atomic(header, KM_USER0); | ||
1169 | + return ret; | ||
1170 | +} | ||
1171 | + | ||
1172 | +/* | ||
1173 | + * We use this range to decide when to log an io to the target device. | ||
1174 | + */ | ||
1175 | +static void fcache_fill_fs_size(struct fcache_dev *fdev) | ||
1176 | +{ | ||
1177 | + struct block_device *bdev = fdev->fs_bdev; | ||
1178 | + | ||
1179 | + /* | ||
1180 | + * Partition or whole device? | ||
1181 | + */ | ||
1182 | + if (bdev != bdev->bd_contains) { | ||
1183 | + struct hd_struct *p = bdev->bd_part; | ||
1184 | + | ||
1185 | + fdev->fs_start_sector = p->start_sect; | ||
1186 | + fdev->fs_sectors = p->nr_sects; | ||
1187 | + } else { | ||
1188 | + fdev->fs_start_sector = 0; | ||
1189 | + fdev->fs_sectors = bdev->bd_inode->i_size >> 9; | ||
1190 | + } | ||
1191 | +} | ||
1192 | + | ||
1193 | +static void fcache_fill_cache_size(struct fcache_dev *fdev) | ||
1194 | +{ | ||
1195 | + struct block_device *bdev = fdev->bdev; | ||
1196 | + | ||
1197 | + /* | ||
1198 | + * Partition or whole device? | ||
1199 | + */ | ||
1200 | + if (bdev != bdev->bd_contains) { | ||
1201 | + struct hd_struct *p = bdev->bd_part; | ||
1202 | + | ||
1203 | + fdev->cache_start_sector = p->start_sect; | ||
1204 | + fdev->cache_blocks = p->nr_sects >> BLOCK_SHIFT; | ||
1205 | + } else { | ||
1206 | + fdev->cache_start_sector = 0; | ||
1207 | + fdev->cache_blocks = bdev->bd_inode->i_size >> PAGE_SHIFT; | ||
1208 | + } | ||
1209 | +} | ||
1210 | + | ||
1211 | +/* | ||
1212 | + * This is a read request, check if we have that block. If we do, then | ||
1213 | + * just redirect. If not, pass it through. | ||
1214 | + */ | ||
1215 | +static int fcache_read_request(struct fcache_dev *fdev, request_queue_t *q, | ||
1216 | + struct bio *bio) | ||
1217 | +{ | ||
1218 | + struct fcache_extent *extents[MAX_FE]; | ||
1219 | + struct fcache_extent *fe; | ||
1220 | + int i, nr; | ||
1221 | + | ||
1222 | + /* | ||
1223 | + * Not there, redirect to original but schedule adding this extent | ||
1224 | + * to our list if we are priming. | ||
1225 | + */ | ||
1226 | + nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents); | ||
1227 | + if (!nr) { | ||
1228 | + if (fdev->priming && !fcache_add_extent(fdev, bio)) | ||
1229 | + return 0; | ||
1230 | + | ||
1231 | + fdev->misses++; | ||
1232 | + return fdev->mfn(q, bio); | ||
1233 | + } | ||
1234 | + | ||
1235 | + /* | ||
1236 | + * If range is at least as big, we use our cache. If not, cop out | ||
1237 | + * and just submit to real device. | ||
1238 | + */ | ||
1239 | + for (i = 0; i < nr; i++) { | ||
1240 | + sector_t end_fe, end_bi; | ||
1241 | + fe = extents[i]; | ||
1242 | + | ||
1243 | + end_fe = fe->fs_sector + (fe->fs_size >> 9); | ||
1244 | + end_bi = bio->bi_sector + (bio->bi_size >> 9); | ||
1245 | + | ||
1246 | + /* | ||
1247 | + * match! | ||
1248 | + */ | ||
1249 | + if (bio->bi_sector >= fe->fs_sector && end_bi <= end_fe) | ||
1250 | + break; | ||
1251 | + | ||
1252 | + fe = NULL; | ||
1253 | + } | ||
1254 | + | ||
1255 | + /* | ||
1256 | + * Nopes, send to real device. | ||
1257 | + */ | ||
1258 | + if (!fe) { | ||
1259 | + fdev->misses++; | ||
1260 | + return fdev->mfn(q, bio); | ||
1261 | + } | ||
1262 | + | ||
1263 | + /* | ||
1264 | + * Perfect, adjust start offset if it isn't aligned. | ||
1265 | + */ | ||
1266 | + fdev->hits++; | ||
1267 | + fcache_bio_align(bio, fe); | ||
1268 | + | ||
1269 | + /* | ||
1270 | + * If we don't have to chop it up, just let generic_make_request() | ||
1271 | + * handle the stacking. Otherwise, return handled and pass to chopper. | ||
1272 | + */ | ||
1273 | + if (fdev->chop_ios) { | ||
1274 | + struct fcache_endio_data *fed; | ||
1275 | + | ||
1276 | + fed = mempool_alloc(fed_pool, GFP_NOIO); | ||
1277 | + | ||
1278 | + fed->fdev = fdev; | ||
1279 | + fed->cache_sector = bio->bi_sector; | ||
1280 | + fed->fs_size = bio->bi_size; | ||
1281 | + fed->bio = bio; | ||
1282 | + fed->io_error = 0; | ||
1283 | + fcache_io_chopper(fdev, fed, fcache_chop_read_endio, | ||
1284 | + fcache_chop_read_done, READ); | ||
1285 | + return 0; | ||
1286 | + } | ||
1287 | + | ||
1288 | + bio->bi_bdev = fdev->bdev; | ||
1289 | + return 1; | ||
1290 | +} | ||
1291 | + | ||
1292 | +/* | ||
1293 | + * If we are priming the cache, always add this block. If not, then we still | ||
1294 | + * need to overwrite this block if it's in our cache. | ||
1295 | + */ | ||
1296 | +static int fcache_write_request(struct fcache_dev *fdev, request_queue_t *q, | ||
1297 | + struct bio *bio) | ||
1298 | +{ | ||
1299 | + struct fcache_extent *extents[MAX_FE]; | ||
1300 | + struct fcache_extent *fe; | ||
1301 | + sector_t start = bio->bi_sector; | ||
1302 | + int i, nr; | ||
1303 | + | ||
1304 | +repeat: | ||
1305 | + nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents); | ||
1306 | + | ||
1307 | + /* | ||
1308 | + * Find out what to overwrite, if anything. | ||
1309 | + */ | ||
1310 | + for (i = 0; i < nr; i++) { | ||
1311 | + fe = extents[i]; | ||
1312 | + fdev->overwrites++; | ||
1313 | + fcache_overwrite_extent(fdev, fe, bio); | ||
1314 | + } | ||
1315 | + | ||
1316 | + /* | ||
1317 | + * If i == MAX_FE, there _may_ be more extents. Repeat lookup, start | ||
1318 | + * from the end of last request. | ||
1319 | + */ | ||
1320 | + if (i == MAX_FE) { | ||
1321 | + fe = extents[i - 1]; | ||
1322 | + start = fe->fs_sector + (fe->fs_size >> 9); | ||
1323 | + goto repeat; | ||
1324 | + } | ||
1325 | + | ||
1326 | + return fdev->mfn(q, bio); | ||
1327 | +} | ||
1328 | + | ||
1329 | +/* | ||
1330 | + * This is the only case where we resubmit an io to the device but don't | ||
1331 | + * want to count it as part of io we log. | ||
1332 | + */ | ||
1333 | +#define fcache_bio_seen(bio) ((bio)->bi_end_io == fcache_extent_endio) | ||
1334 | + | ||
1335 | +static int fcache_make_request(request_queue_t *q, struct bio *bio) | ||
1336 | +{ | ||
1337 | + struct fcache_dev *fdev = &fcache_dev; | ||
1338 | + | ||
1339 | + /* | ||
1340 | + * If it's in the sector range we are monitoring and the device isn't | ||
1341 | + * being shutdown, then pass it on. Assume a bio doesn't span into | ||
1342 | + * the next partition, so don't bother accounting for size. | ||
1343 | + */ | ||
1344 | + if ((bio->bi_sector >= fdev->fs_start_sector) && | ||
1345 | + (bio->bi_sector < (fdev->fs_start_sector + fdev->fs_sectors)) && | ||
1346 | + !test_bit(FDEV_F_DOWN, &fdev->flags) && | ||
1347 | + !fcache_bio_seen(bio)) { | ||
1348 | + | ||
1349 | + fdev->ios[bio_data_dir(bio)]++; | ||
1350 | + | ||
1351 | + if (bio_data_dir(bio) == READ) | ||
1352 | + return fcache_read_request(fdev, q, bio); | ||
1353 | + | ||
1354 | + return fcache_write_request(fdev, q, bio); | ||
1355 | + } | ||
1356 | + | ||
1357 | + /* | ||
1358 | + * Pass through to original make_request_fn. | ||
1359 | + */ | ||
1360 | + return fdev->mfn(q, bio); | ||
1361 | +} | ||
1362 | + | ||
1363 | +/* | ||
1364 | + * Attach the cache device 'bdev' to 'fdev'. | ||
1365 | + */ | ||
1366 | +static int fcache_setup_dev(struct fcache_dev *fdev, | ||
1367 | + struct block_device *fs_bdev, | ||
1368 | + struct block_device *bdev, | ||
1369 | + int priming, int serial) | ||
1370 | +{ | ||
1371 | + request_queue_t *fs_q, *cache_q; | ||
1372 | + char b[BDEVNAME_SIZE]; | ||
1373 | + int ret; | ||
1374 | + | ||
1375 | + memset(fdev, 0, sizeof(*fdev)); | ||
1376 | + INIT_PRIO_TREE_ROOT(&fdev->prio_root); | ||
1377 | + spin_lock_init(&fdev->lock); | ||
1378 | + INIT_LIST_HEAD(&fdev->list); | ||
1379 | + INIT_WORK(&fdev->work, fcache_work, fdev); | ||
1380 | + fdev->priming = priming; | ||
1381 | + fdev->fs_bdev = fs_bdev; | ||
1382 | + fdev->bdev = bdev; | ||
1383 | + | ||
1384 | + ret = -EINVAL; | ||
1385 | + | ||
1386 | + fs_q = bdev_get_queue(fs_bdev); | ||
1387 | + cache_q = bdev_get_queue(bdev); | ||
1388 | + if (!fs_q || !cache_q) | ||
1389 | + goto out; | ||
1390 | + | ||
1391 | + /* | ||
1392 | + * Chop up outgoing ios, if the target is a different queue. We could | ||
1393 | + * look closer at limits, but it's fragile and pretty pointless. | ||
1394 | + */ | ||
1395 | + if (fs_q != cache_q) | ||
1396 | + fdev->chop_ios = 1; | ||
1397 | + | ||
1398 | + ret = bd_claim(bdev, fcache_setup_dev); | ||
1399 | + if (ret < 0) | ||
1400 | + goto out; | ||
1401 | + | ||
1402 | + ret = block_size(bdev); | ||
1403 | + if (ret != PAGE_SIZE) { | ||
1404 | + fdev->old_bs = ret; | ||
1405 | + ret = set_blocksize(bdev, PAGE_SIZE); | ||
1406 | + if (ret < 0) | ||
1407 | + goto out_release; | ||
1408 | + } else | ||
1409 | + ret = 0; | ||
1410 | + | ||
1411 | + fcache_fill_cache_size(fdev); | ||
1412 | + fcache_fill_fs_size(fdev); | ||
1413 | + | ||
1414 | + if (priming) { | ||
1415 | + fdev->serial = serial; | ||
1416 | + ret = fcache_write_new_header(fdev); | ||
1417 | + } else | ||
1418 | + ret = fcache_load_header(fdev, serial); | ||
1419 | + | ||
1420 | + if (!ret) { | ||
1421 | + printk("fcache: %s opened successfully (%spriming)\n", | ||
1422 | + bdevname(bdev, b), | ||
1423 | + priming ? "" : "not "); | ||
1424 | + return 0; | ||
1425 | + } | ||
1426 | + | ||
1427 | +out_release: | ||
1428 | + bd_release(fdev->bdev); | ||
1429 | +out: | ||
1430 | + blkdev_put(fdev->bdev); | ||
1431 | + fdev->bdev = NULL; | ||
1432 | + return ret; | ||
1433 | +} | ||
1434 | + | ||
1435 | +/* | ||
1436 | + * Return fdev->bdev to its original state. | ||
1437 | + */ | ||
1438 | +static void fcache_shutdown_dev(struct fcache_dev *fdev, | ||
1439 | + struct block_device *bdev) | ||
1440 | +{ | ||
1441 | + if (fdev->bdev) { | ||
1442 | + if (fdev->mfn) { | ||
1443 | + request_queue_t *q = bdev_get_queue(bdev); | ||
1444 | + | ||
1445 | + (void) xchg(&q->make_request_fn, fdev->mfn); | ||
1446 | + } | ||
1447 | + sync_blockdev(fdev->bdev); | ||
1448 | + if (fdev->old_bs) | ||
1449 | + set_blocksize(fdev->bdev, fdev->old_bs); | ||
1450 | + | ||
1451 | + bd_release(fdev->bdev); | ||
1452 | + blkdev_put(fdev->bdev); | ||
1453 | + fdev->bdev = NULL; | ||
1454 | + INIT_PRIO_TREE_ROOT(&fdev->prio_root); | ||
1455 | + } | ||
1456 | +} | ||
1457 | + | ||
1458 | +/* | ||
1459 | + * bdev is the file system device, cache_dev is the device we want to store | ||
1460 | + * the cache on. | ||
1461 | + */ | ||
1462 | +int fcache_dev_open(struct block_device *bdev, unsigned long cache_dev, | ||
1463 | + int priming, int serial) | ||
1464 | +{ | ||
1465 | + struct block_device *fcache_bdev; | ||
1466 | + request_queue_t *q; | ||
1467 | + int ret; | ||
1468 | + | ||
1469 | + if (disable) | ||
1470 | + return 0; | ||
1471 | + if (fcache_dev.bdev) | ||
1472 | + return -EBUSY; | ||
1473 | + | ||
1474 | + fcache_bdev = open_by_devnum(cache_dev, FMODE_READ|FMODE_WRITE); | ||
1475 | + if (IS_ERR(fcache_bdev)) | ||
1476 | + return PTR_ERR(fcache_bdev); | ||
1477 | + | ||
1478 | + ret = fcache_setup_dev(&fcache_dev, bdev, fcache_bdev, priming, serial); | ||
1479 | + if (ret) | ||
1480 | + return ret; | ||
1481 | + | ||
1482 | + q = bdev_get_queue(bdev); | ||
1483 | + fcache_dev.mfn = xchg(&q->make_request_fn, fcache_make_request); | ||
1484 | + return 0; | ||
1485 | +} | ||
1486 | + | ||
1487 | +EXPORT_SYMBOL(fcache_dev_open); | ||
1488 | + | ||
1489 | +void fcache_dev_close(struct block_device *bdev, int serial) | ||
1490 | +{ | ||
1491 | + struct fcache_dev *fdev = &fcache_dev; | ||
1492 | + | ||
1493 | + if (disable) | ||
1494 | + return; | ||
1495 | + | ||
1496 | + if (!fdev->bdev) | ||
1497 | + return; | ||
1498 | + | ||
1499 | + printk("fcache: ios r/w %u/%u, hits %u, misses %u, overwrites %u\n", | ||
1500 | + fdev->ios[0], fdev->ios[1], fdev->hits, | ||
1501 | + fdev->misses, fdev->overwrites); | ||
1502 | + fdev->serial = serial; | ||
1503 | + | ||
1504 | + sync_blockdev(bdev); | ||
1505 | + set_bit(FDEV_F_DOWN, &fdev->flags); | ||
1506 | + | ||
1507 | + if (fdev->priming) | ||
1508 | + fcache_write_extents(fdev); | ||
1509 | + | ||
1510 | + fcache_write_header(fdev); | ||
1511 | + fcache_free_prio_tree(fdev); | ||
1512 | + fcache_shutdown_dev(fdev, bdev); | ||
1513 | +} | ||
1514 | + | ||
1515 | +EXPORT_SYMBOL(fcache_dev_close); | ||
1516 | + | ||
1517 | +static int fcache_init(void) | ||
1518 | +{ | ||
1519 | + fcache_slab = kmem_cache_create("fcache", sizeof(struct fcache_extent), | ||
1520 | + 0, 0, NULL, NULL); | ||
1521 | + if (!fcache_slab) | ||
1522 | + return -ENOMEM; | ||
1523 | + | ||
1524 | + fcache_fed_slab = kmem_cache_create("fcache_fed", | ||
1525 | + sizeof(struct fcache_endio_data), 0, 0, | ||
1526 | + NULL, NULL); | ||
1527 | + if (!fcache_fed_slab) { | ||
1528 | + kmem_cache_destroy(fcache_slab); | ||
1529 | + return -ENOMEM; | ||
1530 | + } | ||
1531 | + | ||
1532 | + fed_pool = mempool_create_slab_pool(1, fcache_fed_slab); | ||
1533 | + if (!fed_pool) { | ||
1534 | + kmem_cache_destroy(fcache_slab); | ||
1535 | + kmem_cache_destroy(fcache_fed_slab); | ||
1536 | + return -ENOMEM; | ||
1537 | + } | ||
1538 | + | ||
1539 | + fcache_workqueue = create_singlethread_workqueue("fcached"); | ||
1540 | + if (!fcache_workqueue) | ||
1541 | + panic("fcache: failed to create fcached\n"); | ||
1542 | + | ||
1543 | + return 0; | ||
1544 | +} | ||
1545 | + | ||
1546 | +static void fcache_exit(void) | ||
1547 | +{ | ||
1548 | + destroy_workqueue(fcache_workqueue); | ||
1549 | + kmem_cache_destroy(fcache_slab); | ||
1550 | + kmem_cache_destroy(fcache_fed_slab); | ||
1551 | + mempool_destroy(fed_pool); | ||
1552 | +} | ||
1553 | + | ||
1554 | +MODULE_AUTHOR("Jens Axboe <axboe@suse.de>"); | ||
1555 | +MODULE_LICENSE("GPL"); | ||
1556 | + | ||
1557 | +module_init(fcache_init); | ||
1558 | +module_exit(fcache_exit); | ||
1559 | Index: linux-ck-dev/fs/ext3/super.c | ||
1560 | =================================================================== | ||
1561 | --- linux-ck-dev.orig/fs/ext3/super.c 2006-06-18 15:20:10.000000000 +1000 | ||
1562 | +++ linux-ck-dev/fs/ext3/super.c 2006-06-18 15:25:27.000000000 +1000 | ||
1563 | @@ -384,11 +384,43 @@ static void dump_orphan_list(struct supe | ||
1564 | } | ||
1565 | } | ||
1566 | |||
1567 | +extern int fcache_dev_open(struct block_device *, unsigned long, int, int); | ||
1568 | +extern int fcache_dev_close(struct block_device *, int); | ||
1569 | + | ||
1570 | +static void ext3_close_fcache(struct super_block *sb) | ||
1571 | +{ | ||
1572 | + struct ext3_sb_info *sbi = EXT3_SB(sb); | ||
1573 | + struct ext3_super_block *es = sbi->s_es; | ||
1574 | + int serial = le16_to_cpu(es->s_mnt_count); | ||
1575 | + | ||
1576 | + fcache_dev_close(sb->s_bdev, serial); | ||
1577 | +} | ||
1578 | + | ||
1579 | +static int ext3_open_fcache(struct super_block *sb, unsigned long cachedev) | ||
1580 | +{ | ||
1581 | + struct ext3_sb_info *sbi = EXT3_SB(sb); | ||
1582 | + struct ext3_super_block *es = sbi->s_es; | ||
1583 | + int priming = test_opt(sb, FCACHEPRIME); | ||
1584 | + int serial = le16_to_cpu(es->s_mnt_count); | ||
1585 | + int ret; | ||
1586 | + | ||
1587 | + ret = fcache_dev_open(sb->s_bdev, cachedev, priming, serial); | ||
1588 | + if (!ret) { | ||
1589 | + set_opt(sbi->s_mount_opt, FCACHE); | ||
1590 | + return 0; | ||
1591 | + } | ||
1592 | + | ||
1593 | + printk(KERN_ERR "ext3: failed to open fcache (err=%d)\n", ret); | ||
1594 | + return ret; | ||
1595 | +} | ||
1596 | + | ||
1597 | static void ext3_put_super (struct super_block * sb) | ||
1598 | { | ||
1599 | struct ext3_sb_info *sbi = EXT3_SB(sb); | ||
1600 | struct ext3_super_block *es = sbi->s_es; | ||
1601 | - int i; | ||
1602 | + int i, has_fcache; | ||
1603 | + | ||
1604 | + has_fcache = test_opt(sb, FCACHE); | ||
1605 | |||
1606 | ext3_xattr_put_super(sb); | ||
1607 | journal_destroy(sbi->s_journal); | ||
1608 | @@ -431,6 +463,8 @@ static void ext3_put_super (struct super | ||
1609 | invalidate_bdev(sbi->journal_bdev, 0); | ||
1610 | ext3_blkdev_remove(sbi); | ||
1611 | } | ||
1612 | + if (has_fcache) | ||
1613 | + ext3_close_fcache(sb); | ||
1614 | sb->s_fs_info = NULL; | ||
1615 | kfree(sbi); | ||
1616 | return; | ||
1617 | @@ -635,7 +669,7 @@ enum { | ||
1618 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, | ||
1619 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | ||
1620 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | ||
1621 | - Opt_grpquota | ||
1622 | + Opt_grpquota, Opt_fcache_dev, Opt_fcache_prime, | ||
1623 | }; | ||
1624 | |||
1625 | static match_table_t tokens = { | ||
1626 | @@ -684,6 +718,8 @@ static match_table_t tokens = { | ||
1627 | {Opt_quota, "quota"}, | ||
1628 | {Opt_usrquota, "usrquota"}, | ||
1629 | {Opt_barrier, "barrier=%u"}, | ||
1630 | + {Opt_fcache_dev, "fcache_dev=%s"}, | ||
1631 | + {Opt_fcache_prime, "fcache_prime=%u"}, | ||
1632 | {Opt_err, NULL}, | ||
1633 | {Opt_resize, "resize"}, | ||
1634 | }; | ||
1635 | @@ -710,6 +746,7 @@ static unsigned long get_sb_block(void * | ||
1636 | |||
1637 | static int parse_options (char *options, struct super_block *sb, | ||
1638 | unsigned long *inum, unsigned long *journal_devnum, | ||
1639 | + unsigned long *fcache_devnum, | ||
1640 | unsigned long *n_blocks_count, int is_remount) | ||
1641 | { | ||
1642 | struct ext3_sb_info *sbi = EXT3_SB(sb); | ||
1643 | @@ -1012,6 +1049,29 @@ clear_qf_name: | ||
1644 | case Opt_nobh: | ||
1645 | set_opt(sbi->s_mount_opt, NOBH); | ||
1646 | break; | ||
1647 | + case Opt_fcache_dev: { | ||
1648 | + int maj, min; | ||
1649 | + char *p, *pm; | ||
1650 | + | ||
1651 | + if (!fcache_devnum) | ||
1652 | + break; | ||
1653 | + p = match_strdup(&args[0]); | ||
1654 | + if (!p) | ||
1655 | + return 0; | ||
1656 | + maj = simple_strtol(p, &pm, 10); | ||
1657 | + min = simple_strtol(pm + 1, NULL, 10); | ||
1658 | + *fcache_devnum = maj << MINORBITS | min; | ||
1659 | + kfree(p); | ||
1660 | + break; | ||
1661 | + } | ||
1662 | + case Opt_fcache_prime: | ||
1663 | + if (match_int(&args[0], &option)) | ||
1664 | + return 0; | ||
1665 | + if (option) | ||
1666 | + set_opt(sbi->s_mount_opt, FCACHEPRIME); | ||
1667 | + else | ||
1668 | + clear_opt(sbi->s_mount_opt, FCACHEPRIME); | ||
1669 | + break; | ||
1670 | default: | ||
1671 | printk (KERN_ERR | ||
1672 | "EXT3-fs: Unrecognized mount option \"%s\" " | ||
1673 | @@ -1346,6 +1406,7 @@ static int ext3_fill_super (struct super | ||
1674 | unsigned long offset = 0; | ||
1675 | unsigned long journal_inum = 0; | ||
1676 | unsigned long journal_devnum = 0; | ||
1677 | + unsigned long fcache_devnum = 0; | ||
1678 | unsigned long def_mount_opts; | ||
1679 | struct inode *root; | ||
1680 | int blocksize; | ||
1681 | @@ -1353,6 +1414,7 @@ static int ext3_fill_super (struct super | ||
1682 | int db_count; | ||
1683 | int i; | ||
1684 | int needs_recovery; | ||
1685 | + int fcache = 0; | ||
1686 | __le32 features; | ||
1687 | |||
1688 | sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); | ||
1689 | @@ -1427,7 +1489,7 @@ static int ext3_fill_super (struct super | ||
1690 | set_opt(sbi->s_mount_opt, RESERVATION); | ||
1691 | |||
1692 | if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, | ||
1693 | - NULL, 0)) | ||
1694 | + &fcache_devnum, NULL, 0)) | ||
1695 | goto failed_mount; | ||
1696 | |||
1697 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | | ||
1698 | @@ -1651,6 +1713,9 @@ static int ext3_fill_super (struct super | ||
1699 | goto failed_mount2; | ||
1700 | } | ||
1701 | |||
1702 | + if (fcache_devnum) | ||
1703 | + fcache = ext3_open_fcache(sb, fcache_devnum); | ||
1704 | + | ||
1705 | /* We have now updated the journal if required, so we can | ||
1706 | * validate the data journaling mode. */ | ||
1707 | switch (test_opt(sb, DATA_FLAGS)) { | ||
1708 | @@ -1740,6 +1805,8 @@ cantfind_ext3: | ||
1709 | goto failed_mount; | ||
1710 | |||
1711 | failed_mount3: | ||
1712 | + if (!fcache) | ||
1713 | + ext3_close_fcache(sb); | ||
1714 | journal_destroy(sbi->s_journal); | ||
1715 | failed_mount2: | ||
1716 | for (i = 0; i < db_count; i++) | ||
1717 | @@ -2205,6 +2272,7 @@ static int ext3_remount (struct super_bl | ||
1718 | struct ext3_sb_info *sbi = EXT3_SB(sb); | ||
1719 | unsigned long n_blocks_count = 0; | ||
1720 | unsigned long old_sb_flags; | ||
1721 | + unsigned long fcache_devnum = 0; | ||
1722 | struct ext3_mount_options old_opts; | ||
1723 | int err; | ||
1724 | #ifdef CONFIG_QUOTA | ||
1725 | @@ -2226,7 +2294,7 @@ static int ext3_remount (struct super_bl | ||
1726 | /* | ||
1727 | * Allow the "check" option to be passed as a remount option. | ||
1728 | */ | ||
1729 | - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { | ||
1730 | + if (!parse_options(data, sb, NULL, NULL, &fcache_devnum, &n_blocks_count, 1)) { | ||
1731 | err = -EINVAL; | ||
1732 | goto restore_opts; | ||
1733 | } | ||
1734 | @@ -2241,6 +2309,11 @@ static int ext3_remount (struct super_bl | ||
1735 | |||
1736 | ext3_init_journal_params(sb, sbi->s_journal); | ||
1737 | |||
1738 | + if (fcache_devnum) { | ||
1739 | + ext3_close_fcache(sb); | ||
1740 | + ext3_open_fcache(sb, fcache_devnum); | ||
1741 | + } | ||
1742 | + | ||
1743 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || | ||
1744 | n_blocks_count > le32_to_cpu(es->s_blocks_count)) { | ||
1745 | if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { | ||
1746 | Index: linux-ck-dev/include/linux/bio.h | ||
1747 | =================================================================== | ||
1748 | --- linux-ck-dev.orig/include/linux/bio.h 2006-06-18 15:20:10.000000000 +1000 | ||
1749 | +++ linux-ck-dev/include/linux/bio.h 2006-06-18 15:25:27.000000000 +1000 | ||
1750 | @@ -124,6 +124,7 @@ struct bio { | ||
1751 | #define BIO_BOUNCED 5 /* bio is a bounce bio */ | ||
1752 | #define BIO_USER_MAPPED 6 /* contains user pages */ | ||
1753 | #define BIO_EOPNOTSUPP 7 /* not supported */ | ||
1754 | +#define BIO_NOMERGE 8 /* bio not mergeable */ | ||
1755 | #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) | ||
1756 | |||
1757 | /* | ||
1758 | @@ -179,6 +180,14 @@ struct bio { | ||
1759 | #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) | ||
1760 | #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) | ||
1761 | |||
1762 | +static inline int bio_mergeable(struct bio *bio) | ||
1763 | +{ | ||
1764 | + if (!bio_barrier(bio) && !bio->bi_idx && !bio_flagged(bio, BIO_NOMERGE)) | ||
1765 | + return 1; | ||
1766 | + | ||
1767 | + return 0; | ||
1768 | +} | ||
1769 | + | ||
1770 | /* | ||
1771 | * will die | ||
1772 | */ | ||
1773 | Index: linux-ck-dev/include/linux/ext3_fs.h | ||
1774 | =================================================================== | ||
1775 | --- linux-ck-dev.orig/include/linux/ext3_fs.h 2006-06-18 15:20:10.000000000 +1000 | ||
1776 | +++ linux-ck-dev/include/linux/ext3_fs.h 2006-06-18 15:25:27.000000000 +1000 | ||
1777 | @@ -376,6 +376,8 @@ struct ext3_inode { | ||
1778 | #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ | ||
1779 | #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ | ||
1780 | #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ | ||
1781 | +#define EXT3_MOUNT_FCACHE 0x400000 /* using fcache */ | ||
1782 | +#define EXT3_MOUNT_FCACHEPRIME 0x800000 /* priming fcache */ | ||
1783 | |||
1784 | /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ | ||
1785 | #ifndef _LINUX_EXT2_FS_H | ||
1786 | @@ -847,6 +849,18 @@ extern struct inode_operations ext3_spec | ||
1787 | extern struct inode_operations ext3_symlink_inode_operations; | ||
1788 | extern struct inode_operations ext3_fast_symlink_inode_operations; | ||
1789 | |||
1790 | +#ifndef CONFIG_BLK_FCACHE | ||
1791 | +static inline int fcache_dev_open(struct block_device *bdev, | ||
1792 | + unsigned long cache_dev, int priming, int serial) | ||
1793 | +{ | ||
1794 | + return -ENODEV; | ||
1795 | +} | ||
1796 | + | ||
1797 | +static inline int fcache_dev_close(struct block_device *bdev, int serial) | ||
1798 | +{ | ||
1799 | + return 0; | ||
1800 | +} | ||
1801 | +#endif /* CONFIG_BLK_FCACHE */ | ||
1802 | |||
1803 | #endif /* __KERNEL__ */ | ||
1804 |