Magellan Linux

Contents of /trunk/kernel26-magellan/patches-2.6.17-r6/0028-2.6.17-fs-fcache-v2.1.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 105 - (show annotations) (download)
Sun Mar 11 16:17:56 2007 UTC (17 years, 1 month ago) by niro
File size: 46406 byte(s)
2.6.17-magellan-r6

1 A frontend cache for a block device. The purpose is to speedup a
2 fairly random but repeated read work load, like the boot of a system.
3
4 Signed-off-by: Jens Axboe <axboe@suse.de>
5 ---
6 block/ll_rw_blk.c | 11
7 drivers/block/Kconfig | 6
8 drivers/block/Makefile | 1
9 drivers/block/fcache.c | 1475 ++++++++++++++++++++++++++++++++++++++++++++++++
10 fs/ext3/super.c | 81 ++
11 include/linux/bio.h | 9
12 include/linux/ext3_fs.h | 14
13 7 files changed, 1587 insertions(+), 10 deletions(-)
14
15 Index: linux-ck-dev/block/ll_rw_blk.c
16 ===================================================================
17 --- linux-ck-dev.orig/block/ll_rw_blk.c 2006-06-18 15:20:10.000000000 +1000
18 +++ linux-ck-dev/block/ll_rw_blk.c 2006-06-18 15:25:27.000000000 +1000
19 @@ -2817,12 +2817,10 @@ static void init_request_from_bio(struct
20 */
21 if (bio_rw_ahead(bio) || bio_failfast(bio))
22 req->flags |= REQ_FAILFAST;
23 -
24 - /*
25 - * REQ_BARRIER implies no merging, but lets make it explicit
26 - */
27 if (unlikely(bio_barrier(bio)))
28 - req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
29 + req->flags |= REQ_HARDBARRIER;
30 + if (!bio_mergeable(bio))
31 + req->flags |= REQ_NOMERGE;
32
33 req->errors = 0;
34 req->hard_sector = req->sector = bio->bi_sector;
35 @@ -2870,7 +2868,7 @@ static int __make_request(request_queue_
36
37 spin_lock_irq(q->queue_lock);
38
39 - if (unlikely(barrier) || elv_queue_empty(q))
40 + if (!bio_mergeable(bio) || elv_queue_empty(q))
41 goto get_rq;
42
43 el_ret = elv_merge(q, &req, bio);
44 @@ -3109,6 +3107,7 @@ void submit_bio(int rw, struct bio *bio)
45
46 BIO_BUG_ON(!bio->bi_size);
47 BIO_BUG_ON(!bio->bi_io_vec);
48 + BIO_BUG_ON(bio->bi_next);
49 bio->bi_rw |= rw;
50 if (rw & WRITE)
51 mod_page_state(pgpgout, count);
52 Index: linux-ck-dev/drivers/block/Kconfig
53 ===================================================================
54 --- linux-ck-dev.orig/drivers/block/Kconfig 2006-06-18 15:20:10.000000000 +1000
55 +++ linux-ck-dev/drivers/block/Kconfig 2006-06-18 15:25:27.000000000 +1000
56 @@ -456,4 +456,10 @@ config ATA_OVER_ETH
57 This driver provides Support for ATA over Ethernet block
58 devices like the Coraid EtherDrive (R) Storage Blade.
59
60 +config BLK_FCACHE
61 + bool "Boot frontend cache driver"
62 + help
63 + This driver puts the data needed for a boot sequentially in a
64 + defined place, taking all seeks out of the boot process.
65 +
66 endmenu
67 Index: linux-ck-dev/drivers/block/Makefile
68 ===================================================================
69 --- linux-ck-dev.orig/drivers/block/Makefile 2006-06-18 15:20:10.000000000 +1000
70 +++ linux-ck-dev/drivers/block/Makefile 2006-06-18 15:25:27.000000000 +1000
71 @@ -5,6 +5,7 @@
72 # Rewritten to use lists instead of if-statements.
73 #
74
75 +obj-$(CONFIG_BLK_FCACHE) += fcache.o
76 obj-$(CONFIG_MAC_FLOPPY) += swim3.o
77 obj-$(CONFIG_BLK_DEV_FD) += floppy.o
78 obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o
79 Index: linux-ck-dev/drivers/block/fcache.c
80 ===================================================================
81 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
82 +++ linux-ck-dev/drivers/block/fcache.c 2006-06-18 15:25:27.000000000 +1000
83 @@ -0,0 +1,1475 @@
84 +/*
85 + * A frontend cache for a block device. The purpose is to speedup a
86 + * fairly random but repeated read work load, like the boot of a system.
87 + *
88 + * When run in priming mode, fcache allocates and writes data read from
89 + * the source drive to our extent cache in the order in which they are
90 + * accessed. When later run in non-priming mode, data accessed in the same
91 + * order will be linearly available in the cache.
92 + *
93 + * Performance when priming is slower than non-fcache usage would be. If
94 + * the fcache is located on another disk, the hit should be small. If the
95 + * the fcache is located on the same disk (another partition), it runs
96 + * at about half the speed. Non-priming performance should be fairly
97 + * similar on same/other disk.
98 + *
99 + * On-disk format is as follows:
100 + * Block0: header
101 + * Block1..X extent maps
102 + * BlockX+1..Y extent data
103 + *
104 + * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
105 + *
106 + */
107 +#include <linux/config.h>
108 +#include <linux/module.h>
109 +#include <linux/moduleparam.h>
110 +#include <linux/sched.h>
111 +#include <linux/blkdev.h>
112 +#include <linux/prio_tree.h>
113 +#include <linux/buffer_head.h>
114 +#include <linux/slab.h>
115 +
116 +#define FCACHE_MAGIC 0x61786663
117 +#define FCACHE_VERSION 0x02
118 +
119 +#define FCACHE_HEADER_BLOCK 0
120 +#define FCACHE_EXTENT_BLOCK 1
121 +
122 +#undef FCACHE_PAGES_PROTECTED
123 +
124 +struct fcache_dev {
125 + struct block_device *bdev;
126 + struct block_device *fs_bdev;
127 + make_request_fn *mfn;
128 + struct prio_tree_root prio_root;
129 + unsigned long next_cache_block;
130 + unsigned long nr_extents;
131 + unsigned long max_extents;
132 + unsigned int old_bs;
133 + spinlock_t lock;
134 +
135 + sector_t cache_start_sector;
136 + unsigned long cache_blocks;
137 + sector_t fs_start_sector;
138 + sector_t fs_sectors;
139 +
140 + unsigned long flags;
141 + int priming;
142 + int serial;
143 + int chop_ios;
144 +
145 + struct list_head list;
146 + struct work_struct work;
147 +
148 + /*
149 + * stats
150 + */
151 + unsigned int ios[2];
152 + unsigned int hits;
153 + unsigned int misses;
154 + unsigned int overwrites;
155 +};
156 +
157 +enum {
158 + FDEV_F_DOWN = 0,
159 +};
160 +
161 +static struct fcache_dev fcache_dev;
162 +
163 +static int disable;
164 +module_param(disable, int, 0444);
165 +
166 +struct fcache_endio_data {
167 + struct fcache_dev *fdev;
168 + sector_t fs_sector;
169 + unsigned int fs_size;
170 + sector_t cache_sector;
171 + atomic_t completions;
172 + struct bio *bio;
173 + int io_error;
174 + struct list_head list;
175 +};
176 +
177 +/*
178 + * Maps a file system block to the fcache
179 + */
180 +struct fcache_extent {
181 + sector_t fs_sector; /* real device offset */
182 + unsigned int fs_size; /* extent length */
183 + sector_t cache_sector; /* cache device offset */
184 +
185 + struct prio_tree_node prio_node;
186 +};
187 +
188 +/*
189 + * Header on fcache device - will take up the first page of data, so
190 + * plenty of room to go around.
191 + */
192 +struct fcache_header {
193 + u32 magic; /* fcache magic */
194 + u32 version; /* fcache version */
195 + u32 nr_extents; /* nr of extents in cache */
196 + u32 max_extents; /* max nr of extents available */
197 + u32 serial; /* fs and cache serial */
198 + u32 extent_offset; /* where extents start */
199 + u64 fs_start_sector; /* where fs starts */
200 + u64 fs_sectors; /* how big fs is */
201 + char fs_dev[BDEVNAME_SIZE]; /* fs partition */
202 + u64 cache_blocks; /* number of blocks in cache */
203 + u64 cache_blocks_used; /* used blocks in cache */
204 + u16 sector_t_size; /* user space helper */
205 + u16 extent_size; /* user space helper */
206 +};
207 +
208 +#define BLOCK_SHIFT (PAGE_SHIFT - 9)
209 +
210 +static struct kmem_cache *fcache_slab;
211 +static struct kmem_cache *fcache_fed_slab;
212 +static mempool_t *fed_pool;
213 +static struct workqueue_struct *fcache_workqueue;
214 +
215 +static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err)
216 +{
217 + if (bio->bi_size)
218 + return 1;
219 +
220 + complete(bio->bi_private);
221 + return 0;
222 +}
223 +
224 +/*
225 + * Writes out a page of data and waits for it to complete.
226 + */
227 +static int fcache_rw_page(struct fcache_dev *fdev, sector_t index,
228 + struct page *page, int rw)
229 +{
230 + DECLARE_COMPLETION(wait);
231 + struct bio *bio;
232 + int ret = 0;
233 +
234 + bio = bio_alloc(GFP_KERNEL, 1);
235 +
236 + bio->bi_sector = index << BLOCK_SHIFT;
237 + bio->bi_bdev = fdev->bdev;
238 + bio->bi_rw |= (1 << BIO_RW_SYNC);
239 + bio->bi_end_io = fcache_rw_page_endio;
240 + bio->bi_private = &wait;
241 +
242 + bio_add_page(bio, page, PAGE_SIZE, 0);
243 + submit_bio(rw, bio);
244 +
245 + wait_for_completion(&wait);
246 +
247 + if (!bio_flagged(bio, BIO_UPTODATE))
248 + ret = -EIO;
249 +
250 + bio_put(bio);
251 + return ret;
252 +}
253 +
254 +static inline void fcache_fill_header(struct fcache_dev *fdev,
255 + struct fcache_header *header,
256 + unsigned int nr_extents)
257 +{
258 + /*
259 + * See how many pages we need for extent headers, then we know where
260 + * to start putting data. Assume worst case of 1 page per extent, and
261 + * reserve the first page for the header.
262 + */
263 +
264 + header->magic = FCACHE_MAGIC;
265 + header->version = FCACHE_VERSION;
266 + header->nr_extents = nr_extents;
267 + header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / (PAGE_SIZE - sizeof(struct fcache_extent));
268 + header->serial = fdev->serial;
269 +
270 + header->extent_offset = 1 + (header->max_extents * sizeof(struct fcache_extent) / PAGE_SIZE);
271 +
272 + header->fs_start_sector = fdev->fs_start_sector;
273 + header->fs_sectors = fdev->fs_sectors;
274 + bdevname(fdev->fs_bdev, header->fs_dev);
275 + header->cache_blocks = fdev->cache_blocks;
276 + header->cache_blocks_used = fdev->next_cache_block;
277 + header->sector_t_size = sizeof(sector_t);
278 + header->extent_size = sizeof(struct fcache_extent);
279 +}
280 +
281 +static int fcache_write_new_header(struct fcache_dev *fdev)
282 +{
283 + struct fcache_header *header;
284 + struct page *page;
285 + int ret;
286 +
287 + page = alloc_page(GFP_HIGHUSER);
288 + if (unlikely(!page))
289 + return -ENOMEM;
290 +
291 + header = kmap_atomic(page, KM_USER0);
292 + clear_page(header);
293 + fcache_fill_header(fdev, header, 0);
294 + fdev->next_cache_block = header->extent_offset;
295 + fdev->max_extents = header->max_extents;
296 + kunmap_atomic(header, KM_USER0);
297 +
298 + printk("fcache: new header: first block %lu, max %lu\n",
299 + fdev->next_cache_block, fdev->max_extents);
300 + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
301 + __free_page(page);
302 + return ret;
303 +}
304 +
305 +static void fcache_free_prio_tree(struct fcache_dev *fdev)
306 +{
307 + struct fcache_extent *fe;
308 + struct prio_tree_iter iter;
309 + struct prio_tree_node *node;
310 +
311 + /*
312 + * Now prune and free tree, wish there was a better way...
313 + */
314 + do {
315 + prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
316 +
317 + node = prio_tree_next(&iter);
318 + if (!node)
319 + break;
320 +
321 + fe = prio_tree_entry(node, struct fcache_extent, prio_node);
322 + prio_tree_remove(&fdev->prio_root, node);
323 + kmem_cache_free(fcache_slab, fe);
324 + } while (1);
325 +}
326 +
327 +/*
328 + * First clear the header, write extents, then write real header.
329 + */
330 +static int fcache_write_extents(struct fcache_dev *fdev)
331 +{
332 + struct fcache_header *header;
333 + sector_t index, sectors;
334 + unsigned int nr_extents, this_extents;
335 + struct fcache_extent *fe;
336 + struct prio_tree_iter iter;
337 + struct prio_tree_node *node;
338 + struct page *page;
339 + void *p;
340 + int ret;
341 +
342 + page = alloc_page(GFP_KERNEL);
343 + if (unlikely(!page))
344 + return -ENOMEM;
345 +
346 + header = page_address(page);
347 + clear_page(header);
348 + fcache_fill_header(fdev, header, 0);
349 + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
350 + if (ret)
351 + goto err;
352 +
353 + /*
354 + * Now write the extents in page size chunks.
355 + */
356 + p = page_address(page);
357 + clear_page(p);
358 + index = FCACHE_EXTENT_BLOCK;
359 + sectors = 0;
360 + this_extents = nr_extents = 0;
361 +
362 + prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
363 +
364 + do {
365 + node = prio_tree_next(&iter);
366 + if (!node)
367 + break;
368 +
369 + fe = prio_tree_entry(node, struct fcache_extent, prio_node);
370 + nr_extents++;
371 + this_extents++;
372 + sectors += fe->fs_size >> 9;
373 + memcpy(p, fe, sizeof(*fe));
374 + p += sizeof(*fe);
375 + if ((this_extents + 1) * sizeof(*fe) > PAGE_SIZE) {
376 + ret = fcache_rw_page(fdev, index, page, WRITE);
377 + if (ret)
378 + break;
379 +
380 + this_extents = 0;
381 + index++;
382 + p = page_address(page);
383 + }
384 + } while (1);
385 +
386 + if (this_extents)
387 + ret = fcache_rw_page(fdev, index, page, WRITE);
388 +
389 + fdev->nr_extents = nr_extents;
390 + printk("fcache: wrote %d extents, holding %llu sectors of data\n",
391 + nr_extents, (unsigned long long) sectors);
392 +err:
393 + __free_page(page);
394 + return ret;
395 +}
396 +
397 +static int fcache_write_header(struct fcache_dev *fdev)
398 +{
399 + struct page *page;
400 + int ret;
401 +
402 + page = alloc_page(GFP_KERNEL);
403 + if (unlikely(!page))
404 + return -ENOMEM;
405 +
406 + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
407 + if (!ret) {
408 + struct fcache_header *header = page_address(page);
409 +
410 + fcache_fill_header(fdev, header, fdev->nr_extents);
411 + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
412 + printk("fcache: wrote header (extents=%lu,serial=%d)\n",
413 + fdev->nr_extents, fdev->serial);
414 + }
415 +
416 + __free_page(page);
417 + return ret;
418 +}
419 +
420 +static void fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent *fe)
421 +{
422 + struct prio_tree_node *node = &fe->prio_node;
423 + unsigned long flags;
424 +
425 + INIT_PRIO_TREE_NODE(node);
426 + node->start = fe->fs_sector;
427 + node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
428 +
429 + spin_lock_irqsave(&fdev->lock, flags);
430 + prio_tree_insert(&fdev->prio_root, node);
431 + spin_unlock_irqrestore(&fdev->lock, flags);
432 +}
433 +
434 +#define MAX_FE 16
435 +
436 +/*
437 + * Lookup the range of a given request in the prio tree. Used for both
438 + * looking up a range covering a read operation to be served from cache,
439 + * and to lookup potential conflicts from a new write with an existing
440 + * extent.
441 + */
442 +static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset,
443 + unsigned int bytes, struct fcache_extent **map)
444 +{
445 + sector_t end_sector = offset + (bytes >> 9) - 1;
446 + struct prio_tree_node *node;
447 + struct prio_tree_iter iter;
448 + int i = 0;
449 +
450 + prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector);
451 +
452 + /*
453 + * We only need to lock, if we are priming. The prio tree does
454 + * not change when in normal mode.
455 + */
456 + if (fdev->priming)
457 + spin_lock_irq(&fdev->lock);
458 +
459 + do {
460 + node = prio_tree_next(&iter);
461 + if (!node)
462 + break;
463 +
464 + map[i] = prio_tree_entry(node, struct fcache_extent, prio_node);
465 + } while (++i < MAX_FE);
466 +
467 + if (fdev->priming)
468 + spin_unlock_irq(&fdev->lock);
469 +
470 + return i;
471 +}
472 +
473 +/*
474 + * Our data write is done, now insert the fcache extents into the rbtree.
475 + */
476 +static int fcache_instantiate_extent(struct fcache_dev *fdev,
477 + struct fcache_endio_data *fed)
478 +{
479 + struct fcache_extent *fe;
480 +
481 + fe = kmem_cache_alloc(fcache_slab, GFP_ATOMIC);
482 + if (fe) {
483 + fe->fs_sector = fed->fs_sector;
484 + fe->fs_size = fed->fs_size;
485 + fe->cache_sector = fed->cache_sector;
486 +
487 + fcache_tree_link(fdev, fe);
488 + return 0;
489 + }
490 +
491 + return -ENOMEM;
492 +}
493 +
494 +/*
495 + * Hang on to the bio and its pages - ideally we would want to ensure
496 + * that the page data doesn't change between calling this function and
497 + * fcache_put_bio_pages() as well...
498 + */
499 +static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio)
500 +{
501 + /*
502 + * Currently stubbed out, as we cannot end the bio read before
503 + * the write completes without also making sure that the pages
504 + * don't get reused for something else in the mean time.
505 + */
506 +#ifdef FCACHE_PAGES_PROTECTED
507 + struct bio_vec *bvec;
508 + int i;
509 +
510 + bio_get(bio);
511 +
512 + __bio_for_each_segment(bvec, bio, i, 0)
513 + get_page(bvec->bv_page);
514 +#endif
515 +}
516 +
517 +static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio)
518 +{
519 +#ifdef FCACHE_PAGES_PROTECTED
520 + struct bio_vec *bvec;
521 + int i;
522 +
523 + __bio_for_each_segment(bvec, bio, i, 0)
524 + put_page(bvec->bv_page);
525 +
526 + bio_put(bio);
527 +#endif
528 +}
529 +
530 +static void fcache_chop_write_done(struct fcache_endio_data *fed)
531 +{
532 + /*
533 + * Last io completes.
534 + */
535 + if (atomic_dec_and_test(&fed->completions)) {
536 + struct fcache_dev *fdev = fed->fdev;
537 + struct bio *bio = fed->bio;
538 +
539 + /*
540 + * Release our reference to the original bio and
541 + * its pages.
542 + */
543 + fcache_put_bio_pages(fdev, bio);
544 +
545 + /*
546 + * End the read!
547 + */
548 + bio_endio(bio, bio->bi_size, 0);
549 +
550 + /*
551 + * All done, now add extent to our list if io completed ok.
552 + */
553 + if (!fed->io_error)
554 + fcache_instantiate_extent(fdev, fed);
555 +
556 + mempool_free(fed, fed_pool);
557 + }
558 +}
559 +
560 +/*
561 + * Our data write to the cache completes, we can free our clone and
562 + * instantiate the extent block.
563 + */
564 +static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes,
565 + int err)
566 +{
567 + struct fcache_endio_data *fed;
568 +
569 + if (bio->bi_size)
570 + return 1;
571 +
572 + fed = bio->bi_private;
573 +
574 + if (!bio_flagged(bio, BIO_UPTODATE))
575 + fed->io_error = -EIO;
576 +
577 + bio_put(bio);
578 + fcache_chop_write_done(fed);
579 + return 0;
580 +}
581 +
582 +static void fcache_chop_read_done(struct fcache_endio_data *fed)
583 +{
584 + if (atomic_dec_and_test(&fed->completions)) {
585 + struct bio *bio = fed->bio;
586 +
587 + bio_endio(bio, bio->bi_size, fed->io_error);
588 + mempool_free(fed, fed_pool);
589 + }
590 +}
591 +
592 +static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int err)
593 +{
594 + struct fcache_endio_data *fed;
595 +
596 + if (bio->bi_size)
597 + return 1;
598 +
599 + fed = bio->bi_private;
600 +
601 + if (!bio_flagged(bio, BIO_UPTODATE))
602 + fed->io_error = -EIO;
603 +
604 + bio_put(bio);
605 + fcache_chop_read_done(fed);
606 + return 0;
607 +}
608 +
609 +typedef void (chopper_done_t) (struct fcache_endio_data *);
610 +
611 +/*
612 + * This is our io chopper - it hacks a bio into smaller pieces, suitable
613 + * for the target device. Caller supplies suitable end_io and done functions.
614 + */
615 +static void fcache_io_chopper(struct fcache_dev *fdev,
616 + struct fcache_endio_data *fed,
617 + bio_end_io_t *endio, chopper_done_t *done, int rw)
618 +{
619 + struct bio *bio = NULL;
620 + struct bio_vec *bv;
621 + unsigned int total_bytes;
622 + sector_t sector;
623 + int i, vecs;
624 +
625 + /*
626 + * Make sure 'fed' doesn't disappear while we are still issuing
627 + * ios, the artificial reference is dropped at the end.
628 + */
629 + atomic_set(&fed->completions, 1);
630 +
631 + sector = fed->cache_sector;
632 + total_bytes = fed->fs_size;
633 + vecs = fed->bio->bi_vcnt;
634 + bio_for_each_segment(bv, fed->bio, i) {
635 + unsigned int len;
636 +
637 + if (!total_bytes)
638 + break;
639 +
640 + len = bv->bv_len;
641 + if (len > total_bytes)
642 + len = total_bytes;
643 +
644 + do {
645 + unsigned int l;
646 +
647 + if (!bio) {
648 + bio = bio_alloc(GFP_NOFS, vecs);
649 +
650 + bio->bi_sector = sector;
651 + bio->bi_bdev = fdev->bdev;
652 + bio->bi_end_io = endio;
653 + bio->bi_private = fed;
654 + }
655 +
656 + /*
657 + * If successful, break out of this loop and move on.
658 + */
659 + l = bio_add_page(bio, bv->bv_page, len, bv->bv_offset);
660 + if (l == len)
661 + break;
662 +
663 + BUG_ON(!bio->bi_size);
664 +
665 + /*
666 + * We could not add this page, submit what we have
667 + * and alloc a new bio.
668 + */
669 + atomic_inc(&fed->completions);
670 + submit_bio(rw, bio);
671 + bio = NULL;
672 + } while (1);
673 +
674 + total_bytes -= len;
675 + sector += len >> 9;
676 + vecs--;
677 + }
678 +
679 + if (bio) {
680 + atomic_inc(&fed->completions);
681 + submit_bio(rw, bio);
682 + }
683 +
684 + /*
685 + * Drop our reference to fed.
686 + */
687 + done(fed);
688 +}
689 +
690 +/*
691 + * cache device has similar or higher queue restrictions than the fs
692 + * device - in that case, we can resubmit the bio to the device directly.
693 + */
694 +static void fcache_direct_cache_write(struct fcache_dev *fdev,
695 + struct fcache_endio_data *fed)
696 +{
697 + struct bio *bio = bio_clone(fed->bio, GFP_NOFS);
698 +
699 + bio->bi_sector = fed->cache_sector;
700 + bio->bi_bdev = fdev->bdev;
701 + bio->bi_end_io = fcache_extent_write_endio;
702 + bio->bi_private = fed;
703 +
704 + atomic_set(&fed->completions, 1);
705 + submit_bio(WRITE, bio);
706 +}
707 +
708 +/*
709 + * cache device has more conservative restrictions than the fs device.
710 + * The safest approach is to split up the bio and let bio_add_page()
711 + * decide when it's time to submit the pieces.
712 + */
713 +static void fcache_submit_cache_write(struct fcache_dev *fdev,
714 + struct fcache_endio_data *fed)
715 +{
716 + if (!fdev->chop_ios)
717 + fcache_direct_cache_write(fdev, fed);
718 + else
719 + fcache_io_chopper(fdev, fed, fcache_extent_write_endio,
720 + fcache_chop_write_done, WRITE);
721 +}
722 +
723 +/*
724 + * We punt work to fcache_work() whenever we need do work that blocks. The
725 + * only thing that this thread handles is submitting the extent write
726 + * when the real read has completed. We used to do the extent instantiation
727 + * here as well, but fcache_extent_write_endio handles that now.
728 + */
729 +static void fcache_work(void *data)
730 +{
731 + struct fcache_dev *fdev = data;
732 +
733 + do {
734 + struct fcache_endio_data *fed = NULL;
735 + struct bio *bio;
736 +
737 + spin_lock_irq(&fdev->lock);
738 + if (!list_empty(&fdev->list)) {
739 + fed = list_entry(fdev->list.next, struct fcache_endio_data,list);
740 + list_del_init(&fed->list);
741 + }
742 + spin_unlock_irq(&fdev->lock);
743 +
744 + if (!fed)
745 + break;
746 +
747 + bio = fed->bio;
748 +
749 + if (fed->io_error) {
750 + printk(KERN_ERR "fcache: read error from device\n");
751 + bio_endio(bio, bio->bi_size, fed->io_error);
752 + continue;
753 + }
754 +
755 + /*
756 + * Get a ref on the original bio and pages, then
757 + * we should be able to signal completion of the READ
758 + * without waiting for the write to finish first.
759 + */
760 + fcache_get_bio_pages(fdev, bio);
761 +
762 + /*
763 + * Submit the read data as cache writes.
764 + */
765 + fcache_submit_cache_write(fdev, fed);
766 +
767 + /*
768 + * If fcache_get_bio_pages() could protect the pages from
769 + * being changed, we could end the io here instead of in
770 + * fcache_extent_fed_completes().
771 + */
772 + } while (1);
773 +}
774 +
775 +/*
776 + * Align bio to start at extent and stop sooner if extent is short. Must
777 + * be called cautiously - it's only allowed to modify the bio if this is
778 + * a clone and a write request, reads must be fully aligned and only
779 + * possibly require a starting offset modification.
780 + */
781 +static void fcache_bio_align(struct bio *bio, struct fcache_extent *fe)
782 +{
783 + struct bio_vec *bvec;
784 + sector_t start, end;
785 + sector_t org_start, org_end;
786 + unsigned int org_size, org_idx;
787 + int i;
788 +
789 + start = bio->bi_sector;
790 + bio->bi_sector = fe->cache_sector;
791 +
792 + /*
793 + * Nothing to do, perfectly aligned.
794 + */
795 + if (start == fe->fs_sector && bio->bi_size == fe->fs_size)
796 + return;
797 +
798 + org_start = bio->bi_sector;
799 + org_end = bio->bi_sector + (bio->bi_size >> 9);
800 + org_size = bio->bi_size;
801 + org_idx = bio->bi_idx;
802 +
803 + /*
804 + * Adjust beginning.
805 + */
806 + if (start > fe->fs_sector)
807 + bio->bi_sector += (start - fe->fs_sector);
808 + else if (start < fe->fs_sector) {
809 + sector_t diff = fe->fs_sector - start;
810 + int idx = 0;
811 +
812 + BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
813 + BUG_ON(bio_data_dir(bio) != WRITE);
814 +
815 + /*
816 + * Adjust where bio starts
817 + */
818 + __bio_for_each_segment(bvec, bio, i, 0) {
819 + unsigned int bsec = bvec->bv_len >> 9;
820 + unsigned int this_diff = bsec;
821 +
822 + if (!diff)
823 + break;
824 + if (this_diff > diff)
825 + this_diff = diff;
826 +
827 + bio->bi_sector += this_diff;
828 + bio->bi_size -= (this_diff << 9);
829 +
830 + /*
831 + * Bigger than this chunk, skip ahead.
832 + */
833 + if (this_diff == bsec) {
834 + idx++;
835 + diff -= this_diff;
836 + continue;
837 + }
838 +
839 + /*
840 + * Adjust this bvec
841 + */
842 + bvec->bv_offset += (this_diff << 9);
843 + bvec->bv_len -= (this_diff << 9);
844 + break;
845 + }
846 + bio->bi_idx += idx;
847 + }
848 +
849 + /*
850 + * Goes beyond the end, shrink size.
851 + */
852 + end = bio->bi_sector + (bio->bi_size >> 9);
853 + if (end > fe->cache_sector + (fe->fs_size >> 9)) {
854 + sector_t diff = end - (fe->cache_sector + (fe->fs_size >> 9));
855 + int vecs = 0;
856 +
857 + BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
858 + BUG_ON(bio_data_dir(bio) != WRITE);
859 +
860 + /*
861 + * This is __bio_for_each_segment_reverse().
862 + */
863 + for (i = bio->bi_vcnt - 1; i >= bio->bi_idx; i--) {
864 + struct bio_vec *bvec = &bio->bi_io_vec[i];
865 + unsigned int bsec = bvec->bv_len >> 9;
866 + unsigned int this_diff = bsec;
867 +
868 + if (!diff)
869 + break;
870 + if (this_diff > diff)
871 + this_diff = diff;
872 +
873 + bio->bi_size -= (this_diff << 9);
874 +
875 + /*
876 + * Bigger than this chunk, skip ahead.
877 + */
878 + if (this_diff == bsec) {
879 + vecs++;
880 + diff -= this_diff;
881 + continue;
882 + }
883 +
884 + /*
885 + * Adjust this bvec
886 + */
887 + bvec->bv_len -= (this_diff << 9);
888 + break;
889 + }
890 + bio->bi_vcnt -= vecs;
891 + }
892 +
893 + BUG_ON(bio->bi_sector < fe->cache_sector);
894 + BUG_ON(bio->bi_sector + (bio->bi_size >> 9) > fe->cache_sector + (fe->fs_size >> 9));
895 +
896 + /*
897 + * Invalidate the segment counts, we changed the bio layout.
898 + */
899 + bio->bi_flags &= ~(1 << BIO_SEG_VALID);
900 + bio->bi_flags |= (1 << BIO_NOMERGE);
901 +}
902 +
903 +static int fcache_overwrite_endio(struct bio *bio, unsigned int bytes, int err)
904 +{
905 + if (bio->bi_size)
906 + return 1;
907 +
908 + if (!bio_flagged(bio, BIO_UPTODATE)) {
909 + struct fcache_dev *fdev = bio->bi_private;
910 +
911 + printk(KERN_ERR "fcache: overwrite error, cache off\n");
912 + set_bit(FDEV_F_DOWN, &fdev->flags);
913 + }
914 +
915 + bio_put(bio);
916 + return 0;
917 +}
918 +
919 +/*
920 + * Schedule overwrite of some existing block(s).
921 + */
922 +static int fcache_overwrite_extent(struct fcache_dev *fdev,
923 + struct fcache_extent *fe, struct bio *bio)
924 +{
925 + struct bio *clone = bio_clone(bio, GFP_NOFS);
926 +
927 + clone->bi_bdev = fdev->bdev;
928 + clone->bi_end_io = fcache_overwrite_endio;
929 + clone->bi_private = fdev;
930 + fcache_bio_align(clone, fe);
931 + submit_bio(WRITE, clone);
932 + return 0;
933 +}
934 +
935 +/*
936 + * Our real data read is complete. Kick our process context handler so it
937 + * can submit the write to our cache.
938 + */
939 +static int fcache_extent_endio(struct bio *bio, unsigned int bytes, int err)
940 +{
941 + struct fcache_dev *fdev;
942 + struct fcache_endio_data *fed;
943 + unsigned long flags;
944 +
945 + if (bio->bi_size)
946 + return 1;
947 +
948 + fed = bio->bi_private;
949 +
950 + if (!bio_flagged(bio, BIO_UPTODATE))
951 + fed->io_error = -EIO;
952 +
953 + bio_put(bio);
954 +
955 + fdev = fed->fdev;
956 + spin_lock_irqsave(&fdev->lock, flags);
957 + list_add_tail(&fed->list, &fdev->list);
958 + spin_unlock_irqrestore(&fdev->lock, flags);
959 + queue_work(fcache_workqueue, &fdev->work);
960 + return 0;
961 +}
962 +
963 +/*
964 + * This initiates adding an extent to our list. We do this by cloning the
965 + * original bio and submitting that to the real device and when that completes
966 + * we write that out to the cache device and instantiate the extent.
967 + */
968 +static int fcache_add_extent(struct fcache_dev *fdev, struct bio *bio)
969 +{
970 + struct fcache_endio_data *fed;
971 + struct bio *clone;
972 +
973 + fed = mempool_alloc(fed_pool, GFP_NOIO);
974 +
975 + fed->fdev = fdev;
976 + fed->fs_sector = bio->bi_sector;
977 + fed->fs_size = bio->bi_size;
978 + fed->cache_sector = -1;
979 + fed->bio = bio;
980 + fed->io_error = 0;
981 + INIT_LIST_HEAD(&fed->list);
982 +
983 + /*
984 + * Allocate/assign an extent block for this range
985 + */
986 + spin_lock_irq(&fdev->lock);
987 + if (fdev->nr_extents < fdev->max_extents) {
988 + unsigned long nr = (bio->bi_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
989 +
990 + if (fdev->next_cache_block + nr <= fdev->cache_blocks) {
991 + fdev->nr_extents++;
992 + fed->cache_sector = fdev->next_cache_block << BLOCK_SHIFT;
993 + fdev->next_cache_block += nr;
994 + }
995 + }
996 + spin_unlock_irq(&fdev->lock);
997 +
998 + /*
999 + * Ran out of room
1000 + */
1001 + if (fed->cache_sector == -1) {
1002 + printk(KERN_ERR "fcache: ran out of space, priming now off\n");
1003 + fdev->priming = 0;
1004 + mempool_free(fed, fed_pool);
1005 + return -ENOENT;
1006 + }
1007 +
1008 + clone = bio_clone(bio, GFP_NOFS);
1009 + clone->bi_private = fed;
1010 + clone->bi_end_io = fcache_extent_endio;
1011 + clone->bi_rw |= (1 << BIO_RW_SYNC);
1012 +
1013 + generic_make_request(clone);
1014 + return 0;
1015 +}
1016 +
1017 +static int fcache_parse_extents(struct fcache_dev *fdev, void *addr,
1018 + unsigned int max_extents)
1019 +{
1020 + int nr_extents = PAGE_SIZE / sizeof(struct fcache_extent);
1021 + int extents_read;
1022 +
1023 + if (nr_extents > max_extents)
1024 + nr_extents = max_extents;
1025 +
1026 + extents_read = 0;
1027 + while (nr_extents) {
1028 + struct fcache_extent *fe, *__fe = addr;
1029 +
1030 + fe = kmem_cache_alloc(fcache_slab, GFP_KERNEL);
1031 + if (unlikely(!fe))
1032 + return -ENOMEM;
1033 +
1034 + memset(fe, 0, sizeof(*fe));
1035 + fe->fs_sector = __fe->fs_sector;
1036 + fe->fs_size = __fe->fs_size;
1037 + fe->cache_sector = __fe->cache_sector;
1038 +
1039 + fcache_tree_link(fdev, fe);
1040 +
1041 + nr_extents--;
1042 + extents_read++;
1043 + addr += sizeof(*fe);
1044 + }
1045 +
1046 + return extents_read;
1047 +}
1048 +
1049 +static int fcache_read_extents(struct fcache_dev *fdev)
1050 +{
1051 + unsigned int nr_extents = fdev->nr_extents;
1052 + int ret, extents, total_extents;
1053 + struct page *page;
1054 + sector_t index;
1055 + void *p;
1056 +
1057 + page = alloc_page(GFP_KERNEL);
1058 + if (unlikely(!page))
1059 + return -ENOMEM;
1060 +
1061 + ret = 0;
1062 + total_extents = 0;
1063 + index = FCACHE_EXTENT_BLOCK;
1064 + while (nr_extents) {
1065 + ret = fcache_rw_page(fdev, index, page, READ);
1066 + if (ret)
1067 + break;
1068 +
1069 + p = page_address(page);
1070 + extents = fcache_parse_extents(fdev, p, nr_extents);
1071 +
1072 + if (extents < 0) {
1073 + ret = extents;
1074 + break;
1075 + }
1076 +
1077 + index++;
1078 + nr_extents -= extents;
1079 + total_extents += extents;
1080 + }
1081 +
1082 + __free_page(page);
1083 +
1084 + if (ret)
1085 + return ret;
1086 +
1087 + return total_extents;
1088 +}
1089 +
1090 +/*
1091 + * Read an existing fcache header from the device, and then proceed to
1092 + * reading and adding the extents to out prio tree.
1093 + */
1094 +static int fcache_load_header(struct fcache_dev *fdev, int serial)
1095 +{
1096 + struct fcache_header *header = NULL;
1097 + struct page *page;
1098 + int ret, wrong_serial = 0;
1099 + char b[BDEVNAME_SIZE];
1100 +
1101 + page = alloc_page(GFP_HIGHUSER);
1102 + if (unlikely(!page))
1103 + return -ENOMEM;
1104 +
1105 + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
1106 + if (unlikely(ret))
1107 + goto err;
1108 +
1109 + ret = -EINVAL;
1110 + header = kmap_atomic(page, KM_USER0);
1111 + if (header->magic != FCACHE_MAGIC) {
1112 + printk(KERN_ERR "fcache: bad magic %x\n", header->magic);
1113 + goto err;
1114 + }
1115 + if (header->version != FCACHE_VERSION) {
1116 + printk(KERN_ERR "fcache: bad version %d\n", header->version);
1117 + goto err;
1118 + }
1119 + if (strcmp(bdevname(fdev->fs_bdev, b), header->fs_dev)) {
1120 + printk(KERN_ERR "fcache: device mismatch (%s/%s\n", b,
1121 + header->fs_dev);
1122 + goto err;
1123 + }
1124 + if (header->fs_start_sector != fdev->fs_start_sector ||
1125 + header->fs_sectors != fdev->fs_sectors) {
1126 + printk(KERN_ERR "fcache: fs appears to have changed size\n");
1127 + goto err;
1128 + }
1129 +
1130 + fdev->nr_extents = header->nr_extents;
1131 + fdev->max_extents = header->max_extents;
1132 +
1133 + /*
1134 + * Don't fail on out-of-date serial, just warn that the user needs
1135 + * to prime the cache again. Until then we'll just bypass the cache.
1136 + */
1137 + if (header->serial != serial) {
1138 + printk(KERN_ERR "fcache: found serial %d, expected %d.\n",
1139 + header->serial, serial);
1140 + printk(KERN_ERR "fcache: reprime the cache!\n");
1141 + wrong_serial = 1;
1142 + }
1143 +
1144 + fdev->serial = header->serial;
1145 + kunmap_atomic(header, KM_USER0);
1146 + __free_page(page);
1147 +
1148 + if (!wrong_serial) {
1149 + printk("fcache: header looks valid (extents=%ld extents, serial=%u)\n", fdev->nr_extents, fdev->serial);
1150 +
1151 + ret = fcache_read_extents(fdev);
1152 + printk("fcache: loaded %d extents\n", ret);
1153 +
1154 + /*
1155 + * If we don't find all the extents we require, fail.
1156 + */
1157 + if (ret != fdev->nr_extents) {
1158 + fcache_free_prio_tree(fdev);
1159 + ret = -EINVAL;
1160 + } else
1161 + ret = 0;
1162 + }
1163 +
1164 + return ret;
1165 +err:
1166 + __free_page(page);
1167 + if (header)
1168 + kunmap_atomic(header, KM_USER0);
1169 + return ret;
1170 +}
1171 +
1172 +/*
1173 + * We use this range to decide when to log an io to the target device.
1174 + */
1175 +static void fcache_fill_fs_size(struct fcache_dev *fdev)
1176 +{
1177 + struct block_device *bdev = fdev->fs_bdev;
1178 +
1179 + /*
1180 + * Partition or whole device?
1181 + */
1182 + if (bdev != bdev->bd_contains) {
1183 + struct hd_struct *p = bdev->bd_part;
1184 +
1185 + fdev->fs_start_sector = p->start_sect;
1186 + fdev->fs_sectors = p->nr_sects;
1187 + } else {
1188 + fdev->fs_start_sector = 0;
1189 + fdev->fs_sectors = bdev->bd_inode->i_size >> 9;
1190 + }
1191 +}
1192 +
1193 +static void fcache_fill_cache_size(struct fcache_dev *fdev)
1194 +{
1195 + struct block_device *bdev = fdev->bdev;
1196 +
1197 + /*
1198 + * Partition or whole device?
1199 + */
1200 + if (bdev != bdev->bd_contains) {
1201 + struct hd_struct *p = bdev->bd_part;
1202 +
1203 + fdev->cache_start_sector = p->start_sect;
1204 + fdev->cache_blocks = p->nr_sects >> BLOCK_SHIFT;
1205 + } else {
1206 + fdev->cache_start_sector = 0;
1207 + fdev->cache_blocks = bdev->bd_inode->i_size >> PAGE_SHIFT;
1208 + }
1209 +}
1210 +
1211 +/*
1212 + * This is a read request, check if we have that block. If we do, then
1213 + * just redirect. If not, pass it through.
1214 + */
1215 +static int fcache_read_request(struct fcache_dev *fdev, request_queue_t *q,
1216 + struct bio *bio)
1217 +{
1218 + struct fcache_extent *extents[MAX_FE];
1219 + struct fcache_extent *fe;
1220 + int i, nr;
1221 +
1222 + /*
1223 + * Not there, redirect to original but schedule adding this extent
1224 + * to our list if we are priming.
1225 + */
1226 + nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents);
1227 + if (!nr) {
1228 + if (fdev->priming && !fcache_add_extent(fdev, bio))
1229 + return 0;
1230 +
1231 + fdev->misses++;
1232 + return fdev->mfn(q, bio);
1233 + }
1234 +
1235 + /*
1236 + * If range is at least as big, we use our cache. If not, cop out
1237 + * and just submit to real device.
1238 + */
1239 + for (i = 0; i < nr; i++) {
1240 + sector_t end_fe, end_bi;
1241 + fe = extents[i];
1242 +
1243 + end_fe = fe->fs_sector + (fe->fs_size >> 9);
1244 + end_bi = bio->bi_sector + (bio->bi_size >> 9);
1245 +
1246 + /*
1247 + * match!
1248 + */
1249 + if (bio->bi_sector >= fe->fs_sector && end_bi <= end_fe)
1250 + break;
1251 +
1252 + fe = NULL;
1253 + }
1254 +
1255 + /*
1256 + * Nopes, send to real device.
1257 + */
1258 + if (!fe) {
1259 + fdev->misses++;
1260 + return fdev->mfn(q, bio);
1261 + }
1262 +
1263 + /*
1264 + * Perfect, adjust start offset if it isn't aligned.
1265 + */
1266 + fdev->hits++;
1267 + fcache_bio_align(bio, fe);
1268 +
1269 + /*
1270 + * If we don't have to chop it up, just let generic_make_request()
1271 + * handle the stacking. Otherwise, return handled and pass to chopper.
1272 + */
1273 + if (fdev->chop_ios) {
1274 + struct fcache_endio_data *fed;
1275 +
1276 + fed = mempool_alloc(fed_pool, GFP_NOIO);
1277 +
1278 + fed->fdev = fdev;
1279 + fed->cache_sector = bio->bi_sector;
1280 + fed->fs_size = bio->bi_size;
1281 + fed->bio = bio;
1282 + fed->io_error = 0;
1283 + fcache_io_chopper(fdev, fed, fcache_chop_read_endio,
1284 + fcache_chop_read_done, READ);
1285 + return 0;
1286 + }
1287 +
1288 + bio->bi_bdev = fdev->bdev;
1289 + return 1;
1290 +}
1291 +
1292 +/*
1293 + * If we are priming the cache, always add this block. If not, then we still
1294 + * need to overwrite this block if it's in our cache.
1295 + */
1296 +static int fcache_write_request(struct fcache_dev *fdev, request_queue_t *q,
1297 + struct bio *bio)
1298 +{
1299 + struct fcache_extent *extents[MAX_FE];
1300 + struct fcache_extent *fe;
1301 + sector_t start = bio->bi_sector;
1302 + int i, nr;
1303 +
1304 +repeat:
1305 + nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents);
1306 +
1307 + /*
1308 + * Find out what to overwrite, if anything.
1309 + */
1310 + for (i = 0; i < nr; i++) {
1311 + fe = extents[i];
1312 + fdev->overwrites++;
1313 + fcache_overwrite_extent(fdev, fe, bio);
1314 + }
1315 +
1316 + /*
1317 + * If i == MAX_FE, there _may_ be more extents. Repeat lookup, start
1318 + * from the end of last request.
1319 + */
1320 + if (i == MAX_FE) {
1321 + fe = extents[i - 1];
1322 + start = fe->fs_sector + (fe->fs_size >> 9);
1323 + goto repeat;
1324 + }
1325 +
1326 + return fdev->mfn(q, bio);
1327 +}
1328 +
1329 +/*
1330 + * This is the only case where we resubmit an io to the device but don't
1331 + * want to count it as part of io we log.
1332 + */
1333 +#define fcache_bio_seen(bio) ((bio)->bi_end_io == fcache_extent_endio)
1334 +
1335 +static int fcache_make_request(request_queue_t *q, struct bio *bio)
1336 +{
1337 + struct fcache_dev *fdev = &fcache_dev;
1338 +
1339 + /*
1340 + * If it's in the sector range we are monitoring and the device isn't
1341 + * being shutdown, then pass it on. Assume a bio doesn't span into
1342 + * the next partition, so don't bother accounting for size.
1343 + */
1344 + if ((bio->bi_sector >= fdev->fs_start_sector) &&
1345 + (bio->bi_sector < (fdev->fs_start_sector + fdev->fs_sectors)) &&
1346 + !test_bit(FDEV_F_DOWN, &fdev->flags) &&
1347 + !fcache_bio_seen(bio)) {
1348 +
1349 + fdev->ios[bio_data_dir(bio)]++;
1350 +
1351 + if (bio_data_dir(bio) == READ)
1352 + return fcache_read_request(fdev, q, bio);
1353 +
1354 + return fcache_write_request(fdev, q, bio);
1355 + }
1356 +
1357 + /*
1358 + * Pass through to original make_request_fn.
1359 + */
1360 + return fdev->mfn(q, bio);
1361 +}
1362 +
1363 +/*
1364 + * Attach the cache device 'bdev' to 'fdev'.
1365 + */
1366 +static int fcache_setup_dev(struct fcache_dev *fdev,
1367 + struct block_device *fs_bdev,
1368 + struct block_device *bdev,
1369 + int priming, int serial)
1370 +{
1371 + request_queue_t *fs_q, *cache_q;
1372 + char b[BDEVNAME_SIZE];
1373 + int ret;
1374 +
1375 + memset(fdev, 0, sizeof(*fdev));
1376 + INIT_PRIO_TREE_ROOT(&fdev->prio_root);
1377 + spin_lock_init(&fdev->lock);
1378 + INIT_LIST_HEAD(&fdev->list);
1379 + INIT_WORK(&fdev->work, fcache_work, fdev);
1380 + fdev->priming = priming;
1381 + fdev->fs_bdev = fs_bdev;
1382 + fdev->bdev = bdev;
1383 +
1384 + ret = -EINVAL;
1385 +
1386 + fs_q = bdev_get_queue(fs_bdev);
1387 + cache_q = bdev_get_queue(bdev);
1388 + if (!fs_q || !cache_q)
1389 + goto out;
1390 +
1391 + /*
1392 + * Chop up outgoing ios, if the target is a different queue. We could
1393 + * look closer at limits, but it's fragile and pretty pointless.
1394 + */
1395 + if (fs_q != cache_q)
1396 + fdev->chop_ios = 1;
1397 +
1398 + ret = bd_claim(bdev, fcache_setup_dev);
1399 + if (ret < 0)
1400 + goto out;
1401 +
1402 + ret = block_size(bdev);
1403 + if (ret != PAGE_SIZE) {
1404 + fdev->old_bs = ret;
1405 + ret = set_blocksize(bdev, PAGE_SIZE);
1406 + if (ret < 0)
1407 + goto out_release;
1408 + } else
1409 + ret = 0;
1410 +
1411 + fcache_fill_cache_size(fdev);
1412 + fcache_fill_fs_size(fdev);
1413 +
1414 + if (priming) {
1415 + fdev->serial = serial;
1416 + ret = fcache_write_new_header(fdev);
1417 + } else
1418 + ret = fcache_load_header(fdev, serial);
1419 +
1420 + if (!ret) {
1421 + printk("fcache: %s opened successfully (%spriming)\n",
1422 + bdevname(bdev, b),
1423 + priming ? "" : "not ");
1424 + return 0;
1425 + }
1426 +
1427 +out_release:
1428 + bd_release(fdev->bdev);
1429 +out:
1430 + blkdev_put(fdev->bdev);
1431 + fdev->bdev = NULL;
1432 + return ret;
1433 +}
1434 +
1435 +/*
1436 + * Return fdev->bdev to its original state.
1437 + */
1438 +static void fcache_shutdown_dev(struct fcache_dev *fdev,
1439 + struct block_device *bdev)
1440 +{
1441 + if (fdev->bdev) {
1442 + if (fdev->mfn) {
1443 + request_queue_t *q = bdev_get_queue(bdev);
1444 +
1445 + (void) xchg(&q->make_request_fn, fdev->mfn);
1446 + }
1447 + sync_blockdev(fdev->bdev);
1448 + if (fdev->old_bs)
1449 + set_blocksize(fdev->bdev, fdev->old_bs);
1450 +
1451 + bd_release(fdev->bdev);
1452 + blkdev_put(fdev->bdev);
1453 + fdev->bdev = NULL;
1454 + INIT_PRIO_TREE_ROOT(&fdev->prio_root);
1455 + }
1456 +}
1457 +
1458 +/*
1459 + * bdev is the file system device, cache_dev is the device we want to store
1460 + * the cache on.
1461 + */
1462 +int fcache_dev_open(struct block_device *bdev, unsigned long cache_dev,
1463 + int priming, int serial)
1464 +{
1465 + struct block_device *fcache_bdev;
1466 + request_queue_t *q;
1467 + int ret;
1468 +
1469 + if (disable)
1470 + return 0;
1471 + if (fcache_dev.bdev)
1472 + return -EBUSY;
1473 +
1474 + fcache_bdev = open_by_devnum(cache_dev, FMODE_READ|FMODE_WRITE);
1475 + if (IS_ERR(fcache_bdev))
1476 + return PTR_ERR(fcache_bdev);
1477 +
1478 + ret = fcache_setup_dev(&fcache_dev, bdev, fcache_bdev, priming, serial);
1479 + if (ret)
1480 + return ret;
1481 +
1482 + q = bdev_get_queue(bdev);
1483 + fcache_dev.mfn = xchg(&q->make_request_fn, fcache_make_request);
1484 + return 0;
1485 +}
1486 +
1487 +EXPORT_SYMBOL(fcache_dev_open);
1488 +
1489 +void fcache_dev_close(struct block_device *bdev, int serial)
1490 +{
1491 + struct fcache_dev *fdev = &fcache_dev;
1492 +
1493 + if (disable)
1494 + return;
1495 +
1496 + if (!fdev->bdev)
1497 + return;
1498 +
1499 + printk("fcache: ios r/w %u/%u, hits %u, misses %u, overwrites %u\n",
1500 + fdev->ios[0], fdev->ios[1], fdev->hits,
1501 + fdev->misses, fdev->overwrites);
1502 + fdev->serial = serial;
1503 +
1504 + sync_blockdev(bdev);
1505 + set_bit(FDEV_F_DOWN, &fdev->flags);
1506 +
1507 + if (fdev->priming)
1508 + fcache_write_extents(fdev);
1509 +
1510 + fcache_write_header(fdev);
1511 + fcache_free_prio_tree(fdev);
1512 + fcache_shutdown_dev(fdev, bdev);
1513 +}
1514 +
1515 +EXPORT_SYMBOL(fcache_dev_close);
1516 +
1517 +static int fcache_init(void)
1518 +{
1519 + fcache_slab = kmem_cache_create("fcache", sizeof(struct fcache_extent),
1520 + 0, 0, NULL, NULL);
1521 + if (!fcache_slab)
1522 + return -ENOMEM;
1523 +
1524 + fcache_fed_slab = kmem_cache_create("fcache_fed",
1525 + sizeof(struct fcache_endio_data), 0, 0,
1526 + NULL, NULL);
1527 + if (!fcache_fed_slab) {
1528 + kmem_cache_destroy(fcache_slab);
1529 + return -ENOMEM;
1530 + }
1531 +
1532 + fed_pool = mempool_create_slab_pool(1, fcache_fed_slab);
1533 + if (!fed_pool) {
1534 + kmem_cache_destroy(fcache_slab);
1535 + kmem_cache_destroy(fcache_fed_slab);
1536 + return -ENOMEM;
1537 + }
1538 +
1539 + fcache_workqueue = create_singlethread_workqueue("fcached");
1540 + if (!fcache_workqueue)
1541 + panic("fcache: failed to create fcached\n");
1542 +
1543 + return 0;
1544 +}
1545 +
1546 +static void fcache_exit(void)
1547 +{
1548 + destroy_workqueue(fcache_workqueue);
1549 + kmem_cache_destroy(fcache_slab);
1550 + kmem_cache_destroy(fcache_fed_slab);
1551 + mempool_destroy(fed_pool);
1552 +}
1553 +
1554 +MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
1555 +MODULE_LICENSE("GPL");
1556 +
1557 +module_init(fcache_init);
1558 +module_exit(fcache_exit);
1559 Index: linux-ck-dev/fs/ext3/super.c
1560 ===================================================================
1561 --- linux-ck-dev.orig/fs/ext3/super.c 2006-06-18 15:20:10.000000000 +1000
1562 +++ linux-ck-dev/fs/ext3/super.c 2006-06-18 15:25:27.000000000 +1000
1563 @@ -384,11 +384,43 @@ static void dump_orphan_list(struct supe
1564 }
1565 }
1566
1567 +extern int fcache_dev_open(struct block_device *, unsigned long, int, int);
1568 +extern int fcache_dev_close(struct block_device *, int);
1569 +
1570 +static void ext3_close_fcache(struct super_block *sb)
1571 +{
1572 + struct ext3_sb_info *sbi = EXT3_SB(sb);
1573 + struct ext3_super_block *es = sbi->s_es;
1574 + int serial = le16_to_cpu(es->s_mnt_count);
1575 +
1576 + fcache_dev_close(sb->s_bdev, serial);
1577 +}
1578 +
1579 +static int ext3_open_fcache(struct super_block *sb, unsigned long cachedev)
1580 +{
1581 + struct ext3_sb_info *sbi = EXT3_SB(sb);
1582 + struct ext3_super_block *es = sbi->s_es;
1583 + int priming = test_opt(sb, FCACHEPRIME);
1584 + int serial = le16_to_cpu(es->s_mnt_count);
1585 + int ret;
1586 +
1587 + ret = fcache_dev_open(sb->s_bdev, cachedev, priming, serial);
1588 + if (!ret) {
1589 + set_opt(sbi->s_mount_opt, FCACHE);
1590 + return 0;
1591 + }
1592 +
1593 + printk(KERN_ERR "ext3: failed to open fcache (err=%d)\n", ret);
1594 + return ret;
1595 +}
1596 +
1597 static void ext3_put_super (struct super_block * sb)
1598 {
1599 struct ext3_sb_info *sbi = EXT3_SB(sb);
1600 struct ext3_super_block *es = sbi->s_es;
1601 - int i;
1602 + int i, has_fcache;
1603 +
1604 + has_fcache = test_opt(sb, FCACHE);
1605
1606 ext3_xattr_put_super(sb);
1607 journal_destroy(sbi->s_journal);
1608 @@ -431,6 +463,8 @@ static void ext3_put_super (struct super
1609 invalidate_bdev(sbi->journal_bdev, 0);
1610 ext3_blkdev_remove(sbi);
1611 }
1612 + if (has_fcache)
1613 + ext3_close_fcache(sb);
1614 sb->s_fs_info = NULL;
1615 kfree(sbi);
1616 return;
1617 @@ -635,7 +669,7 @@ enum {
1618 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1619 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1620 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
1621 - Opt_grpquota
1622 + Opt_grpquota, Opt_fcache_dev, Opt_fcache_prime,
1623 };
1624
1625 static match_table_t tokens = {
1626 @@ -684,6 +718,8 @@ static match_table_t tokens = {
1627 {Opt_quota, "quota"},
1628 {Opt_usrquota, "usrquota"},
1629 {Opt_barrier, "barrier=%u"},
1630 + {Opt_fcache_dev, "fcache_dev=%s"},
1631 + {Opt_fcache_prime, "fcache_prime=%u"},
1632 {Opt_err, NULL},
1633 {Opt_resize, "resize"},
1634 };
1635 @@ -710,6 +746,7 @@ static unsigned long get_sb_block(void *
1636
1637 static int parse_options (char *options, struct super_block *sb,
1638 unsigned long *inum, unsigned long *journal_devnum,
1639 + unsigned long *fcache_devnum,
1640 unsigned long *n_blocks_count, int is_remount)
1641 {
1642 struct ext3_sb_info *sbi = EXT3_SB(sb);
1643 @@ -1012,6 +1049,29 @@ clear_qf_name:
1644 case Opt_nobh:
1645 set_opt(sbi->s_mount_opt, NOBH);
1646 break;
1647 + case Opt_fcache_dev: {
1648 + int maj, min;
1649 + char *p, *pm;
1650 +
1651 + if (!fcache_devnum)
1652 + break;
1653 + p = match_strdup(&args[0]);
1654 + if (!p)
1655 + return 0;
1656 + maj = simple_strtol(p, &pm, 10);
1657 + min = simple_strtol(pm + 1, NULL, 10);
1658 + *fcache_devnum = maj << MINORBITS | min;
1659 + kfree(p);
1660 + break;
1661 + }
1662 + case Opt_fcache_prime:
1663 + if (match_int(&args[0], &option))
1664 + return 0;
1665 + if (option)
1666 + set_opt(sbi->s_mount_opt, FCACHEPRIME);
1667 + else
1668 + clear_opt(sbi->s_mount_opt, FCACHEPRIME);
1669 + break;
1670 default:
1671 printk (KERN_ERR
1672 "EXT3-fs: Unrecognized mount option \"%s\" "
1673 @@ -1346,6 +1406,7 @@ static int ext3_fill_super (struct super
1674 unsigned long offset = 0;
1675 unsigned long journal_inum = 0;
1676 unsigned long journal_devnum = 0;
1677 + unsigned long fcache_devnum = 0;
1678 unsigned long def_mount_opts;
1679 struct inode *root;
1680 int blocksize;
1681 @@ -1353,6 +1414,7 @@ static int ext3_fill_super (struct super
1682 int db_count;
1683 int i;
1684 int needs_recovery;
1685 + int fcache = 0;
1686 __le32 features;
1687
1688 sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
1689 @@ -1427,7 +1489,7 @@ static int ext3_fill_super (struct super
1690 set_opt(sbi->s_mount_opt, RESERVATION);
1691
1692 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1693 - NULL, 0))
1694 + &fcache_devnum, NULL, 0))
1695 goto failed_mount;
1696
1697 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1698 @@ -1651,6 +1713,9 @@ static int ext3_fill_super (struct super
1699 goto failed_mount2;
1700 }
1701
1702 + if (fcache_devnum)
1703 + fcache = ext3_open_fcache(sb, fcache_devnum);
1704 +
1705 /* We have now updated the journal if required, so we can
1706 * validate the data journaling mode. */
1707 switch (test_opt(sb, DATA_FLAGS)) {
1708 @@ -1740,6 +1805,8 @@ cantfind_ext3:
1709 goto failed_mount;
1710
1711 failed_mount3:
1712 + if (!fcache)
1713 + ext3_close_fcache(sb);
1714 journal_destroy(sbi->s_journal);
1715 failed_mount2:
1716 for (i = 0; i < db_count; i++)
1717 @@ -2205,6 +2272,7 @@ static int ext3_remount (struct super_bl
1718 struct ext3_sb_info *sbi = EXT3_SB(sb);
1719 unsigned long n_blocks_count = 0;
1720 unsigned long old_sb_flags;
1721 + unsigned long fcache_devnum = 0;
1722 struct ext3_mount_options old_opts;
1723 int err;
1724 #ifdef CONFIG_QUOTA
1725 @@ -2226,7 +2294,7 @@ static int ext3_remount (struct super_bl
1726 /*
1727 * Allow the "check" option to be passed as a remount option.
1728 */
1729 - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
1730 + if (!parse_options(data, sb, NULL, NULL, &fcache_devnum, &n_blocks_count, 1)) {
1731 err = -EINVAL;
1732 goto restore_opts;
1733 }
1734 @@ -2241,6 +2309,11 @@ static int ext3_remount (struct super_bl
1735
1736 ext3_init_journal_params(sb, sbi->s_journal);
1737
1738 + if (fcache_devnum) {
1739 + ext3_close_fcache(sb);
1740 + ext3_open_fcache(sb, fcache_devnum);
1741 + }
1742 +
1743 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
1744 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
1745 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
1746 Index: linux-ck-dev/include/linux/bio.h
1747 ===================================================================
1748 --- linux-ck-dev.orig/include/linux/bio.h 2006-06-18 15:20:10.000000000 +1000
1749 +++ linux-ck-dev/include/linux/bio.h 2006-06-18 15:25:27.000000000 +1000
1750 @@ -124,6 +124,7 @@ struct bio {
1751 #define BIO_BOUNCED 5 /* bio is a bounce bio */
1752 #define BIO_USER_MAPPED 6 /* contains user pages */
1753 #define BIO_EOPNOTSUPP 7 /* not supported */
1754 +#define BIO_NOMERGE 8 /* bio not mergeable */
1755 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
1756
1757 /*
1758 @@ -179,6 +180,14 @@ struct bio {
1759 #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
1760 #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
1761
1762 +static inline int bio_mergeable(struct bio *bio)
1763 +{
1764 + if (!bio_barrier(bio) && !bio->bi_idx && !bio_flagged(bio, BIO_NOMERGE))
1765 + return 1;
1766 +
1767 + return 0;
1768 +}
1769 +
1770 /*
1771 * will die
1772 */
1773 Index: linux-ck-dev/include/linux/ext3_fs.h
1774 ===================================================================
1775 --- linux-ck-dev.orig/include/linux/ext3_fs.h 2006-06-18 15:20:10.000000000 +1000
1776 +++ linux-ck-dev/include/linux/ext3_fs.h 2006-06-18 15:25:27.000000000 +1000
1777 @@ -376,6 +376,8 @@ struct ext3_inode {
1778 #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
1779 #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
1780 #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
1781 +#define EXT3_MOUNT_FCACHE 0x400000 /* using fcache */
1782 +#define EXT3_MOUNT_FCACHEPRIME 0x800000 /* priming fcache */
1783
1784 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
1785 #ifndef _LINUX_EXT2_FS_H
1786 @@ -847,6 +849,18 @@ extern struct inode_operations ext3_spec
1787 extern struct inode_operations ext3_symlink_inode_operations;
1788 extern struct inode_operations ext3_fast_symlink_inode_operations;
1789
1790 +#ifndef CONFIG_BLK_FCACHE
1791 +static inline int fcache_dev_open(struct block_device *bdev,
1792 + unsigned long cache_dev, int priming, int serial)
1793 +{
1794 + return -ENODEV;
1795 +}
1796 +
1797 +static inline int fcache_dev_close(struct block_device *bdev, int serial)
1798 +{
1799 + return 0;
1800 +}
1801 +#endif /* CONFIG_BLK_FCACHE */
1802
1803 #endif /* __KERNEL__ */
1804