Magellan Linux

Annotation of /trunk/kernel26-alx/patches-2.6.17-r7/0028-2.6.17-fs-fcache-v2.1.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (hide annotations) (download)
Fri May 18 11:04:36 2007 UTC (17 years ago) by niro
File size: 46406 byte(s)
-import

1 niro 199 A frontend cache for a block device. The purpose is to speedup a
2     fairly random but repeated read work load, like the boot of a system.
3    
4     Signed-off-by: Jens Axboe <axboe@suse.de>
5     ---
6     block/ll_rw_blk.c | 11
7     drivers/block/Kconfig | 6
8     drivers/block/Makefile | 1
9     drivers/block/fcache.c | 1475 ++++++++++++++++++++++++++++++++++++++++++++++++
10     fs/ext3/super.c | 81 ++
11     include/linux/bio.h | 9
12     include/linux/ext3_fs.h | 14
13     7 files changed, 1587 insertions(+), 10 deletions(-)
14    
15     Index: linux-ck-dev/block/ll_rw_blk.c
16     ===================================================================
17     --- linux-ck-dev.orig/block/ll_rw_blk.c 2006-06-18 15:20:10.000000000 +1000
18     +++ linux-ck-dev/block/ll_rw_blk.c 2006-06-18 15:25:27.000000000 +1000
19     @@ -2817,12 +2817,10 @@ static void init_request_from_bio(struct
20     */
21     if (bio_rw_ahead(bio) || bio_failfast(bio))
22     req->flags |= REQ_FAILFAST;
23     -
24     - /*
25     - * REQ_BARRIER implies no merging, but lets make it explicit
26     - */
27     if (unlikely(bio_barrier(bio)))
28     - req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
29     + req->flags |= REQ_HARDBARRIER;
30     + if (!bio_mergeable(bio))
31     + req->flags |= REQ_NOMERGE;
32    
33     req->errors = 0;
34     req->hard_sector = req->sector = bio->bi_sector;
35     @@ -2870,7 +2868,7 @@ static int __make_request(request_queue_
36    
37     spin_lock_irq(q->queue_lock);
38    
39     - if (unlikely(barrier) || elv_queue_empty(q))
40     + if (!bio_mergeable(bio) || elv_queue_empty(q))
41     goto get_rq;
42    
43     el_ret = elv_merge(q, &req, bio);
44     @@ -3109,6 +3107,7 @@ void submit_bio(int rw, struct bio *bio)
45    
46     BIO_BUG_ON(!bio->bi_size);
47     BIO_BUG_ON(!bio->bi_io_vec);
48     + BIO_BUG_ON(bio->bi_next);
49     bio->bi_rw |= rw;
50     if (rw & WRITE)
51     mod_page_state(pgpgout, count);
52     Index: linux-ck-dev/drivers/block/Kconfig
53     ===================================================================
54     --- linux-ck-dev.orig/drivers/block/Kconfig 2006-06-18 15:20:10.000000000 +1000
55     +++ linux-ck-dev/drivers/block/Kconfig 2006-06-18 15:25:27.000000000 +1000
56     @@ -456,4 +456,10 @@ config ATA_OVER_ETH
57     This driver provides Support for ATA over Ethernet block
58     devices like the Coraid EtherDrive (R) Storage Blade.
59    
60     +config BLK_FCACHE
61     + bool "Boot frontend cache driver"
62     + help
63     + This driver puts the data needed for a boot sequentially in a
64     + defined place, taking all seeks out of the boot process.
65     +
66     endmenu
67     Index: linux-ck-dev/drivers/block/Makefile
68     ===================================================================
69     --- linux-ck-dev.orig/drivers/block/Makefile 2006-06-18 15:20:10.000000000 +1000
70     +++ linux-ck-dev/drivers/block/Makefile 2006-06-18 15:25:27.000000000 +1000
71     @@ -5,6 +5,7 @@
72     # Rewritten to use lists instead of if-statements.
73     #
74    
75     +obj-$(CONFIG_BLK_FCACHE) += fcache.o
76     obj-$(CONFIG_MAC_FLOPPY) += swim3.o
77     obj-$(CONFIG_BLK_DEV_FD) += floppy.o
78     obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o
79     Index: linux-ck-dev/drivers/block/fcache.c
80     ===================================================================
81     --- /dev/null 1970-01-01 00:00:00.000000000 +0000
82     +++ linux-ck-dev/drivers/block/fcache.c 2006-06-18 15:25:27.000000000 +1000
83     @@ -0,0 +1,1475 @@
84     +/*
85     + * A frontend cache for a block device. The purpose is to speedup a
86     + * fairly random but repeated read work load, like the boot of a system.
87     + *
88     + * When run in priming mode, fcache allocates and writes data read from
89     + * the source drive to our extent cache in the order in which they are
90     + * accessed. When later run in non-priming mode, data accessed in the same
91     + * order will be linearly available in the cache.
92     + *
93     + * Performance when priming is slower than non-fcache usage would be. If
94     + * the fcache is located on another disk, the hit should be small. If the
95     + * the fcache is located on the same disk (another partition), it runs
96     + * at about half the speed. Non-priming performance should be fairly
97     + * similar on same/other disk.
98     + *
99     + * On-disk format is as follows:
100     + * Block0: header
101     + * Block1..X extent maps
102     + * BlockX+1..Y extent data
103     + *
104     + * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
105     + *
106     + */
107     +#include <linux/config.h>
108     +#include <linux/module.h>
109     +#include <linux/moduleparam.h>
110     +#include <linux/sched.h>
111     +#include <linux/blkdev.h>
112     +#include <linux/prio_tree.h>
113     +#include <linux/buffer_head.h>
114     +#include <linux/slab.h>
115     +
116     +#define FCACHE_MAGIC 0x61786663
117     +#define FCACHE_VERSION 0x02
118     +
119     +#define FCACHE_HEADER_BLOCK 0
120     +#define FCACHE_EXTENT_BLOCK 1
121     +
122     +#undef FCACHE_PAGES_PROTECTED
123     +
124     +struct fcache_dev {
125     + struct block_device *bdev;
126     + struct block_device *fs_bdev;
127     + make_request_fn *mfn;
128     + struct prio_tree_root prio_root;
129     + unsigned long next_cache_block;
130     + unsigned long nr_extents;
131     + unsigned long max_extents;
132     + unsigned int old_bs;
133     + spinlock_t lock;
134     +
135     + sector_t cache_start_sector;
136     + unsigned long cache_blocks;
137     + sector_t fs_start_sector;
138     + sector_t fs_sectors;
139     +
140     + unsigned long flags;
141     + int priming;
142     + int serial;
143     + int chop_ios;
144     +
145     + struct list_head list;
146     + struct work_struct work;
147     +
148     + /*
149     + * stats
150     + */
151     + unsigned int ios[2];
152     + unsigned int hits;
153     + unsigned int misses;
154     + unsigned int overwrites;
155     +};
156     +
157     +enum {
158     + FDEV_F_DOWN = 0,
159     +};
160     +
161     +static struct fcache_dev fcache_dev;
162     +
163     +static int disable;
164     +module_param(disable, int, 0444);
165     +
166     +struct fcache_endio_data {
167     + struct fcache_dev *fdev;
168     + sector_t fs_sector;
169     + unsigned int fs_size;
170     + sector_t cache_sector;
171     + atomic_t completions;
172     + struct bio *bio;
173     + int io_error;
174     + struct list_head list;
175     +};
176     +
177     +/*
178     + * Maps a file system block to the fcache
179     + */
180     +struct fcache_extent {
181     + sector_t fs_sector; /* real device offset */
182     + unsigned int fs_size; /* extent length */
183     + sector_t cache_sector; /* cache device offset */
184     +
185     + struct prio_tree_node prio_node;
186     +};
187     +
188     +/*
189     + * Header on fcache device - will take up the first page of data, so
190     + * plenty of room to go around.
191     + */
192     +struct fcache_header {
193     + u32 magic; /* fcache magic */
194     + u32 version; /* fcache version */
195     + u32 nr_extents; /* nr of extents in cache */
196     + u32 max_extents; /* max nr of extents available */
197     + u32 serial; /* fs and cache serial */
198     + u32 extent_offset; /* where extents start */
199     + u64 fs_start_sector; /* where fs starts */
200     + u64 fs_sectors; /* how big fs is */
201     + char fs_dev[BDEVNAME_SIZE]; /* fs partition */
202     + u64 cache_blocks; /* number of blocks in cache */
203     + u64 cache_blocks_used; /* used blocks in cache */
204     + u16 sector_t_size; /* user space helper */
205     + u16 extent_size; /* user space helper */
206     +};
207     +
208     +#define BLOCK_SHIFT (PAGE_SHIFT - 9)
209     +
210     +static struct kmem_cache *fcache_slab;
211     +static struct kmem_cache *fcache_fed_slab;
212     +static mempool_t *fed_pool;
213     +static struct workqueue_struct *fcache_workqueue;
214     +
215     +static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err)
216     +{
217     + if (bio->bi_size)
218     + return 1;
219     +
220     + complete(bio->bi_private);
221     + return 0;
222     +}
223     +
224     +/*
225     + * Writes out a page of data and waits for it to complete.
226     + */
227     +static int fcache_rw_page(struct fcache_dev *fdev, sector_t index,
228     + struct page *page, int rw)
229     +{
230     + DECLARE_COMPLETION(wait);
231     + struct bio *bio;
232     + int ret = 0;
233     +
234     + bio = bio_alloc(GFP_KERNEL, 1);
235     +
236     + bio->bi_sector = index << BLOCK_SHIFT;
237     + bio->bi_bdev = fdev->bdev;
238     + bio->bi_rw |= (1 << BIO_RW_SYNC);
239     + bio->bi_end_io = fcache_rw_page_endio;
240     + bio->bi_private = &wait;
241     +
242     + bio_add_page(bio, page, PAGE_SIZE, 0);
243     + submit_bio(rw, bio);
244     +
245     + wait_for_completion(&wait);
246     +
247     + if (!bio_flagged(bio, BIO_UPTODATE))
248     + ret = -EIO;
249     +
250     + bio_put(bio);
251     + return ret;
252     +}
253     +
254     +static inline void fcache_fill_header(struct fcache_dev *fdev,
255     + struct fcache_header *header,
256     + unsigned int nr_extents)
257     +{
258     + /*
259     + * See how many pages we need for extent headers, then we know where
260     + * to start putting data. Assume worst case of 1 page per extent, and
261     + * reserve the first page for the header.
262     + */
263     +
264     + header->magic = FCACHE_MAGIC;
265     + header->version = FCACHE_VERSION;
266     + header->nr_extents = nr_extents;
267     + header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / (PAGE_SIZE - sizeof(struct fcache_extent));
268     + header->serial = fdev->serial;
269     +
270     + header->extent_offset = 1 + (header->max_extents * sizeof(struct fcache_extent) / PAGE_SIZE);
271     +
272     + header->fs_start_sector = fdev->fs_start_sector;
273     + header->fs_sectors = fdev->fs_sectors;
274     + bdevname(fdev->fs_bdev, header->fs_dev);
275     + header->cache_blocks = fdev->cache_blocks;
276     + header->cache_blocks_used = fdev->next_cache_block;
277     + header->sector_t_size = sizeof(sector_t);
278     + header->extent_size = sizeof(struct fcache_extent);
279     +}
280     +
281     +static int fcache_write_new_header(struct fcache_dev *fdev)
282     +{
283     + struct fcache_header *header;
284     + struct page *page;
285     + int ret;
286     +
287     + page = alloc_page(GFP_HIGHUSER);
288     + if (unlikely(!page))
289     + return -ENOMEM;
290     +
291     + header = kmap_atomic(page, KM_USER0);
292     + clear_page(header);
293     + fcache_fill_header(fdev, header, 0);
294     + fdev->next_cache_block = header->extent_offset;
295     + fdev->max_extents = header->max_extents;
296     + kunmap_atomic(header, KM_USER0);
297     +
298     + printk("fcache: new header: first block %lu, max %lu\n",
299     + fdev->next_cache_block, fdev->max_extents);
300     + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
301     + __free_page(page);
302     + return ret;
303     +}
304     +
305     +static void fcache_free_prio_tree(struct fcache_dev *fdev)
306     +{
307     + struct fcache_extent *fe;
308     + struct prio_tree_iter iter;
309     + struct prio_tree_node *node;
310     +
311     + /*
312     + * Now prune and free tree, wish there was a better way...
313     + */
314     + do {
315     + prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
316     +
317     + node = prio_tree_next(&iter);
318     + if (!node)
319     + break;
320     +
321     + fe = prio_tree_entry(node, struct fcache_extent, prio_node);
322     + prio_tree_remove(&fdev->prio_root, node);
323     + kmem_cache_free(fcache_slab, fe);
324     + } while (1);
325     +}
326     +
327     +/*
328     + * First clear the header, write extents, then write real header.
329     + */
330     +static int fcache_write_extents(struct fcache_dev *fdev)
331     +{
332     + struct fcache_header *header;
333     + sector_t index, sectors;
334     + unsigned int nr_extents, this_extents;
335     + struct fcache_extent *fe;
336     + struct prio_tree_iter iter;
337     + struct prio_tree_node *node;
338     + struct page *page;
339     + void *p;
340     + int ret;
341     +
342     + page = alloc_page(GFP_KERNEL);
343     + if (unlikely(!page))
344     + return -ENOMEM;
345     +
346     + header = page_address(page);
347     + clear_page(header);
348     + fcache_fill_header(fdev, header, 0);
349     + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
350     + if (ret)
351     + goto err;
352     +
353     + /*
354     + * Now write the extents in page size chunks.
355     + */
356     + p = page_address(page);
357     + clear_page(p);
358     + index = FCACHE_EXTENT_BLOCK;
359     + sectors = 0;
360     + this_extents = nr_extents = 0;
361     +
362     + prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
363     +
364     + do {
365     + node = prio_tree_next(&iter);
366     + if (!node)
367     + break;
368     +
369     + fe = prio_tree_entry(node, struct fcache_extent, prio_node);
370     + nr_extents++;
371     + this_extents++;
372     + sectors += fe->fs_size >> 9;
373     + memcpy(p, fe, sizeof(*fe));
374     + p += sizeof(*fe);
375     + if ((this_extents + 1) * sizeof(*fe) > PAGE_SIZE) {
376     + ret = fcache_rw_page(fdev, index, page, WRITE);
377     + if (ret)
378     + break;
379     +
380     + this_extents = 0;
381     + index++;
382     + p = page_address(page);
383     + }
384     + } while (1);
385     +
386     + if (this_extents)
387     + ret = fcache_rw_page(fdev, index, page, WRITE);
388     +
389     + fdev->nr_extents = nr_extents;
390     + printk("fcache: wrote %d extents, holding %llu sectors of data\n",
391     + nr_extents, (unsigned long long) sectors);
392     +err:
393     + __free_page(page);
394     + return ret;
395     +}
396     +
397     +static int fcache_write_header(struct fcache_dev *fdev)
398     +{
399     + struct page *page;
400     + int ret;
401     +
402     + page = alloc_page(GFP_KERNEL);
403     + if (unlikely(!page))
404     + return -ENOMEM;
405     +
406     + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
407     + if (!ret) {
408     + struct fcache_header *header = page_address(page);
409     +
410     + fcache_fill_header(fdev, header, fdev->nr_extents);
411     + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
412     + printk("fcache: wrote header (extents=%lu,serial=%d)\n",
413     + fdev->nr_extents, fdev->serial);
414     + }
415     +
416     + __free_page(page);
417     + return ret;
418     +}
419     +
420     +static void fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent *fe)
421     +{
422     + struct prio_tree_node *node = &fe->prio_node;
423     + unsigned long flags;
424     +
425     + INIT_PRIO_TREE_NODE(node);
426     + node->start = fe->fs_sector;
427     + node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
428     +
429     + spin_lock_irqsave(&fdev->lock, flags);
430     + prio_tree_insert(&fdev->prio_root, node);
431     + spin_unlock_irqrestore(&fdev->lock, flags);
432     +}
433     +
434     +#define MAX_FE 16
435     +
436     +/*
437     + * Lookup the range of a given request in the prio tree. Used for both
438     + * looking up a range covering a read operation to be served from cache,
439     + * and to lookup potential conflicts from a new write with an existing
440     + * extent.
441     + */
442     +static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset,
443     + unsigned int bytes, struct fcache_extent **map)
444     +{
445     + sector_t end_sector = offset + (bytes >> 9) - 1;
446     + struct prio_tree_node *node;
447     + struct prio_tree_iter iter;
448     + int i = 0;
449     +
450     + prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector);
451     +
452     + /*
453     + * We only need to lock, if we are priming. The prio tree does
454     + * not change when in normal mode.
455     + */
456     + if (fdev->priming)
457     + spin_lock_irq(&fdev->lock);
458     +
459     + do {
460     + node = prio_tree_next(&iter);
461     + if (!node)
462     + break;
463     +
464     + map[i] = prio_tree_entry(node, struct fcache_extent, prio_node);
465     + } while (++i < MAX_FE);
466     +
467     + if (fdev->priming)
468     + spin_unlock_irq(&fdev->lock);
469     +
470     + return i;
471     +}
472     +
473     +/*
474     + * Our data write is done, now insert the fcache extents into the rbtree.
475     + */
476     +static int fcache_instantiate_extent(struct fcache_dev *fdev,
477     + struct fcache_endio_data *fed)
478     +{
479     + struct fcache_extent *fe;
480     +
481     + fe = kmem_cache_alloc(fcache_slab, GFP_ATOMIC);
482     + if (fe) {
483     + fe->fs_sector = fed->fs_sector;
484     + fe->fs_size = fed->fs_size;
485     + fe->cache_sector = fed->cache_sector;
486     +
487     + fcache_tree_link(fdev, fe);
488     + return 0;
489     + }
490     +
491     + return -ENOMEM;
492     +}
493     +
494     +/*
495     + * Hang on to the bio and its pages - ideally we would want to ensure
496     + * that the page data doesn't change between calling this function and
497     + * fcache_put_bio_pages() as well...
498     + */
499     +static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio)
500     +{
501     + /*
502     + * Currently stubbed out, as we cannot end the bio read before
503     + * the write completes without also making sure that the pages
504     + * don't get reused for something else in the mean time.
505     + */
506     +#ifdef FCACHE_PAGES_PROTECTED
507     + struct bio_vec *bvec;
508     + int i;
509     +
510     + bio_get(bio);
511     +
512     + __bio_for_each_segment(bvec, bio, i, 0)
513     + get_page(bvec->bv_page);
514     +#endif
515     +}
516     +
517     +static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio)
518     +{
519     +#ifdef FCACHE_PAGES_PROTECTED
520     + struct bio_vec *bvec;
521     + int i;
522     +
523     + __bio_for_each_segment(bvec, bio, i, 0)
524     + put_page(bvec->bv_page);
525     +
526     + bio_put(bio);
527     +#endif
528     +}
529     +
530     +static void fcache_chop_write_done(struct fcache_endio_data *fed)
531     +{
532     + /*
533     + * Last io completes.
534     + */
535     + if (atomic_dec_and_test(&fed->completions)) {
536     + struct fcache_dev *fdev = fed->fdev;
537     + struct bio *bio = fed->bio;
538     +
539     + /*
540     + * Release our reference to the original bio and
541     + * its pages.
542     + */
543     + fcache_put_bio_pages(fdev, bio);
544     +
545     + /*
546     + * End the read!
547     + */
548     + bio_endio(bio, bio->bi_size, 0);
549     +
550     + /*
551     + * All done, now add extent to our list if io completed ok.
552     + */
553     + if (!fed->io_error)
554     + fcache_instantiate_extent(fdev, fed);
555     +
556     + mempool_free(fed, fed_pool);
557     + }
558     +}
559     +
560     +/*
561     + * Our data write to the cache completes, we can free our clone and
562     + * instantiate the extent block.
563     + */
564     +static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes,
565     + int err)
566     +{
567     + struct fcache_endio_data *fed;
568     +
569     + if (bio->bi_size)
570     + return 1;
571     +
572     + fed = bio->bi_private;
573     +
574     + if (!bio_flagged(bio, BIO_UPTODATE))
575     + fed->io_error = -EIO;
576     +
577     + bio_put(bio);
578     + fcache_chop_write_done(fed);
579     + return 0;
580     +}
581     +
582     +static void fcache_chop_read_done(struct fcache_endio_data *fed)
583     +{
584     + if (atomic_dec_and_test(&fed->completions)) {
585     + struct bio *bio = fed->bio;
586     +
587     + bio_endio(bio, bio->bi_size, fed->io_error);
588     + mempool_free(fed, fed_pool);
589     + }
590     +}
591     +
592     +static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int err)
593     +{
594     + struct fcache_endio_data *fed;
595     +
596     + if (bio->bi_size)
597     + return 1;
598     +
599     + fed = bio->bi_private;
600     +
601     + if (!bio_flagged(bio, BIO_UPTODATE))
602     + fed->io_error = -EIO;
603     +
604     + bio_put(bio);
605     + fcache_chop_read_done(fed);
606     + return 0;
607     +}
608     +
609     +typedef void (chopper_done_t) (struct fcache_endio_data *);
610     +
611     +/*
612     + * This is our io chopper - it hacks a bio into smaller pieces, suitable
613     + * for the target device. Caller supplies suitable end_io and done functions.
614     + */
615     +static void fcache_io_chopper(struct fcache_dev *fdev,
616     + struct fcache_endio_data *fed,
617     + bio_end_io_t *endio, chopper_done_t *done, int rw)
618     +{
619     + struct bio *bio = NULL;
620     + struct bio_vec *bv;
621     + unsigned int total_bytes;
622     + sector_t sector;
623     + int i, vecs;
624     +
625     + /*
626     + * Make sure 'fed' doesn't disappear while we are still issuing
627     + * ios, the artificial reference is dropped at the end.
628     + */
629     + atomic_set(&fed->completions, 1);
630     +
631     + sector = fed->cache_sector;
632     + total_bytes = fed->fs_size;
633     + vecs = fed->bio->bi_vcnt;
634     + bio_for_each_segment(bv, fed->bio, i) {
635     + unsigned int len;
636     +
637     + if (!total_bytes)
638     + break;
639     +
640     + len = bv->bv_len;
641     + if (len > total_bytes)
642     + len = total_bytes;
643     +
644     + do {
645     + unsigned int l;
646     +
647     + if (!bio) {
648     + bio = bio_alloc(GFP_NOFS, vecs);
649     +
650     + bio->bi_sector = sector;
651     + bio->bi_bdev = fdev->bdev;
652     + bio->bi_end_io = endio;
653     + bio->bi_private = fed;
654     + }
655     +
656     + /*
657     + * If successful, break out of this loop and move on.
658     + */
659     + l = bio_add_page(bio, bv->bv_page, len, bv->bv_offset);
660     + if (l == len)
661     + break;
662     +
663     + BUG_ON(!bio->bi_size);
664     +
665     + /*
666     + * We could not add this page, submit what we have
667     + * and alloc a new bio.
668     + */
669     + atomic_inc(&fed->completions);
670     + submit_bio(rw, bio);
671     + bio = NULL;
672     + } while (1);
673     +
674     + total_bytes -= len;
675     + sector += len >> 9;
676     + vecs--;
677     + }
678     +
679     + if (bio) {
680     + atomic_inc(&fed->completions);
681     + submit_bio(rw, bio);
682     + }
683     +
684     + /*
685     + * Drop our reference to fed.
686     + */
687     + done(fed);
688     +}
689     +
690     +/*
691     + * cache device has similar or higher queue restrictions than the fs
692     + * device - in that case, we can resubmit the bio to the device directly.
693     + */
694     +static void fcache_direct_cache_write(struct fcache_dev *fdev,
695     + struct fcache_endio_data *fed)
696     +{
697     + struct bio *bio = bio_clone(fed->bio, GFP_NOFS);
698     +
699     + bio->bi_sector = fed->cache_sector;
700     + bio->bi_bdev = fdev->bdev;
701     + bio->bi_end_io = fcache_extent_write_endio;
702     + bio->bi_private = fed;
703     +
704     + atomic_set(&fed->completions, 1);
705     + submit_bio(WRITE, bio);
706     +}
707     +
708     +/*
709     + * cache device has more conservative restrictions than the fs device.
710     + * The safest approach is to split up the bio and let bio_add_page()
711     + * decide when it's time to submit the pieces.
712     + */
713     +static void fcache_submit_cache_write(struct fcache_dev *fdev,
714     + struct fcache_endio_data *fed)
715     +{
716     + if (!fdev->chop_ios)
717     + fcache_direct_cache_write(fdev, fed);
718     + else
719     + fcache_io_chopper(fdev, fed, fcache_extent_write_endio,
720     + fcache_chop_write_done, WRITE);
721     +}
722     +
723     +/*
724     + * We punt work to fcache_work() whenever we need do work that blocks. The
725     + * only thing that this thread handles is submitting the extent write
726     + * when the real read has completed. We used to do the extent instantiation
727     + * here as well, but fcache_extent_write_endio handles that now.
728     + */
729     +static void fcache_work(void *data)
730     +{
731     + struct fcache_dev *fdev = data;
732     +
733     + do {
734     + struct fcache_endio_data *fed = NULL;
735     + struct bio *bio;
736     +
737     + spin_lock_irq(&fdev->lock);
738     + if (!list_empty(&fdev->list)) {
739     + fed = list_entry(fdev->list.next, struct fcache_endio_data,list);
740     + list_del_init(&fed->list);
741     + }
742     + spin_unlock_irq(&fdev->lock);
743     +
744     + if (!fed)
745     + break;
746     +
747     + bio = fed->bio;
748     +
749     + if (fed->io_error) {
750     + printk(KERN_ERR "fcache: read error from device\n");
751     + bio_endio(bio, bio->bi_size, fed->io_error);
752     + continue;
753     + }
754     +
755     + /*
756     + * Get a ref on the original bio and pages, then
757     + * we should be able to signal completion of the READ
758     + * without waiting for the write to finish first.
759     + */
760     + fcache_get_bio_pages(fdev, bio);
761     +
762     + /*
763     + * Submit the read data as cache writes.
764     + */
765     + fcache_submit_cache_write(fdev, fed);
766     +
767     + /*
768     + * If fcache_get_bio_pages() could protect the pages from
769     + * being changed, we could end the io here instead of in
770     + * fcache_extent_fed_completes().
771     + */
772     + } while (1);
773     +}
774     +
775     +/*
776     + * Align bio to start at extent and stop sooner if extent is short. Must
777     + * be called cautiously - it's only allowed to modify the bio if this is
778     + * a clone and a write request, reads must be fully aligned and only
779     + * possibly require a starting offset modification.
780     + */
781     +static void fcache_bio_align(struct bio *bio, struct fcache_extent *fe)
782     +{
783     + struct bio_vec *bvec;
784     + sector_t start, end;
785     + sector_t org_start, org_end;
786     + unsigned int org_size, org_idx;
787     + int i;
788     +
789     + start = bio->bi_sector;
790     + bio->bi_sector = fe->cache_sector;
791     +
792     + /*
793     + * Nothing to do, perfectly aligned.
794     + */
795     + if (start == fe->fs_sector && bio->bi_size == fe->fs_size)
796     + return;
797     +
798     + org_start = bio->bi_sector;
799     + org_end = bio->bi_sector + (bio->bi_size >> 9);
800     + org_size = bio->bi_size;
801     + org_idx = bio->bi_idx;
802     +
803     + /*
804     + * Adjust beginning.
805     + */
806     + if (start > fe->fs_sector)
807     + bio->bi_sector += (start - fe->fs_sector);
808     + else if (start < fe->fs_sector) {
809     + sector_t diff = fe->fs_sector - start;
810     + int idx = 0;
811     +
812     + BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
813     + BUG_ON(bio_data_dir(bio) != WRITE);
814     +
815     + /*
816     + * Adjust where bio starts
817     + */
818     + __bio_for_each_segment(bvec, bio, i, 0) {
819     + unsigned int bsec = bvec->bv_len >> 9;
820     + unsigned int this_diff = bsec;
821     +
822     + if (!diff)
823     + break;
824     + if (this_diff > diff)
825     + this_diff = diff;
826     +
827     + bio->bi_sector += this_diff;
828     + bio->bi_size -= (this_diff << 9);
829     +
830     + /*
831     + * Bigger than this chunk, skip ahead.
832     + */
833     + if (this_diff == bsec) {
834     + idx++;
835     + diff -= this_diff;
836     + continue;
837     + }
838     +
839     + /*
840     + * Adjust this bvec
841     + */
842     + bvec->bv_offset += (this_diff << 9);
843     + bvec->bv_len -= (this_diff << 9);
844     + break;
845     + }
846     + bio->bi_idx += idx;
847     + }
848     +
849     + /*
850     + * Goes beyond the end, shrink size.
851     + */
852     + end = bio->bi_sector + (bio->bi_size >> 9);
853     + if (end > fe->cache_sector + (fe->fs_size >> 9)) {
854     + sector_t diff = end - (fe->cache_sector + (fe->fs_size >> 9));
855     + int vecs = 0;
856     +
857     + BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
858     + BUG_ON(bio_data_dir(bio) != WRITE);
859     +
860     + /*
861     + * This is __bio_for_each_segment_reverse().
862     + */
863     + for (i = bio->bi_vcnt - 1; i >= bio->bi_idx; i--) {
864     + struct bio_vec *bvec = &bio->bi_io_vec[i];
865     + unsigned int bsec = bvec->bv_len >> 9;
866     + unsigned int this_diff = bsec;
867     +
868     + if (!diff)
869     + break;
870     + if (this_diff > diff)
871     + this_diff = diff;
872     +
873     + bio->bi_size -= (this_diff << 9);
874     +
875     + /*
876     + * Bigger than this chunk, skip ahead.
877     + */
878     + if (this_diff == bsec) {
879     + vecs++;
880     + diff -= this_diff;
881     + continue;
882     + }
883     +
884     + /*
885     + * Adjust this bvec
886     + */
887     + bvec->bv_len -= (this_diff << 9);
888     + break;
889     + }
890     + bio->bi_vcnt -= vecs;
891     + }
892     +
893     + BUG_ON(bio->bi_sector < fe->cache_sector);
894     + BUG_ON(bio->bi_sector + (bio->bi_size >> 9) > fe->cache_sector + (fe->fs_size >> 9));
895     +
896     + /*
897     + * Invalidate the segment counts, we changed the bio layout.
898     + */
899     + bio->bi_flags &= ~(1 << BIO_SEG_VALID);
900     + bio->bi_flags |= (1 << BIO_NOMERGE);
901     +}
902     +
903     +static int fcache_overwrite_endio(struct bio *bio, unsigned int bytes, int err)
904     +{
905     + if (bio->bi_size)
906     + return 1;
907     +
908     + if (!bio_flagged(bio, BIO_UPTODATE)) {
909     + struct fcache_dev *fdev = bio->bi_private;
910     +
911     + printk(KERN_ERR "fcache: overwrite error, cache off\n");
912     + set_bit(FDEV_F_DOWN, &fdev->flags);
913     + }
914     +
915     + bio_put(bio);
916     + return 0;
917     +}
918     +
919     +/*
920     + * Schedule overwrite of some existing block(s).
921     + */
922     +static int fcache_overwrite_extent(struct fcache_dev *fdev,
923     + struct fcache_extent *fe, struct bio *bio)
924     +{
925     + struct bio *clone = bio_clone(bio, GFP_NOFS);
926     +
927     + clone->bi_bdev = fdev->bdev;
928     + clone->bi_end_io = fcache_overwrite_endio;
929     + clone->bi_private = fdev;
930     + fcache_bio_align(clone, fe);
931     + submit_bio(WRITE, clone);
932     + return 0;
933     +}
934     +
935     +/*
936     + * Our real data read is complete. Kick our process context handler so it
937     + * can submit the write to our cache.
938     + */
939     +static int fcache_extent_endio(struct bio *bio, unsigned int bytes, int err)
940     +{
941     + struct fcache_dev *fdev;
942     + struct fcache_endio_data *fed;
943     + unsigned long flags;
944     +
945     + if (bio->bi_size)
946     + return 1;
947     +
948     + fed = bio->bi_private;
949     +
950     + if (!bio_flagged(bio, BIO_UPTODATE))
951     + fed->io_error = -EIO;
952     +
953     + bio_put(bio);
954     +
955     + fdev = fed->fdev;
956     + spin_lock_irqsave(&fdev->lock, flags);
957     + list_add_tail(&fed->list, &fdev->list);
958     + spin_unlock_irqrestore(&fdev->lock, flags);
959     + queue_work(fcache_workqueue, &fdev->work);
960     + return 0;
961     +}
962     +
963     +/*
964     + * This initiates adding an extent to our list. We do this by cloning the
965     + * original bio and submitting that to the real device and when that completes
966     + * we write that out to the cache device and instantiate the extent.
967     + */
968     +static int fcache_add_extent(struct fcache_dev *fdev, struct bio *bio)
969     +{
970     + struct fcache_endio_data *fed;
971     + struct bio *clone;
972     +
973     + fed = mempool_alloc(fed_pool, GFP_NOIO);
974     +
975     + fed->fdev = fdev;
976     + fed->fs_sector = bio->bi_sector;
977     + fed->fs_size = bio->bi_size;
978     + fed->cache_sector = -1;
979     + fed->bio = bio;
980     + fed->io_error = 0;
981     + INIT_LIST_HEAD(&fed->list);
982     +
983     + /*
984     + * Allocate/assign an extent block for this range
985     + */
986     + spin_lock_irq(&fdev->lock);
987     + if (fdev->nr_extents < fdev->max_extents) {
988     + unsigned long nr = (bio->bi_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
989     +
990     + if (fdev->next_cache_block + nr <= fdev->cache_blocks) {
991     + fdev->nr_extents++;
992     + fed->cache_sector = fdev->next_cache_block << BLOCK_SHIFT;
993     + fdev->next_cache_block += nr;
994     + }
995     + }
996     + spin_unlock_irq(&fdev->lock);
997     +
998     + /*
999     + * Ran out of room
1000     + */
1001     + if (fed->cache_sector == -1) {
1002     + printk(KERN_ERR "fcache: ran out of space, priming now off\n");
1003     + fdev->priming = 0;
1004     + mempool_free(fed, fed_pool);
1005     + return -ENOENT;
1006     + }
1007     +
1008     + clone = bio_clone(bio, GFP_NOFS);
1009     + clone->bi_private = fed;
1010     + clone->bi_end_io = fcache_extent_endio;
1011     + clone->bi_rw |= (1 << BIO_RW_SYNC);
1012     +
1013     + generic_make_request(clone);
1014     + return 0;
1015     +}
1016     +
1017     +static int fcache_parse_extents(struct fcache_dev *fdev, void *addr,
1018     + unsigned int max_extents)
1019     +{
1020     + int nr_extents = PAGE_SIZE / sizeof(struct fcache_extent);
1021     + int extents_read;
1022     +
1023     + if (nr_extents > max_extents)
1024     + nr_extents = max_extents;
1025     +
1026     + extents_read = 0;
1027     + while (nr_extents) {
1028     + struct fcache_extent *fe, *__fe = addr;
1029     +
1030     + fe = kmem_cache_alloc(fcache_slab, GFP_KERNEL);
1031     + if (unlikely(!fe))
1032     + return -ENOMEM;
1033     +
1034     + memset(fe, 0, sizeof(*fe));
1035     + fe->fs_sector = __fe->fs_sector;
1036     + fe->fs_size = __fe->fs_size;
1037     + fe->cache_sector = __fe->cache_sector;
1038     +
1039     + fcache_tree_link(fdev, fe);
1040     +
1041     + nr_extents--;
1042     + extents_read++;
1043     + addr += sizeof(*fe);
1044     + }
1045     +
1046     + return extents_read;
1047     +}
1048     +
1049     +static int fcache_read_extents(struct fcache_dev *fdev)
1050     +{
1051     + unsigned int nr_extents = fdev->nr_extents;
1052     + int ret, extents, total_extents;
1053     + struct page *page;
1054     + sector_t index;
1055     + void *p;
1056     +
1057     + page = alloc_page(GFP_KERNEL);
1058     + if (unlikely(!page))
1059     + return -ENOMEM;
1060     +
1061     + ret = 0;
1062     + total_extents = 0;
1063     + index = FCACHE_EXTENT_BLOCK;
1064     + while (nr_extents) {
1065     + ret = fcache_rw_page(fdev, index, page, READ);
1066     + if (ret)
1067     + break;
1068     +
1069     + p = page_address(page);
1070     + extents = fcache_parse_extents(fdev, p, nr_extents);
1071     +
1072     + if (extents < 0) {
1073     + ret = extents;
1074     + break;
1075     + }
1076     +
1077     + index++;
1078     + nr_extents -= extents;
1079     + total_extents += extents;
1080     + }
1081     +
1082     + __free_page(page);
1083     +
1084     + if (ret)
1085     + return ret;
1086     +
1087     + return total_extents;
1088     +}
1089     +
1090     +/*
1091     + * Read an existing fcache header from the device, and then proceed to
1092     + * reading and adding the extents to out prio tree.
1093     + */
1094     +static int fcache_load_header(struct fcache_dev *fdev, int serial)
1095     +{
1096     + struct fcache_header *header = NULL;
1097     + struct page *page;
1098     + int ret, wrong_serial = 0;
1099     + char b[BDEVNAME_SIZE];
1100     +
1101     + page = alloc_page(GFP_HIGHUSER);
1102     + if (unlikely(!page))
1103     + return -ENOMEM;
1104     +
1105     + ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
1106     + if (unlikely(ret))
1107     + goto err;
1108     +
1109     + ret = -EINVAL;
1110     + header = kmap_atomic(page, KM_USER0);
1111     + if (header->magic != FCACHE_MAGIC) {
1112     + printk(KERN_ERR "fcache: bad magic %x\n", header->magic);
1113     + goto err;
1114     + }
1115     + if (header->version != FCACHE_VERSION) {
1116     + printk(KERN_ERR "fcache: bad version %d\n", header->version);
1117     + goto err;
1118     + }
1119     + if (strcmp(bdevname(fdev->fs_bdev, b), header->fs_dev)) {
1120     + printk(KERN_ERR "fcache: device mismatch (%s/%s\n", b,
1121     + header->fs_dev);
1122     + goto err;
1123     + }
1124     + if (header->fs_start_sector != fdev->fs_start_sector ||
1125     + header->fs_sectors != fdev->fs_sectors) {
1126     + printk(KERN_ERR "fcache: fs appears to have changed size\n");
1127     + goto err;
1128     + }
1129     +
1130     + fdev->nr_extents = header->nr_extents;
1131     + fdev->max_extents = header->max_extents;
1132     +
1133     + /*
1134     + * Don't fail on out-of-date serial, just warn that the user needs
1135     + * to prime the cache again. Until then we'll just bypass the cache.
1136     + */
1137     + if (header->serial != serial) {
1138     + printk(KERN_ERR "fcache: found serial %d, expected %d.\n",
1139     + header->serial, serial);
1140     + printk(KERN_ERR "fcache: reprime the cache!\n");
1141     + wrong_serial = 1;
1142     + }
1143     +
1144     + fdev->serial = header->serial;
1145     + kunmap_atomic(header, KM_USER0);
1146     + __free_page(page);
1147     +
1148     + if (!wrong_serial) {
1149     + printk("fcache: header looks valid (extents=%ld extents, serial=%u)\n", fdev->nr_extents, fdev->serial);
1150     +
1151     + ret = fcache_read_extents(fdev);
1152     + printk("fcache: loaded %d extents\n", ret);
1153     +
1154     + /*
1155     + * If we don't find all the extents we require, fail.
1156     + */
1157     + if (ret != fdev->nr_extents) {
1158     + fcache_free_prio_tree(fdev);
1159     + ret = -EINVAL;
1160     + } else
1161     + ret = 0;
1162     + }
1163     +
1164     + return ret;
1165     +err:
1166     + __free_page(page);
1167     + if (header)
1168     + kunmap_atomic(header, KM_USER0);
1169     + return ret;
1170     +}
1171     +
1172     +/*
1173     + * We use this range to decide when to log an io to the target device.
1174     + */
1175     +static void fcache_fill_fs_size(struct fcache_dev *fdev)
1176     +{
1177     + struct block_device *bdev = fdev->fs_bdev;
1178     +
1179     + /*
1180     + * Partition or whole device?
1181     + */
1182     + if (bdev != bdev->bd_contains) {
1183     + struct hd_struct *p = bdev->bd_part;
1184     +
1185     + fdev->fs_start_sector = p->start_sect;
1186     + fdev->fs_sectors = p->nr_sects;
1187     + } else {
1188     + fdev->fs_start_sector = 0;
1189     + fdev->fs_sectors = bdev->bd_inode->i_size >> 9;
1190     + }
1191     +}
1192     +
1193     +static void fcache_fill_cache_size(struct fcache_dev *fdev)
1194     +{
1195     + struct block_device *bdev = fdev->bdev;
1196     +
1197     + /*
1198     + * Partition or whole device?
1199     + */
1200     + if (bdev != bdev->bd_contains) {
1201     + struct hd_struct *p = bdev->bd_part;
1202     +
1203     + fdev->cache_start_sector = p->start_sect;
1204     + fdev->cache_blocks = p->nr_sects >> BLOCK_SHIFT;
1205     + } else {
1206     + fdev->cache_start_sector = 0;
1207     + fdev->cache_blocks = bdev->bd_inode->i_size >> PAGE_SHIFT;
1208     + }
1209     +}
1210     +
1211     +/*
1212     + * This is a read request, check if we have that block. If we do, then
1213     + * just redirect. If not, pass it through.
1214     + */
1215     +static int fcache_read_request(struct fcache_dev *fdev, request_queue_t *q,
1216     + struct bio *bio)
1217     +{
1218     + struct fcache_extent *extents[MAX_FE];
1219     + struct fcache_extent *fe;
1220     + int i, nr;
1221     +
1222     + /*
1223     + * Not there, redirect to original but schedule adding this extent
1224     + * to our list if we are priming.
1225     + */
1226     + nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents);
1227     + if (!nr) {
1228     + if (fdev->priming && !fcache_add_extent(fdev, bio))
1229     + return 0;
1230     +
1231     + fdev->misses++;
1232     + return fdev->mfn(q, bio);
1233     + }
1234     +
1235     + /*
1236     + * If range is at least as big, we use our cache. If not, cop out
1237     + * and just submit to real device.
1238     + */
1239     + for (i = 0; i < nr; i++) {
1240     + sector_t end_fe, end_bi;
1241     + fe = extents[i];
1242     +
1243     + end_fe = fe->fs_sector + (fe->fs_size >> 9);
1244     + end_bi = bio->bi_sector + (bio->bi_size >> 9);
1245     +
1246     + /*
1247     + * match!
1248     + */
1249     + if (bio->bi_sector >= fe->fs_sector && end_bi <= end_fe)
1250     + break;
1251     +
1252     + fe = NULL;
1253     + }
1254     +
1255     + /*
1256     + * Nopes, send to real device.
1257     + */
1258     + if (!fe) {
1259     + fdev->misses++;
1260     + return fdev->mfn(q, bio);
1261     + }
1262     +
1263     + /*
1264     + * Perfect, adjust start offset if it isn't aligned.
1265     + */
1266     + fdev->hits++;
1267     + fcache_bio_align(bio, fe);
1268     +
1269     + /*
1270     + * If we don't have to chop it up, just let generic_make_request()
1271     + * handle the stacking. Otherwise, return handled and pass to chopper.
1272     + */
1273     + if (fdev->chop_ios) {
1274     + struct fcache_endio_data *fed;
1275     +
1276     + fed = mempool_alloc(fed_pool, GFP_NOIO);
1277     +
1278     + fed->fdev = fdev;
1279     + fed->cache_sector = bio->bi_sector;
1280     + fed->fs_size = bio->bi_size;
1281     + fed->bio = bio;
1282     + fed->io_error = 0;
1283     + fcache_io_chopper(fdev, fed, fcache_chop_read_endio,
1284     + fcache_chop_read_done, READ);
1285     + return 0;
1286     + }
1287     +
1288     + bio->bi_bdev = fdev->bdev;
1289     + return 1;
1290     +}
1291     +
1292     +/*
1293     + * If we are priming the cache, always add this block. If not, then we still
1294     + * need to overwrite this block if it's in our cache.
1295     + */
1296     +static int fcache_write_request(struct fcache_dev *fdev, request_queue_t *q,
1297     + struct bio *bio)
1298     +{
1299     + struct fcache_extent *extents[MAX_FE];
1300     + struct fcache_extent *fe;
1301     + sector_t start = bio->bi_sector;
1302     + int i, nr;
1303     +
1304     +repeat:
1305     + nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents);
1306     +
1307     + /*
1308     + * Find out what to overwrite, if anything.
1309     + */
1310     + for (i = 0; i < nr; i++) {
1311     + fe = extents[i];
1312     + fdev->overwrites++;
1313     + fcache_overwrite_extent(fdev, fe, bio);
1314     + }
1315     +
1316     + /*
1317     + * If i == MAX_FE, there _may_ be more extents. Repeat lookup, start
1318     + * from the end of last request.
1319     + */
1320     + if (i == MAX_FE) {
1321     + fe = extents[i - 1];
1322     + start = fe->fs_sector + (fe->fs_size >> 9);
1323     + goto repeat;
1324     + }
1325     +
1326     + return fdev->mfn(q, bio);
1327     +}
1328     +
1329     +/*
1330     + * This is the only case where we resubmit an io to the device but don't
1331     + * want to count it as part of io we log.
1332     + */
1333     +#define fcache_bio_seen(bio) ((bio)->bi_end_io == fcache_extent_endio)
1334     +
1335     +static int fcache_make_request(request_queue_t *q, struct bio *bio)
1336     +{
1337     + struct fcache_dev *fdev = &fcache_dev;
1338     +
1339     + /*
1340     + * If it's in the sector range we are monitoring and the device isn't
1341     + * being shutdown, then pass it on. Assume a bio doesn't span into
1342     + * the next partition, so don't bother accounting for size.
1343     + */
1344     + if ((bio->bi_sector >= fdev->fs_start_sector) &&
1345     + (bio->bi_sector < (fdev->fs_start_sector + fdev->fs_sectors)) &&
1346     + !test_bit(FDEV_F_DOWN, &fdev->flags) &&
1347     + !fcache_bio_seen(bio)) {
1348     +
1349     + fdev->ios[bio_data_dir(bio)]++;
1350     +
1351     + if (bio_data_dir(bio) == READ)
1352     + return fcache_read_request(fdev, q, bio);
1353     +
1354     + return fcache_write_request(fdev, q, bio);
1355     + }
1356     +
1357     + /*
1358     + * Pass through to original make_request_fn.
1359     + */
1360     + return fdev->mfn(q, bio);
1361     +}
1362     +
1363     +/*
1364     + * Attach the cache device 'bdev' to 'fdev'.
1365     + */
1366     +static int fcache_setup_dev(struct fcache_dev *fdev,
1367     + struct block_device *fs_bdev,
1368     + struct block_device *bdev,
1369     + int priming, int serial)
1370     +{
1371     + request_queue_t *fs_q, *cache_q;
1372     + char b[BDEVNAME_SIZE];
1373     + int ret;
1374     +
1375     + memset(fdev, 0, sizeof(*fdev));
1376     + INIT_PRIO_TREE_ROOT(&fdev->prio_root);
1377     + spin_lock_init(&fdev->lock);
1378     + INIT_LIST_HEAD(&fdev->list);
1379     + INIT_WORK(&fdev->work, fcache_work, fdev);
1380     + fdev->priming = priming;
1381     + fdev->fs_bdev = fs_bdev;
1382     + fdev->bdev = bdev;
1383     +
1384     + ret = -EINVAL;
1385     +
1386     + fs_q = bdev_get_queue(fs_bdev);
1387     + cache_q = bdev_get_queue(bdev);
1388     + if (!fs_q || !cache_q)
1389     + goto out;
1390     +
1391     + /*
1392     + * Chop up outgoing ios, if the target is a different queue. We could
1393     + * look closer at limits, but it's fragile and pretty pointless.
1394     + */
1395     + if (fs_q != cache_q)
1396     + fdev->chop_ios = 1;
1397     +
1398     + ret = bd_claim(bdev, fcache_setup_dev);
1399     + if (ret < 0)
1400     + goto out;
1401     +
1402     + ret = block_size(bdev);
1403     + if (ret != PAGE_SIZE) {
1404     + fdev->old_bs = ret;
1405     + ret = set_blocksize(bdev, PAGE_SIZE);
1406     + if (ret < 0)
1407     + goto out_release;
1408     + } else
1409     + ret = 0;
1410     +
1411     + fcache_fill_cache_size(fdev);
1412     + fcache_fill_fs_size(fdev);
1413     +
1414     + if (priming) {
1415     + fdev->serial = serial;
1416     + ret = fcache_write_new_header(fdev);
1417     + } else
1418     + ret = fcache_load_header(fdev, serial);
1419     +
1420     + if (!ret) {
1421     + printk("fcache: %s opened successfully (%spriming)\n",
1422     + bdevname(bdev, b),
1423     + priming ? "" : "not ");
1424     + return 0;
1425     + }
1426     +
1427     +out_release:
1428     + bd_release(fdev->bdev);
1429     +out:
1430     + blkdev_put(fdev->bdev);
1431     + fdev->bdev = NULL;
1432     + return ret;
1433     +}
1434     +
1435     +/*
1436     + * Return fdev->bdev to its original state.
1437     + */
1438     +static void fcache_shutdown_dev(struct fcache_dev *fdev,
1439     + struct block_device *bdev)
1440     +{
1441     + if (fdev->bdev) {
1442     + if (fdev->mfn) {
1443     + request_queue_t *q = bdev_get_queue(bdev);
1444     +
1445     + (void) xchg(&q->make_request_fn, fdev->mfn);
1446     + }
1447     + sync_blockdev(fdev->bdev);
1448     + if (fdev->old_bs)
1449     + set_blocksize(fdev->bdev, fdev->old_bs);
1450     +
1451     + bd_release(fdev->bdev);
1452     + blkdev_put(fdev->bdev);
1453     + fdev->bdev = NULL;
1454     + INIT_PRIO_TREE_ROOT(&fdev->prio_root);
1455     + }
1456     +}
1457     +
1458     +/*
1459     + * bdev is the file system device, cache_dev is the device we want to store
1460     + * the cache on.
1461     + */
1462     +int fcache_dev_open(struct block_device *bdev, unsigned long cache_dev,
1463     + int priming, int serial)
1464     +{
1465     + struct block_device *fcache_bdev;
1466     + request_queue_t *q;
1467     + int ret;
1468     +
1469     + if (disable)
1470     + return 0;
1471     + if (fcache_dev.bdev)
1472     + return -EBUSY;
1473     +
1474     + fcache_bdev = open_by_devnum(cache_dev, FMODE_READ|FMODE_WRITE);
1475     + if (IS_ERR(fcache_bdev))
1476     + return PTR_ERR(fcache_bdev);
1477     +
1478     + ret = fcache_setup_dev(&fcache_dev, bdev, fcache_bdev, priming, serial);
1479     + if (ret)
1480     + return ret;
1481     +
1482     + q = bdev_get_queue(bdev);
1483     + fcache_dev.mfn = xchg(&q->make_request_fn, fcache_make_request);
1484     + return 0;
1485     +}
1486     +
1487     +EXPORT_SYMBOL(fcache_dev_open);
1488     +
1489     +void fcache_dev_close(struct block_device *bdev, int serial)
1490     +{
1491     + struct fcache_dev *fdev = &fcache_dev;
1492     +
1493     + if (disable)
1494     + return;
1495     +
1496     + if (!fdev->bdev)
1497     + return;
1498     +
1499     + printk("fcache: ios r/w %u/%u, hits %u, misses %u, overwrites %u\n",
1500     + fdev->ios[0], fdev->ios[1], fdev->hits,
1501     + fdev->misses, fdev->overwrites);
1502     + fdev->serial = serial;
1503     +
1504     + sync_blockdev(bdev);
1505     + set_bit(FDEV_F_DOWN, &fdev->flags);
1506     +
1507     + if (fdev->priming)
1508     + fcache_write_extents(fdev);
1509     +
1510     + fcache_write_header(fdev);
1511     + fcache_free_prio_tree(fdev);
1512     + fcache_shutdown_dev(fdev, bdev);
1513     +}
1514     +
1515     +EXPORT_SYMBOL(fcache_dev_close);
1516     +
1517     +static int fcache_init(void)
1518     +{
1519     + fcache_slab = kmem_cache_create("fcache", sizeof(struct fcache_extent),
1520     + 0, 0, NULL, NULL);
1521     + if (!fcache_slab)
1522     + return -ENOMEM;
1523     +
1524     + fcache_fed_slab = kmem_cache_create("fcache_fed",
1525     + sizeof(struct fcache_endio_data), 0, 0,
1526     + NULL, NULL);
1527     + if (!fcache_fed_slab) {
1528     + kmem_cache_destroy(fcache_slab);
1529     + return -ENOMEM;
1530     + }
1531     +
1532     + fed_pool = mempool_create_slab_pool(1, fcache_fed_slab);
1533     + if (!fed_pool) {
1534     + kmem_cache_destroy(fcache_slab);
1535     + kmem_cache_destroy(fcache_fed_slab);
1536     + return -ENOMEM;
1537     + }
1538     +
1539     + fcache_workqueue = create_singlethread_workqueue("fcached");
1540     + if (!fcache_workqueue)
1541     + panic("fcache: failed to create fcached\n");
1542     +
1543     + return 0;
1544     +}
1545     +
1546     +static void fcache_exit(void)
1547     +{
1548     + destroy_workqueue(fcache_workqueue);
1549     + kmem_cache_destroy(fcache_slab);
1550     + kmem_cache_destroy(fcache_fed_slab);
1551     + mempool_destroy(fed_pool);
1552     +}
1553     +
1554     +MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
1555     +MODULE_LICENSE("GPL");
1556     +
1557     +module_init(fcache_init);
1558     +module_exit(fcache_exit);
1559     Index: linux-ck-dev/fs/ext3/super.c
1560     ===================================================================
1561     --- linux-ck-dev.orig/fs/ext3/super.c 2006-06-18 15:20:10.000000000 +1000
1562     +++ linux-ck-dev/fs/ext3/super.c 2006-06-18 15:25:27.000000000 +1000
1563     @@ -384,11 +384,43 @@ static void dump_orphan_list(struct supe
1564     }
1565     }
1566    
1567     +extern int fcache_dev_open(struct block_device *, unsigned long, int, int);
1568     +extern int fcache_dev_close(struct block_device *, int);
1569     +
1570     +static void ext3_close_fcache(struct super_block *sb)
1571     +{
1572     + struct ext3_sb_info *sbi = EXT3_SB(sb);
1573     + struct ext3_super_block *es = sbi->s_es;
1574     + int serial = le16_to_cpu(es->s_mnt_count);
1575     +
1576     + fcache_dev_close(sb->s_bdev, serial);
1577     +}
1578     +
1579     +static int ext3_open_fcache(struct super_block *sb, unsigned long cachedev)
1580     +{
1581     + struct ext3_sb_info *sbi = EXT3_SB(sb);
1582     + struct ext3_super_block *es = sbi->s_es;
1583     + int priming = test_opt(sb, FCACHEPRIME);
1584     + int serial = le16_to_cpu(es->s_mnt_count);
1585     + int ret;
1586     +
1587     + ret = fcache_dev_open(sb->s_bdev, cachedev, priming, serial);
1588     + if (!ret) {
1589     + set_opt(sbi->s_mount_opt, FCACHE);
1590     + return 0;
1591     + }
1592     +
1593     + printk(KERN_ERR "ext3: failed to open fcache (err=%d)\n", ret);
1594     + return ret;
1595     +}
1596     +
1597     static void ext3_put_super (struct super_block * sb)
1598     {
1599     struct ext3_sb_info *sbi = EXT3_SB(sb);
1600     struct ext3_super_block *es = sbi->s_es;
1601     - int i;
1602     + int i, has_fcache;
1603     +
1604     + has_fcache = test_opt(sb, FCACHE);
1605    
1606     ext3_xattr_put_super(sb);
1607     journal_destroy(sbi->s_journal);
1608     @@ -431,6 +463,8 @@ static void ext3_put_super (struct super
1609     invalidate_bdev(sbi->journal_bdev, 0);
1610     ext3_blkdev_remove(sbi);
1611     }
1612     + if (has_fcache)
1613     + ext3_close_fcache(sb);
1614     sb->s_fs_info = NULL;
1615     kfree(sbi);
1616     return;
1617     @@ -635,7 +669,7 @@ enum {
1618     Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1619     Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1620     Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
1621     - Opt_grpquota
1622     + Opt_grpquota, Opt_fcache_dev, Opt_fcache_prime,
1623     };
1624    
1625     static match_table_t tokens = {
1626     @@ -684,6 +718,8 @@ static match_table_t tokens = {
1627     {Opt_quota, "quota"},
1628     {Opt_usrquota, "usrquota"},
1629     {Opt_barrier, "barrier=%u"},
1630     + {Opt_fcache_dev, "fcache_dev=%s"},
1631     + {Opt_fcache_prime, "fcache_prime=%u"},
1632     {Opt_err, NULL},
1633     {Opt_resize, "resize"},
1634     };
1635     @@ -710,6 +746,7 @@ static unsigned long get_sb_block(void *
1636    
1637     static int parse_options (char *options, struct super_block *sb,
1638     unsigned long *inum, unsigned long *journal_devnum,
1639     + unsigned long *fcache_devnum,
1640     unsigned long *n_blocks_count, int is_remount)
1641     {
1642     struct ext3_sb_info *sbi = EXT3_SB(sb);
1643     @@ -1012,6 +1049,29 @@ clear_qf_name:
1644     case Opt_nobh:
1645     set_opt(sbi->s_mount_opt, NOBH);
1646     break;
1647     + case Opt_fcache_dev: {
1648     + int maj, min;
1649     + char *p, *pm;
1650     +
1651     + if (!fcache_devnum)
1652     + break;
1653     + p = match_strdup(&args[0]);
1654     + if (!p)
1655     + return 0;
1656     + maj = simple_strtol(p, &pm, 10);
1657     + min = simple_strtol(pm + 1, NULL, 10);
1658     + *fcache_devnum = maj << MINORBITS | min;
1659     + kfree(p);
1660     + break;
1661     + }
1662     + case Opt_fcache_prime:
1663     + if (match_int(&args[0], &option))
1664     + return 0;
1665     + if (option)
1666     + set_opt(sbi->s_mount_opt, FCACHEPRIME);
1667     + else
1668     + clear_opt(sbi->s_mount_opt, FCACHEPRIME);
1669     + break;
1670     default:
1671     printk (KERN_ERR
1672     "EXT3-fs: Unrecognized mount option \"%s\" "
1673     @@ -1346,6 +1406,7 @@ static int ext3_fill_super (struct super
1674     unsigned long offset = 0;
1675     unsigned long journal_inum = 0;
1676     unsigned long journal_devnum = 0;
1677     + unsigned long fcache_devnum = 0;
1678     unsigned long def_mount_opts;
1679     struct inode *root;
1680     int blocksize;
1681     @@ -1353,6 +1414,7 @@ static int ext3_fill_super (struct super
1682     int db_count;
1683     int i;
1684     int needs_recovery;
1685     + int fcache = 0;
1686     __le32 features;
1687    
1688     sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
1689     @@ -1427,7 +1489,7 @@ static int ext3_fill_super (struct super
1690     set_opt(sbi->s_mount_opt, RESERVATION);
1691    
1692     if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1693     - NULL, 0))
1694     + &fcache_devnum, NULL, 0))
1695     goto failed_mount;
1696    
1697     sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1698     @@ -1651,6 +1713,9 @@ static int ext3_fill_super (struct super
1699     goto failed_mount2;
1700     }
1701    
1702     + if (fcache_devnum)
1703     + fcache = ext3_open_fcache(sb, fcache_devnum);
1704     +
1705     /* We have now updated the journal if required, so we can
1706     * validate the data journaling mode. */
1707     switch (test_opt(sb, DATA_FLAGS)) {
1708     @@ -1740,6 +1805,8 @@ cantfind_ext3:
1709     goto failed_mount;
1710    
1711     failed_mount3:
1712     + if (!fcache)
1713     + ext3_close_fcache(sb);
1714     journal_destroy(sbi->s_journal);
1715     failed_mount2:
1716     for (i = 0; i < db_count; i++)
1717     @@ -2205,6 +2272,7 @@ static int ext3_remount (struct super_bl
1718     struct ext3_sb_info *sbi = EXT3_SB(sb);
1719     unsigned long n_blocks_count = 0;
1720     unsigned long old_sb_flags;
1721     + unsigned long fcache_devnum = 0;
1722     struct ext3_mount_options old_opts;
1723     int err;
1724     #ifdef CONFIG_QUOTA
1725     @@ -2226,7 +2294,7 @@ static int ext3_remount (struct super_bl
1726     /*
1727     * Allow the "check" option to be passed as a remount option.
1728     */
1729     - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
1730     + if (!parse_options(data, sb, NULL, NULL, &fcache_devnum, &n_blocks_count, 1)) {
1731     err = -EINVAL;
1732     goto restore_opts;
1733     }
1734     @@ -2241,6 +2309,11 @@ static int ext3_remount (struct super_bl
1735    
1736     ext3_init_journal_params(sb, sbi->s_journal);
1737    
1738     + if (fcache_devnum) {
1739     + ext3_close_fcache(sb);
1740     + ext3_open_fcache(sb, fcache_devnum);
1741     + }
1742     +
1743     if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
1744     n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
1745     if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
1746     Index: linux-ck-dev/include/linux/bio.h
1747     ===================================================================
1748     --- linux-ck-dev.orig/include/linux/bio.h 2006-06-18 15:20:10.000000000 +1000
1749     +++ linux-ck-dev/include/linux/bio.h 2006-06-18 15:25:27.000000000 +1000
1750     @@ -124,6 +124,7 @@ struct bio {
1751     #define BIO_BOUNCED 5 /* bio is a bounce bio */
1752     #define BIO_USER_MAPPED 6 /* contains user pages */
1753     #define BIO_EOPNOTSUPP 7 /* not supported */
1754     +#define BIO_NOMERGE 8 /* bio not mergeable */
1755     #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
1756    
1757     /*
1758     @@ -179,6 +180,14 @@ struct bio {
1759     #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
1760     #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
1761    
1762     +static inline int bio_mergeable(struct bio *bio)
1763     +{
1764     + if (!bio_barrier(bio) && !bio->bi_idx && !bio_flagged(bio, BIO_NOMERGE))
1765     + return 1;
1766     +
1767     + return 0;
1768     +}
1769     +
1770     /*
1771     * will die
1772     */
1773     Index: linux-ck-dev/include/linux/ext3_fs.h
1774     ===================================================================
1775     --- linux-ck-dev.orig/include/linux/ext3_fs.h 2006-06-18 15:20:10.000000000 +1000
1776     +++ linux-ck-dev/include/linux/ext3_fs.h 2006-06-18 15:25:27.000000000 +1000
1777     @@ -376,6 +376,8 @@ struct ext3_inode {
1778     #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
1779     #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
1780     #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
1781     +#define EXT3_MOUNT_FCACHE 0x400000 /* using fcache */
1782     +#define EXT3_MOUNT_FCACHEPRIME 0x800000 /* priming fcache */
1783    
1784     /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
1785     #ifndef _LINUX_EXT2_FS_H
1786     @@ -847,6 +849,18 @@ extern struct inode_operations ext3_spec
1787     extern struct inode_operations ext3_symlink_inode_operations;
1788     extern struct inode_operations ext3_fast_symlink_inode_operations;
1789    
1790     +#ifndef CONFIG_BLK_FCACHE
1791     +static inline int fcache_dev_open(struct block_device *bdev,
1792     + unsigned long cache_dev, int priming, int serial)
1793     +{
1794     + return -ENODEV;
1795     +}
1796     +
1797     +static inline int fcache_dev_close(struct block_device *bdev, int serial)
1798     +{
1799     + return 0;
1800     +}
1801     +#endif /* CONFIG_BLK_FCACHE */
1802    
1803     #endif /* __KERNEL__ */
1804