Magellan Linux

Annotation of /trunk/kernel26-magellan/patches-2.6.16-r10/0029-2.6.16-adaptive-readahead-11.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 70 - (hide annotations) (download)
Thu May 11 19:09:22 2006 UTC (18 years ago) by niro
File size: 82008 byte(s)
import

1 niro 70 ---
2     Documentation/sysctl/vm.txt | 36 +
3     drivers/block/loop.c | 6
4     fs/mpage.c | 4
5     fs/nfsd/vfs.c | 6
6     include/linux/fs.h | 41 -
7     include/linux/mm.h | 31
8     include/linux/page-flags.h | 5
9     include/linux/radix-tree.h | 82 ++
10     include/linux/sysctl.h | 2
11     include/linux/writeback.h | 6
12     kernel/sysctl.c | 28
13     lib/radix-tree.c | 208 +++++-
14     mm/Kconfig | 55 +
15     mm/filemap.c | 86 ++
16     mm/memory.c | 1
17     mm/page-writeback.c | 2
18     mm/page_alloc.c | 2
19     mm/readahead.c | 1519 +++++++++++++++++++++++++++++++++++++++++++-
20     mm/swap.c | 2
21     mm/vmscan.c | 3
22     20 files changed, 2062 insertions(+), 63 deletions(-)
23    
24     Index: linux-2.6.16-ck1/Documentation/sysctl/vm.txt
25     ===================================================================
26     --- linux-2.6.16-ck1.orig/Documentation/sysctl/vm.txt 2006-03-20 20:47:01.000000000 +1100
27     +++ linux-2.6.16-ck1/Documentation/sysctl/vm.txt 2006-03-20 20:47:04.000000000 +1100
28     @@ -30,6 +30,8 @@ Currently, these files are in /proc/sys/
29     - zone_reclaim_mode
30     - zone_reclaim_interval
31     - swap_prefetch
32     +- readahead_ratio
33     +- readahead_hit_rate
34    
35     ==============================================================
36    
37     @@ -204,3 +206,37 @@ swap_prefetch unset and then it is enabl
38     prefetched.
39    
40     The default value is 1.
41     +
42     +==============================================================
43     +
44     +readahead_ratio
45     +
46     +This limits readahead size to percent of the thrashing-threshold,
47     +which is dynamicly estimated from the _history_ read speed and
48     +system load, to deduce the _future_ readahead request size.
49     +
50     +Set it to a smaller value if you have not enough memory for all the
51     +concurrent readers, or the I/O loads fluctuate a lot. But if there's
52     +plenty of memory(>2MB per reader), enlarge it may help speedup reads.
53     +
54     +readahead_ratio also selects the readahead logic:
55     +0: disable readahead totally
56     +1-9: select the stock readahead logic
57     +10-inf: select the adaptive readahead logic
58     +
59     +The default value is 50; reasonable values would be 50-100.
60     +
61     +==============================================================
62     +
63     +readahead_hit_rate
64     +
65     +This is the max allowed value of (readahead-pages : accessed-pages).
66     +Useful only when (readahead_ratio >= 10). If the previous readahead
67     +request has bad hit rate, the kernel will be reluctant to do the next
68     +readahead.
69     +
70     +A larger value helps catch more sparse access patterns. Be aware that
71     +readahead of the sparse patterns sacrifices memory for speed.
72     +
73     +The default value is 2.
74     +It is recommended to keep the value below (max-readahead-pages / 8).
75     Index: linux-2.6.16-ck1/drivers/block/loop.c
76     ===================================================================
77     --- linux-2.6.16-ck1.orig/drivers/block/loop.c 2006-03-20 20:46:23.000000000 +1100
78     +++ linux-2.6.16-ck1/drivers/block/loop.c 2006-03-20 20:47:04.000000000 +1100
79     @@ -779,6 +779,12 @@ static int loop_set_fd(struct loop_devic
80     mapping = file->f_mapping;
81     inode = mapping->host;
82    
83     + /*
84     + * The upper layer should already do proper look-ahead,
85     + * one more look-ahead here only ruins the cache hit rate.
86     + */
87     + file->f_ra.flags |= RA_FLAG_NO_LOOKAHEAD;
88     +
89     if (!(file->f_mode & FMODE_WRITE))
90     lo_flags |= LO_FLAGS_READ_ONLY;
91    
92     Index: linux-2.6.16-ck1/fs/mpage.c
93     ===================================================================
94     --- linux-2.6.16-ck1.orig/fs/mpage.c 2006-03-20 20:46:23.000000000 +1100
95     +++ linux-2.6.16-ck1/fs/mpage.c 2006-03-20 20:47:04.000000000 +1100
96     @@ -343,8 +343,10 @@ mpage_readpages(struct address_space *ma
97     bio = do_mpage_readpage(bio, page,
98     nr_pages - page_idx,
99     &last_block_in_bio, get_block);
100     - if (!pagevec_add(&lru_pvec, page))
101     + if (!pagevec_add(&lru_pvec, page)) {
102     + cond_resched();
103     __pagevec_lru_add(&lru_pvec);
104     + }
105     } else {
106     page_cache_release(page);
107     }
108     Index: linux-2.6.16-ck1/fs/nfsd/vfs.c
109     ===================================================================
110     --- linux-2.6.16-ck1.orig/fs/nfsd/vfs.c 2006-03-20 20:46:23.000000000 +1100
111     +++ linux-2.6.16-ck1/fs/nfsd/vfs.c 2006-03-20 20:47:04.000000000 +1100
112     @@ -833,10 +833,14 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
113     #endif
114    
115     /* Get readahead parameters */
116     - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
117     + if (prefer_adaptive_readahead())
118     + ra = NULL;
119     + else
120     + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
121    
122     if (ra && ra->p_set)
123     file->f_ra = ra->p_ra;
124     + file->f_ra.flags |= RA_FLAG_NFSD;
125    
126     if (file->f_op->sendfile) {
127     svc_pushback_unused_pages(rqstp);
128     Index: linux-2.6.16-ck1/include/linux/fs.h
129     ===================================================================
130     --- linux-2.6.16-ck1.orig/include/linux/fs.h 2006-03-20 20:46:23.000000000 +1100
131     +++ linux-2.6.16-ck1/include/linux/fs.h 2006-03-20 20:47:04.000000000 +1100
132     @@ -600,19 +600,40 @@ struct fown_struct {
133     * Track a single file's readahead state
134     */
135     struct file_ra_state {
136     - unsigned long start; /* Current window */
137     - unsigned long size;
138     - unsigned long flags; /* ra flags RA_FLAG_xxx*/
139     - unsigned long cache_hit; /* cache hit count*/
140     - unsigned long prev_page; /* Cache last read() position */
141     - unsigned long ahead_start; /* Ahead window */
142     - unsigned long ahead_size;
143     - unsigned long ra_pages; /* Maximum readahead window */
144     - unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
145     - unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
146     + union {
147     + struct { /* conventional read-ahead */
148     + unsigned long start; /* Current window */
149     + unsigned long size;
150     + unsigned long ahead_start; /* Ahead window */
151     + unsigned long ahead_size;
152     + unsigned long cache_hit; /* cache hit count */
153     + };
154     +#ifdef CONFIG_ADAPTIVE_READAHEAD
155     + struct { /* adaptive read-ahead */
156     + pgoff_t la_index;
157     + pgoff_t ra_index;
158     + pgoff_t lookahead_index;
159     + pgoff_t readahead_index;
160     + unsigned long age;
161     + uint64_t cache_hits;
162     + };
163     +#endif
164     + };
165     +
166     + /* mmap read-around */
167     + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
168     + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
169     +
170     + /* common ones */
171     + unsigned long flags; /* ra flags RA_FLAG_xxx*/
172     + unsigned long prev_page; /* Cache last read() position */
173     + unsigned long ra_pages; /* Maximum readahead window */
174     };
175     #define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
176     #define RA_FLAG_INCACHE 0x02 /* file is already in cache */
177     +#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */
178     +#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */
179     +#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */
180    
181     struct file {
182     /*
183     Index: linux-2.6.16-ck1/include/linux/mm.h
184     ===================================================================
185     --- linux-2.6.16-ck1.orig/include/linux/mm.h 2006-03-20 20:46:23.000000000 +1100
186     +++ linux-2.6.16-ck1/include/linux/mm.h 2006-03-20 20:47:04.000000000 +1100
187     @@ -954,7 +954,11 @@ extern int filemap_populate(struct vm_ar
188     int write_one_page(struct page *page, int wait);
189    
190     /* readahead.c */
191     +#ifdef CONFIG_ADAPTIVE_READAHEAD
192     +#define VM_MAX_READAHEAD 1024 /* kbytes */
193     +#else
194     #define VM_MAX_READAHEAD 128 /* kbytes */
195     +#endif
196     #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
197     #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before
198     * turning readahead off */
199     @@ -971,6 +975,33 @@ unsigned long page_cache_readahead(struc
200     void handle_ra_miss(struct address_space *mapping,
201     struct file_ra_state *ra, pgoff_t offset);
202     unsigned long max_sane_readahead(unsigned long nr);
203     +unsigned long
204     +page_cache_readahead_adaptive(struct address_space *mapping,
205     + struct file_ra_state *ra, struct file *filp,
206     + struct page *prev_page, struct page *page,
207     + pgoff_t first_index, pgoff_t index, pgoff_t last_index);
208     +
209     +#ifdef CONFIG_ADAPTIVE_READAHEAD
210     +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page);
211     +extern int readahead_ratio;
212     +#else
213     +#define readahead_cache_hit(ra, page) do { } while (0)
214     +#define readahead_ratio 1
215     +#endif /* CONFIG_ADAPTIVE_READAHEAD */
216     +
217     +static inline int prefer_adaptive_readahead(void)
218     +{
219     + return readahead_ratio >= 10;
220     +}
221     +
222     +DECLARE_PER_CPU(unsigned long, readahead_aging);
223     +static inline void inc_readahead_aging(void)
224     +{
225     + if (prefer_adaptive_readahead()) {
226     + per_cpu(readahead_aging, get_cpu())++;
227     + put_cpu();
228     + }
229     +}
230    
231     /* Do stack extension */
232     extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
233     Index: linux-2.6.16-ck1/include/linux/page-flags.h
234     ===================================================================
235     --- linux-2.6.16-ck1.orig/include/linux/page-flags.h 2006-03-20 20:46:23.000000000 +1100
236     +++ linux-2.6.16-ck1/include/linux/page-flags.h 2006-03-20 20:47:04.000000000 +1100
237     @@ -75,6 +75,7 @@
238     #define PG_reclaim 17 /* To be reclaimed asap */
239     #define PG_nosave_free 18 /* Free, should not be written */
240     #define PG_uncached 19 /* Page has been mapped as uncached */
241     +#define PG_readahead 20 /* Reminder to do readahead */
242    
243     /*
244     * Global page accounting. One instance per CPU. Only unsigned longs are
245     @@ -344,6 +345,10 @@ extern void __mod_page_state_offset(unsi
246     #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
247     #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
248    
249     +#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags)
250     +#define __SetPageReadahead(page) __set_bit(PG_readahead, &(page)->flags)
251     +#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags)
252     +
253     struct page; /* forward declaration */
254    
255     int test_clear_page_dirty(struct page *page);
256     Index: linux-2.6.16-ck1/include/linux/radix-tree.h
257     ===================================================================
258     --- linux-2.6.16-ck1.orig/include/linux/radix-tree.h 2006-03-20 20:46:23.000000000 +1100
259     +++ linux-2.6.16-ck1/include/linux/radix-tree.h 2006-03-20 20:47:04.000000000 +1100
260     @@ -23,12 +23,24 @@
261     #include <linux/preempt.h>
262     #include <linux/types.h>
263    
264     +#define RADIX_TREE_MAP_SHIFT 6
265     +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
266     +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
267     +
268     struct radix_tree_root {
269     unsigned int height;
270     gfp_t gfp_mask;
271     struct radix_tree_node *rnode;
272     };
273    
274     +/*
275     + * Lookaside cache to support access patterns with strong locality.
276     + */
277     +struct radix_tree_cache {
278     + unsigned long first_index;
279     + struct radix_tree_node *tree_node;
280     +};
281     +
282     #define RADIX_TREE_INIT(mask) { \
283     .height = 0, \
284     .gfp_mask = (mask), \
285     @@ -46,9 +58,18 @@ do { \
286     } while (0)
287    
288     int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
289     -void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
290     -void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
291     +void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long,
292     + unsigned int);
293     +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long);
294     void *radix_tree_delete(struct radix_tree_root *, unsigned long);
295     +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache);
296     +void *radix_tree_cache_lookup_node(struct radix_tree_root *root,
297     + struct radix_tree_cache *cache,
298     + unsigned long index, unsigned int level);
299     +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root,
300     + unsigned long index, unsigned long max_scan);
301     +unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
302     + unsigned long index, unsigned long max_scan);
303     unsigned int
304     radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
305     unsigned long first_index, unsigned int max_items);
306     @@ -70,4 +91,61 @@ static inline void radix_tree_preload_en
307     preempt_enable();
308     }
309    
310     +/**
311     + * radix_tree_lookup - perform lookup operation on a radix tree
312     + * @root: radix tree root
313     + * @index: index key
314     + *
315     + * Lookup the item at the position @index in the radix tree @root.
316     + */
317     +static inline void *radix_tree_lookup(struct radix_tree_root *root,
318     + unsigned long index)
319     +{
320     + return radix_tree_lookup_node(root, index, 0);
321     +}
322     +
323     +/**
324     + * radix_tree_cache_init - init a look-aside cache
325     + * @cache: look-aside cache
326     + *
327     + * Init the radix tree look-aside cache @cache.
328     + */
329     +static inline void radix_tree_cache_init(struct radix_tree_cache *cache)
330     +{
331     + cache->first_index = RADIX_TREE_MAP_MASK;
332     + cache->tree_node = NULL;
333     +}
334     +
335     +/**
336     + * radix_tree_cache_lookup - cached lookup on a radix tree
337     + * @root: radix tree root
338     + * @cache: look-aside cache
339     + * @index: index key
340     + *
341     + * Lookup the item at the position @index in the radix tree @root,
342     + * and make use of @cache to speedup the lookup process.
343     + */
344     +static inline void *radix_tree_cache_lookup(struct radix_tree_root *root,
345     + struct radix_tree_cache *cache,
346     + unsigned long index)
347     +{
348     + return radix_tree_cache_lookup_node(root, cache, index, 0);
349     +}
350     +
351     +static inline unsigned int radix_tree_cache_size(struct radix_tree_cache *cache)
352     +{
353     + return RADIX_TREE_MAP_SIZE;
354     +}
355     +
356     +static inline int radix_tree_cache_full(struct radix_tree_cache *cache)
357     +{
358     + return radix_tree_cache_count(cache) == radix_tree_cache_size(cache);
359     +}
360     +
361     +static inline unsigned long
362     +radix_tree_cache_first_index(struct radix_tree_cache *cache)
363     +{
364     + return cache->first_index;
365     +}
366     +
367     #endif /* _LINUX_RADIX_TREE_H */
368     Index: linux-2.6.16-ck1/include/linux/sysctl.h
369     ===================================================================
370     --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:56.000000000 +1100
371     +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:47:04.000000000 +1100
372     @@ -191,6 +191,8 @@ enum
373     VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
374     VM_SWAP_PREFETCH=33, /* swap prefetch */
375     VM_HARDMAPLIMIT=34, /* Make mapped a hard limit */
376     + VM_READAHEAD_RATIO=35, /* percent of read-ahead size to thrashing-threshold */
377     + VM_READAHEAD_HIT_RATE=36, /* one accessed page legitimizes so many read-ahead pages */
378     };
379    
380    
381     Index: linux-2.6.16-ck1/include/linux/writeback.h
382     ===================================================================
383     --- linux-2.6.16-ck1.orig/include/linux/writeback.h 2006-03-20 20:46:23.000000000 +1100
384     +++ linux-2.6.16-ck1/include/linux/writeback.h 2006-03-20 20:47:04.000000000 +1100
385     @@ -85,6 +85,12 @@ void laptop_io_completion(void);
386     void laptop_sync_completion(void);
387     void throttle_vm_writeout(void);
388    
389     +extern struct timer_list laptop_mode_wb_timer;
390     +static inline int laptop_spinned_down(void)
391     +{
392     + return !timer_pending(&laptop_mode_wb_timer);
393     +}
394     +
395     /* These are exported to sysctl. */
396     extern int dirty_background_ratio;
397     extern int vm_dirty_ratio;
398     Index: linux-2.6.16-ck1/kernel/sysctl.c
399     ===================================================================
400     --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:56.000000000 +1100
401     +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:47:04.000000000 +1100
402     @@ -74,6 +74,12 @@ extern int pid_max_min, pid_max_max;
403     extern int sysctl_drop_caches;
404     extern int percpu_pagelist_fraction;
405    
406     +#if defined(CONFIG_ADAPTIVE_READAHEAD)
407     +extern int readahead_ratio;
408     +extern int readahead_hit_rate;
409     +static int one = 1;
410     +#endif
411     +
412     #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
413     int unknown_nmi_panic;
414     extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
415     @@ -961,6 +967,28 @@ static ctl_table vm_table[] = {
416     .proc_handler = &proc_dointvec,
417     },
418     #endif
419     +#ifdef CONFIG_ADAPTIVE_READAHEAD
420     + {
421     + .ctl_name = VM_READAHEAD_RATIO,
422     + .procname = "readahead_ratio",
423     + .data = &readahead_ratio,
424     + .maxlen = sizeof(readahead_ratio),
425     + .mode = 0644,
426     + .proc_handler = &proc_dointvec,
427     + .strategy = &sysctl_intvec,
428     + .extra1 = &zero,
429     + },
430     + {
431     + .ctl_name = VM_READAHEAD_HIT_RATE,
432     + .procname = "readahead_hit_rate",
433     + .data = &readahead_hit_rate,
434     + .maxlen = sizeof(readahead_hit_rate),
435     + .mode = 0644,
436     + .proc_handler = &proc_dointvec,
437     + .strategy = &sysctl_intvec,
438     + .extra1 = &one,
439     + },
440     +#endif
441     { .ctl_name = 0 }
442     };
443    
444     Index: linux-2.6.16-ck1/lib/radix-tree.c
445     ===================================================================
446     --- linux-2.6.16-ck1.orig/lib/radix-tree.c 2006-03-20 20:46:23.000000000 +1100
447     +++ linux-2.6.16-ck1/lib/radix-tree.c 2006-03-20 20:47:04.000000000 +1100
448     @@ -32,16 +32,7 @@
449     #include <linux/bitops.h>
450    
451    
452     -#ifdef __KERNEL__
453     -#define RADIX_TREE_MAP_SHIFT 6
454     -#else
455     -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
456     -#endif
457     #define RADIX_TREE_TAGS 2
458     -
459     -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
460     -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
461     -
462     #define RADIX_TREE_TAG_LONGS \
463     ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
464    
465     @@ -286,32 +277,89 @@ int radix_tree_insert(struct radix_tree_
466     }
467     EXPORT_SYMBOL(radix_tree_insert);
468    
469     -static inline void **__lookup_slot(struct radix_tree_root *root,
470     - unsigned long index)
471     +/**
472     + * radix_tree_lookup_node - low level lookup routine
473     + * @root: radix tree root
474     + * @index: index key
475     + * @level: stop at that many levels from the tree leaf
476     + *
477     + * Lookup the item at the position @index in the radix tree @root.
478     + * The return value is:
479     + * @level == 0: page at @index;
480     + * @level == 1: the corresponding bottom level tree node;
481     + * @level < height: (@level-1)th parent node of the bottom node
482     + * that contains @index;
483     + * @level >= height: the root node.
484     + */
485     +void *radix_tree_lookup_node(struct radix_tree_root *root,
486     + unsigned long index, unsigned int level)
487     {
488     unsigned int height, shift;
489     - struct radix_tree_node **slot;
490     + struct radix_tree_node *slot;
491    
492     height = root->height;
493     if (index > radix_tree_maxindex(height))
494     return NULL;
495    
496     shift = (height-1) * RADIX_TREE_MAP_SHIFT;
497     - slot = &root->rnode;
498     + slot = root->rnode;
499    
500     - while (height > 0) {
501     - if (*slot == NULL)
502     + while (height > level) {
503     + if (slot == NULL)
504     return NULL;
505    
506     - slot = (struct radix_tree_node **)
507     - ((*slot)->slots +
508     - ((index >> shift) & RADIX_TREE_MAP_MASK));
509     + slot = slot->slots[(index >> shift) & RADIX_TREE_MAP_MASK];
510     shift -= RADIX_TREE_MAP_SHIFT;
511     height--;
512     }
513    
514     - return (void **)slot;
515     + return slot;
516     +}
517     +EXPORT_SYMBOL(radix_tree_lookup_node);
518     +
519     +/**
520     + * radix_tree_cache_lookup_node - cached lookup node
521     + * @root: radix tree root
522     + * @cache: look-aside cache
523     + * @index: index key
524     + *
525     + * Lookup the item at the position @index in the radix tree @root,
526     + * and return the node @level levels from the bottom in the search path.
527     + *
528     + * @cache stores the last accessed upper level tree node by this
529     + * function, and is always checked first before searching in the tree.
530     + * It can improve speed for access patterns with strong locality.
531     + *
532     + * NOTE:
533     + * - The cache becomes invalid on leaving the lock;
534     + * - Do not intermix calls with different @level.
535     + */
536     +void *radix_tree_cache_lookup_node(struct radix_tree_root *root,
537     + struct radix_tree_cache *cache,
538     + unsigned long index, unsigned int level)
539     +{
540     + struct radix_tree_node *node;
541     + unsigned long i;
542     + unsigned long mask;
543     +
544     + if (level >= root->height)
545     + return root->rnode;
546     +
547     + i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK);
548     + mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1);
549     +
550     + if ((index & mask) == cache->first_index)
551     + return cache->tree_node->slots[i];
552     +
553     + node = radix_tree_lookup_node(root, index, level + 1);
554     + if (!node)
555     + return 0;
556     +
557     + cache->tree_node = node;
558     + cache->first_index = (index & mask);
559     + return node->slots[i];
560     }
561     +EXPORT_SYMBOL(radix_tree_cache_lookup_node);
562    
563     /**
564     * radix_tree_lookup_slot - lookup a slot in a radix tree
565     @@ -323,25 +371,131 @@ static inline void **__lookup_slot(struc
566     */
567     void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
568     {
569     - return __lookup_slot(root, index);
570     + struct radix_tree_node *node;
571     +
572     + node = radix_tree_lookup_node(root, index, 1);
573     + return node->slots + (index & RADIX_TREE_MAP_MASK);
574     }
575     EXPORT_SYMBOL(radix_tree_lookup_slot);
576    
577     /**
578     - * radix_tree_lookup - perform lookup operation on a radix tree
579     + * radix_tree_cache_count - items in the cached node
580     + * @cache: radix tree look-aside cache
581     + *
582     + * Query the number of items contained in the cached node.
583     + */
584     +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache)
585     +{
586     + if (!(cache->first_index & RADIX_TREE_MAP_MASK))
587     + return cache->tree_node->count;
588     + else
589     + return 0;
590     +}
591     +EXPORT_SYMBOL(radix_tree_cache_count);
592     +
593     +/**
594     + * radix_tree_scan_hole_backward - scan backward for hole
595     * @root: radix tree root
596     * @index: index key
597     + * @max_scan: advice on max items to scan (it may scan a little more)
598     *
599     - * Lookup the item at the position @index in the radix tree @root.
600     + * Scan backward from @index for a hole/empty item, stop when
601     + * - hit hole
602     + * - @max_scan or more items scanned
603     + * - hit index 0
604     + *
605     + * Return the correponding index.
606     */
607     -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
608     +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root,
609     + unsigned long index, unsigned long max_scan)
610     {
611     - void **slot;
612     + struct radix_tree_cache cache;
613     + struct radix_tree_node *node;
614     + unsigned long origin;
615     + int i;
616     +
617     + origin = index;
618     + radix_tree_cache_init(&cache);
619     +
620     + while (origin - index < max_scan) {
621     + node = radix_tree_cache_lookup_node(root, &cache, index, 1);
622     + if (!node)
623     + break;
624     +
625     + if (node->count == RADIX_TREE_MAP_SIZE) {
626     + index = (index - RADIX_TREE_MAP_SIZE) |
627     + RADIX_TREE_MAP_MASK;
628     + goto check_underflow;
629     + }
630     +
631     + for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) {
632     + if (!node->slots[i])
633     + goto out;
634     + }
635     +
636     +check_underflow:
637     + if (unlikely(index == ULONG_MAX)) {
638     + index = 0;
639     + break;
640     + }
641     + }
642     +
643     +out:
644     + return index;
645     +}
646     +EXPORT_SYMBOL(radix_tree_scan_hole_backward);
647    
648     - slot = __lookup_slot(root, index);
649     - return slot != NULL ? *slot : NULL;
650     +/**
651     + * radix_tree_scan_hole - scan for hole
652     + * @root: radix tree root
653     + * @index: index key
654     + * @max_scan: advice on max items to scan (it may scan a little more)
655     + *
656     + * Scan forward from @index for a hole/empty item, stop when
657     + * - hit hole
658     + * - hit EOF
659     + * - hit index ULONG_MAX
660     + * - @max_scan or more items scanned
661     + *
662     + * Return the correponding index.
663     + */
664     +unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
665     + unsigned long index, unsigned long max_scan)
666     +{
667     + struct radix_tree_cache cache;
668     + struct radix_tree_node *node;
669     + unsigned long origin;
670     + int i;
671     +
672     + origin = index;
673     + radix_tree_cache_init(&cache);
674     +
675     + while (index - origin < max_scan) {
676     + node = radix_tree_cache_lookup_node(root, &cache, index, 1);
677     + if (!node)
678     + break;
679     +
680     + if (node->count == RADIX_TREE_MAP_SIZE) {
681     + index = (index | RADIX_TREE_MAP_MASK) + 1;
682     + goto check_overflow;
683     + }
684     +
685     + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE;
686     + i++, index++) {
687     + if (!node->slots[i])
688     + goto out;
689     + }
690     +
691     +check_overflow:
692     + if (unlikely(!index)) {
693     + index = ULONG_MAX;
694     + break;
695     + }
696     + }
697     +out:
698     + return index;
699     }
700     -EXPORT_SYMBOL(radix_tree_lookup);
701     +EXPORT_SYMBOL(radix_tree_scan_hole);
702    
703     /**
704     * radix_tree_tag_set - set a tag on a radix tree node
705     Index: linux-2.6.16-ck1/mm/Kconfig
706     ===================================================================
707     --- linux-2.6.16-ck1.orig/mm/Kconfig 2006-03-20 20:46:23.000000000 +1100
708     +++ linux-2.6.16-ck1/mm/Kconfig 2006-03-20 20:47:04.000000000 +1100
709     @@ -139,3 +139,58 @@ config SPLIT_PTLOCK_CPUS
710     config MIGRATION
711     def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
712     depends on SWAP
713     +
714     +#
715     +# Adaptive file readahead
716     +#
717     +config ADAPTIVE_READAHEAD
718     + bool "Adaptive file readahead (EXPERIMENTAL)"
719     + default n
720     + depends on EXPERIMENTAL
721     + help
722     + Readahead is a technique employed by the kernel in an attempt
723     + to improve file reading performance. If the kernel has reason
724     + to believe that a particular file is being read sequentially,
725     + it will attempt to read blocks from the file into memory before
726     + the application requests them. When readahead works, it speeds
727     + up the system's throughput, since the reading application does
728     + not have to wait for its requests. When readahead fails, instead,
729     + it generates useless I/O and occupies memory pages which are
730     + needed for some other purpose. For sequential readings,
731     +
732     + Normally, the kernel uses a stock readahead logic that is well
733     + understood and well tuned. This option enables a much complex and
734     + feature rich one. It is more aggressive and memory efficient in
735     + doing readahead, and supports some less-common access patterns such
736     + as reading backward and reading sparsely. However, due to the great
737     + diversity of real world applications, it might not fit everyone.
738     +
739     + Please refer to Documentation/sysctl/vm.txt for tunable parameters.
740     +
741     + Say Y here if you are building kernel for file servers.
742     + Say N if you are unsure.
743     +
744     +config DEBUG_READAHEAD
745     + bool "Readahead debug and accounting"
746     + default n
747     + depends on ADAPTIVE_READAHEAD
748     + select DEBUG_FS
749     + help
750     + This option injects extra code to dump detailed debug traces and do
751     + readahead events accounting.
752     +
753     + To actually get the data:
754     +
755     + mkdir /debug
756     + mount -t debug none /debug
757     +
758     + After that you can do the following:
759     +
760     + echo > /debug/readahead/events # reset the counters
761     + cat /debug/readahead/events # check the counters
762     +
763     + echo 1 > /debug/readahead/debug_level # show printk traces
764     + echo 2 > /debug/readahead/debug_level # show verbose printk traces
765     + echo 0 > /debug/readahead/debug_level # stop filling my kern.log
766     +
767     + Say N, unless you have readahead performance problems.
768     Index: linux-2.6.16-ck1/mm/filemap.c
769     ===================================================================
770     --- linux-2.6.16-ck1.orig/mm/filemap.c 2006-03-20 20:46:23.000000000 +1100
771     +++ linux-2.6.16-ck1/mm/filemap.c 2006-03-20 20:47:04.000000000 +1100
772     @@ -42,6 +42,12 @@ static ssize_t
773     generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
774     loff_t offset, unsigned long nr_segs);
775    
776     +#ifdef CONFIG_DEBUG_READAHEAD
777     +extern u32 readahead_debug_level;
778     +#else
779     +#define readahead_debug_level 0
780     +#endif /* CONFIG_DEBUG_READAHEAD */
781     +
782     /*
783     * Shared mappings implemented 30.11.1994. It's not fully working yet,
784     * though.
785     @@ -746,10 +752,12 @@ void do_generic_mapping_read(struct addr
786     unsigned long prev_index;
787     loff_t isize;
788     struct page *cached_page;
789     + struct page *prev_page;
790     int error;
791     struct file_ra_state ra = *_ra;
792    
793     cached_page = NULL;
794     + prev_page = NULL;
795     index = *ppos >> PAGE_CACHE_SHIFT;
796     next_index = index;
797     prev_index = ra.prev_page;
798     @@ -760,6 +768,10 @@ void do_generic_mapping_read(struct addr
799     if (!isize)
800     goto out;
801    
802     + if (readahead_debug_level >= 5)
803     + printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n",
804     + inode->i_ino, index, last_index - index);
805     +
806     end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
807     for (;;) {
808     struct page *page;
809     @@ -778,16 +790,45 @@ void do_generic_mapping_read(struct addr
810     nr = nr - offset;
811    
812     cond_resched();
813     - if (index == next_index)
814     +
815     + if (!prefer_adaptive_readahead() && index == next_index)
816     next_index = page_cache_readahead(mapping, &ra, filp,
817     index, last_index - index);
818    
819     find_page:
820     page = find_get_page(mapping, index);
821     + if (prefer_adaptive_readahead()) {
822     + if (unlikely(page == NULL)) {
823     + ra.prev_page = prev_index;
824     + page_cache_readahead_adaptive(mapping, &ra,
825     + filp, prev_page, NULL,
826     + *ppos >> PAGE_CACHE_SHIFT,
827     + index, last_index);
828     + page = find_get_page(mapping, index);
829     + } else if (PageReadahead(page)) {
830     + ra.prev_page = prev_index;
831     + page_cache_readahead_adaptive(mapping, &ra,
832     + filp, prev_page, page,
833     + *ppos >> PAGE_CACHE_SHIFT,
834     + index, last_index);
835     + }
836     + }
837     if (unlikely(page == NULL)) {
838     - handle_ra_miss(mapping, &ra, index);
839     + if (!prefer_adaptive_readahead())
840     + handle_ra_miss(mapping, &ra, index);
841     goto no_cached_page;
842     }
843     +
844     + if (prev_page)
845     + page_cache_release(prev_page);
846     + prev_page = page;
847     +
848     + readahead_cache_hit(&ra, page);
849     + if (readahead_debug_level >= 7)
850     + printk(KERN_DEBUG "read-page(ino=%lu, idx=%lu, io=%s)\n",
851     + inode->i_ino, index,
852     + PageUptodate(page) ? "hit" : "miss");
853     +
854     if (!PageUptodate(page))
855     goto page_not_up_to_date;
856     page_ok:
857     @@ -822,7 +863,6 @@ page_ok:
858     index += offset >> PAGE_CACHE_SHIFT;
859     offset &= ~PAGE_CACHE_MASK;
860    
861     - page_cache_release(page);
862     if (ret == nr && desc->count)
863     continue;
864     goto out;
865     @@ -834,7 +874,6 @@ page_not_up_to_date:
866     /* Did it get unhashed before we got the lock? */
867     if (!page->mapping) {
868     unlock_page(page);
869     - page_cache_release(page);
870     continue;
871     }
872    
873     @@ -864,7 +903,6 @@ readpage:
874     * invalidate_inode_pages got it
875     */
876     unlock_page(page);
877     - page_cache_release(page);
878     goto find_page;
879     }
880     unlock_page(page);
881     @@ -885,7 +923,6 @@ readpage:
882     isize = i_size_read(inode);
883     end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
884     if (unlikely(!isize || index > end_index)) {
885     - page_cache_release(page);
886     goto out;
887     }
888    
889     @@ -894,7 +931,6 @@ readpage:
890     if (index == end_index) {
891     nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
892     if (nr <= offset) {
893     - page_cache_release(page);
894     goto out;
895     }
896     }
897     @@ -904,7 +940,6 @@ readpage:
898     readpage_error:
899     /* UHHUH! A synchronous read error occurred. Report it */
900     desc->error = error;
901     - page_cache_release(page);
902     goto out;
903    
904     no_cached_page:
905     @@ -929,15 +964,22 @@ no_cached_page:
906     }
907     page = cached_page;
908     cached_page = NULL;
909     + if (prev_page)
910     + page_cache_release(prev_page);
911     + prev_page = page;
912     goto readpage;
913     }
914    
915     out:
916     *_ra = ra;
917     + if (prefer_adaptive_readahead())
918     + _ra->prev_page = prev_index;
919    
920     *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
921     if (cached_page)
922     page_cache_release(cached_page);
923     + if (prev_page)
924     + page_cache_release(prev_page);
925     if (filp)
926     file_accessed(filp);
927     }
928     @@ -1216,6 +1258,7 @@ struct page *filemap_nopage(struct vm_ar
929     unsigned long size, pgoff;
930     int did_readaround = 0, majmin = VM_FAULT_MINOR;
931    
932     + ra->flags |= RA_FLAG_MMAP;
933     pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
934    
935     retry_all:
936     @@ -1233,7 +1276,7 @@ retry_all:
937     *
938     * For sequential accesses, we use the generic readahead logic.
939     */
940     - if (VM_SequentialReadHint(area))
941     + if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area))
942     page_cache_readahead(mapping, ra, file, pgoff, 1);
943    
944     /*
945     @@ -1241,11 +1284,24 @@ retry_all:
946     */
947     retry_find:
948     page = find_get_page(mapping, pgoff);
949     + if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) {
950     + if (!page) {
951     + page_cache_readahead_adaptive(mapping, ra,
952     + file, NULL, NULL,
953     + pgoff, pgoff, pgoff + 1);
954     + page = find_get_page(mapping, pgoff);
955     + } else if (PageReadahead(page)) {
956     + page_cache_readahead_adaptive(mapping, ra,
957     + file, NULL, page,
958     + pgoff, pgoff, pgoff + 1);
959     + }
960     + }
961     if (!page) {
962     unsigned long ra_pages;
963    
964     if (VM_SequentialReadHint(area)) {
965     - handle_ra_miss(mapping, ra, pgoff);
966     + if (!prefer_adaptive_readahead())
967     + handle_ra_miss(mapping, ra, pgoff);
968     goto no_cached_page;
969     }
970     ra->mmap_miss++;
971     @@ -1282,6 +1338,14 @@ retry_find:
972     if (!did_readaround)
973     ra->mmap_hit++;
974    
975     + readahead_cache_hit(ra, page);
976     + if (readahead_debug_level >= 6)
977     + printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n",
978     + inode->i_ino, pgoff,
979     + VM_RandomReadHint(area) ? "random" :
980     + (VM_SequentialReadHint(area) ? "sequential" : "none"),
981     + PageUptodate(page) ? "hit" : "miss");
982     +
983     /*
984     * Ok, found a page in the page cache, now we need to check
985     * that it's up-to-date.
986     @@ -1296,6 +1360,8 @@ success:
987     mark_page_accessed(page);
988     if (type)
989     *type = majmin;
990     + if (prefer_adaptive_readahead())
991     + ra->prev_page = page->index;
992     return page;
993    
994     outside_data_content:
995     Index: linux-2.6.16-ck1/mm/memory.c
996     ===================================================================
997     --- linux-2.6.16-ck1.orig/mm/memory.c 2006-03-20 20:46:23.000000000 +1100
998     +++ linux-2.6.16-ck1/mm/memory.c 2006-03-20 20:47:04.000000000 +1100
999     @@ -1993,6 +1993,7 @@ static int do_anonymous_page(struct mm_s
1000     page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1001     if (!pte_none(*page_table))
1002     goto release;
1003     + inc_readahead_aging();
1004     inc_mm_counter(mm, anon_rss);
1005     lru_cache_add_active(page);
1006     page_add_new_anon_rmap(page, vma, address);
1007     Index: linux-2.6.16-ck1/mm/page-writeback.c
1008     ===================================================================
1009     --- linux-2.6.16-ck1.orig/mm/page-writeback.c 2006-03-20 20:46:53.000000000 +1100
1010     +++ linux-2.6.16-ck1/mm/page-writeback.c 2006-03-20 20:47:04.000000000 +1100
1011     @@ -370,7 +370,7 @@ static void wb_timer_fn(unsigned long un
1012     static void laptop_timer_fn(unsigned long unused);
1013    
1014     static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
1015     -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
1016     +DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
1017    
1018     /*
1019     * Periodic writeback of "old" data.
1020     Index: linux-2.6.16-ck1/mm/page_alloc.c
1021     ===================================================================
1022     --- linux-2.6.16-ck1.orig/mm/page_alloc.c 2006-03-20 20:46:59.000000000 +1100
1023     +++ linux-2.6.16-ck1/mm/page_alloc.c 2006-03-20 20:47:04.000000000 +1100
1024     @@ -532,7 +532,7 @@ static int prep_new_page(struct page *pa
1025     if (PageReserved(page))
1026     return 1;
1027    
1028     - page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1029     + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
1030     1 << PG_referenced | 1 << PG_arch_1 |
1031     1 << PG_checked | 1 << PG_mappedtodisk);
1032     set_page_private(page, 0);
1033     Index: linux-2.6.16-ck1/mm/readahead.c
1034     ===================================================================
1035     --- linux-2.6.16-ck1.orig/mm/readahead.c 2006-03-20 20:46:23.000000000 +1100
1036     +++ linux-2.6.16-ck1/mm/readahead.c 2006-03-20 20:47:04.000000000 +1100
1037     @@ -14,6 +14,300 @@
1038     #include <linux/blkdev.h>
1039     #include <linux/backing-dev.h>
1040     #include <linux/pagevec.h>
1041     +#include <linux/writeback.h>
1042     +#include <linux/nfsd/const.h>
1043     +#include <asm/div64.h>
1044     +
1045     +/* The default max/min read-ahead pages. */
1046     +#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE)
1047     +#define MAX_RA_PAGES KB(VM_MAX_READAHEAD)
1048     +#define MIN_RA_PAGES KB(VM_MIN_READAHEAD)
1049     +#define MIN_NFSD_PAGES KB(NFSSVC_MAXBLKSIZE/1024)
1050     +
1051     +#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru))
1052     +#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru))
1053     +
1054     +#ifdef CONFIG_ADAPTIVE_READAHEAD
1055     +/*
1056     + * Adaptive read-ahead parameters.
1057     + */
1058     +
1059     +/* In laptop mode, poll delayed look-ahead on every ## pages read. */
1060     +#define LAPTOP_POLL_INTERVAL 16
1061     +
1062     +/* Set look-ahead size to 1/# of the thrashing-threshold. */
1063     +#define LOOKAHEAD_RATIO 8
1064     +
1065     +/* Set read-ahead size to ##% of the thrashing-threshold. */
1066     +int readahead_ratio = 50;
1067     +EXPORT_SYMBOL(readahead_ratio);
1068     +
1069     +/* Readahead as long as cache hit ratio keeps above 1/##. */
1070     +int readahead_hit_rate = 2;
1071     +EXPORT_SYMBOL(readahead_hit_rate);
1072     +
1073     +/*
1074     + * Measures the aging process of cold pages.
1075     + * Mainly increased on fresh page references to make it smooth.
1076     + */
1077     +DEFINE_PER_CPU(unsigned long, readahead_aging);
1078     +EXPORT_PER_CPU_SYMBOL(readahead_aging);
1079     +
1080     +/*
1081     + * Detailed classification of read-ahead behaviors.
1082     + */
1083     +#define RA_CLASS_SHIFT 4
1084     +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1)
1085     +enum ra_class {
1086     + RA_CLASS_ALL,
1087     + RA_CLASS_NEWFILE,
1088     + RA_CLASS_STATE,
1089     + RA_CLASS_CONTEXT,
1090     + RA_CLASS_CONTEXT_AGGRESSIVE,
1091     + RA_CLASS_BACKWARD,
1092     + RA_CLASS_THRASHING,
1093     + RA_CLASS_SEEK,
1094     + RA_CLASS_END,
1095     +};
1096     +#endif /* CONFIG_ADAPTIVE_READAHEAD */
1097     +
1098     +/*
1099     + * Read-ahead events accounting.
1100     + */
1101     +#ifdef CONFIG_DEBUG_READAHEAD
1102     +#include <linux/init.h>
1103     +#include <linux/jiffies.h>
1104     +#include <linux/debugfs.h>
1105     +#include <linux/seq_file.h>
1106     +
1107     +#define DEBUG_READAHEAD_RADIXTREE
1108     +
1109     +/* Read-ahead events to be accounted. */
1110     +enum ra_event {
1111     + RA_EVENT_CACHE_MISS, /* read cache misses */
1112     + RA_EVENT_READRANDOM, /* random reads */
1113     + RA_EVENT_IO_CONGESTION, /* io congestion */
1114     + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */
1115     + RA_EVENT_IO_BLOCK, /* read on locked page */
1116     +
1117     + RA_EVENT_READAHEAD, /* read-ahead issued */
1118     + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */
1119     + RA_EVENT_LOOKAHEAD, /* look-ahead issued */
1120     + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */
1121     + RA_EVENT_LOOKAHEAD_NOACTION, /* look-ahead mark ignored */
1122     + RA_EVENT_READAHEAD_MMAP, /* read-ahead for memory mapped file */
1123     + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */
1124     + RA_EVENT_READAHEAD_SHRINK, /* ra_size under previous la_size */
1125     + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */
1126     + RA_EVENT_READAHEAD_MUTILATE, /* read-ahead request mutilated */
1127     + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */
1128     +
1129     + RA_EVENT_END
1130     +};
1131     +
1132     +static const char * const ra_event_name[] = {
1133     + "cache_miss",
1134     + "read_random",
1135     + "io_congestion",
1136     + "io_cache_hit",
1137     + "io_block",
1138     + "readahead",
1139     + "readahead_hit",
1140     + "lookahead",
1141     + "lookahead_hit",
1142     + "lookahead_ignore",
1143     + "readahead_mmap",
1144     + "readahead_eof",
1145     + "readahead_shrink",
1146     + "readahead_thrash",
1147     + "readahead_mutilt",
1148     + "readahead_rescue",
1149     +};
1150     +
1151     +static const char * const ra_class_name[] = {
1152     + "total",
1153     + "newfile",
1154     + "state",
1155     + "context",
1156     + "contexta",
1157     + "backward",
1158     + "onthrash",
1159     + "onraseek",
1160     + "none",
1161     +};
1162     +
1163     +static unsigned long ra_events[RA_CLASS_END+1][RA_EVENT_END+1][2];
1164     +
1165     +static inline void ra_account(struct file_ra_state *ra,
1166     + enum ra_event e, int pages)
1167     +{
1168     + enum ra_class c;
1169     +
1170     + if (e == RA_EVENT_READAHEAD_HIT && pages < 0) {
1171     + c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK;
1172     + pages = -pages;
1173     + } else if (ra)
1174     + c = ra->flags & RA_CLASS_MASK;
1175     + else
1176     + c = RA_CLASS_END;
1177     +
1178     + if (!c)
1179     + c = RA_CLASS_END;
1180     +
1181     + ra_events[c][e][0] += 1;
1182     + ra_events[c][e][1] += pages;
1183     +
1184     + if (e == RA_EVENT_READAHEAD)
1185     + ra_events[c][RA_EVENT_END][1] += pages * pages;
1186     +}
1187     +
1188     +static int ra_events_show(struct seq_file *s, void *_)
1189     +{
1190     + int i;
1191     + int c;
1192     + int e;
1193     + static const char event_fmt[] = "%-16s";
1194     + static const char class_fmt[] = "%10s";
1195     + static const char item_fmt[] = "%10lu";
1196     + static const char percent_format[] = "%9lu%%";
1197     + static const char * const table_name[] = {
1198     + "[table requests]",
1199     + "[table pages]",
1200     + "[table summary]"};
1201     +
1202     + for (i = 0; i <= 1; i++) {
1203     + for (e = 0; e <= RA_EVENT_END; e++) {
1204     + ra_events[0][e][i] = 0;
1205     + for (c = 1; c < RA_CLASS_END; c++)
1206     + ra_events[0][e][i] += ra_events[c][e][i];
1207     + }
1208     +
1209     + seq_printf(s, event_fmt, table_name[i]);
1210     + for (c = 0; c <= RA_CLASS_END; c++)
1211     + seq_printf(s, class_fmt, ra_class_name[c]);
1212     + seq_puts(s, "\n");
1213     +
1214     + for (e = 0; e < RA_EVENT_END; e++) {
1215     + if (e == RA_EVENT_READAHEAD_HIT && i == 0)
1216     + continue;
1217     + if (e == RA_EVENT_IO_BLOCK && i == 1)
1218     + continue;
1219     +
1220     + seq_printf(s, event_fmt, ra_event_name[e]);
1221     + for (c = 0; c <= RA_CLASS_END; c++)
1222     + seq_printf(s, item_fmt, ra_events[c][e][i]);
1223     + seq_puts(s, "\n");
1224     + }
1225     + seq_puts(s, "\n");
1226     + }
1227     +
1228     + seq_printf(s, event_fmt, table_name[2]);
1229     + for (c = 0; c <= RA_CLASS_END; c++)
1230     + seq_printf(s, class_fmt, ra_class_name[c]);
1231     + seq_puts(s, "\n");
1232     +
1233     + seq_printf(s, event_fmt, "random_rate");
1234     + for (c = 0; c <= RA_CLASS_END; c++)
1235     + seq_printf(s, percent_format,
1236     + (ra_events[c][RA_EVENT_READRANDOM][0] * 100) /
1237     + ((ra_events[c][RA_EVENT_READRANDOM][0] +
1238     + ra_events[c][RA_EVENT_READAHEAD][0]) | 1));
1239     + seq_puts(s, "\n");
1240     +
1241     + seq_printf(s, event_fmt, "ra_hit_rate");
1242     + for (c = 0; c <= RA_CLASS_END; c++)
1243     + seq_printf(s, percent_format,
1244     + (ra_events[c][RA_EVENT_READAHEAD_HIT][1] * 100) /
1245     + (ra_events[c][RA_EVENT_READAHEAD][1] | 1));
1246     + seq_puts(s, "\n");
1247     +
1248     + seq_printf(s, event_fmt, "la_hit_rate");
1249     + for (c = 0; c <= RA_CLASS_END; c++)
1250     + seq_printf(s, percent_format,
1251     + (ra_events[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) /
1252     + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1));
1253     + seq_puts(s, "\n");
1254     +
1255     + seq_printf(s, event_fmt, "var_ra_size");
1256     + for (c = 0; c <= RA_CLASS_END; c++)
1257     + seq_printf(s, item_fmt,
1258     + (ra_events[c][RA_EVENT_END][1] -
1259     + ra_events[c][RA_EVENT_READAHEAD][1] *
1260     + (ra_events[c][RA_EVENT_READAHEAD][1] /
1261     + (ra_events[c][RA_EVENT_READAHEAD][0] | 1))) /
1262     + (ra_events[c][RA_EVENT_READAHEAD][0] | 1));
1263     + seq_puts(s, "\n");
1264     +
1265     + seq_printf(s, event_fmt, "avg_ra_size");
1266     + for (c = 0; c <= RA_CLASS_END; c++)
1267     + seq_printf(s, item_fmt,
1268     + (ra_events[c][RA_EVENT_READAHEAD][1] +
1269     + ra_events[c][RA_EVENT_READAHEAD][0] / 2) /
1270     + (ra_events[c][RA_EVENT_READAHEAD][0] | 1));
1271     + seq_puts(s, "\n");
1272     +
1273     + seq_printf(s, event_fmt, "avg_la_size");
1274     + for (c = 0; c <= RA_CLASS_END; c++)
1275     + seq_printf(s, item_fmt,
1276     + (ra_events[c][RA_EVENT_LOOKAHEAD][1] +
1277     + ra_events[c][RA_EVENT_LOOKAHEAD][0] / 2) /
1278     + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1));
1279     + seq_puts(s, "\n");
1280     +
1281     + return 0;
1282     +}
1283     +
1284     +static int ra_events_open(struct inode *inode, struct file *file)
1285     +{
1286     + return single_open(file, ra_events_show, NULL);
1287     +}
1288     +
1289     +static ssize_t ra_events_write(struct file *file, const char __user *buf,
1290     + size_t size, loff_t *offset)
1291     +{
1292     + memset(ra_events, 0, sizeof(ra_events));
1293     + return 1;
1294     +}
1295     +
1296     +struct file_operations ra_events_fops = {
1297     + .owner = THIS_MODULE,
1298     + .open = ra_events_open,
1299     + .write = ra_events_write,
1300     + .read = seq_read,
1301     + .llseek = seq_lseek,
1302     + .release = single_release,
1303     +};
1304     +
1305     +u32 readahead_debug_level = 0;
1306     +u32 disable_stateful_method = 0;
1307     +
1308     +static int __init readahead_init(void)
1309     +{
1310     + struct dentry *root;
1311     +
1312     + root = debugfs_create_dir("readahead", NULL);
1313     +
1314     + debugfs_create_file("events", 0644, root, NULL, &ra_events_fops);
1315     +
1316     + debugfs_create_u32("debug_level", 0644, root, &readahead_debug_level);
1317     + debugfs_create_bool("disable_stateful_method", 0644, root,
1318     + &disable_stateful_method);
1319     +
1320     + return 0;
1321     +}
1322     +
1323     +module_init(readahead_init)
1324     +#else
1325     +#define ra_account(ra, e, pages) do { } while (0)
1326     +#define readahead_debug_level (0)
1327     +#define disable_stateful_method (0)
1328     +#endif /* CONFIG_DEBUG_READAHEAD */
1329     +
1330     +#define dprintk(args...) \
1331     + do { if (readahead_debug_level >= 1) printk(KERN_DEBUG args); } while(0)
1332     +#define ddprintk(args...) \
1333     + do { if (readahead_debug_level >= 2) printk(KERN_DEBUG args); } while(0)
1334     +
1335    
1336     void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1337     {
1338     @@ -21,7 +315,7 @@ void default_unplug_io_fn(struct backing
1339     EXPORT_SYMBOL(default_unplug_io_fn);
1340    
1341     struct backing_dev_info default_backing_dev_info = {
1342     - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
1343     + .ra_pages = MAX_RA_PAGES,
1344     .state = 0,
1345     .capabilities = BDI_CAP_MAP_COPY,
1346     .unplug_io_fn = default_unplug_io_fn,
1347     @@ -49,7 +343,7 @@ static inline unsigned long get_max_read
1348    
1349     static inline unsigned long get_min_readahead(struct file_ra_state *ra)
1350     {
1351     - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
1352     + return MIN_RA_PAGES;
1353     }
1354    
1355     static inline void ra_off(struct file_ra_state *ra)
1356     @@ -134,8 +428,10 @@ int read_cache_pages(struct address_spac
1357     continue;
1358     }
1359     ret = filler(data, page);
1360     - if (!pagevec_add(&lru_pvec, page))
1361     + if (!pagevec_add(&lru_pvec, page)) {
1362     + cond_resched();
1363     __pagevec_lru_add(&lru_pvec);
1364     + }
1365     if (ret) {
1366     while (!list_empty(pages)) {
1367     struct page *victim;
1368     @@ -173,8 +469,10 @@ static int read_pages(struct address_spa
1369     page->index, GFP_KERNEL)) {
1370     ret = mapping->a_ops->readpage(filp, page);
1371     if (ret != AOP_TRUNCATED_PAGE) {
1372     - if (!pagevec_add(&lru_pvec, page))
1373     + if (!pagevec_add(&lru_pvec, page)) {
1374     + cond_resched();
1375     __pagevec_lru_add(&lru_pvec);
1376     + }
1377     continue;
1378     } /* else fall through to release */
1379     }
1380     @@ -257,7 +555,8 @@ out:
1381     */
1382     static int
1383     __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
1384     - pgoff_t offset, unsigned long nr_to_read)
1385     + pgoff_t offset, unsigned long nr_to_read,
1386     + unsigned long lookahead_size)
1387     {
1388     struct inode *inode = mapping->host;
1389     struct page *page;
1390     @@ -270,7 +569,7 @@ __do_page_cache_readahead(struct address
1391     if (isize == 0)
1392     goto out;
1393    
1394     - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
1395     + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
1396    
1397     /*
1398     * Preallocate as many pages as we will need.
1399     @@ -287,12 +586,15 @@ __do_page_cache_readahead(struct address
1400     continue;
1401    
1402     read_unlock_irq(&mapping->tree_lock);
1403     + cond_resched();
1404     page = page_cache_alloc_cold(mapping);
1405     read_lock_irq(&mapping->tree_lock);
1406     if (!page)
1407     break;
1408     page->index = page_offset;
1409     list_add(&page->lru, &page_pool);
1410     + if (page_idx == nr_to_read - lookahead_size)
1411     + __SetPageReadahead(page);
1412     ret++;
1413     }
1414     read_unlock_irq(&mapping->tree_lock);
1415     @@ -329,7 +631,7 @@ int force_page_cache_readahead(struct ad
1416     if (this_chunk > nr_to_read)
1417     this_chunk = nr_to_read;
1418     err = __do_page_cache_readahead(mapping, filp,
1419     - offset, this_chunk);
1420     + offset, this_chunk, 0);
1421     if (err < 0) {
1422     ret = err;
1423     break;
1424     @@ -338,6 +640,9 @@ int force_page_cache_readahead(struct ad
1425     offset += this_chunk;
1426     nr_to_read -= this_chunk;
1427     }
1428     +
1429     + ra_account(NULL, RA_EVENT_READAHEAD, ret);
1430     +
1431     return ret;
1432     }
1433    
1434     @@ -373,10 +678,16 @@ static inline int check_ra_success(struc
1435     int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
1436     pgoff_t offset, unsigned long nr_to_read)
1437     {
1438     + unsigned long ret;
1439     +
1440     if (bdi_read_congested(mapping->backing_dev_info))
1441     return -1;
1442    
1443     - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
1444     + ret = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
1445     +
1446     + ra_account(NULL, RA_EVENT_READAHEAD, ret);
1447     +
1448     + return ret;
1449     }
1450    
1451     /*
1452     @@ -396,7 +707,11 @@ blockable_page_cache_readahead(struct ad
1453     if (!block && bdi_read_congested(mapping->backing_dev_info))
1454     return 0;
1455    
1456     - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
1457     + actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
1458     +
1459     + ra_account(NULL, RA_EVENT_READAHEAD, actual);
1460     + dprintk("blockable-readahead(ino=%lu, ra=%lu+%lu) = %d\n",
1461     + mapping->host->i_ino, offset, nr_to_read, actual);
1462    
1463     return check_ra_success(ra, nr_to_read, actual);
1464     }
1465     @@ -442,7 +757,7 @@ static int make_ahead_window(struct addr
1466     * @req_size: hint: total size of the read which the caller is performing in
1467     * PAGE_CACHE_SIZE units
1468     *
1469     - * page_cache_readahead() is the main function. If performs the adaptive
1470     + * page_cache_readahead() is the main function. It performs the adaptive
1471     * readahead window size management and submits the readahead I/O.
1472     *
1473     * Note that @filp is purely used for passing on to the ->readpage[s]()
1474     @@ -572,3 +887,1187 @@ unsigned long max_sane_readahead(unsigne
1475     __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
1476     return min(nr, (inactive + free) / 2);
1477     }
1478     +
1479     +/*
1480     + * Adaptive read-ahead.
1481     + *
1482     + * Good read patterns are compact both in space and time. The read-ahead logic
1483     + * tries to grant larger read-ahead size to better readers under the constraint
1484     + * of system memory and load pressure.
1485     + *
1486     + * It employs two methods to estimate the max thrashing safe read-ahead size:
1487     + * 1. state based - the default one
1488     + * 2. context based - the failsafe one
1489     + * The integration of the dual methods has the merit of being agile and robust.
1490     + * It makes the overall design clean: special cases are handled in general by
1491     + * the stateless method, leaving the stateful one simple and fast.
1492     + *
1493     + * To improve throughput and decrease read delay, the logic 'looks ahead'.
1494     + * In most read-ahead chunks, one page will be selected and tagged with
1495     + * PG_readahead. Later when the page with PG_readahead is read, the logic
1496     + * will be notified to submit the next read-ahead chunk in advance.
1497     + *
1498     + * a read-ahead chunk
1499     + * +-----------------------------------------+
1500     + * | # PG_readahead |
1501     + * +-----------------------------------------+
1502     + * ^ When this page is read, notify me for the next read-ahead.
1503     + *
1504     + *
1505     + * Here are some variable names used frequently:
1506     + *
1507     + * |<------- la_size ------>|
1508     + * +-----------------------------------------+
1509     + * | # |
1510     + * +-----------------------------------------+
1511     + * ra_index -->|<---------------- ra_size -------------->|
1512     + *
1513     + */
1514     +
1515     +#ifdef CONFIG_ADAPTIVE_READAHEAD
1516     +
1517     +/*
1518     + * The nature of read-ahead allows false tests to occur occasionally.
1519     + * Here we just do not bother to call get_page(), it's meaningless anyway.
1520     + */
1521     +static inline struct page *__find_page(struct address_space *mapping,
1522     + pgoff_t offset)
1523     +{
1524     + return radix_tree_lookup(&mapping->page_tree, offset);
1525     +}
1526     +
1527     +static inline struct page *find_page(struct address_space *mapping,
1528     + pgoff_t offset)
1529     +{
1530     + struct page *page;
1531     +
1532     + read_lock_irq(&mapping->tree_lock);
1533     + page = __find_page(mapping, offset);
1534     + read_unlock_irq(&mapping->tree_lock);
1535     + return page;
1536     +}
1537     +
1538     +/*
1539     + * Move pages in danger (of thrashing) to the head of inactive_list.
1540     + * Not expected to happen frequently.
1541     + */
1542     +static unsigned long rescue_pages(struct page *page, unsigned long nr_pages)
1543     +{
1544     + int pgrescue;
1545     + pgoff_t index;
1546     + struct zone *zone;
1547     + struct address_space *mapping;
1548     +
1549     + BUG_ON(!nr_pages || !page);
1550     + pgrescue = 0;
1551     + index = page_index(page);
1552     + mapping = page_mapping(page);
1553     +
1554     + dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n",
1555     + mapping->host->i_ino, index, nr_pages);
1556     +
1557     + for(;;) {
1558     + zone = page_zone(page);
1559     + spin_lock_irq(&zone->lru_lock);
1560     +
1561     + if (!PageLRU(page))
1562     + goto out_unlock;
1563     +
1564     + while (page_mapping(page) == mapping &&
1565     + page_index(page) == index) {
1566     + struct page *the_page = page;
1567     + page = next_page(page);
1568     + if (!PageActive(the_page) &&
1569     + !PageLocked(the_page) &&
1570     + page_count(the_page) == 1) {
1571     + list_move(&the_page->lru, &zone->inactive_list);
1572     + pgrescue++;
1573     + }
1574     + index++;
1575     + if (!--nr_pages)
1576     + goto out_unlock;
1577     + }
1578     +
1579     + spin_unlock_irq(&zone->lru_lock);
1580     +
1581     + cond_resched();
1582     + page = find_page(mapping, index);
1583     + if (!page)
1584     + goto out;
1585     + }
1586     +out_unlock:
1587     + spin_unlock_irq(&zone->lru_lock);
1588     +out:
1589     + ra_account(NULL, RA_EVENT_READAHEAD_RESCUE, pgrescue);
1590     + return nr_pages;
1591     +}
1592     +
1593     +/*
1594     + * Set a new look-ahead mark at @new_index.
1595     + * Return 0 if the new mark is successfully set.
1596     + */
1597     +static inline int renew_lookahead(struct address_space *mapping,
1598     + struct file_ra_state *ra,
1599     + pgoff_t index, pgoff_t new_index)
1600     +{
1601     + struct page *page;
1602     +
1603     + if (index == ra->lookahead_index &&
1604     + new_index >= ra->readahead_index)
1605     + return 1;
1606     +
1607     + page = find_page(mapping, new_index);
1608     + if (!page)
1609     + return 1;
1610     +
1611     + __SetPageReadahead(page);
1612     + if (ra->lookahead_index == index)
1613     + ra->lookahead_index = new_index;
1614     +
1615     + return 0;
1616     +}
1617     +
1618     +/*
1619     + * State based calculation of read-ahead request.
1620     + *
1621     + * This figure shows the meaning of file_ra_state members:
1622     + *
1623     + * chunk A chunk B
1624     + * +---------------------------+-------------------------------------------+
1625     + * | # | # |
1626     + * +---------------------------+-------------------------------------------+
1627     + * ^ ^ ^ ^
1628     + * la_index ra_index lookahead_index readahead_index
1629     + */
1630     +
1631     +/*
1632     + * The node's effective length of inactive_list(s).
1633     + */
1634     +static unsigned long node_free_and_cold_pages(void)
1635     +{
1636     + unsigned int i;
1637     + unsigned long sum = 0;
1638     + struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
1639     +
1640     + for (i = 0; i < MAX_NR_ZONES; i++)
1641     + sum += zones[i].nr_inactive +
1642     + zones[i].free_pages - zones[i].pages_low;
1643     +
1644     + return sum;
1645     +}
1646     +
1647     +/*
1648     + * The node's accumulated aging activities.
1649     + */
1650     +static unsigned long node_readahead_aging(void)
1651     +{
1652     + unsigned long cpu;
1653     + unsigned long sum = 0;
1654     + cpumask_t mask = node_to_cpumask(numa_node_id());
1655     +
1656     + for_each_cpu_mask(cpu, mask)
1657     + sum += per_cpu(readahead_aging, cpu);
1658     +
1659     + return sum;
1660     +}
1661     +
1662     +/*
1663     + * The 64bit cache_hits stores three accumulated values and a counter value.
1664     + * MSB LSB
1665     + * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000
1666     + */
1667     +static inline int ra_cache_hit(struct file_ra_state *ra, int nr)
1668     +{
1669     + return (ra->cache_hits >> (nr * 16)) & 0xFFFF;
1670     +}
1671     +
1672     +/*
1673     + * Conceptual code:
1674     + * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0);
1675     + * ra_cache_hit(ra, 0) = 0;
1676     + */
1677     +static inline void ra_addup_cache_hit(struct file_ra_state *ra)
1678     +{
1679     + int n;
1680     +
1681     + n = ra_cache_hit(ra, 0);
1682     + ra->cache_hits -= n;
1683     + n <<= 16;
1684     + ra->cache_hits += n;
1685     +}
1686     +
1687     +/*
1688     + * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate.
1689     + */
1690     +static inline int ra_cache_hit_ok(struct file_ra_state *ra)
1691     +{
1692     + return ra_cache_hit(ra, 0) * readahead_hit_rate >=
1693     + (ra->lookahead_index - ra->la_index);
1694     +}
1695     +
1696     +/*
1697     + * Check if @index falls in the @ra request.
1698     + */
1699     +static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
1700     +{
1701     + if (index < ra->la_index || index >= ra->readahead_index)
1702     + return 0;
1703     +
1704     + if (index >= ra->ra_index)
1705     + return 1;
1706     + else
1707     + return -1;
1708     +}
1709     +
1710     +/*
1711     + * Which method is issuing this read-ahead?
1712     + */
1713     +static inline void ra_set_class(struct file_ra_state *ra,
1714     + enum ra_class ra_class)
1715     +{
1716     + unsigned long flags_mask;
1717     + unsigned long flags;
1718     + unsigned long old_ra_class;
1719     +
1720     + flags_mask = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT));
1721     + flags = ra->flags & flags_mask;
1722     +
1723     + old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT;
1724     +
1725     + ra->flags = flags | old_ra_class | ra_class;
1726     +
1727     + ra_addup_cache_hit(ra);
1728     + if (ra_class != RA_CLASS_STATE)
1729     + ra->cache_hits <<= 16;
1730     +
1731     + ra->age = node_readahead_aging();
1732     +}
1733     +
1734     +/*
1735     + * Where is the old read-ahead and look-ahead?
1736     + */
1737     +static inline void ra_set_index(struct file_ra_state *ra,
1738     + pgoff_t la_index, pgoff_t ra_index)
1739     +{
1740     + ra->la_index = la_index;
1741     + ra->ra_index = ra_index;
1742     +}
1743     +
1744     +/*
1745     + * Where is the new read-ahead and look-ahead?
1746     + */
1747     +static inline void ra_set_size(struct file_ra_state *ra,
1748     + unsigned long ra_size, unsigned long la_size)
1749     +{
1750     + /* Disable look-ahead for loopback file. */
1751     + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD))
1752     + la_size = 0;
1753     +
1754     + ra->readahead_index = ra->ra_index + ra_size;
1755     + ra->lookahead_index = ra->readahead_index - la_size;
1756     +}
1757     +
1758     +/*
1759     + * Submit IO for the read-ahead request in file_ra_state.
1760     + */
1761     +static int ra_dispatch(struct file_ra_state *ra,
1762     + struct address_space *mapping, struct file *filp)
1763     +{
1764     + pgoff_t eof_index;
1765     + unsigned long ra_size;
1766     + unsigned long la_size;
1767     + int actual;
1768     + enum ra_class ra_class;
1769     +
1770     + ra_class = (ra->flags & RA_CLASS_MASK);
1771     + BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
1772     +
1773     + eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1;
1774     + ra_size = ra->readahead_index - ra->ra_index;
1775     + la_size = ra->readahead_index - ra->lookahead_index;
1776     +
1777     + /* Snap to EOF. */
1778     + if (unlikely(ra->ra_index >= eof_index))
1779     + return 0;
1780     + if (ra->readahead_index + ra_size / 2 > eof_index) {
1781     + if (ra_class == RA_CLASS_CONTEXT_AGGRESSIVE &&
1782     + eof_index > ra->lookahead_index + 1)
1783     + la_size = eof_index - ra->lookahead_index;
1784     + else
1785     + la_size = 0;
1786     + ra_size = eof_index - ra->ra_index;
1787     + ra_set_size(ra, ra_size, la_size);
1788     + }
1789     +
1790     + actual = __do_page_cache_readahead(mapping, filp,
1791     + ra->ra_index, ra_size, la_size);
1792     +
1793     +#ifdef CONFIG_DEBUG_READAHEAD
1794     + if (ra->flags & RA_FLAG_MMAP)
1795     + ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual);
1796     + if (ra->readahead_index == eof_index)
1797     + ra_account(ra, RA_EVENT_READAHEAD_EOF, actual);
1798     + if (la_size)
1799     + ra_account(ra, RA_EVENT_LOOKAHEAD, la_size);
1800     + if (ra_size > actual)
1801     + ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual);
1802     + ra_account(ra, RA_EVENT_READAHEAD, actual);
1803     +
1804     + if (!ra->ra_index && filp->f_dentry->d_inode) {
1805     + char *fn;
1806     + static char path[1024];
1807     + unsigned long size;
1808     +
1809     + size = (i_size_read(filp->f_dentry->d_inode)+1023)/1024;
1810     + fn = d_path(filp->f_dentry, filp->f_vfsmnt, path, 1000);
1811     + if (!IS_ERR(fn))
1812     + ddprintk("ino %lu is %s size %luK by %s(%d)\n",
1813     + filp->f_dentry->d_inode->i_ino,
1814     + fn, size,
1815     + current->comm, current->pid);
1816     + }
1817     +
1818     + dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
1819     + ra_class_name[ra_class],
1820     + mapping->host->i_ino, ra->la_index,
1821     + ra->ra_index, ra_size, la_size, actual);
1822     +#endif /* CONFIG_DEBUG_READAHEAD */
1823     +
1824     + return actual;
1825     +}
1826     +
1827     +/*
1828     + * Determine the ra request from primitive values.
1829     + *
1830     + * It applies the following rules:
1831     + * - Substract ra_size by the old look-ahead to get real safe read-ahead;
1832     + * - Set new la_size according to the (still large) ra_size;
1833     + * - Apply upper limits;
1834     + * - Make sure stream_shift is not too small.
1835     + * (So that the next global_shift will not be too small.)
1836     + *
1837     + * Input:
1838     + * ra_size stores the estimated thrashing-threshold.
1839     + * la_size stores the look-ahead size of previous request.
1840     + */
1841     +static inline int adjust_rala(unsigned long ra_max,
1842     + unsigned long *ra_size, unsigned long *la_size)
1843     +{
1844     + unsigned long stream_shift = *la_size;
1845     +
1846     + if (*ra_size > *la_size)
1847     + *ra_size -= *la_size;
1848     + else {
1849     + ra_account(NULL, RA_EVENT_READAHEAD_SHRINK, *ra_size);
1850     + return 0;
1851     + }
1852     +
1853     + *la_size = *ra_size / LOOKAHEAD_RATIO;
1854     +
1855     + if (*ra_size > ra_max)
1856     + *ra_size = ra_max;
1857     + if (*la_size > *ra_size)
1858     + *la_size = *ra_size;
1859     +
1860     + stream_shift += (*ra_size - *la_size);
1861     + if (stream_shift < *ra_size / 4)
1862     + *la_size -= (*ra_size / 4 - stream_shift);
1863     +
1864     + return 1;
1865     +}
1866     +
1867     +/*
1868     + * The function estimates two values:
1869     + * 1. thrashing-threshold for the current stream
1870     + * It is returned to make the next read-ahead request.
1871     + * 2. the remained safe space for the current chunk
1872     + * It will be checked to ensure that the current chunk is safe.
1873     + *
1874     + * The computation will be pretty accurate under heavy load, and will vibrate
1875     + * more on light load(with small global_shift), so the grow speed of ra_size
1876     + * must be limited, and a moderate large stream_shift must be insured.
1877     + *
1878     + * This figure illustrates the formula used in the function:
1879     + * While the stream reads stream_shift pages inside the chunks,
1880     + * the chunks are shifted global_shift pages inside inactive_list.
1881     + *
1882     + * chunk A chunk B
1883     + * |<=============== global_shift ================|
1884     + * +-------------+ +-------------------+ |
1885     + * | # | | # | inactive_list |
1886     + * +-------------+ +-------------------+ head |
1887     + * |---->| |---------->|
1888     + * | |
1889     + * +-- stream_shift --+
1890     + */
1891     +static inline unsigned long compute_thrashing_threshold(
1892     + struct file_ra_state *ra,
1893     + unsigned long *remain)
1894     +{
1895     + unsigned long global_size;
1896     + unsigned long global_shift;
1897     + unsigned long stream_shift;
1898     + unsigned long ra_size;
1899     + uint64_t ll;
1900     +
1901     + global_size = node_free_and_cold_pages();
1902     + global_shift = node_readahead_aging() - ra->age;
1903     + global_shift |= 1UL;
1904     + stream_shift = ra_cache_hit(ra, 0);
1905     +
1906     + ll = (uint64_t) stream_shift * (global_size >> 9) * readahead_ratio * 5;
1907     + do_div(ll, global_shift);
1908     + ra_size = ll;
1909     +
1910     + if (global_size > global_shift) {
1911     + ll = (uint64_t) stream_shift * (global_size - global_shift);
1912     + do_div(ll, global_shift);
1913     + *remain = ll;
1914     + } else
1915     + *remain = 0;
1916     +
1917     + ddprintk("compute_thrashing_threshold: "
1918     + "at %lu ra %lu=%lu*%lu/%lu, remain %lu for %lu\n",
1919     + ra->readahead_index, ra_size,
1920     + stream_shift, global_size, global_shift,
1921     + *remain, ra->readahead_index - ra->lookahead_index);
1922     +
1923     + return ra_size;
1924     +}
1925     +
1926     +/*
1927     + * Main function for file_ra_state based read-ahead.
1928     + */
1929     +static inline unsigned long
1930     +state_based_readahead(struct address_space *mapping, struct file *filp,
1931     + struct file_ra_state *ra,
1932     + struct page *page, pgoff_t index,
1933     + unsigned long ra_size, unsigned long ra_max)
1934     +{
1935     + unsigned long ra_old;
1936     + unsigned long la_size;
1937     + unsigned long remain_space;
1938     + unsigned long growth_limit;
1939     +
1940     + la_size = ra->readahead_index - index;
1941     + ra_old = ra->readahead_index - ra->ra_index;
1942     + growth_limit = ra_size + ra_max / 16 +
1943     + (2 + readahead_ratio / 64) * ra_old;
1944     + ra_size = compute_thrashing_threshold(ra, &remain_space);
1945     +
1946     + if (page && remain_space <= la_size && la_size > 1) {
1947     + rescue_pages(page, la_size);
1948     + return 0;
1949     + }
1950     +
1951     + if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size))
1952     + return 0;
1953     +
1954     + ra_set_class(ra, RA_CLASS_STATE);
1955     + ra_set_index(ra, index, ra->readahead_index);
1956     + ra_set_size(ra, ra_size, la_size);
1957     +
1958     + return ra_dispatch(ra, mapping, filp);
1959     +}
1960     +
1961     +/*
1962     + * Page cache context based estimation of read-ahead/look-ahead size/index.
1963     + *
1964     + * The logic first looks around to find the start point of next read-ahead,
1965     + * and then, if necessary, looks backward in the inactive_list to get an
1966     + * estimation of the thrashing-threshold.
1967     + *
1968     + * The estimation theory can be illustrated with figure:
1969     + *
1970     + * chunk A chunk B chunk C head
1971     + *
1972     + * l01 l11 l12 l21 l22
1973     + *| |-->|-->| |------>|-->| |------>|
1974     + *| +-------+ +-----------+ +-------------+ |
1975     + *| | # | | # | | # | |
1976     + *| +-------+ +-----------+ +-------------+ |
1977     + *| |<==============|<===========================|<============================|
1978     + * L0 L1 L2
1979     + *
1980     + * Let f(l) = L be a map from
1981     + * l: the number of pages read by the stream
1982     + * to
1983     + * L: the number of pages pushed into inactive_list in the mean time
1984     + * then
1985     + * f(l01) <= L0
1986     + * f(l11 + l12) = L1
1987     + * f(l21 + l22) = L2
1988     + * ...
1989     + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...)
1990     + * <= Length(inactive_list) = f(thrashing-threshold)
1991     + *
1992     + * So the count of countinuous history pages left in the inactive_list is always
1993     + * a lower estimation of the true thrashing-threshold.
1994     + */
1995     +
1996     +#define PAGE_REFCNT_0 0
1997     +#define PAGE_REFCNT_1 (1 << PG_referenced)
1998     +#define PAGE_REFCNT_2 (1 << PG_active)
1999     +#define PAGE_REFCNT_3 ((1 << PG_active) | (1 << PG_referenced))
2000     +#define PAGE_REFCNT_MASK PAGE_REFCNT_3
2001     +
2002     +/*
2003     + * STATUS REFERENCE COUNT
2004     + * __ 0
2005     + * _R PAGE_REFCNT_1
2006     + * A_ PAGE_REFCNT_2
2007     + * AR PAGE_REFCNT_3
2008     + *
2009     + * A/R: Active / Referenced
2010     + */
2011     +static inline unsigned long page_refcnt(struct page *page)
2012     +{
2013     + return page->flags & PAGE_REFCNT_MASK;
2014     +}
2015     +
2016     +/*
2017     + * STATUS REFERENCE COUNT TYPE
2018     + * __ 0 fresh
2019     + * _R PAGE_REFCNT_1 stale
2020     + * A_ PAGE_REFCNT_2 disturbed once
2021     + * AR PAGE_REFCNT_3 disturbed twice
2022     + *
2023     + * A/R: Active / Referenced
2024     + */
2025     +static inline unsigned long cold_page_refcnt(struct page *page)
2026     +{
2027     + if (!page || PageActive(page))
2028     + return 0;
2029     +
2030     + return page_refcnt(page);
2031     +}
2032     +
2033     +static inline char page_refcnt_symbol(struct page *page)
2034     +{
2035     + if (!page)
2036     + return 'X';
2037     +
2038     + switch (page_refcnt(page)) {
2039     + case 0:
2040     + return '_';
2041     + case PAGE_REFCNT_1:
2042     + return '-';
2043     + case PAGE_REFCNT_2:
2044     + return '=';
2045     + case PAGE_REFCNT_3:
2046     + return '#';
2047     + default:
2048     + return '?';
2049     + }
2050     +}
2051     +
2052     +/*
2053     + * Count/estimate cache hits in range [first_index, last_index].
2054     + * The estimation is simple and optimistic.
2055     + */
2056     +static int count_cache_hit(struct address_space *mapping,
2057     + pgoff_t first_index, pgoff_t last_index)
2058     +{
2059     + struct page *page;
2060     + int size = last_index - first_index + 1;
2061     + int count = 0;
2062     + int i;
2063     +
2064     + cond_resched();
2065     + read_lock_irq(&mapping->tree_lock);
2066     +
2067     + /*
2068     + * The first page may well is chunk head and has been accessed,
2069     + * so it is index 0 that makes the estimation optimistic. This
2070     + * behavior guarantees a readahead when (size < ra_max) and
2071     + * (readahead_hit_rate >= 16).
2072     + */
2073     + for (i = 0; i < 16;) {
2074     + page = __find_page(mapping, first_index +
2075     + size * ((i++ * 29) & 15) / 16);
2076     + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2)
2077     + break;
2078     + }
2079     +
2080     + read_unlock_irq(&mapping->tree_lock);
2081     +
2082     + return size * count / i;
2083     +}
2084     +
2085     +/*
2086     + * Look back and check history pages to estimate thrashing-threshold.
2087     + */
2088     +static unsigned long query_page_cache_segment(struct address_space *mapping,
2089     + struct file_ra_state *ra,
2090     + unsigned long *remain, pgoff_t offset,
2091     + unsigned long ra_min, unsigned long ra_max)
2092     +{
2093     + pgoff_t index;
2094     + unsigned long count;
2095     + unsigned long nr_lookback;
2096     + struct radix_tree_cache cache;
2097     +
2098     + /*
2099     + * Scan backward and check the near @ra_max pages.
2100     + * The count here determines ra_size.
2101     + */
2102     + cond_resched();
2103     + read_lock_irq(&mapping->tree_lock);
2104     + index = radix_tree_scan_hole_backward(&mapping->page_tree,
2105     + offset, ra_max);
2106     +#ifdef DEBUG_READAHEAD_RADIXTREE
2107     + WARN_ON(index > offset);
2108     + if (index != offset)
2109     + WARN_ON(!__find_page(mapping, index + 1));
2110     + if (index && offset - index < ra_max)
2111     + WARN_ON(__find_page(mapping, index));
2112     +#endif
2113     + read_unlock_irq(&mapping->tree_lock);
2114     +
2115     + *remain = offset - index;
2116     +
2117     + if (offset == ra->readahead_index && ra_cache_hit_ok(ra))
2118     + count = *remain;
2119     + else if (count_cache_hit(mapping, index + 1, offset) *
2120     + readahead_hit_rate >= *remain)
2121     + count = *remain;
2122     + else
2123     + count = ra_min;
2124     +
2125     + /*
2126     + * Unnecessary to count more?
2127     + */
2128     + if (count < ra_max)
2129     + goto out;
2130     +
2131     + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD))
2132     + goto out;
2133     +
2134     + /*
2135     + * Check the far pages coarsely.
2136     + * The big count here helps increase la_size.
2137     + */
2138     + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
2139     + 100 / (readahead_ratio + 1);
2140     +
2141     + cond_resched();
2142     + radix_tree_cache_init(&cache);
2143     + read_lock_irq(&mapping->tree_lock);
2144     + for (count += ra_max; count < nr_lookback; count += ra_max) {
2145     + struct radix_tree_node *node;
2146     + node = radix_tree_cache_lookup_node(&mapping->page_tree,
2147     + &cache, offset - count, 1);
2148     +#ifdef DEBUG_READAHEAD_RADIXTREE
2149     + if (node != radix_tree_lookup_node(&mapping->page_tree,
2150     + offset - count, 1))
2151     + BUG();
2152     +#endif
2153     + if (!node)
2154     + break;
2155     + }
2156     + read_unlock_irq(&mapping->tree_lock);
2157     +
2158     +out:
2159     + /*
2160     + * For sequential read that extends from index 0, the counted value
2161     + * may well be far under the true threshold, so return it unmodified
2162     + * for further process in adjust_rala_aggressive().
2163     + */
2164     + if (count >= offset)
2165     + count = offset;
2166     + else
2167     + count = max(ra_min, count * readahead_ratio / 100);
2168     +
2169     + ddprintk("query_page_cache_segment: "
2170     + "ino=%lu, idx=%lu, count=%lu, remain=%lu\n",
2171     + mapping->host->i_ino, offset, count, *remain);
2172     +
2173     + return count;
2174     +}
2175     +
2176     +/*
2177     + * Find past-the-end index of the segment before @index.
2178     + */
2179     +static inline pgoff_t find_segtail_backward(struct address_space *mapping,
2180     + pgoff_t index, unsigned long max_scan)
2181     +{
2182     + struct radix_tree_cache cache;
2183     + struct page *page;
2184     + pgoff_t origin;
2185     +
2186     + origin = index;
2187     + if (max_scan > index)
2188     + max_scan = index;
2189     +
2190     + cond_resched();
2191     + radix_tree_cache_init(&cache);
2192     + read_lock_irq(&mapping->tree_lock);
2193     + for (; origin - index < max_scan;) {
2194     + page = radix_tree_cache_lookup(&mapping->page_tree,
2195     + &cache, --index);
2196     + if (page) {
2197     + read_unlock_irq(&mapping->tree_lock);
2198     + return index + 1;
2199     + }
2200     + }
2201     + read_unlock_irq(&mapping->tree_lock);
2202     +
2203     + return 0;
2204     +}
2205     +
2206     +/*
2207     + * Find past-the-end index of the segment at @index.
2208     + */
2209     +static inline pgoff_t find_segtail(struct address_space *mapping,
2210     + pgoff_t index, unsigned long max_scan)
2211     +{
2212     + pgoff_t ra_index;
2213     +
2214     + cond_resched();
2215     + read_lock_irq(&mapping->tree_lock);
2216     + ra_index = radix_tree_scan_hole(&mapping->page_tree, index, max_scan);
2217     +#ifdef DEBUG_READAHEAD_RADIXTREE
2218     + BUG_ON(!__find_page(mapping, index));
2219     + WARN_ON(ra_index < index);
2220     + if (ra_index != index && !__find_page(mapping, ra_index - 1))
2221     + printk(KERN_ERR "radix_tree_scan_hole(index=%lu ra_index=%lu "
2222     + "max_scan=%lu nrpages=%lu) fooled!\n",
2223     + index, ra_index, max_scan, mapping->nrpages);
2224     + if (ra_index != ~0UL && ra_index - index < max_scan)
2225     + WARN_ON(__find_page(mapping, ra_index));
2226     +#endif
2227     + read_unlock_irq(&mapping->tree_lock);
2228     +
2229     + if (ra_index <= index + max_scan)
2230     + return ra_index;
2231     + else
2232     + return 0;
2233     +}
2234     +
2235     +/*
2236     + * Determine the request parameters for context based read-ahead that extends
2237     + * from start of file.
2238     + *
2239     + * The major weakness of stateless method is perhaps the slow grow up speed of
2240     + * ra_size. The logic tries to make up for this in the important case of
2241     + * sequential reads that extend from start of file. In this case, the ra_size
2242     + * is not chosen to make the whole next chunk safe (as in normal ones). Only
2243     + * half of which is safe. The added 'unsafe' half is the look-ahead part. It
2244     + * is expected to be safeguarded by rescue_pages() when the previous chunks are
2245     + * lost.
2246     + */
2247     +static inline int adjust_rala_aggressive(unsigned long ra_max,
2248     + unsigned long *ra_size, unsigned long *la_size)
2249     +{
2250     + pgoff_t index = *ra_size;
2251     +
2252     + *ra_size -= min(*ra_size, *la_size);
2253     + *ra_size = *ra_size * readahead_ratio / 100;
2254     + *la_size = index * readahead_ratio / 100;
2255     + *ra_size += *la_size;
2256     +
2257     + if (*ra_size > ra_max)
2258     + *ra_size = ra_max;
2259     + if (*la_size > *ra_size)
2260     + *la_size = *ra_size;
2261     +
2262     + return 1;
2263     +}
2264     +
2265     +/*
2266     + * Main function for page context based read-ahead.
2267     + */
2268     +static inline int
2269     +try_context_based_readahead(struct address_space *mapping,
2270     + struct file_ra_state *ra, struct page *prev_page,
2271     + struct page *page, pgoff_t index,
2272     + unsigned long ra_min, unsigned long ra_max)
2273     +{
2274     + pgoff_t ra_index;
2275     + unsigned long ra_size;
2276     + unsigned long la_size;
2277     + unsigned long remain_pages;
2278     +
2279     + /* Where to start read-ahead?
2280     + * NFSv3 daemons may process adjacent requests in parallel,
2281     + * leading to many locally disordered, globally sequential reads.
2282     + * So do not require nearby history pages to be present or accessed.
2283     + */
2284     + if (page) {
2285     + ra_index = find_segtail(mapping, index, ra_max * 5 / 4);
2286     + if (!ra_index)
2287     + return -1;
2288     + } else if (prev_page || find_page(mapping, index - 1)) {
2289     + ra_index = index;
2290     + } else if (readahead_hit_rate > 1) {
2291     + ra_index = find_segtail_backward(mapping, index,
2292     + readahead_hit_rate + ra_min);
2293     + if (!ra_index)
2294     + return 0;
2295     + ra_min += 2 * (index - ra_index);
2296     + index = ra_index; /* pretend the request starts here */
2297     + } else
2298     + return 0;
2299     +
2300     + ra_size = query_page_cache_segment(mapping, ra, &remain_pages,
2301     + index, ra_min, ra_max);
2302     +
2303     + la_size = ra_index - index;
2304     + if (page && remain_pages <= la_size &&
2305     + remain_pages < index && la_size > 1) {
2306     + rescue_pages(page, la_size);
2307     + return -1;
2308     + }
2309     +
2310     + if (ra_size == index) {
2311     + if (!adjust_rala_aggressive(ra_max, &ra_size, &la_size))
2312     + return -1;
2313     + ra_set_class(ra, RA_CLASS_CONTEXT_AGGRESSIVE);
2314     + } else {
2315     + if (!adjust_rala(ra_max, &ra_size, &la_size))
2316     + return -1;
2317     + ra_set_class(ra, RA_CLASS_CONTEXT);
2318     + }
2319     +
2320     + ra_set_index(ra, index, ra_index);
2321     + ra_set_size(ra, ra_size, la_size);
2322     +
2323     + return 1;
2324     +}
2325     +
2326     +/*
2327     + * Read-ahead on start of file.
2328     + *
2329     + * The strategies here are most important for small files.
2330     + * 1. Set a moderately large read-ahead size;
2331     + * 2. Issue the next read-ahead request as soon as possible.
2332     + *
2333     + * But be careful, there are some applications that dip into only the very head
2334     + * of a file. The most important thing is to prevent them from triggering the
2335     + * next (much larger) read-ahead request, which leads to lots of cache misses.
2336     + * Two pages should be enough for them, correct me if I'm wrong.
2337     + */
2338     +static inline unsigned long
2339     +newfile_readahead(struct address_space *mapping,
2340     + struct file *filp, struct file_ra_state *ra,
2341     + unsigned long req_size, unsigned long ra_min)
2342     +{
2343     + unsigned long ra_size;
2344     + unsigned long la_size;
2345     +
2346     + if (req_size > ra_min) /* larger value risks thrashing */
2347     + req_size = ra_min;
2348     +
2349     + if (unlikely(ra->flags & RA_FLAG_NFSD)) {
2350     + ra_size = MIN_NFSD_PAGES;
2351     + la_size = 0;
2352     + } else {
2353     + ra_size = 4 * req_size;
2354     + la_size = 2 * req_size;
2355     + }
2356     +
2357     + ra_set_class(ra, RA_CLASS_NEWFILE);
2358     + ra_set_index(ra, 0, 0);
2359     + ra_set_size(ra, ra_size, la_size);
2360     +
2361     + return ra_dispatch(ra, mapping, filp);
2362     +}
2363     +
2364     +/*
2365     + * Backward prefetching.
2366     + * No look ahead and thrashing threshold estimation for stepping backward
2367     + * pattern: should be unnecessary.
2368     + */
2369     +static inline int
2370     +try_read_backward(struct file_ra_state *ra, pgoff_t begin_index,
2371     + unsigned long ra_size, unsigned long ra_max)
2372     +{
2373     + pgoff_t end_index;
2374     +
2375     + /* Are we reading backward? */
2376     + if (begin_index > ra->prev_page)
2377     + return 0;
2378     +
2379     + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD &&
2380     + ra_has_index(ra, ra->prev_page)) {
2381     + ra_size += 2 * ra_cache_hit(ra, 0);
2382     + end_index = ra->la_index;
2383     + } else {
2384     + ra_size += ra_size + ra_size * (readahead_hit_rate - 1) / 2;
2385     + end_index = ra->prev_page;
2386     + }
2387     +
2388     + if (ra_size > ra_max)
2389     + ra_size = ra_max;
2390     +
2391     + /* Read traces close enough to be covered by the prefetching? */
2392     + if (end_index > begin_index + ra_size)
2393     + return 0;
2394     +
2395     + begin_index = end_index - ra_size;
2396     +
2397     + ra_set_class(ra, RA_CLASS_BACKWARD);
2398     + ra_set_index(ra, begin_index, begin_index);
2399     + ra_set_size(ra, ra_size, 0);
2400     +
2401     + return 1;
2402     +}
2403     +
2404     +/*
2405     + * Readahead thrashing recovery.
2406     + */
2407     +static inline unsigned long
2408     +thrashing_recovery_readahead(struct address_space *mapping,
2409     + struct file *filp, struct file_ra_state *ra,
2410     + pgoff_t index, unsigned long ra_max)
2411     +{
2412     + unsigned long ra_size;
2413     +
2414     + if (readahead_debug_level && find_page(mapping, index - 1))
2415     + ra_account(ra, RA_EVENT_READAHEAD_MUTILATE,
2416     + ra->readahead_index - index);
2417     + ra_account(ra, RA_EVENT_READAHEAD_THRASHING,
2418     + ra->readahead_index - index);
2419     +
2420     + /*
2421     + * Some thrashing occur in (ra_index, la_index], in which case the
2422     + * old read-ahead chunk is lost soon after the new one is allocated.
2423     + * Ensure that we recover all needed pages in the old chunk.
2424     + */
2425     + if (index < ra->ra_index)
2426     + ra_size = ra->ra_index - index;
2427     + else {
2428     + /* After thrashing, we know the exact thrashing-threshold. */
2429     + ra_size = ra_cache_hit(ra, 0);
2430     +
2431     + /* And we'd better be a bit conservative. */
2432     + ra_size = ra_size * 3 / 4;
2433     + }
2434     +
2435     + if (ra_size > ra_max)
2436     + ra_size = ra_max;
2437     +
2438     + ra_set_class(ra, RA_CLASS_THRASHING);
2439     + ra_set_index(ra, index, index);
2440     + ra_set_size(ra, ra_size, ra_size / LOOKAHEAD_RATIO);
2441     +
2442     + return ra_dispatch(ra, mapping, filp);
2443     +}
2444     +
2445     +/*
2446     + * If there is a previous sequential read, it is likely to be another
2447     + * sequential read at the new position.
2448     + * Databases are known to have this seek-and-read-one-block pattern.
2449     + */
2450     +static inline int
2451     +try_readahead_on_seek(struct file_ra_state *ra, pgoff_t index,
2452     + unsigned long ra_size, unsigned long ra_max)
2453     +{
2454     + unsigned long hit0 = ra_cache_hit(ra, 0);
2455     + unsigned long hit1 = ra_cache_hit(ra, 1) + hit0;
2456     + unsigned long hit2 = ra_cache_hit(ra, 2);
2457     + unsigned long hit3 = ra_cache_hit(ra, 3);
2458     +
2459     + /* There's a previous read-ahead request? */
2460     + if (!ra_has_index(ra, ra->prev_page))
2461     + return 0;
2462     +
2463     + /* The previous read-ahead sequences have similiar sizes? */
2464     + if (!(ra_size < hit1 && hit1 > hit2 / 2 &&
2465     + hit2 > hit3 / 2 &&
2466     + hit3 > hit1 / 2))
2467     + return 0;
2468     +
2469     + hit1 = max(hit1, hit2);
2470     +
2471     + /* Follow the same prefetching direction. */
2472     + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD)
2473     + index = ((index > hit1 - ra_size) ? index - hit1 + ra_size : 0);
2474     +
2475     + ra_size = min(hit1, ra_max);
2476     +
2477     + ra_set_class(ra, RA_CLASS_SEEK);
2478     + ra_set_index(ra, index, index);
2479     + ra_set_size(ra, ra_size, 0);
2480     +
2481     + return 1;
2482     +}
2483     +
2484     +/*
2485     + * ra_min is mainly determined by the size of cache memory.
2486     + * Table of concrete numbers for 4KB page size:
2487     + * inactive + free (MB): 4 8 16 32 64 128 256 512 1024
2488     + * ra_min (KB): 16 16 16 16 20 24 32 48 64
2489     + */
2490     +static inline void get_readahead_bounds(struct file_ra_state *ra,
2491     + unsigned long *ra_min,
2492     + unsigned long *ra_max)
2493     +{
2494     + unsigned long pages;
2495     +
2496     + pages = max_sane_readahead(KB(1024*1024));
2497     + *ra_max = min(min(pages, 0xFFFFUL), ra->ra_pages);
2498     + *ra_min = min(min(MIN_RA_PAGES + (pages>>13), KB(128)), *ra_max/2);
2499     +}
2500     +
2501     +/**
2502     + * page_cache_readahead_adaptive - adaptive read-ahead main function
2503     + * @mapping, @ra, @filp: the same as page_cache_readahead()
2504     + * @prev_page: the page at @index-1, may be NULL to let the function find it
2505     + * @page: the page at @index, or NULL if non-present
2506     + * @begin_index, @index, @end_index: offsets into @mapping
2507     + * [@begin_index, @end_index) is the read the caller is performing
2508     + * @index indicates the page to be read now
2509     + *
2510     + * page_cache_readahead_adaptive() is the entry point of the adaptive
2511     + * read-ahead logic. It tries a set of methods in turn to determine the
2512     + * appropriate readahead action and submits the readahead I/O.
2513     + *
2514     + * The caller is expected to point ra->prev_page to the previously accessed
2515     + * page, and to call it on two conditions:
2516     + * 1. @page == NULL
2517     + * A cache miss happened, some pages have to be read in
2518     + * 2. @page != NULL && PageReadahead(@page)
2519     + * A look-ahead mark encountered, this is set by a previous read-ahead
2520     + * invocation to instruct the caller to give the function a chance to
2521     + * check up and do next read-ahead in advance.
2522     + */
2523     +unsigned long
2524     +page_cache_readahead_adaptive(struct address_space *mapping,
2525     + struct file_ra_state *ra, struct file *filp,
2526     + struct page *prev_page, struct page *page,
2527     + pgoff_t begin_index, pgoff_t index, pgoff_t end_index)
2528     +{
2529     + unsigned long size;
2530     + unsigned long ra_min;
2531     + unsigned long ra_max;
2532     + int ret;
2533     +
2534     + might_sleep();
2535     +
2536     + if (page) {
2537     + if(!TestClearPageReadahead(page))
2538     + return 0;
2539     + if (bdi_read_congested(mapping->backing_dev_info)) {
2540     + ra_account(ra, RA_EVENT_IO_CONGESTION,
2541     + end_index - index);
2542     + return 0;
2543     + }
2544     + if (laptop_mode && laptop_spinned_down()) {
2545     + if (!renew_lookahead(mapping, ra, index,
2546     + index + LAPTOP_POLL_INTERVAL))
2547     + return 0;
2548     + }
2549     + }
2550     +
2551     + if (page)
2552     + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT,
2553     + ra->readahead_index - ra->lookahead_index);
2554     + else if (index)
2555     + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index);
2556     +
2557     + size = end_index - index;
2558     + get_readahead_bounds(ra, &ra_min, &ra_max);
2559     +
2560     + /* readahead disabled? */
2561     + if (unlikely(!ra_max || !readahead_ratio)) {
2562     + size = max_sane_readahead(size);
2563     + goto readit;
2564     + }
2565     +
2566     + /*
2567     + * Start of file.
2568     + */
2569     + if (index == 0)
2570     + return newfile_readahead(mapping, filp, ra, end_index, ra_min);
2571     +
2572     + /*
2573     + * State based sequential read-ahead.
2574     + */
2575     + if (!disable_stateful_method &&
2576     + index == ra->lookahead_index && ra_cache_hit_ok(ra))
2577     + return state_based_readahead(mapping, filp, ra, page,
2578     + index, size, ra_max);
2579     +
2580     + /*
2581     + * Recover from possible thrashing.
2582     + */
2583     + if (!page && index == ra->prev_page + 1 && ra_has_index(ra, index))
2584     + return thrashing_recovery_readahead(mapping, filp, ra,
2585     + index, ra_max);
2586     +
2587     + /*
2588     + * Backward read-ahead.
2589     + */
2590     + if (!page && begin_index == index &&
2591     + try_read_backward(ra, index, size, ra_max))
2592     + return ra_dispatch(ra, mapping, filp);
2593     +
2594     + /*
2595     + * Context based sequential read-ahead.
2596     + */
2597     + ret = try_context_based_readahead(mapping, ra, prev_page, page,
2598     + index, ra_min, ra_max);
2599     + if (ret > 0)
2600     + return ra_dispatch(ra, mapping, filp);
2601     + if (ret < 0)
2602     + return 0;
2603     +
2604     + /* No action on look ahead time? */
2605     + if (page) {
2606     + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION,
2607     + ra->readahead_index - index);
2608     + return 0;
2609     + }
2610     +
2611     + /*
2612     + * Random read that follows a sequential one.
2613     + */
2614     + if (try_readahead_on_seek(ra, index, size, ra_max))
2615     + return ra_dispatch(ra, mapping, filp);
2616     +
2617     + /*
2618     + * Random read.
2619     + */
2620     + if (size > ra_max)
2621     + size = ra_max;
2622     +
2623     +readit:
2624     + size = __do_page_cache_readahead(mapping, filp, index, size, 0);
2625     +
2626     + ra_account(ra, RA_EVENT_READRANDOM, size);
2627     + dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n",
2628     + mapping->host->i_ino, mapping->nrpages,
2629     + begin_index, index, end_index, size);
2630     +
2631     + return size;
2632     +}
2633     +
2634     +/**
2635     + * readahead_cache_hit - adaptive read-ahead feedback function
2636     + * @ra: file_ra_state which holds the readahead state
2637     + * @page: the page just accessed
2638     + *
2639     + * readahead_cache_hit() is the feedback route of the adaptive read-ahead
2640     + * logic. It must be called on every access on the read-ahead pages.
2641     + */
2642     +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page)
2643     +{
2644     + if (PageActive(page) || PageReferenced(page))
2645     + return;
2646     +
2647     + if (!PageUptodate(page))
2648     + ra_account(ra, RA_EVENT_IO_BLOCK, 1);
2649     +
2650     + if (!ra_has_index(ra, page->index))
2651     + return;
2652     +
2653     + ra->cache_hits++;
2654     +
2655     + if (page->index >= ra->ra_index)
2656     + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1);
2657     + else
2658     + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1);
2659     +}
2660     +
2661     +#endif /* CONFIG_ADAPTIVE_READAHEAD */
2662     Index: linux-2.6.16-ck1/mm/swap.c
2663     ===================================================================
2664     --- linux-2.6.16-ck1.orig/mm/swap.c 2006-03-20 20:46:55.000000000 +1100
2665     +++ linux-2.6.16-ck1/mm/swap.c 2006-03-20 20:47:04.000000000 +1100
2666     @@ -128,6 +128,8 @@ void fastcall mark_page_accessed(struct
2667     ClearPageReferenced(page);
2668     } else if (!PageReferenced(page)) {
2669     SetPageReferenced(page);
2670     + if (PageLRU(page))
2671     + inc_readahead_aging();
2672     }
2673     }
2674    
2675     Index: linux-2.6.16-ck1/mm/vmscan.c
2676     ===================================================================
2677     --- linux-2.6.16-ck1.orig/mm/vmscan.c 2006-03-20 20:47:00.000000000 +1100
2678     +++ linux-2.6.16-ck1/mm/vmscan.c 2006-03-20 20:47:04.000000000 +1100
2679     @@ -458,6 +458,9 @@ static int shrink_list(struct list_head
2680     if (PageWriteback(page))
2681     goto keep_locked;
2682    
2683     + if (!PageReferenced(page))
2684     + inc_readahead_aging();
2685     +
2686     referenced = page_referenced(page, 1);
2687     /* In active use or really unfreeable? Activate it. */
2688     if (referenced && page_mapping_inuse(page))