Magellan Linux

Contents of /trunk/kernel26-magellan/patches-2.6.16-r10/0029-2.6.16-adaptive-readahead-11.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 70 - (show annotations) (download)
Thu May 11 19:09:22 2006 UTC (18 years ago) by niro
File size: 82008 byte(s)
import

1 ---
2 Documentation/sysctl/vm.txt | 36 +
3 drivers/block/loop.c | 6
4 fs/mpage.c | 4
5 fs/nfsd/vfs.c | 6
6 include/linux/fs.h | 41 -
7 include/linux/mm.h | 31
8 include/linux/page-flags.h | 5
9 include/linux/radix-tree.h | 82 ++
10 include/linux/sysctl.h | 2
11 include/linux/writeback.h | 6
12 kernel/sysctl.c | 28
13 lib/radix-tree.c | 208 +++++-
14 mm/Kconfig | 55 +
15 mm/filemap.c | 86 ++
16 mm/memory.c | 1
17 mm/page-writeback.c | 2
18 mm/page_alloc.c | 2
19 mm/readahead.c | 1519 +++++++++++++++++++++++++++++++++++++++++++-
20 mm/swap.c | 2
21 mm/vmscan.c | 3
22 20 files changed, 2062 insertions(+), 63 deletions(-)
23
24 Index: linux-2.6.16-ck1/Documentation/sysctl/vm.txt
25 ===================================================================
26 --- linux-2.6.16-ck1.orig/Documentation/sysctl/vm.txt 2006-03-20 20:47:01.000000000 +1100
27 +++ linux-2.6.16-ck1/Documentation/sysctl/vm.txt 2006-03-20 20:47:04.000000000 +1100
28 @@ -30,6 +30,8 @@ Currently, these files are in /proc/sys/
29 - zone_reclaim_mode
30 - zone_reclaim_interval
31 - swap_prefetch
32 +- readahead_ratio
33 +- readahead_hit_rate
34
35 ==============================================================
36
37 @@ -204,3 +206,37 @@ swap_prefetch unset and then it is enabl
38 prefetched.
39
40 The default value is 1.
41 +
42 +==============================================================
43 +
44 +readahead_ratio
45 +
46 +This limits readahead size to percent of the thrashing-threshold,
47 +which is dynamicly estimated from the _history_ read speed and
48 +system load, to deduce the _future_ readahead request size.
49 +
50 +Set it to a smaller value if you have not enough memory for all the
51 +concurrent readers, or the I/O loads fluctuate a lot. But if there's
52 +plenty of memory(>2MB per reader), enlarge it may help speedup reads.
53 +
54 +readahead_ratio also selects the readahead logic:
55 +0: disable readahead totally
56 +1-9: select the stock readahead logic
57 +10-inf: select the adaptive readahead logic
58 +
59 +The default value is 50; reasonable values would be 50-100.
60 +
61 +==============================================================
62 +
63 +readahead_hit_rate
64 +
65 +This is the max allowed value of (readahead-pages : accessed-pages).
66 +Useful only when (readahead_ratio >= 10). If the previous readahead
67 +request has bad hit rate, the kernel will be reluctant to do the next
68 +readahead.
69 +
70 +A larger value helps catch more sparse access patterns. Be aware that
71 +readahead of the sparse patterns sacrifices memory for speed.
72 +
73 +The default value is 2.
74 +It is recommended to keep the value below (max-readahead-pages / 8).
75 Index: linux-2.6.16-ck1/drivers/block/loop.c
76 ===================================================================
77 --- linux-2.6.16-ck1.orig/drivers/block/loop.c 2006-03-20 20:46:23.000000000 +1100
78 +++ linux-2.6.16-ck1/drivers/block/loop.c 2006-03-20 20:47:04.000000000 +1100
79 @@ -779,6 +779,12 @@ static int loop_set_fd(struct loop_devic
80 mapping = file->f_mapping;
81 inode = mapping->host;
82
83 + /*
84 + * The upper layer should already do proper look-ahead,
85 + * one more look-ahead here only ruins the cache hit rate.
86 + */
87 + file->f_ra.flags |= RA_FLAG_NO_LOOKAHEAD;
88 +
89 if (!(file->f_mode & FMODE_WRITE))
90 lo_flags |= LO_FLAGS_READ_ONLY;
91
92 Index: linux-2.6.16-ck1/fs/mpage.c
93 ===================================================================
94 --- linux-2.6.16-ck1.orig/fs/mpage.c 2006-03-20 20:46:23.000000000 +1100
95 +++ linux-2.6.16-ck1/fs/mpage.c 2006-03-20 20:47:04.000000000 +1100
96 @@ -343,8 +343,10 @@ mpage_readpages(struct address_space *ma
97 bio = do_mpage_readpage(bio, page,
98 nr_pages - page_idx,
99 &last_block_in_bio, get_block);
100 - if (!pagevec_add(&lru_pvec, page))
101 + if (!pagevec_add(&lru_pvec, page)) {
102 + cond_resched();
103 __pagevec_lru_add(&lru_pvec);
104 + }
105 } else {
106 page_cache_release(page);
107 }
108 Index: linux-2.6.16-ck1/fs/nfsd/vfs.c
109 ===================================================================
110 --- linux-2.6.16-ck1.orig/fs/nfsd/vfs.c 2006-03-20 20:46:23.000000000 +1100
111 +++ linux-2.6.16-ck1/fs/nfsd/vfs.c 2006-03-20 20:47:04.000000000 +1100
112 @@ -833,10 +833,14 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
113 #endif
114
115 /* Get readahead parameters */
116 - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
117 + if (prefer_adaptive_readahead())
118 + ra = NULL;
119 + else
120 + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
121
122 if (ra && ra->p_set)
123 file->f_ra = ra->p_ra;
124 + file->f_ra.flags |= RA_FLAG_NFSD;
125
126 if (file->f_op->sendfile) {
127 svc_pushback_unused_pages(rqstp);
128 Index: linux-2.6.16-ck1/include/linux/fs.h
129 ===================================================================
130 --- linux-2.6.16-ck1.orig/include/linux/fs.h 2006-03-20 20:46:23.000000000 +1100
131 +++ linux-2.6.16-ck1/include/linux/fs.h 2006-03-20 20:47:04.000000000 +1100
132 @@ -600,19 +600,40 @@ struct fown_struct {
133 * Track a single file's readahead state
134 */
135 struct file_ra_state {
136 - unsigned long start; /* Current window */
137 - unsigned long size;
138 - unsigned long flags; /* ra flags RA_FLAG_xxx*/
139 - unsigned long cache_hit; /* cache hit count*/
140 - unsigned long prev_page; /* Cache last read() position */
141 - unsigned long ahead_start; /* Ahead window */
142 - unsigned long ahead_size;
143 - unsigned long ra_pages; /* Maximum readahead window */
144 - unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
145 - unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
146 + union {
147 + struct { /* conventional read-ahead */
148 + unsigned long start; /* Current window */
149 + unsigned long size;
150 + unsigned long ahead_start; /* Ahead window */
151 + unsigned long ahead_size;
152 + unsigned long cache_hit; /* cache hit count */
153 + };
154 +#ifdef CONFIG_ADAPTIVE_READAHEAD
155 + struct { /* adaptive read-ahead */
156 + pgoff_t la_index;
157 + pgoff_t ra_index;
158 + pgoff_t lookahead_index;
159 + pgoff_t readahead_index;
160 + unsigned long age;
161 + uint64_t cache_hits;
162 + };
163 +#endif
164 + };
165 +
166 + /* mmap read-around */
167 + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
168 + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
169 +
170 + /* common ones */
171 + unsigned long flags; /* ra flags RA_FLAG_xxx*/
172 + unsigned long prev_page; /* Cache last read() position */
173 + unsigned long ra_pages; /* Maximum readahead window */
174 };
175 #define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
176 #define RA_FLAG_INCACHE 0x02 /* file is already in cache */
177 +#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */
178 +#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */
179 +#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */
180
181 struct file {
182 /*
183 Index: linux-2.6.16-ck1/include/linux/mm.h
184 ===================================================================
185 --- linux-2.6.16-ck1.orig/include/linux/mm.h 2006-03-20 20:46:23.000000000 +1100
186 +++ linux-2.6.16-ck1/include/linux/mm.h 2006-03-20 20:47:04.000000000 +1100
187 @@ -954,7 +954,11 @@ extern int filemap_populate(struct vm_ar
188 int write_one_page(struct page *page, int wait);
189
190 /* readahead.c */
191 +#ifdef CONFIG_ADAPTIVE_READAHEAD
192 +#define VM_MAX_READAHEAD 1024 /* kbytes */
193 +#else
194 #define VM_MAX_READAHEAD 128 /* kbytes */
195 +#endif
196 #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
197 #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before
198 * turning readahead off */
199 @@ -971,6 +975,33 @@ unsigned long page_cache_readahead(struc
200 void handle_ra_miss(struct address_space *mapping,
201 struct file_ra_state *ra, pgoff_t offset);
202 unsigned long max_sane_readahead(unsigned long nr);
203 +unsigned long
204 +page_cache_readahead_adaptive(struct address_space *mapping,
205 + struct file_ra_state *ra, struct file *filp,
206 + struct page *prev_page, struct page *page,
207 + pgoff_t first_index, pgoff_t index, pgoff_t last_index);
208 +
209 +#ifdef CONFIG_ADAPTIVE_READAHEAD
210 +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page);
211 +extern int readahead_ratio;
212 +#else
213 +#define readahead_cache_hit(ra, page) do { } while (0)
214 +#define readahead_ratio 1
215 +#endif /* CONFIG_ADAPTIVE_READAHEAD */
216 +
217 +static inline int prefer_adaptive_readahead(void)
218 +{
219 + return readahead_ratio >= 10;
220 +}
221 +
222 +DECLARE_PER_CPU(unsigned long, readahead_aging);
223 +static inline void inc_readahead_aging(void)
224 +{
225 + if (prefer_adaptive_readahead()) {
226 + per_cpu(readahead_aging, get_cpu())++;
227 + put_cpu();
228 + }
229 +}
230
231 /* Do stack extension */
232 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
233 Index: linux-2.6.16-ck1/include/linux/page-flags.h
234 ===================================================================
235 --- linux-2.6.16-ck1.orig/include/linux/page-flags.h 2006-03-20 20:46:23.000000000 +1100
236 +++ linux-2.6.16-ck1/include/linux/page-flags.h 2006-03-20 20:47:04.000000000 +1100
237 @@ -75,6 +75,7 @@
238 #define PG_reclaim 17 /* To be reclaimed asap */
239 #define PG_nosave_free 18 /* Free, should not be written */
240 #define PG_uncached 19 /* Page has been mapped as uncached */
241 +#define PG_readahead 20 /* Reminder to do readahead */
242
243 /*
244 * Global page accounting. One instance per CPU. Only unsigned longs are
245 @@ -344,6 +345,10 @@ extern void __mod_page_state_offset(unsi
246 #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
247 #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
248
249 +#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags)
250 +#define __SetPageReadahead(page) __set_bit(PG_readahead, &(page)->flags)
251 +#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags)
252 +
253 struct page; /* forward declaration */
254
255 int test_clear_page_dirty(struct page *page);
256 Index: linux-2.6.16-ck1/include/linux/radix-tree.h
257 ===================================================================
258 --- linux-2.6.16-ck1.orig/include/linux/radix-tree.h 2006-03-20 20:46:23.000000000 +1100
259 +++ linux-2.6.16-ck1/include/linux/radix-tree.h 2006-03-20 20:47:04.000000000 +1100
260 @@ -23,12 +23,24 @@
261 #include <linux/preempt.h>
262 #include <linux/types.h>
263
264 +#define RADIX_TREE_MAP_SHIFT 6
265 +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
266 +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
267 +
268 struct radix_tree_root {
269 unsigned int height;
270 gfp_t gfp_mask;
271 struct radix_tree_node *rnode;
272 };
273
274 +/*
275 + * Lookaside cache to support access patterns with strong locality.
276 + */
277 +struct radix_tree_cache {
278 + unsigned long first_index;
279 + struct radix_tree_node *tree_node;
280 +};
281 +
282 #define RADIX_TREE_INIT(mask) { \
283 .height = 0, \
284 .gfp_mask = (mask), \
285 @@ -46,9 +58,18 @@ do { \
286 } while (0)
287
288 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
289 -void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
290 -void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
291 +void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long,
292 + unsigned int);
293 +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long);
294 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
295 +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache);
296 +void *radix_tree_cache_lookup_node(struct radix_tree_root *root,
297 + struct radix_tree_cache *cache,
298 + unsigned long index, unsigned int level);
299 +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root,
300 + unsigned long index, unsigned long max_scan);
301 +unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
302 + unsigned long index, unsigned long max_scan);
303 unsigned int
304 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
305 unsigned long first_index, unsigned int max_items);
306 @@ -70,4 +91,61 @@ static inline void radix_tree_preload_en
307 preempt_enable();
308 }
309
310 +/**
311 + * radix_tree_lookup - perform lookup operation on a radix tree
312 + * @root: radix tree root
313 + * @index: index key
314 + *
315 + * Lookup the item at the position @index in the radix tree @root.
316 + */
317 +static inline void *radix_tree_lookup(struct radix_tree_root *root,
318 + unsigned long index)
319 +{
320 + return radix_tree_lookup_node(root, index, 0);
321 +}
322 +
323 +/**
324 + * radix_tree_cache_init - init a look-aside cache
325 + * @cache: look-aside cache
326 + *
327 + * Init the radix tree look-aside cache @cache.
328 + */
329 +static inline void radix_tree_cache_init(struct radix_tree_cache *cache)
330 +{
331 + cache->first_index = RADIX_TREE_MAP_MASK;
332 + cache->tree_node = NULL;
333 +}
334 +
335 +/**
336 + * radix_tree_cache_lookup - cached lookup on a radix tree
337 + * @root: radix tree root
338 + * @cache: look-aside cache
339 + * @index: index key
340 + *
341 + * Lookup the item at the position @index in the radix tree @root,
342 + * and make use of @cache to speedup the lookup process.
343 + */
344 +static inline void *radix_tree_cache_lookup(struct radix_tree_root *root,
345 + struct radix_tree_cache *cache,
346 + unsigned long index)
347 +{
348 + return radix_tree_cache_lookup_node(root, cache, index, 0);
349 +}
350 +
351 +static inline unsigned int radix_tree_cache_size(struct radix_tree_cache *cache)
352 +{
353 + return RADIX_TREE_MAP_SIZE;
354 +}
355 +
356 +static inline int radix_tree_cache_full(struct radix_tree_cache *cache)
357 +{
358 + return radix_tree_cache_count(cache) == radix_tree_cache_size(cache);
359 +}
360 +
361 +static inline unsigned long
362 +radix_tree_cache_first_index(struct radix_tree_cache *cache)
363 +{
364 + return cache->first_index;
365 +}
366 +
367 #endif /* _LINUX_RADIX_TREE_H */
368 Index: linux-2.6.16-ck1/include/linux/sysctl.h
369 ===================================================================
370 --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:56.000000000 +1100
371 +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:47:04.000000000 +1100
372 @@ -191,6 +191,8 @@ enum
373 VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
374 VM_SWAP_PREFETCH=33, /* swap prefetch */
375 VM_HARDMAPLIMIT=34, /* Make mapped a hard limit */
376 + VM_READAHEAD_RATIO=35, /* percent of read-ahead size to thrashing-threshold */
377 + VM_READAHEAD_HIT_RATE=36, /* one accessed page legitimizes so many read-ahead pages */
378 };
379
380
381 Index: linux-2.6.16-ck1/include/linux/writeback.h
382 ===================================================================
383 --- linux-2.6.16-ck1.orig/include/linux/writeback.h 2006-03-20 20:46:23.000000000 +1100
384 +++ linux-2.6.16-ck1/include/linux/writeback.h 2006-03-20 20:47:04.000000000 +1100
385 @@ -85,6 +85,12 @@ void laptop_io_completion(void);
386 void laptop_sync_completion(void);
387 void throttle_vm_writeout(void);
388
389 +extern struct timer_list laptop_mode_wb_timer;
390 +static inline int laptop_spinned_down(void)
391 +{
392 + return !timer_pending(&laptop_mode_wb_timer);
393 +}
394 +
395 /* These are exported to sysctl. */
396 extern int dirty_background_ratio;
397 extern int vm_dirty_ratio;
398 Index: linux-2.6.16-ck1/kernel/sysctl.c
399 ===================================================================
400 --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:56.000000000 +1100
401 +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:47:04.000000000 +1100
402 @@ -74,6 +74,12 @@ extern int pid_max_min, pid_max_max;
403 extern int sysctl_drop_caches;
404 extern int percpu_pagelist_fraction;
405
406 +#if defined(CONFIG_ADAPTIVE_READAHEAD)
407 +extern int readahead_ratio;
408 +extern int readahead_hit_rate;
409 +static int one = 1;
410 +#endif
411 +
412 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
413 int unknown_nmi_panic;
414 extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
415 @@ -961,6 +967,28 @@ static ctl_table vm_table[] = {
416 .proc_handler = &proc_dointvec,
417 },
418 #endif
419 +#ifdef CONFIG_ADAPTIVE_READAHEAD
420 + {
421 + .ctl_name = VM_READAHEAD_RATIO,
422 + .procname = "readahead_ratio",
423 + .data = &readahead_ratio,
424 + .maxlen = sizeof(readahead_ratio),
425 + .mode = 0644,
426 + .proc_handler = &proc_dointvec,
427 + .strategy = &sysctl_intvec,
428 + .extra1 = &zero,
429 + },
430 + {
431 + .ctl_name = VM_READAHEAD_HIT_RATE,
432 + .procname = "readahead_hit_rate",
433 + .data = &readahead_hit_rate,
434 + .maxlen = sizeof(readahead_hit_rate),
435 + .mode = 0644,
436 + .proc_handler = &proc_dointvec,
437 + .strategy = &sysctl_intvec,
438 + .extra1 = &one,
439 + },
440 +#endif
441 { .ctl_name = 0 }
442 };
443
444 Index: linux-2.6.16-ck1/lib/radix-tree.c
445 ===================================================================
446 --- linux-2.6.16-ck1.orig/lib/radix-tree.c 2006-03-20 20:46:23.000000000 +1100
447 +++ linux-2.6.16-ck1/lib/radix-tree.c 2006-03-20 20:47:04.000000000 +1100
448 @@ -32,16 +32,7 @@
449 #include <linux/bitops.h>
450
451
452 -#ifdef __KERNEL__
453 -#define RADIX_TREE_MAP_SHIFT 6
454 -#else
455 -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
456 -#endif
457 #define RADIX_TREE_TAGS 2
458 -
459 -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
460 -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
461 -
462 #define RADIX_TREE_TAG_LONGS \
463 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
464
465 @@ -286,32 +277,89 @@ int radix_tree_insert(struct radix_tree_
466 }
467 EXPORT_SYMBOL(radix_tree_insert);
468
469 -static inline void **__lookup_slot(struct radix_tree_root *root,
470 - unsigned long index)
471 +/**
472 + * radix_tree_lookup_node - low level lookup routine
473 + * @root: radix tree root
474 + * @index: index key
475 + * @level: stop at that many levels from the tree leaf
476 + *
477 + * Lookup the item at the position @index in the radix tree @root.
478 + * The return value is:
479 + * @level == 0: page at @index;
480 + * @level == 1: the corresponding bottom level tree node;
481 + * @level < height: (@level-1)th parent node of the bottom node
482 + * that contains @index;
483 + * @level >= height: the root node.
484 + */
485 +void *radix_tree_lookup_node(struct radix_tree_root *root,
486 + unsigned long index, unsigned int level)
487 {
488 unsigned int height, shift;
489 - struct radix_tree_node **slot;
490 + struct radix_tree_node *slot;
491
492 height = root->height;
493 if (index > radix_tree_maxindex(height))
494 return NULL;
495
496 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
497 - slot = &root->rnode;
498 + slot = root->rnode;
499
500 - while (height > 0) {
501 - if (*slot == NULL)
502 + while (height > level) {
503 + if (slot == NULL)
504 return NULL;
505
506 - slot = (struct radix_tree_node **)
507 - ((*slot)->slots +
508 - ((index >> shift) & RADIX_TREE_MAP_MASK));
509 + slot = slot->slots[(index >> shift) & RADIX_TREE_MAP_MASK];
510 shift -= RADIX_TREE_MAP_SHIFT;
511 height--;
512 }
513
514 - return (void **)slot;
515 + return slot;
516 +}
517 +EXPORT_SYMBOL(radix_tree_lookup_node);
518 +
519 +/**
520 + * radix_tree_cache_lookup_node - cached lookup node
521 + * @root: radix tree root
522 + * @cache: look-aside cache
523 + * @index: index key
524 + *
525 + * Lookup the item at the position @index in the radix tree @root,
526 + * and return the node @level levels from the bottom in the search path.
527 + *
528 + * @cache stores the last accessed upper level tree node by this
529 + * function, and is always checked first before searching in the tree.
530 + * It can improve speed for access patterns with strong locality.
531 + *
532 + * NOTE:
533 + * - The cache becomes invalid on leaving the lock;
534 + * - Do not intermix calls with different @level.
535 + */
536 +void *radix_tree_cache_lookup_node(struct radix_tree_root *root,
537 + struct radix_tree_cache *cache,
538 + unsigned long index, unsigned int level)
539 +{
540 + struct radix_tree_node *node;
541 + unsigned long i;
542 + unsigned long mask;
543 +
544 + if (level >= root->height)
545 + return root->rnode;
546 +
547 + i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK);
548 + mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1);
549 +
550 + if ((index & mask) == cache->first_index)
551 + return cache->tree_node->slots[i];
552 +
553 + node = radix_tree_lookup_node(root, index, level + 1);
554 + if (!node)
555 + return 0;
556 +
557 + cache->tree_node = node;
558 + cache->first_index = (index & mask);
559 + return node->slots[i];
560 }
561 +EXPORT_SYMBOL(radix_tree_cache_lookup_node);
562
563 /**
564 * radix_tree_lookup_slot - lookup a slot in a radix tree
565 @@ -323,25 +371,131 @@ static inline void **__lookup_slot(struc
566 */
567 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
568 {
569 - return __lookup_slot(root, index);
570 + struct radix_tree_node *node;
571 +
572 + node = radix_tree_lookup_node(root, index, 1);
573 + return node->slots + (index & RADIX_TREE_MAP_MASK);
574 }
575 EXPORT_SYMBOL(radix_tree_lookup_slot);
576
577 /**
578 - * radix_tree_lookup - perform lookup operation on a radix tree
579 + * radix_tree_cache_count - items in the cached node
580 + * @cache: radix tree look-aside cache
581 + *
582 + * Query the number of items contained in the cached node.
583 + */
584 +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache)
585 +{
586 + if (!(cache->first_index & RADIX_TREE_MAP_MASK))
587 + return cache->tree_node->count;
588 + else
589 + return 0;
590 +}
591 +EXPORT_SYMBOL(radix_tree_cache_count);
592 +
593 +/**
594 + * radix_tree_scan_hole_backward - scan backward for hole
595 * @root: radix tree root
596 * @index: index key
597 + * @max_scan: advice on max items to scan (it may scan a little more)
598 *
599 - * Lookup the item at the position @index in the radix tree @root.
600 + * Scan backward from @index for a hole/empty item, stop when
601 + * - hit hole
602 + * - @max_scan or more items scanned
603 + * - hit index 0
604 + *
605 + * Return the correponding index.
606 */
607 -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
608 +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root,
609 + unsigned long index, unsigned long max_scan)
610 {
611 - void **slot;
612 + struct radix_tree_cache cache;
613 + struct radix_tree_node *node;
614 + unsigned long origin;
615 + int i;
616 +
617 + origin = index;
618 + radix_tree_cache_init(&cache);
619 +
620 + while (origin - index < max_scan) {
621 + node = radix_tree_cache_lookup_node(root, &cache, index, 1);
622 + if (!node)
623 + break;
624 +
625 + if (node->count == RADIX_TREE_MAP_SIZE) {
626 + index = (index - RADIX_TREE_MAP_SIZE) |
627 + RADIX_TREE_MAP_MASK;
628 + goto check_underflow;
629 + }
630 +
631 + for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) {
632 + if (!node->slots[i])
633 + goto out;
634 + }
635 +
636 +check_underflow:
637 + if (unlikely(index == ULONG_MAX)) {
638 + index = 0;
639 + break;
640 + }
641 + }
642 +
643 +out:
644 + return index;
645 +}
646 +EXPORT_SYMBOL(radix_tree_scan_hole_backward);
647
648 - slot = __lookup_slot(root, index);
649 - return slot != NULL ? *slot : NULL;
650 +/**
651 + * radix_tree_scan_hole - scan for hole
652 + * @root: radix tree root
653 + * @index: index key
654 + * @max_scan: advice on max items to scan (it may scan a little more)
655 + *
656 + * Scan forward from @index for a hole/empty item, stop when
657 + * - hit hole
658 + * - hit EOF
659 + * - hit index ULONG_MAX
660 + * - @max_scan or more items scanned
661 + *
662 + * Return the correponding index.
663 + */
664 +unsigned long radix_tree_scan_hole(struct radix_tree_root *root,
665 + unsigned long index, unsigned long max_scan)
666 +{
667 + struct radix_tree_cache cache;
668 + struct radix_tree_node *node;
669 + unsigned long origin;
670 + int i;
671 +
672 + origin = index;
673 + radix_tree_cache_init(&cache);
674 +
675 + while (index - origin < max_scan) {
676 + node = radix_tree_cache_lookup_node(root, &cache, index, 1);
677 + if (!node)
678 + break;
679 +
680 + if (node->count == RADIX_TREE_MAP_SIZE) {
681 + index = (index | RADIX_TREE_MAP_MASK) + 1;
682 + goto check_overflow;
683 + }
684 +
685 + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE;
686 + i++, index++) {
687 + if (!node->slots[i])
688 + goto out;
689 + }
690 +
691 +check_overflow:
692 + if (unlikely(!index)) {
693 + index = ULONG_MAX;
694 + break;
695 + }
696 + }
697 +out:
698 + return index;
699 }
700 -EXPORT_SYMBOL(radix_tree_lookup);
701 +EXPORT_SYMBOL(radix_tree_scan_hole);
702
703 /**
704 * radix_tree_tag_set - set a tag on a radix tree node
705 Index: linux-2.6.16-ck1/mm/Kconfig
706 ===================================================================
707 --- linux-2.6.16-ck1.orig/mm/Kconfig 2006-03-20 20:46:23.000000000 +1100
708 +++ linux-2.6.16-ck1/mm/Kconfig 2006-03-20 20:47:04.000000000 +1100
709 @@ -139,3 +139,58 @@ config SPLIT_PTLOCK_CPUS
710 config MIGRATION
711 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
712 depends on SWAP
713 +
714 +#
715 +# Adaptive file readahead
716 +#
717 +config ADAPTIVE_READAHEAD
718 + bool "Adaptive file readahead (EXPERIMENTAL)"
719 + default n
720 + depends on EXPERIMENTAL
721 + help
722 + Readahead is a technique employed by the kernel in an attempt
723 + to improve file reading performance. If the kernel has reason
724 + to believe that a particular file is being read sequentially,
725 + it will attempt to read blocks from the file into memory before
726 + the application requests them. When readahead works, it speeds
727 + up the system's throughput, since the reading application does
728 + not have to wait for its requests. When readahead fails, instead,
729 + it generates useless I/O and occupies memory pages which are
730 + needed for some other purpose. For sequential readings,
731 +
732 + Normally, the kernel uses a stock readahead logic that is well
733 + understood and well tuned. This option enables a much complex and
734 + feature rich one. It is more aggressive and memory efficient in
735 + doing readahead, and supports some less-common access patterns such
736 + as reading backward and reading sparsely. However, due to the great
737 + diversity of real world applications, it might not fit everyone.
738 +
739 + Please refer to Documentation/sysctl/vm.txt for tunable parameters.
740 +
741 + Say Y here if you are building kernel for file servers.
742 + Say N if you are unsure.
743 +
744 +config DEBUG_READAHEAD
745 + bool "Readahead debug and accounting"
746 + default n
747 + depends on ADAPTIVE_READAHEAD
748 + select DEBUG_FS
749 + help
750 + This option injects extra code to dump detailed debug traces and do
751 + readahead events accounting.
752 +
753 + To actually get the data:
754 +
755 + mkdir /debug
756 + mount -t debug none /debug
757 +
758 + After that you can do the following:
759 +
760 + echo > /debug/readahead/events # reset the counters
761 + cat /debug/readahead/events # check the counters
762 +
763 + echo 1 > /debug/readahead/debug_level # show printk traces
764 + echo 2 > /debug/readahead/debug_level # show verbose printk traces
765 + echo 0 > /debug/readahead/debug_level # stop filling my kern.log
766 +
767 + Say N, unless you have readahead performance problems.
768 Index: linux-2.6.16-ck1/mm/filemap.c
769 ===================================================================
770 --- linux-2.6.16-ck1.orig/mm/filemap.c 2006-03-20 20:46:23.000000000 +1100
771 +++ linux-2.6.16-ck1/mm/filemap.c 2006-03-20 20:47:04.000000000 +1100
772 @@ -42,6 +42,12 @@ static ssize_t
773 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
774 loff_t offset, unsigned long nr_segs);
775
776 +#ifdef CONFIG_DEBUG_READAHEAD
777 +extern u32 readahead_debug_level;
778 +#else
779 +#define readahead_debug_level 0
780 +#endif /* CONFIG_DEBUG_READAHEAD */
781 +
782 /*
783 * Shared mappings implemented 30.11.1994. It's not fully working yet,
784 * though.
785 @@ -746,10 +752,12 @@ void do_generic_mapping_read(struct addr
786 unsigned long prev_index;
787 loff_t isize;
788 struct page *cached_page;
789 + struct page *prev_page;
790 int error;
791 struct file_ra_state ra = *_ra;
792
793 cached_page = NULL;
794 + prev_page = NULL;
795 index = *ppos >> PAGE_CACHE_SHIFT;
796 next_index = index;
797 prev_index = ra.prev_page;
798 @@ -760,6 +768,10 @@ void do_generic_mapping_read(struct addr
799 if (!isize)
800 goto out;
801
802 + if (readahead_debug_level >= 5)
803 + printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n",
804 + inode->i_ino, index, last_index - index);
805 +
806 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
807 for (;;) {
808 struct page *page;
809 @@ -778,16 +790,45 @@ void do_generic_mapping_read(struct addr
810 nr = nr - offset;
811
812 cond_resched();
813 - if (index == next_index)
814 +
815 + if (!prefer_adaptive_readahead() && index == next_index)
816 next_index = page_cache_readahead(mapping, &ra, filp,
817 index, last_index - index);
818
819 find_page:
820 page = find_get_page(mapping, index);
821 + if (prefer_adaptive_readahead()) {
822 + if (unlikely(page == NULL)) {
823 + ra.prev_page = prev_index;
824 + page_cache_readahead_adaptive(mapping, &ra,
825 + filp, prev_page, NULL,
826 + *ppos >> PAGE_CACHE_SHIFT,
827 + index, last_index);
828 + page = find_get_page(mapping, index);
829 + } else if (PageReadahead(page)) {
830 + ra.prev_page = prev_index;
831 + page_cache_readahead_adaptive(mapping, &ra,
832 + filp, prev_page, page,
833 + *ppos >> PAGE_CACHE_SHIFT,
834 + index, last_index);
835 + }
836 + }
837 if (unlikely(page == NULL)) {
838 - handle_ra_miss(mapping, &ra, index);
839 + if (!prefer_adaptive_readahead())
840 + handle_ra_miss(mapping, &ra, index);
841 goto no_cached_page;
842 }
843 +
844 + if (prev_page)
845 + page_cache_release(prev_page);
846 + prev_page = page;
847 +
848 + readahead_cache_hit(&ra, page);
849 + if (readahead_debug_level >= 7)
850 + printk(KERN_DEBUG "read-page(ino=%lu, idx=%lu, io=%s)\n",
851 + inode->i_ino, index,
852 + PageUptodate(page) ? "hit" : "miss");
853 +
854 if (!PageUptodate(page))
855 goto page_not_up_to_date;
856 page_ok:
857 @@ -822,7 +863,6 @@ page_ok:
858 index += offset >> PAGE_CACHE_SHIFT;
859 offset &= ~PAGE_CACHE_MASK;
860
861 - page_cache_release(page);
862 if (ret == nr && desc->count)
863 continue;
864 goto out;
865 @@ -834,7 +874,6 @@ page_not_up_to_date:
866 /* Did it get unhashed before we got the lock? */
867 if (!page->mapping) {
868 unlock_page(page);
869 - page_cache_release(page);
870 continue;
871 }
872
873 @@ -864,7 +903,6 @@ readpage:
874 * invalidate_inode_pages got it
875 */
876 unlock_page(page);
877 - page_cache_release(page);
878 goto find_page;
879 }
880 unlock_page(page);
881 @@ -885,7 +923,6 @@ readpage:
882 isize = i_size_read(inode);
883 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
884 if (unlikely(!isize || index > end_index)) {
885 - page_cache_release(page);
886 goto out;
887 }
888
889 @@ -894,7 +931,6 @@ readpage:
890 if (index == end_index) {
891 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
892 if (nr <= offset) {
893 - page_cache_release(page);
894 goto out;
895 }
896 }
897 @@ -904,7 +940,6 @@ readpage:
898 readpage_error:
899 /* UHHUH! A synchronous read error occurred. Report it */
900 desc->error = error;
901 - page_cache_release(page);
902 goto out;
903
904 no_cached_page:
905 @@ -929,15 +964,22 @@ no_cached_page:
906 }
907 page = cached_page;
908 cached_page = NULL;
909 + if (prev_page)
910 + page_cache_release(prev_page);
911 + prev_page = page;
912 goto readpage;
913 }
914
915 out:
916 *_ra = ra;
917 + if (prefer_adaptive_readahead())
918 + _ra->prev_page = prev_index;
919
920 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
921 if (cached_page)
922 page_cache_release(cached_page);
923 + if (prev_page)
924 + page_cache_release(prev_page);
925 if (filp)
926 file_accessed(filp);
927 }
928 @@ -1216,6 +1258,7 @@ struct page *filemap_nopage(struct vm_ar
929 unsigned long size, pgoff;
930 int did_readaround = 0, majmin = VM_FAULT_MINOR;
931
932 + ra->flags |= RA_FLAG_MMAP;
933 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
934
935 retry_all:
936 @@ -1233,7 +1276,7 @@ retry_all:
937 *
938 * For sequential accesses, we use the generic readahead logic.
939 */
940 - if (VM_SequentialReadHint(area))
941 + if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area))
942 page_cache_readahead(mapping, ra, file, pgoff, 1);
943
944 /*
945 @@ -1241,11 +1284,24 @@ retry_all:
946 */
947 retry_find:
948 page = find_get_page(mapping, pgoff);
949 + if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) {
950 + if (!page) {
951 + page_cache_readahead_adaptive(mapping, ra,
952 + file, NULL, NULL,
953 + pgoff, pgoff, pgoff + 1);
954 + page = find_get_page(mapping, pgoff);
955 + } else if (PageReadahead(page)) {
956 + page_cache_readahead_adaptive(mapping, ra,
957 + file, NULL, page,
958 + pgoff, pgoff, pgoff + 1);
959 + }
960 + }
961 if (!page) {
962 unsigned long ra_pages;
963
964 if (VM_SequentialReadHint(area)) {
965 - handle_ra_miss(mapping, ra, pgoff);
966 + if (!prefer_adaptive_readahead())
967 + handle_ra_miss(mapping, ra, pgoff);
968 goto no_cached_page;
969 }
970 ra->mmap_miss++;
971 @@ -1282,6 +1338,14 @@ retry_find:
972 if (!did_readaround)
973 ra->mmap_hit++;
974
975 + readahead_cache_hit(ra, page);
976 + if (readahead_debug_level >= 6)
977 + printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n",
978 + inode->i_ino, pgoff,
979 + VM_RandomReadHint(area) ? "random" :
980 + (VM_SequentialReadHint(area) ? "sequential" : "none"),
981 + PageUptodate(page) ? "hit" : "miss");
982 +
983 /*
984 * Ok, found a page in the page cache, now we need to check
985 * that it's up-to-date.
986 @@ -1296,6 +1360,8 @@ success:
987 mark_page_accessed(page);
988 if (type)
989 *type = majmin;
990 + if (prefer_adaptive_readahead())
991 + ra->prev_page = page->index;
992 return page;
993
994 outside_data_content:
995 Index: linux-2.6.16-ck1/mm/memory.c
996 ===================================================================
997 --- linux-2.6.16-ck1.orig/mm/memory.c 2006-03-20 20:46:23.000000000 +1100
998 +++ linux-2.6.16-ck1/mm/memory.c 2006-03-20 20:47:04.000000000 +1100
999 @@ -1993,6 +1993,7 @@ static int do_anonymous_page(struct mm_s
1000 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1001 if (!pte_none(*page_table))
1002 goto release;
1003 + inc_readahead_aging();
1004 inc_mm_counter(mm, anon_rss);
1005 lru_cache_add_active(page);
1006 page_add_new_anon_rmap(page, vma, address);
1007 Index: linux-2.6.16-ck1/mm/page-writeback.c
1008 ===================================================================
1009 --- linux-2.6.16-ck1.orig/mm/page-writeback.c 2006-03-20 20:46:53.000000000 +1100
1010 +++ linux-2.6.16-ck1/mm/page-writeback.c 2006-03-20 20:47:04.000000000 +1100
1011 @@ -370,7 +370,7 @@ static void wb_timer_fn(unsigned long un
1012 static void laptop_timer_fn(unsigned long unused);
1013
1014 static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
1015 -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
1016 +DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
1017
1018 /*
1019 * Periodic writeback of "old" data.
1020 Index: linux-2.6.16-ck1/mm/page_alloc.c
1021 ===================================================================
1022 --- linux-2.6.16-ck1.orig/mm/page_alloc.c 2006-03-20 20:46:59.000000000 +1100
1023 +++ linux-2.6.16-ck1/mm/page_alloc.c 2006-03-20 20:47:04.000000000 +1100
1024 @@ -532,7 +532,7 @@ static int prep_new_page(struct page *pa
1025 if (PageReserved(page))
1026 return 1;
1027
1028 - page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1029 + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
1030 1 << PG_referenced | 1 << PG_arch_1 |
1031 1 << PG_checked | 1 << PG_mappedtodisk);
1032 set_page_private(page, 0);
1033 Index: linux-2.6.16-ck1/mm/readahead.c
1034 ===================================================================
1035 --- linux-2.6.16-ck1.orig/mm/readahead.c 2006-03-20 20:46:23.000000000 +1100
1036 +++ linux-2.6.16-ck1/mm/readahead.c 2006-03-20 20:47:04.000000000 +1100
1037 @@ -14,6 +14,300 @@
1038 #include <linux/blkdev.h>
1039 #include <linux/backing-dev.h>
1040 #include <linux/pagevec.h>
1041 +#include <linux/writeback.h>
1042 +#include <linux/nfsd/const.h>
1043 +#include <asm/div64.h>
1044 +
1045 +/* The default max/min read-ahead pages. */
1046 +#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE)
1047 +#define MAX_RA_PAGES KB(VM_MAX_READAHEAD)
1048 +#define MIN_RA_PAGES KB(VM_MIN_READAHEAD)
1049 +#define MIN_NFSD_PAGES KB(NFSSVC_MAXBLKSIZE/1024)
1050 +
1051 +#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru))
1052 +#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru))
1053 +
1054 +#ifdef CONFIG_ADAPTIVE_READAHEAD
1055 +/*
1056 + * Adaptive read-ahead parameters.
1057 + */
1058 +
1059 +/* In laptop mode, poll delayed look-ahead on every ## pages read. */
1060 +#define LAPTOP_POLL_INTERVAL 16
1061 +
1062 +/* Set look-ahead size to 1/# of the thrashing-threshold. */
1063 +#define LOOKAHEAD_RATIO 8
1064 +
1065 +/* Set read-ahead size to ##% of the thrashing-threshold. */
1066 +int readahead_ratio = 50;
1067 +EXPORT_SYMBOL(readahead_ratio);
1068 +
1069 +/* Readahead as long as cache hit ratio keeps above 1/##. */
1070 +int readahead_hit_rate = 2;
1071 +EXPORT_SYMBOL(readahead_hit_rate);
1072 +
1073 +/*
1074 + * Measures the aging process of cold pages.
1075 + * Mainly increased on fresh page references to make it smooth.
1076 + */
1077 +DEFINE_PER_CPU(unsigned long, readahead_aging);
1078 +EXPORT_PER_CPU_SYMBOL(readahead_aging);
1079 +
1080 +/*
1081 + * Detailed classification of read-ahead behaviors.
1082 + */
1083 +#define RA_CLASS_SHIFT 4
1084 +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1)
1085 +enum ra_class {
1086 + RA_CLASS_ALL,
1087 + RA_CLASS_NEWFILE,
1088 + RA_CLASS_STATE,
1089 + RA_CLASS_CONTEXT,
1090 + RA_CLASS_CONTEXT_AGGRESSIVE,
1091 + RA_CLASS_BACKWARD,
1092 + RA_CLASS_THRASHING,
1093 + RA_CLASS_SEEK,
1094 + RA_CLASS_END,
1095 +};
1096 +#endif /* CONFIG_ADAPTIVE_READAHEAD */
1097 +
1098 +/*
1099 + * Read-ahead events accounting.
1100 + */
1101 +#ifdef CONFIG_DEBUG_READAHEAD
1102 +#include <linux/init.h>
1103 +#include <linux/jiffies.h>
1104 +#include <linux/debugfs.h>
1105 +#include <linux/seq_file.h>
1106 +
1107 +#define DEBUG_READAHEAD_RADIXTREE
1108 +
1109 +/* Read-ahead events to be accounted. */
1110 +enum ra_event {
1111 + RA_EVENT_CACHE_MISS, /* read cache misses */
1112 + RA_EVENT_READRANDOM, /* random reads */
1113 + RA_EVENT_IO_CONGESTION, /* io congestion */
1114 + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */
1115 + RA_EVENT_IO_BLOCK, /* read on locked page */
1116 +
1117 + RA_EVENT_READAHEAD, /* read-ahead issued */
1118 + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */
1119 + RA_EVENT_LOOKAHEAD, /* look-ahead issued */
1120 + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */
1121 + RA_EVENT_LOOKAHEAD_NOACTION, /* look-ahead mark ignored */
1122 + RA_EVENT_READAHEAD_MMAP, /* read-ahead for memory mapped file */
1123 + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */
1124 + RA_EVENT_READAHEAD_SHRINK, /* ra_size under previous la_size */
1125 + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */
1126 + RA_EVENT_READAHEAD_MUTILATE, /* read-ahead request mutilated */
1127 + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */
1128 +
1129 + RA_EVENT_END
1130 +};
1131 +
1132 +static const char * const ra_event_name[] = {
1133 + "cache_miss",
1134 + "read_random",
1135 + "io_congestion",
1136 + "io_cache_hit",
1137 + "io_block",
1138 + "readahead",
1139 + "readahead_hit",
1140 + "lookahead",
1141 + "lookahead_hit",
1142 + "lookahead_ignore",
1143 + "readahead_mmap",
1144 + "readahead_eof",
1145 + "readahead_shrink",
1146 + "readahead_thrash",
1147 + "readahead_mutilt",
1148 + "readahead_rescue",
1149 +};
1150 +
1151 +static const char * const ra_class_name[] = {
1152 + "total",
1153 + "newfile",
1154 + "state",
1155 + "context",
1156 + "contexta",
1157 + "backward",
1158 + "onthrash",
1159 + "onraseek",
1160 + "none",
1161 +};
1162 +
1163 +static unsigned long ra_events[RA_CLASS_END+1][RA_EVENT_END+1][2];
1164 +
1165 +static inline void ra_account(struct file_ra_state *ra,
1166 + enum ra_event e, int pages)
1167 +{
1168 + enum ra_class c;
1169 +
1170 + if (e == RA_EVENT_READAHEAD_HIT && pages < 0) {
1171 + c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK;
1172 + pages = -pages;
1173 + } else if (ra)
1174 + c = ra->flags & RA_CLASS_MASK;
1175 + else
1176 + c = RA_CLASS_END;
1177 +
1178 + if (!c)
1179 + c = RA_CLASS_END;
1180 +
1181 + ra_events[c][e][0] += 1;
1182 + ra_events[c][e][1] += pages;
1183 +
1184 + if (e == RA_EVENT_READAHEAD)
1185 + ra_events[c][RA_EVENT_END][1] += pages * pages;
1186 +}
1187 +
1188 +static int ra_events_show(struct seq_file *s, void *_)
1189 +{
1190 + int i;
1191 + int c;
1192 + int e;
1193 + static const char event_fmt[] = "%-16s";
1194 + static const char class_fmt[] = "%10s";
1195 + static const char item_fmt[] = "%10lu";
1196 + static const char percent_format[] = "%9lu%%";
1197 + static const char * const table_name[] = {
1198 + "[table requests]",
1199 + "[table pages]",
1200 + "[table summary]"};
1201 +
1202 + for (i = 0; i <= 1; i++) {
1203 + for (e = 0; e <= RA_EVENT_END; e++) {
1204 + ra_events[0][e][i] = 0;
1205 + for (c = 1; c < RA_CLASS_END; c++)
1206 + ra_events[0][e][i] += ra_events[c][e][i];
1207 + }
1208 +
1209 + seq_printf(s, event_fmt, table_name[i]);
1210 + for (c = 0; c <= RA_CLASS_END; c++)
1211 + seq_printf(s, class_fmt, ra_class_name[c]);
1212 + seq_puts(s, "\n");
1213 +
1214 + for (e = 0; e < RA_EVENT_END; e++) {
1215 + if (e == RA_EVENT_READAHEAD_HIT && i == 0)
1216 + continue;
1217 + if (e == RA_EVENT_IO_BLOCK && i == 1)
1218 + continue;
1219 +
1220 + seq_printf(s, event_fmt, ra_event_name[e]);
1221 + for (c = 0; c <= RA_CLASS_END; c++)
1222 + seq_printf(s, item_fmt, ra_events[c][e][i]);
1223 + seq_puts(s, "\n");
1224 + }
1225 + seq_puts(s, "\n");
1226 + }
1227 +
1228 + seq_printf(s, event_fmt, table_name[2]);
1229 + for (c = 0; c <= RA_CLASS_END; c++)
1230 + seq_printf(s, class_fmt, ra_class_name[c]);
1231 + seq_puts(s, "\n");
1232 +
1233 + seq_printf(s, event_fmt, "random_rate");
1234 + for (c = 0; c <= RA_CLASS_END; c++)
1235 + seq_printf(s, percent_format,
1236 + (ra_events[c][RA_EVENT_READRANDOM][0] * 100) /
1237 + ((ra_events[c][RA_EVENT_READRANDOM][0] +
1238 + ra_events[c][RA_EVENT_READAHEAD][0]) | 1));
1239 + seq_puts(s, "\n");
1240 +
1241 + seq_printf(s, event_fmt, "ra_hit_rate");
1242 + for (c = 0; c <= RA_CLASS_END; c++)
1243 + seq_printf(s, percent_format,
1244 + (ra_events[c][RA_EVENT_READAHEAD_HIT][1] * 100) /
1245 + (ra_events[c][RA_EVENT_READAHEAD][1] | 1));
1246 + seq_puts(s, "\n");
1247 +
1248 + seq_printf(s, event_fmt, "la_hit_rate");
1249 + for (c = 0; c <= RA_CLASS_END; c++)
1250 + seq_printf(s, percent_format,
1251 + (ra_events[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) /
1252 + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1));
1253 + seq_puts(s, "\n");
1254 +
1255 + seq_printf(s, event_fmt, "var_ra_size");
1256 + for (c = 0; c <= RA_CLASS_END; c++)
1257 + seq_printf(s, item_fmt,
1258 + (ra_events[c][RA_EVENT_END][1] -
1259 + ra_events[c][RA_EVENT_READAHEAD][1] *
1260 + (ra_events[c][RA_EVENT_READAHEAD][1] /
1261 + (ra_events[c][RA_EVENT_READAHEAD][0] | 1))) /
1262 + (ra_events[c][RA_EVENT_READAHEAD][0] | 1));
1263 + seq_puts(s, "\n");
1264 +
1265 + seq_printf(s, event_fmt, "avg_ra_size");
1266 + for (c = 0; c <= RA_CLASS_END; c++)
1267 + seq_printf(s, item_fmt,
1268 + (ra_events[c][RA_EVENT_READAHEAD][1] +
1269 + ra_events[c][RA_EVENT_READAHEAD][0] / 2) /
1270 + (ra_events[c][RA_EVENT_READAHEAD][0] | 1));
1271 + seq_puts(s, "\n");
1272 +
1273 + seq_printf(s, event_fmt, "avg_la_size");
1274 + for (c = 0; c <= RA_CLASS_END; c++)
1275 + seq_printf(s, item_fmt,
1276 + (ra_events[c][RA_EVENT_LOOKAHEAD][1] +
1277 + ra_events[c][RA_EVENT_LOOKAHEAD][0] / 2) /
1278 + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1));
1279 + seq_puts(s, "\n");
1280 +
1281 + return 0;
1282 +}
1283 +
1284 +static int ra_events_open(struct inode *inode, struct file *file)
1285 +{
1286 + return single_open(file, ra_events_show, NULL);
1287 +}
1288 +
1289 +static ssize_t ra_events_write(struct file *file, const char __user *buf,
1290 + size_t size, loff_t *offset)
1291 +{
1292 + memset(ra_events, 0, sizeof(ra_events));
1293 + return 1;
1294 +}
1295 +
1296 +struct file_operations ra_events_fops = {
1297 + .owner = THIS_MODULE,
1298 + .open = ra_events_open,
1299 + .write = ra_events_write,
1300 + .read = seq_read,
1301 + .llseek = seq_lseek,
1302 + .release = single_release,
1303 +};
1304 +
1305 +u32 readahead_debug_level = 0;
1306 +u32 disable_stateful_method = 0;
1307 +
1308 +static int __init readahead_init(void)
1309 +{
1310 + struct dentry *root;
1311 +
1312 + root = debugfs_create_dir("readahead", NULL);
1313 +
1314 + debugfs_create_file("events", 0644, root, NULL, &ra_events_fops);
1315 +
1316 + debugfs_create_u32("debug_level", 0644, root, &readahead_debug_level);
1317 + debugfs_create_bool("disable_stateful_method", 0644, root,
1318 + &disable_stateful_method);
1319 +
1320 + return 0;
1321 +}
1322 +
1323 +module_init(readahead_init)
1324 +#else
1325 +#define ra_account(ra, e, pages) do { } while (0)
1326 +#define readahead_debug_level (0)
1327 +#define disable_stateful_method (0)
1328 +#endif /* CONFIG_DEBUG_READAHEAD */
1329 +
1330 +#define dprintk(args...) \
1331 + do { if (readahead_debug_level >= 1) printk(KERN_DEBUG args); } while(0)
1332 +#define ddprintk(args...) \
1333 + do { if (readahead_debug_level >= 2) printk(KERN_DEBUG args); } while(0)
1334 +
1335
1336 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1337 {
1338 @@ -21,7 +315,7 @@ void default_unplug_io_fn(struct backing
1339 EXPORT_SYMBOL(default_unplug_io_fn);
1340
1341 struct backing_dev_info default_backing_dev_info = {
1342 - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
1343 + .ra_pages = MAX_RA_PAGES,
1344 .state = 0,
1345 .capabilities = BDI_CAP_MAP_COPY,
1346 .unplug_io_fn = default_unplug_io_fn,
1347 @@ -49,7 +343,7 @@ static inline unsigned long get_max_read
1348
1349 static inline unsigned long get_min_readahead(struct file_ra_state *ra)
1350 {
1351 - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
1352 + return MIN_RA_PAGES;
1353 }
1354
1355 static inline void ra_off(struct file_ra_state *ra)
1356 @@ -134,8 +428,10 @@ int read_cache_pages(struct address_spac
1357 continue;
1358 }
1359 ret = filler(data, page);
1360 - if (!pagevec_add(&lru_pvec, page))
1361 + if (!pagevec_add(&lru_pvec, page)) {
1362 + cond_resched();
1363 __pagevec_lru_add(&lru_pvec);
1364 + }
1365 if (ret) {
1366 while (!list_empty(pages)) {
1367 struct page *victim;
1368 @@ -173,8 +469,10 @@ static int read_pages(struct address_spa
1369 page->index, GFP_KERNEL)) {
1370 ret = mapping->a_ops->readpage(filp, page);
1371 if (ret != AOP_TRUNCATED_PAGE) {
1372 - if (!pagevec_add(&lru_pvec, page))
1373 + if (!pagevec_add(&lru_pvec, page)) {
1374 + cond_resched();
1375 __pagevec_lru_add(&lru_pvec);
1376 + }
1377 continue;
1378 } /* else fall through to release */
1379 }
1380 @@ -257,7 +555,8 @@ out:
1381 */
1382 static int
1383 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
1384 - pgoff_t offset, unsigned long nr_to_read)
1385 + pgoff_t offset, unsigned long nr_to_read,
1386 + unsigned long lookahead_size)
1387 {
1388 struct inode *inode = mapping->host;
1389 struct page *page;
1390 @@ -270,7 +569,7 @@ __do_page_cache_readahead(struct address
1391 if (isize == 0)
1392 goto out;
1393
1394 - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
1395 + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
1396
1397 /*
1398 * Preallocate as many pages as we will need.
1399 @@ -287,12 +586,15 @@ __do_page_cache_readahead(struct address
1400 continue;
1401
1402 read_unlock_irq(&mapping->tree_lock);
1403 + cond_resched();
1404 page = page_cache_alloc_cold(mapping);
1405 read_lock_irq(&mapping->tree_lock);
1406 if (!page)
1407 break;
1408 page->index = page_offset;
1409 list_add(&page->lru, &page_pool);
1410 + if (page_idx == nr_to_read - lookahead_size)
1411 + __SetPageReadahead(page);
1412 ret++;
1413 }
1414 read_unlock_irq(&mapping->tree_lock);
1415 @@ -329,7 +631,7 @@ int force_page_cache_readahead(struct ad
1416 if (this_chunk > nr_to_read)
1417 this_chunk = nr_to_read;
1418 err = __do_page_cache_readahead(mapping, filp,
1419 - offset, this_chunk);
1420 + offset, this_chunk, 0);
1421 if (err < 0) {
1422 ret = err;
1423 break;
1424 @@ -338,6 +640,9 @@ int force_page_cache_readahead(struct ad
1425 offset += this_chunk;
1426 nr_to_read -= this_chunk;
1427 }
1428 +
1429 + ra_account(NULL, RA_EVENT_READAHEAD, ret);
1430 +
1431 return ret;
1432 }
1433
1434 @@ -373,10 +678,16 @@ static inline int check_ra_success(struc
1435 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
1436 pgoff_t offset, unsigned long nr_to_read)
1437 {
1438 + unsigned long ret;
1439 +
1440 if (bdi_read_congested(mapping->backing_dev_info))
1441 return -1;
1442
1443 - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
1444 + ret = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
1445 +
1446 + ra_account(NULL, RA_EVENT_READAHEAD, ret);
1447 +
1448 + return ret;
1449 }
1450
1451 /*
1452 @@ -396,7 +707,11 @@ blockable_page_cache_readahead(struct ad
1453 if (!block && bdi_read_congested(mapping->backing_dev_info))
1454 return 0;
1455
1456 - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
1457 + actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
1458 +
1459 + ra_account(NULL, RA_EVENT_READAHEAD, actual);
1460 + dprintk("blockable-readahead(ino=%lu, ra=%lu+%lu) = %d\n",
1461 + mapping->host->i_ino, offset, nr_to_read, actual);
1462
1463 return check_ra_success(ra, nr_to_read, actual);
1464 }
1465 @@ -442,7 +757,7 @@ static int make_ahead_window(struct addr
1466 * @req_size: hint: total size of the read which the caller is performing in
1467 * PAGE_CACHE_SIZE units
1468 *
1469 - * page_cache_readahead() is the main function. If performs the adaptive
1470 + * page_cache_readahead() is the main function. It performs the adaptive
1471 * readahead window size management and submits the readahead I/O.
1472 *
1473 * Note that @filp is purely used for passing on to the ->readpage[s]()
1474 @@ -572,3 +887,1187 @@ unsigned long max_sane_readahead(unsigne
1475 __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
1476 return min(nr, (inactive + free) / 2);
1477 }
1478 +
1479 +/*
1480 + * Adaptive read-ahead.
1481 + *
1482 + * Good read patterns are compact both in space and time. The read-ahead logic
1483 + * tries to grant larger read-ahead size to better readers under the constraint
1484 + * of system memory and load pressure.
1485 + *
1486 + * It employs two methods to estimate the max thrashing safe read-ahead size:
1487 + * 1. state based - the default one
1488 + * 2. context based - the failsafe one
1489 + * The integration of the dual methods has the merit of being agile and robust.
1490 + * It makes the overall design clean: special cases are handled in general by
1491 + * the stateless method, leaving the stateful one simple and fast.
1492 + *
1493 + * To improve throughput and decrease read delay, the logic 'looks ahead'.
1494 + * In most read-ahead chunks, one page will be selected and tagged with
1495 + * PG_readahead. Later when the page with PG_readahead is read, the logic
1496 + * will be notified to submit the next read-ahead chunk in advance.
1497 + *
1498 + * a read-ahead chunk
1499 + * +-----------------------------------------+
1500 + * | # PG_readahead |
1501 + * +-----------------------------------------+
1502 + * ^ When this page is read, notify me for the next read-ahead.
1503 + *
1504 + *
1505 + * Here are some variable names used frequently:
1506 + *
1507 + * |<------- la_size ------>|
1508 + * +-----------------------------------------+
1509 + * | # |
1510 + * +-----------------------------------------+
1511 + * ra_index -->|<---------------- ra_size -------------->|
1512 + *
1513 + */
1514 +
1515 +#ifdef CONFIG_ADAPTIVE_READAHEAD
1516 +
1517 +/*
1518 + * The nature of read-ahead allows false tests to occur occasionally.
1519 + * Here we just do not bother to call get_page(), it's meaningless anyway.
1520 + */
1521 +static inline struct page *__find_page(struct address_space *mapping,
1522 + pgoff_t offset)
1523 +{
1524 + return radix_tree_lookup(&mapping->page_tree, offset);
1525 +}
1526 +
1527 +static inline struct page *find_page(struct address_space *mapping,
1528 + pgoff_t offset)
1529 +{
1530 + struct page *page;
1531 +
1532 + read_lock_irq(&mapping->tree_lock);
1533 + page = __find_page(mapping, offset);
1534 + read_unlock_irq(&mapping->tree_lock);
1535 + return page;
1536 +}
1537 +
1538 +/*
1539 + * Move pages in danger (of thrashing) to the head of inactive_list.
1540 + * Not expected to happen frequently.
1541 + */
1542 +static unsigned long rescue_pages(struct page *page, unsigned long nr_pages)
1543 +{
1544 + int pgrescue;
1545 + pgoff_t index;
1546 + struct zone *zone;
1547 + struct address_space *mapping;
1548 +
1549 + BUG_ON(!nr_pages || !page);
1550 + pgrescue = 0;
1551 + index = page_index(page);
1552 + mapping = page_mapping(page);
1553 +
1554 + dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n",
1555 + mapping->host->i_ino, index, nr_pages);
1556 +
1557 + for(;;) {
1558 + zone = page_zone(page);
1559 + spin_lock_irq(&zone->lru_lock);
1560 +
1561 + if (!PageLRU(page))
1562 + goto out_unlock;
1563 +
1564 + while (page_mapping(page) == mapping &&
1565 + page_index(page) == index) {
1566 + struct page *the_page = page;
1567 + page = next_page(page);
1568 + if (!PageActive(the_page) &&
1569 + !PageLocked(the_page) &&
1570 + page_count(the_page) == 1) {
1571 + list_move(&the_page->lru, &zone->inactive_list);
1572 + pgrescue++;
1573 + }
1574 + index++;
1575 + if (!--nr_pages)
1576 + goto out_unlock;
1577 + }
1578 +
1579 + spin_unlock_irq(&zone->lru_lock);
1580 +
1581 + cond_resched();
1582 + page = find_page(mapping, index);
1583 + if (!page)
1584 + goto out;
1585 + }
1586 +out_unlock:
1587 + spin_unlock_irq(&zone->lru_lock);
1588 +out:
1589 + ra_account(NULL, RA_EVENT_READAHEAD_RESCUE, pgrescue);
1590 + return nr_pages;
1591 +}
1592 +
1593 +/*
1594 + * Set a new look-ahead mark at @new_index.
1595 + * Return 0 if the new mark is successfully set.
1596 + */
1597 +static inline int renew_lookahead(struct address_space *mapping,
1598 + struct file_ra_state *ra,
1599 + pgoff_t index, pgoff_t new_index)
1600 +{
1601 + struct page *page;
1602 +
1603 + if (index == ra->lookahead_index &&
1604 + new_index >= ra->readahead_index)
1605 + return 1;
1606 +
1607 + page = find_page(mapping, new_index);
1608 + if (!page)
1609 + return 1;
1610 +
1611 + __SetPageReadahead(page);
1612 + if (ra->lookahead_index == index)
1613 + ra->lookahead_index = new_index;
1614 +
1615 + return 0;
1616 +}
1617 +
1618 +/*
1619 + * State based calculation of read-ahead request.
1620 + *
1621 + * This figure shows the meaning of file_ra_state members:
1622 + *
1623 + * chunk A chunk B
1624 + * +---------------------------+-------------------------------------------+
1625 + * | # | # |
1626 + * +---------------------------+-------------------------------------------+
1627 + * ^ ^ ^ ^
1628 + * la_index ra_index lookahead_index readahead_index
1629 + */
1630 +
1631 +/*
1632 + * The node's effective length of inactive_list(s).
1633 + */
1634 +static unsigned long node_free_and_cold_pages(void)
1635 +{
1636 + unsigned int i;
1637 + unsigned long sum = 0;
1638 + struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
1639 +
1640 + for (i = 0; i < MAX_NR_ZONES; i++)
1641 + sum += zones[i].nr_inactive +
1642 + zones[i].free_pages - zones[i].pages_low;
1643 +
1644 + return sum;
1645 +}
1646 +
1647 +/*
1648 + * The node's accumulated aging activities.
1649 + */
1650 +static unsigned long node_readahead_aging(void)
1651 +{
1652 + unsigned long cpu;
1653 + unsigned long sum = 0;
1654 + cpumask_t mask = node_to_cpumask(numa_node_id());
1655 +
1656 + for_each_cpu_mask(cpu, mask)
1657 + sum += per_cpu(readahead_aging, cpu);
1658 +
1659 + return sum;
1660 +}
1661 +
1662 +/*
1663 + * The 64bit cache_hits stores three accumulated values and a counter value.
1664 + * MSB LSB
1665 + * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000
1666 + */
1667 +static inline int ra_cache_hit(struct file_ra_state *ra, int nr)
1668 +{
1669 + return (ra->cache_hits >> (nr * 16)) & 0xFFFF;
1670 +}
1671 +
1672 +/*
1673 + * Conceptual code:
1674 + * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0);
1675 + * ra_cache_hit(ra, 0) = 0;
1676 + */
1677 +static inline void ra_addup_cache_hit(struct file_ra_state *ra)
1678 +{
1679 + int n;
1680 +
1681 + n = ra_cache_hit(ra, 0);
1682 + ra->cache_hits -= n;
1683 + n <<= 16;
1684 + ra->cache_hits += n;
1685 +}
1686 +
1687 +/*
1688 + * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate.
1689 + */
1690 +static inline int ra_cache_hit_ok(struct file_ra_state *ra)
1691 +{
1692 + return ra_cache_hit(ra, 0) * readahead_hit_rate >=
1693 + (ra->lookahead_index - ra->la_index);
1694 +}
1695 +
1696 +/*
1697 + * Check if @index falls in the @ra request.
1698 + */
1699 +static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
1700 +{
1701 + if (index < ra->la_index || index >= ra->readahead_index)
1702 + return 0;
1703 +
1704 + if (index >= ra->ra_index)
1705 + return 1;
1706 + else
1707 + return -1;
1708 +}
1709 +
1710 +/*
1711 + * Which method is issuing this read-ahead?
1712 + */
1713 +static inline void ra_set_class(struct file_ra_state *ra,
1714 + enum ra_class ra_class)
1715 +{
1716 + unsigned long flags_mask;
1717 + unsigned long flags;
1718 + unsigned long old_ra_class;
1719 +
1720 + flags_mask = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT));
1721 + flags = ra->flags & flags_mask;
1722 +
1723 + old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT;
1724 +
1725 + ra->flags = flags | old_ra_class | ra_class;
1726 +
1727 + ra_addup_cache_hit(ra);
1728 + if (ra_class != RA_CLASS_STATE)
1729 + ra->cache_hits <<= 16;
1730 +
1731 + ra->age = node_readahead_aging();
1732 +}
1733 +
1734 +/*
1735 + * Where is the old read-ahead and look-ahead?
1736 + */
1737 +static inline void ra_set_index(struct file_ra_state *ra,
1738 + pgoff_t la_index, pgoff_t ra_index)
1739 +{
1740 + ra->la_index = la_index;
1741 + ra->ra_index = ra_index;
1742 +}
1743 +
1744 +/*
1745 + * Where is the new read-ahead and look-ahead?
1746 + */
1747 +static inline void ra_set_size(struct file_ra_state *ra,
1748 + unsigned long ra_size, unsigned long la_size)
1749 +{
1750 + /* Disable look-ahead for loopback file. */
1751 + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD))
1752 + la_size = 0;
1753 +
1754 + ra->readahead_index = ra->ra_index + ra_size;
1755 + ra->lookahead_index = ra->readahead_index - la_size;
1756 +}
1757 +
1758 +/*
1759 + * Submit IO for the read-ahead request in file_ra_state.
1760 + */
1761 +static int ra_dispatch(struct file_ra_state *ra,
1762 + struct address_space *mapping, struct file *filp)
1763 +{
1764 + pgoff_t eof_index;
1765 + unsigned long ra_size;
1766 + unsigned long la_size;
1767 + int actual;
1768 + enum ra_class ra_class;
1769 +
1770 + ra_class = (ra->flags & RA_CLASS_MASK);
1771 + BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
1772 +
1773 + eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1;
1774 + ra_size = ra->readahead_index - ra->ra_index;
1775 + la_size = ra->readahead_index - ra->lookahead_index;
1776 +
1777 + /* Snap to EOF. */
1778 + if (unlikely(ra->ra_index >= eof_index))
1779 + return 0;
1780 + if (ra->readahead_index + ra_size / 2 > eof_index) {
1781 + if (ra_class == RA_CLASS_CONTEXT_AGGRESSIVE &&
1782 + eof_index > ra->lookahead_index + 1)
1783 + la_size = eof_index - ra->lookahead_index;
1784 + else
1785 + la_size = 0;
1786 + ra_size = eof_index - ra->ra_index;
1787 + ra_set_size(ra, ra_size, la_size);
1788 + }
1789 +
1790 + actual = __do_page_cache_readahead(mapping, filp,
1791 + ra->ra_index, ra_size, la_size);
1792 +
1793 +#ifdef CONFIG_DEBUG_READAHEAD
1794 + if (ra->flags & RA_FLAG_MMAP)
1795 + ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual);
1796 + if (ra->readahead_index == eof_index)
1797 + ra_account(ra, RA_EVENT_READAHEAD_EOF, actual);
1798 + if (la_size)
1799 + ra_account(ra, RA_EVENT_LOOKAHEAD, la_size);
1800 + if (ra_size > actual)
1801 + ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual);
1802 + ra_account(ra, RA_EVENT_READAHEAD, actual);
1803 +
1804 + if (!ra->ra_index && filp->f_dentry->d_inode) {
1805 + char *fn;
1806 + static char path[1024];
1807 + unsigned long size;
1808 +
1809 + size = (i_size_read(filp->f_dentry->d_inode)+1023)/1024;
1810 + fn = d_path(filp->f_dentry, filp->f_vfsmnt, path, 1000);
1811 + if (!IS_ERR(fn))
1812 + ddprintk("ino %lu is %s size %luK by %s(%d)\n",
1813 + filp->f_dentry->d_inode->i_ino,
1814 + fn, size,
1815 + current->comm, current->pid);
1816 + }
1817 +
1818 + dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
1819 + ra_class_name[ra_class],
1820 + mapping->host->i_ino, ra->la_index,
1821 + ra->ra_index, ra_size, la_size, actual);
1822 +#endif /* CONFIG_DEBUG_READAHEAD */
1823 +
1824 + return actual;
1825 +}
1826 +
1827 +/*
1828 + * Determine the ra request from primitive values.
1829 + *
1830 + * It applies the following rules:
1831 + * - Substract ra_size by the old look-ahead to get real safe read-ahead;
1832 + * - Set new la_size according to the (still large) ra_size;
1833 + * - Apply upper limits;
1834 + * - Make sure stream_shift is not too small.
1835 + * (So that the next global_shift will not be too small.)
1836 + *
1837 + * Input:
1838 + * ra_size stores the estimated thrashing-threshold.
1839 + * la_size stores the look-ahead size of previous request.
1840 + */
1841 +static inline int adjust_rala(unsigned long ra_max,
1842 + unsigned long *ra_size, unsigned long *la_size)
1843 +{
1844 + unsigned long stream_shift = *la_size;
1845 +
1846 + if (*ra_size > *la_size)
1847 + *ra_size -= *la_size;
1848 + else {
1849 + ra_account(NULL, RA_EVENT_READAHEAD_SHRINK, *ra_size);
1850 + return 0;
1851 + }
1852 +
1853 + *la_size = *ra_size / LOOKAHEAD_RATIO;
1854 +
1855 + if (*ra_size > ra_max)
1856 + *ra_size = ra_max;
1857 + if (*la_size > *ra_size)
1858 + *la_size = *ra_size;
1859 +
1860 + stream_shift += (*ra_size - *la_size);
1861 + if (stream_shift < *ra_size / 4)
1862 + *la_size -= (*ra_size / 4 - stream_shift);
1863 +
1864 + return 1;
1865 +}
1866 +
1867 +/*
1868 + * The function estimates two values:
1869 + * 1. thrashing-threshold for the current stream
1870 + * It is returned to make the next read-ahead request.
1871 + * 2. the remained safe space for the current chunk
1872 + * It will be checked to ensure that the current chunk is safe.
1873 + *
1874 + * The computation will be pretty accurate under heavy load, and will vibrate
1875 + * more on light load(with small global_shift), so the grow speed of ra_size
1876 + * must be limited, and a moderate large stream_shift must be insured.
1877 + *
1878 + * This figure illustrates the formula used in the function:
1879 + * While the stream reads stream_shift pages inside the chunks,
1880 + * the chunks are shifted global_shift pages inside inactive_list.
1881 + *
1882 + * chunk A chunk B
1883 + * |<=============== global_shift ================|
1884 + * +-------------+ +-------------------+ |
1885 + * | # | | # | inactive_list |
1886 + * +-------------+ +-------------------+ head |
1887 + * |---->| |---------->|
1888 + * | |
1889 + * +-- stream_shift --+
1890 + */
1891 +static inline unsigned long compute_thrashing_threshold(
1892 + struct file_ra_state *ra,
1893 + unsigned long *remain)
1894 +{
1895 + unsigned long global_size;
1896 + unsigned long global_shift;
1897 + unsigned long stream_shift;
1898 + unsigned long ra_size;
1899 + uint64_t ll;
1900 +
1901 + global_size = node_free_and_cold_pages();
1902 + global_shift = node_readahead_aging() - ra->age;
1903 + global_shift |= 1UL;
1904 + stream_shift = ra_cache_hit(ra, 0);
1905 +
1906 + ll = (uint64_t) stream_shift * (global_size >> 9) * readahead_ratio * 5;
1907 + do_div(ll, global_shift);
1908 + ra_size = ll;
1909 +
1910 + if (global_size > global_shift) {
1911 + ll = (uint64_t) stream_shift * (global_size - global_shift);
1912 + do_div(ll, global_shift);
1913 + *remain = ll;
1914 + } else
1915 + *remain = 0;
1916 +
1917 + ddprintk("compute_thrashing_threshold: "
1918 + "at %lu ra %lu=%lu*%lu/%lu, remain %lu for %lu\n",
1919 + ra->readahead_index, ra_size,
1920 + stream_shift, global_size, global_shift,
1921 + *remain, ra->readahead_index - ra->lookahead_index);
1922 +
1923 + return ra_size;
1924 +}
1925 +
1926 +/*
1927 + * Main function for file_ra_state based read-ahead.
1928 + */
1929 +static inline unsigned long
1930 +state_based_readahead(struct address_space *mapping, struct file *filp,
1931 + struct file_ra_state *ra,
1932 + struct page *page, pgoff_t index,
1933 + unsigned long ra_size, unsigned long ra_max)
1934 +{
1935 + unsigned long ra_old;
1936 + unsigned long la_size;
1937 + unsigned long remain_space;
1938 + unsigned long growth_limit;
1939 +
1940 + la_size = ra->readahead_index - index;
1941 + ra_old = ra->readahead_index - ra->ra_index;
1942 + growth_limit = ra_size + ra_max / 16 +
1943 + (2 + readahead_ratio / 64) * ra_old;
1944 + ra_size = compute_thrashing_threshold(ra, &remain_space);
1945 +
1946 + if (page && remain_space <= la_size && la_size > 1) {
1947 + rescue_pages(page, la_size);
1948 + return 0;
1949 + }
1950 +
1951 + if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size))
1952 + return 0;
1953 +
1954 + ra_set_class(ra, RA_CLASS_STATE);
1955 + ra_set_index(ra, index, ra->readahead_index);
1956 + ra_set_size(ra, ra_size, la_size);
1957 +
1958 + return ra_dispatch(ra, mapping, filp);
1959 +}
1960 +
1961 +/*
1962 + * Page cache context based estimation of read-ahead/look-ahead size/index.
1963 + *
1964 + * The logic first looks around to find the start point of next read-ahead,
1965 + * and then, if necessary, looks backward in the inactive_list to get an
1966 + * estimation of the thrashing-threshold.
1967 + *
1968 + * The estimation theory can be illustrated with figure:
1969 + *
1970 + * chunk A chunk B chunk C head
1971 + *
1972 + * l01 l11 l12 l21 l22
1973 + *| |-->|-->| |------>|-->| |------>|
1974 + *| +-------+ +-----------+ +-------------+ |
1975 + *| | # | | # | | # | |
1976 + *| +-------+ +-----------+ +-------------+ |
1977 + *| |<==============|<===========================|<============================|
1978 + * L0 L1 L2
1979 + *
1980 + * Let f(l) = L be a map from
1981 + * l: the number of pages read by the stream
1982 + * to
1983 + * L: the number of pages pushed into inactive_list in the mean time
1984 + * then
1985 + * f(l01) <= L0
1986 + * f(l11 + l12) = L1
1987 + * f(l21 + l22) = L2
1988 + * ...
1989 + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...)
1990 + * <= Length(inactive_list) = f(thrashing-threshold)
1991 + *
1992 + * So the count of countinuous history pages left in the inactive_list is always
1993 + * a lower estimation of the true thrashing-threshold.
1994 + */
1995 +
1996 +#define PAGE_REFCNT_0 0
1997 +#define PAGE_REFCNT_1 (1 << PG_referenced)
1998 +#define PAGE_REFCNT_2 (1 << PG_active)
1999 +#define PAGE_REFCNT_3 ((1 << PG_active) | (1 << PG_referenced))
2000 +#define PAGE_REFCNT_MASK PAGE_REFCNT_3
2001 +
2002 +/*
2003 + * STATUS REFERENCE COUNT
2004 + * __ 0
2005 + * _R PAGE_REFCNT_1
2006 + * A_ PAGE_REFCNT_2
2007 + * AR PAGE_REFCNT_3
2008 + *
2009 + * A/R: Active / Referenced
2010 + */
2011 +static inline unsigned long page_refcnt(struct page *page)
2012 +{
2013 + return page->flags & PAGE_REFCNT_MASK;
2014 +}
2015 +
2016 +/*
2017 + * STATUS REFERENCE COUNT TYPE
2018 + * __ 0 fresh
2019 + * _R PAGE_REFCNT_1 stale
2020 + * A_ PAGE_REFCNT_2 disturbed once
2021 + * AR PAGE_REFCNT_3 disturbed twice
2022 + *
2023 + * A/R: Active / Referenced
2024 + */
2025 +static inline unsigned long cold_page_refcnt(struct page *page)
2026 +{
2027 + if (!page || PageActive(page))
2028 + return 0;
2029 +
2030 + return page_refcnt(page);
2031 +}
2032 +
2033 +static inline char page_refcnt_symbol(struct page *page)
2034 +{
2035 + if (!page)
2036 + return 'X';
2037 +
2038 + switch (page_refcnt(page)) {
2039 + case 0:
2040 + return '_';
2041 + case PAGE_REFCNT_1:
2042 + return '-';
2043 + case PAGE_REFCNT_2:
2044 + return '=';
2045 + case PAGE_REFCNT_3:
2046 + return '#';
2047 + default:
2048 + return '?';
2049 + }
2050 +}
2051 +
2052 +/*
2053 + * Count/estimate cache hits in range [first_index, last_index].
2054 + * The estimation is simple and optimistic.
2055 + */
2056 +static int count_cache_hit(struct address_space *mapping,
2057 + pgoff_t first_index, pgoff_t last_index)
2058 +{
2059 + struct page *page;
2060 + int size = last_index - first_index + 1;
2061 + int count = 0;
2062 + int i;
2063 +
2064 + cond_resched();
2065 + read_lock_irq(&mapping->tree_lock);
2066 +
2067 + /*
2068 + * The first page may well is chunk head and has been accessed,
2069 + * so it is index 0 that makes the estimation optimistic. This
2070 + * behavior guarantees a readahead when (size < ra_max) and
2071 + * (readahead_hit_rate >= 16).
2072 + */
2073 + for (i = 0; i < 16;) {
2074 + page = __find_page(mapping, first_index +
2075 + size * ((i++ * 29) & 15) / 16);
2076 + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2)
2077 + break;
2078 + }
2079 +
2080 + read_unlock_irq(&mapping->tree_lock);
2081 +
2082 + return size * count / i;
2083 +}
2084 +
2085 +/*
2086 + * Look back and check history pages to estimate thrashing-threshold.
2087 + */
2088 +static unsigned long query_page_cache_segment(struct address_space *mapping,
2089 + struct file_ra_state *ra,
2090 + unsigned long *remain, pgoff_t offset,
2091 + unsigned long ra_min, unsigned long ra_max)
2092 +{
2093 + pgoff_t index;
2094 + unsigned long count;
2095 + unsigned long nr_lookback;
2096 + struct radix_tree_cache cache;
2097 +
2098 + /*
2099 + * Scan backward and check the near @ra_max pages.
2100 + * The count here determines ra_size.
2101 + */
2102 + cond_resched();
2103 + read_lock_irq(&mapping->tree_lock);
2104 + index = radix_tree_scan_hole_backward(&mapping->page_tree,
2105 + offset, ra_max);
2106 +#ifdef DEBUG_READAHEAD_RADIXTREE
2107 + WARN_ON(index > offset);
2108 + if (index != offset)
2109 + WARN_ON(!__find_page(mapping, index + 1));
2110 + if (index && offset - index < ra_max)
2111 + WARN_ON(__find_page(mapping, index));
2112 +#endif
2113 + read_unlock_irq(&mapping->tree_lock);
2114 +
2115 + *remain = offset - index;
2116 +
2117 + if (offset == ra->readahead_index && ra_cache_hit_ok(ra))
2118 + count = *remain;
2119 + else if (count_cache_hit(mapping, index + 1, offset) *
2120 + readahead_hit_rate >= *remain)
2121 + count = *remain;
2122 + else
2123 + count = ra_min;
2124 +
2125 + /*
2126 + * Unnecessary to count more?
2127 + */
2128 + if (count < ra_max)
2129 + goto out;
2130 +
2131 + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD))
2132 + goto out;
2133 +
2134 + /*
2135 + * Check the far pages coarsely.
2136 + * The big count here helps increase la_size.
2137 + */
2138 + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
2139 + 100 / (readahead_ratio + 1);
2140 +
2141 + cond_resched();
2142 + radix_tree_cache_init(&cache);
2143 + read_lock_irq(&mapping->tree_lock);
2144 + for (count += ra_max; count < nr_lookback; count += ra_max) {
2145 + struct radix_tree_node *node;
2146 + node = radix_tree_cache_lookup_node(&mapping->page_tree,
2147 + &cache, offset - count, 1);
2148 +#ifdef DEBUG_READAHEAD_RADIXTREE
2149 + if (node != radix_tree_lookup_node(&mapping->page_tree,
2150 + offset - count, 1))
2151 + BUG();
2152 +#endif
2153 + if (!node)
2154 + break;
2155 + }
2156 + read_unlock_irq(&mapping->tree_lock);
2157 +
2158 +out:
2159 + /*
2160 + * For sequential read that extends from index 0, the counted value
2161 + * may well be far under the true threshold, so return it unmodified
2162 + * for further process in adjust_rala_aggressive().
2163 + */
2164 + if (count >= offset)
2165 + count = offset;
2166 + else
2167 + count = max(ra_min, count * readahead_ratio / 100);
2168 +
2169 + ddprintk("query_page_cache_segment: "
2170 + "ino=%lu, idx=%lu, count=%lu, remain=%lu\n",
2171 + mapping->host->i_ino, offset, count, *remain);
2172 +
2173 + return count;
2174 +}
2175 +
2176 +/*
2177 + * Find past-the-end index of the segment before @index.
2178 + */
2179 +static inline pgoff_t find_segtail_backward(struct address_space *mapping,
2180 + pgoff_t index, unsigned long max_scan)
2181 +{
2182 + struct radix_tree_cache cache;
2183 + struct page *page;
2184 + pgoff_t origin;
2185 +
2186 + origin = index;
2187 + if (max_scan > index)
2188 + max_scan = index;
2189 +
2190 + cond_resched();
2191 + radix_tree_cache_init(&cache);
2192 + read_lock_irq(&mapping->tree_lock);
2193 + for (; origin - index < max_scan;) {
2194 + page = radix_tree_cache_lookup(&mapping->page_tree,
2195 + &cache, --index);
2196 + if (page) {
2197 + read_unlock_irq(&mapping->tree_lock);
2198 + return index + 1;
2199 + }
2200 + }
2201 + read_unlock_irq(&mapping->tree_lock);
2202 +
2203 + return 0;
2204 +}
2205 +
2206 +/*
2207 + * Find past-the-end index of the segment at @index.
2208 + */
2209 +static inline pgoff_t find_segtail(struct address_space *mapping,
2210 + pgoff_t index, unsigned long max_scan)
2211 +{
2212 + pgoff_t ra_index;
2213 +
2214 + cond_resched();
2215 + read_lock_irq(&mapping->tree_lock);
2216 + ra_index = radix_tree_scan_hole(&mapping->page_tree, index, max_scan);
2217 +#ifdef DEBUG_READAHEAD_RADIXTREE
2218 + BUG_ON(!__find_page(mapping, index));
2219 + WARN_ON(ra_index < index);
2220 + if (ra_index != index && !__find_page(mapping, ra_index - 1))
2221 + printk(KERN_ERR "radix_tree_scan_hole(index=%lu ra_index=%lu "
2222 + "max_scan=%lu nrpages=%lu) fooled!\n",
2223 + index, ra_index, max_scan, mapping->nrpages);
2224 + if (ra_index != ~0UL && ra_index - index < max_scan)
2225 + WARN_ON(__find_page(mapping, ra_index));
2226 +#endif
2227 + read_unlock_irq(&mapping->tree_lock);
2228 +
2229 + if (ra_index <= index + max_scan)
2230 + return ra_index;
2231 + else
2232 + return 0;
2233 +}
2234 +
2235 +/*
2236 + * Determine the request parameters for context based read-ahead that extends
2237 + * from start of file.
2238 + *
2239 + * The major weakness of stateless method is perhaps the slow grow up speed of
2240 + * ra_size. The logic tries to make up for this in the important case of
2241 + * sequential reads that extend from start of file. In this case, the ra_size
2242 + * is not chosen to make the whole next chunk safe (as in normal ones). Only
2243 + * half of which is safe. The added 'unsafe' half is the look-ahead part. It
2244 + * is expected to be safeguarded by rescue_pages() when the previous chunks are
2245 + * lost.
2246 + */
2247 +static inline int adjust_rala_aggressive(unsigned long ra_max,
2248 + unsigned long *ra_size, unsigned long *la_size)
2249 +{
2250 + pgoff_t index = *ra_size;
2251 +
2252 + *ra_size -= min(*ra_size, *la_size);
2253 + *ra_size = *ra_size * readahead_ratio / 100;
2254 + *la_size = index * readahead_ratio / 100;
2255 + *ra_size += *la_size;
2256 +
2257 + if (*ra_size > ra_max)
2258 + *ra_size = ra_max;
2259 + if (*la_size > *ra_size)
2260 + *la_size = *ra_size;
2261 +
2262 + return 1;
2263 +}
2264 +
2265 +/*
2266 + * Main function for page context based read-ahead.
2267 + */
2268 +static inline int
2269 +try_context_based_readahead(struct address_space *mapping,
2270 + struct file_ra_state *ra, struct page *prev_page,
2271 + struct page *page, pgoff_t index,
2272 + unsigned long ra_min, unsigned long ra_max)
2273 +{
2274 + pgoff_t ra_index;
2275 + unsigned long ra_size;
2276 + unsigned long la_size;
2277 + unsigned long remain_pages;
2278 +
2279 + /* Where to start read-ahead?
2280 + * NFSv3 daemons may process adjacent requests in parallel,
2281 + * leading to many locally disordered, globally sequential reads.
2282 + * So do not require nearby history pages to be present or accessed.
2283 + */
2284 + if (page) {
2285 + ra_index = find_segtail(mapping, index, ra_max * 5 / 4);
2286 + if (!ra_index)
2287 + return -1;
2288 + } else if (prev_page || find_page(mapping, index - 1)) {
2289 + ra_index = index;
2290 + } else if (readahead_hit_rate > 1) {
2291 + ra_index = find_segtail_backward(mapping, index,
2292 + readahead_hit_rate + ra_min);
2293 + if (!ra_index)
2294 + return 0;
2295 + ra_min += 2 * (index - ra_index);
2296 + index = ra_index; /* pretend the request starts here */
2297 + } else
2298 + return 0;
2299 +
2300 + ra_size = query_page_cache_segment(mapping, ra, &remain_pages,
2301 + index, ra_min, ra_max);
2302 +
2303 + la_size = ra_index - index;
2304 + if (page && remain_pages <= la_size &&
2305 + remain_pages < index && la_size > 1) {
2306 + rescue_pages(page, la_size);
2307 + return -1;
2308 + }
2309 +
2310 + if (ra_size == index) {
2311 + if (!adjust_rala_aggressive(ra_max, &ra_size, &la_size))
2312 + return -1;
2313 + ra_set_class(ra, RA_CLASS_CONTEXT_AGGRESSIVE);
2314 + } else {
2315 + if (!adjust_rala(ra_max, &ra_size, &la_size))
2316 + return -1;
2317 + ra_set_class(ra, RA_CLASS_CONTEXT);
2318 + }
2319 +
2320 + ra_set_index(ra, index, ra_index);
2321 + ra_set_size(ra, ra_size, la_size);
2322 +
2323 + return 1;
2324 +}
2325 +
2326 +/*
2327 + * Read-ahead on start of file.
2328 + *
2329 + * The strategies here are most important for small files.
2330 + * 1. Set a moderately large read-ahead size;
2331 + * 2. Issue the next read-ahead request as soon as possible.
2332 + *
2333 + * But be careful, there are some applications that dip into only the very head
2334 + * of a file. The most important thing is to prevent them from triggering the
2335 + * next (much larger) read-ahead request, which leads to lots of cache misses.
2336 + * Two pages should be enough for them, correct me if I'm wrong.
2337 + */
2338 +static inline unsigned long
2339 +newfile_readahead(struct address_space *mapping,
2340 + struct file *filp, struct file_ra_state *ra,
2341 + unsigned long req_size, unsigned long ra_min)
2342 +{
2343 + unsigned long ra_size;
2344 + unsigned long la_size;
2345 +
2346 + if (req_size > ra_min) /* larger value risks thrashing */
2347 + req_size = ra_min;
2348 +
2349 + if (unlikely(ra->flags & RA_FLAG_NFSD)) {
2350 + ra_size = MIN_NFSD_PAGES;
2351 + la_size = 0;
2352 + } else {
2353 + ra_size = 4 * req_size;
2354 + la_size = 2 * req_size;
2355 + }
2356 +
2357 + ra_set_class(ra, RA_CLASS_NEWFILE);
2358 + ra_set_index(ra, 0, 0);
2359 + ra_set_size(ra, ra_size, la_size);
2360 +
2361 + return ra_dispatch(ra, mapping, filp);
2362 +}
2363 +
2364 +/*
2365 + * Backward prefetching.
2366 + * No look ahead and thrashing threshold estimation for stepping backward
2367 + * pattern: should be unnecessary.
2368 + */
2369 +static inline int
2370 +try_read_backward(struct file_ra_state *ra, pgoff_t begin_index,
2371 + unsigned long ra_size, unsigned long ra_max)
2372 +{
2373 + pgoff_t end_index;
2374 +
2375 + /* Are we reading backward? */
2376 + if (begin_index > ra->prev_page)
2377 + return 0;
2378 +
2379 + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD &&
2380 + ra_has_index(ra, ra->prev_page)) {
2381 + ra_size += 2 * ra_cache_hit(ra, 0);
2382 + end_index = ra->la_index;
2383 + } else {
2384 + ra_size += ra_size + ra_size * (readahead_hit_rate - 1) / 2;
2385 + end_index = ra->prev_page;
2386 + }
2387 +
2388 + if (ra_size > ra_max)
2389 + ra_size = ra_max;
2390 +
2391 + /* Read traces close enough to be covered by the prefetching? */
2392 + if (end_index > begin_index + ra_size)
2393 + return 0;
2394 +
2395 + begin_index = end_index - ra_size;
2396 +
2397 + ra_set_class(ra, RA_CLASS_BACKWARD);
2398 + ra_set_index(ra, begin_index, begin_index);
2399 + ra_set_size(ra, ra_size, 0);
2400 +
2401 + return 1;
2402 +}
2403 +
2404 +/*
2405 + * Readahead thrashing recovery.
2406 + */
2407 +static inline unsigned long
2408 +thrashing_recovery_readahead(struct address_space *mapping,
2409 + struct file *filp, struct file_ra_state *ra,
2410 + pgoff_t index, unsigned long ra_max)
2411 +{
2412 + unsigned long ra_size;
2413 +
2414 + if (readahead_debug_level && find_page(mapping, index - 1))
2415 + ra_account(ra, RA_EVENT_READAHEAD_MUTILATE,
2416 + ra->readahead_index - index);
2417 + ra_account(ra, RA_EVENT_READAHEAD_THRASHING,
2418 + ra->readahead_index - index);
2419 +
2420 + /*
2421 + * Some thrashing occur in (ra_index, la_index], in which case the
2422 + * old read-ahead chunk is lost soon after the new one is allocated.
2423 + * Ensure that we recover all needed pages in the old chunk.
2424 + */
2425 + if (index < ra->ra_index)
2426 + ra_size = ra->ra_index - index;
2427 + else {
2428 + /* After thrashing, we know the exact thrashing-threshold. */
2429 + ra_size = ra_cache_hit(ra, 0);
2430 +
2431 + /* And we'd better be a bit conservative. */
2432 + ra_size = ra_size * 3 / 4;
2433 + }
2434 +
2435 + if (ra_size > ra_max)
2436 + ra_size = ra_max;
2437 +
2438 + ra_set_class(ra, RA_CLASS_THRASHING);
2439 + ra_set_index(ra, index, index);
2440 + ra_set_size(ra, ra_size, ra_size / LOOKAHEAD_RATIO);
2441 +
2442 + return ra_dispatch(ra, mapping, filp);
2443 +}
2444 +
2445 +/*
2446 + * If there is a previous sequential read, it is likely to be another
2447 + * sequential read at the new position.
2448 + * Databases are known to have this seek-and-read-one-block pattern.
2449 + */
2450 +static inline int
2451 +try_readahead_on_seek(struct file_ra_state *ra, pgoff_t index,
2452 + unsigned long ra_size, unsigned long ra_max)
2453 +{
2454 + unsigned long hit0 = ra_cache_hit(ra, 0);
2455 + unsigned long hit1 = ra_cache_hit(ra, 1) + hit0;
2456 + unsigned long hit2 = ra_cache_hit(ra, 2);
2457 + unsigned long hit3 = ra_cache_hit(ra, 3);
2458 +
2459 + /* There's a previous read-ahead request? */
2460 + if (!ra_has_index(ra, ra->prev_page))
2461 + return 0;
2462 +
2463 + /* The previous read-ahead sequences have similiar sizes? */
2464 + if (!(ra_size < hit1 && hit1 > hit2 / 2 &&
2465 + hit2 > hit3 / 2 &&
2466 + hit3 > hit1 / 2))
2467 + return 0;
2468 +
2469 + hit1 = max(hit1, hit2);
2470 +
2471 + /* Follow the same prefetching direction. */
2472 + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD)
2473 + index = ((index > hit1 - ra_size) ? index - hit1 + ra_size : 0);
2474 +
2475 + ra_size = min(hit1, ra_max);
2476 +
2477 + ra_set_class(ra, RA_CLASS_SEEK);
2478 + ra_set_index(ra, index, index);
2479 + ra_set_size(ra, ra_size, 0);
2480 +
2481 + return 1;
2482 +}
2483 +
2484 +/*
2485 + * ra_min is mainly determined by the size of cache memory.
2486 + * Table of concrete numbers for 4KB page size:
2487 + * inactive + free (MB): 4 8 16 32 64 128 256 512 1024
2488 + * ra_min (KB): 16 16 16 16 20 24 32 48 64
2489 + */
2490 +static inline void get_readahead_bounds(struct file_ra_state *ra,
2491 + unsigned long *ra_min,
2492 + unsigned long *ra_max)
2493 +{
2494 + unsigned long pages;
2495 +
2496 + pages = max_sane_readahead(KB(1024*1024));
2497 + *ra_max = min(min(pages, 0xFFFFUL), ra->ra_pages);
2498 + *ra_min = min(min(MIN_RA_PAGES + (pages>>13), KB(128)), *ra_max/2);
2499 +}
2500 +
2501 +/**
2502 + * page_cache_readahead_adaptive - adaptive read-ahead main function
2503 + * @mapping, @ra, @filp: the same as page_cache_readahead()
2504 + * @prev_page: the page at @index-1, may be NULL to let the function find it
2505 + * @page: the page at @index, or NULL if non-present
2506 + * @begin_index, @index, @end_index: offsets into @mapping
2507 + * [@begin_index, @end_index) is the read the caller is performing
2508 + * @index indicates the page to be read now
2509 + *
2510 + * page_cache_readahead_adaptive() is the entry point of the adaptive
2511 + * read-ahead logic. It tries a set of methods in turn to determine the
2512 + * appropriate readahead action and submits the readahead I/O.
2513 + *
2514 + * The caller is expected to point ra->prev_page to the previously accessed
2515 + * page, and to call it on two conditions:
2516 + * 1. @page == NULL
2517 + * A cache miss happened, some pages have to be read in
2518 + * 2. @page != NULL && PageReadahead(@page)
2519 + * A look-ahead mark encountered, this is set by a previous read-ahead
2520 + * invocation to instruct the caller to give the function a chance to
2521 + * check up and do next read-ahead in advance.
2522 + */
2523 +unsigned long
2524 +page_cache_readahead_adaptive(struct address_space *mapping,
2525 + struct file_ra_state *ra, struct file *filp,
2526 + struct page *prev_page, struct page *page,
2527 + pgoff_t begin_index, pgoff_t index, pgoff_t end_index)
2528 +{
2529 + unsigned long size;
2530 + unsigned long ra_min;
2531 + unsigned long ra_max;
2532 + int ret;
2533 +
2534 + might_sleep();
2535 +
2536 + if (page) {
2537 + if(!TestClearPageReadahead(page))
2538 + return 0;
2539 + if (bdi_read_congested(mapping->backing_dev_info)) {
2540 + ra_account(ra, RA_EVENT_IO_CONGESTION,
2541 + end_index - index);
2542 + return 0;
2543 + }
2544 + if (laptop_mode && laptop_spinned_down()) {
2545 + if (!renew_lookahead(mapping, ra, index,
2546 + index + LAPTOP_POLL_INTERVAL))
2547 + return 0;
2548 + }
2549 + }
2550 +
2551 + if (page)
2552 + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT,
2553 + ra->readahead_index - ra->lookahead_index);
2554 + else if (index)
2555 + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index);
2556 +
2557 + size = end_index - index;
2558 + get_readahead_bounds(ra, &ra_min, &ra_max);
2559 +
2560 + /* readahead disabled? */
2561 + if (unlikely(!ra_max || !readahead_ratio)) {
2562 + size = max_sane_readahead(size);
2563 + goto readit;
2564 + }
2565 +
2566 + /*
2567 + * Start of file.
2568 + */
2569 + if (index == 0)
2570 + return newfile_readahead(mapping, filp, ra, end_index, ra_min);
2571 +
2572 + /*
2573 + * State based sequential read-ahead.
2574 + */
2575 + if (!disable_stateful_method &&
2576 + index == ra->lookahead_index && ra_cache_hit_ok(ra))
2577 + return state_based_readahead(mapping, filp, ra, page,
2578 + index, size, ra_max);
2579 +
2580 + /*
2581 + * Recover from possible thrashing.
2582 + */
2583 + if (!page && index == ra->prev_page + 1 && ra_has_index(ra, index))
2584 + return thrashing_recovery_readahead(mapping, filp, ra,
2585 + index, ra_max);
2586 +
2587 + /*
2588 + * Backward read-ahead.
2589 + */
2590 + if (!page && begin_index == index &&
2591 + try_read_backward(ra, index, size, ra_max))
2592 + return ra_dispatch(ra, mapping, filp);
2593 +
2594 + /*
2595 + * Context based sequential read-ahead.
2596 + */
2597 + ret = try_context_based_readahead(mapping, ra, prev_page, page,
2598 + index, ra_min, ra_max);
2599 + if (ret > 0)
2600 + return ra_dispatch(ra, mapping, filp);
2601 + if (ret < 0)
2602 + return 0;
2603 +
2604 + /* No action on look ahead time? */
2605 + if (page) {
2606 + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION,
2607 + ra->readahead_index - index);
2608 + return 0;
2609 + }
2610 +
2611 + /*
2612 + * Random read that follows a sequential one.
2613 + */
2614 + if (try_readahead_on_seek(ra, index, size, ra_max))
2615 + return ra_dispatch(ra, mapping, filp);
2616 +
2617 + /*
2618 + * Random read.
2619 + */
2620 + if (size > ra_max)
2621 + size = ra_max;
2622 +
2623 +readit:
2624 + size = __do_page_cache_readahead(mapping, filp, index, size, 0);
2625 +
2626 + ra_account(ra, RA_EVENT_READRANDOM, size);
2627 + dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n",
2628 + mapping->host->i_ino, mapping->nrpages,
2629 + begin_index, index, end_index, size);
2630 +
2631 + return size;
2632 +}
2633 +
2634 +/**
2635 + * readahead_cache_hit - adaptive read-ahead feedback function
2636 + * @ra: file_ra_state which holds the readahead state
2637 + * @page: the page just accessed
2638 + *
2639 + * readahead_cache_hit() is the feedback route of the adaptive read-ahead
2640 + * logic. It must be called on every access on the read-ahead pages.
2641 + */
2642 +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page)
2643 +{
2644 + if (PageActive(page) || PageReferenced(page))
2645 + return;
2646 +
2647 + if (!PageUptodate(page))
2648 + ra_account(ra, RA_EVENT_IO_BLOCK, 1);
2649 +
2650 + if (!ra_has_index(ra, page->index))
2651 + return;
2652 +
2653 + ra->cache_hits++;
2654 +
2655 + if (page->index >= ra->ra_index)
2656 + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1);
2657 + else
2658 + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1);
2659 +}
2660 +
2661 +#endif /* CONFIG_ADAPTIVE_READAHEAD */
2662 Index: linux-2.6.16-ck1/mm/swap.c
2663 ===================================================================
2664 --- linux-2.6.16-ck1.orig/mm/swap.c 2006-03-20 20:46:55.000000000 +1100
2665 +++ linux-2.6.16-ck1/mm/swap.c 2006-03-20 20:47:04.000000000 +1100
2666 @@ -128,6 +128,8 @@ void fastcall mark_page_accessed(struct
2667 ClearPageReferenced(page);
2668 } else if (!PageReferenced(page)) {
2669 SetPageReferenced(page);
2670 + if (PageLRU(page))
2671 + inc_readahead_aging();
2672 }
2673 }
2674
2675 Index: linux-2.6.16-ck1/mm/vmscan.c
2676 ===================================================================
2677 --- linux-2.6.16-ck1.orig/mm/vmscan.c 2006-03-20 20:47:00.000000000 +1100
2678 +++ linux-2.6.16-ck1/mm/vmscan.c 2006-03-20 20:47:04.000000000 +1100
2679 @@ -458,6 +458,9 @@ static int shrink_list(struct list_head
2680 if (PageWriteback(page))
2681 goto keep_locked;
2682
2683 + if (!PageReferenced(page))
2684 + inc_readahead_aging();
2685 +
2686 referenced = page_referenced(page, 1);
2687 /* In active use or really unfreeable? Activate it. */
2688 if (referenced && page_mapping_inuse(page))