Contents of /trunk/kernel26-magellan/patches-2.6.16-r12/0029-2.6.16-adaptive-readahead-11.patch
Parent Directory | Revision Log
Revision 72 -
(show annotations)
(download)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 82008 byte(s)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 82008 byte(s)
ver bump to 2.6.16-r12: - updated to linux-2.6.16.19 - updated to ck11
1 | --- |
2 | Documentation/sysctl/vm.txt | 36 + |
3 | drivers/block/loop.c | 6 |
4 | fs/mpage.c | 4 |
5 | fs/nfsd/vfs.c | 6 |
6 | include/linux/fs.h | 41 - |
7 | include/linux/mm.h | 31 |
8 | include/linux/page-flags.h | 5 |
9 | include/linux/radix-tree.h | 82 ++ |
10 | include/linux/sysctl.h | 2 |
11 | include/linux/writeback.h | 6 |
12 | kernel/sysctl.c | 28 |
13 | lib/radix-tree.c | 208 +++++- |
14 | mm/Kconfig | 55 + |
15 | mm/filemap.c | 86 ++ |
16 | mm/memory.c | 1 |
17 | mm/page-writeback.c | 2 |
18 | mm/page_alloc.c | 2 |
19 | mm/readahead.c | 1519 +++++++++++++++++++++++++++++++++++++++++++- |
20 | mm/swap.c | 2 |
21 | mm/vmscan.c | 3 |
22 | 20 files changed, 2062 insertions(+), 63 deletions(-) |
23 | |
24 | Index: linux-2.6.16-ck1/Documentation/sysctl/vm.txt |
25 | =================================================================== |
26 | --- linux-2.6.16-ck1.orig/Documentation/sysctl/vm.txt 2006-03-20 20:47:01.000000000 +1100 |
27 | +++ linux-2.6.16-ck1/Documentation/sysctl/vm.txt 2006-03-20 20:47:04.000000000 +1100 |
28 | @@ -30,6 +30,8 @@ Currently, these files are in /proc/sys/ |
29 | - zone_reclaim_mode |
30 | - zone_reclaim_interval |
31 | - swap_prefetch |
32 | +- readahead_ratio |
33 | +- readahead_hit_rate |
34 | |
35 | ============================================================== |
36 | |
37 | @@ -204,3 +206,37 @@ swap_prefetch unset and then it is enabl |
38 | prefetched. |
39 | |
40 | The default value is 1. |
41 | + |
42 | +============================================================== |
43 | + |
44 | +readahead_ratio |
45 | + |
46 | +This limits readahead size to percent of the thrashing-threshold, |
47 | +which is dynamicly estimated from the _history_ read speed and |
48 | +system load, to deduce the _future_ readahead request size. |
49 | + |
50 | +Set it to a smaller value if you have not enough memory for all the |
51 | +concurrent readers, or the I/O loads fluctuate a lot. But if there's |
52 | +plenty of memory(>2MB per reader), enlarge it may help speedup reads. |
53 | + |
54 | +readahead_ratio also selects the readahead logic: |
55 | +0: disable readahead totally |
56 | +1-9: select the stock readahead logic |
57 | +10-inf: select the adaptive readahead logic |
58 | + |
59 | +The default value is 50; reasonable values would be 50-100. |
60 | + |
61 | +============================================================== |
62 | + |
63 | +readahead_hit_rate |
64 | + |
65 | +This is the max allowed value of (readahead-pages : accessed-pages). |
66 | +Useful only when (readahead_ratio >= 10). If the previous readahead |
67 | +request has bad hit rate, the kernel will be reluctant to do the next |
68 | +readahead. |
69 | + |
70 | +A larger value helps catch more sparse access patterns. Be aware that |
71 | +readahead of the sparse patterns sacrifices memory for speed. |
72 | + |
73 | +The default value is 2. |
74 | +It is recommended to keep the value below (max-readahead-pages / 8). |
75 | Index: linux-2.6.16-ck1/drivers/block/loop.c |
76 | =================================================================== |
77 | --- linux-2.6.16-ck1.orig/drivers/block/loop.c 2006-03-20 20:46:23.000000000 +1100 |
78 | +++ linux-2.6.16-ck1/drivers/block/loop.c 2006-03-20 20:47:04.000000000 +1100 |
79 | @@ -779,6 +779,12 @@ static int loop_set_fd(struct loop_devic |
80 | mapping = file->f_mapping; |
81 | inode = mapping->host; |
82 | |
83 | + /* |
84 | + * The upper layer should already do proper look-ahead, |
85 | + * one more look-ahead here only ruins the cache hit rate. |
86 | + */ |
87 | + file->f_ra.flags |= RA_FLAG_NO_LOOKAHEAD; |
88 | + |
89 | if (!(file->f_mode & FMODE_WRITE)) |
90 | lo_flags |= LO_FLAGS_READ_ONLY; |
91 | |
92 | Index: linux-2.6.16-ck1/fs/mpage.c |
93 | =================================================================== |
94 | --- linux-2.6.16-ck1.orig/fs/mpage.c 2006-03-20 20:46:23.000000000 +1100 |
95 | +++ linux-2.6.16-ck1/fs/mpage.c 2006-03-20 20:47:04.000000000 +1100 |
96 | @@ -343,8 +343,10 @@ mpage_readpages(struct address_space *ma |
97 | bio = do_mpage_readpage(bio, page, |
98 | nr_pages - page_idx, |
99 | &last_block_in_bio, get_block); |
100 | - if (!pagevec_add(&lru_pvec, page)) |
101 | + if (!pagevec_add(&lru_pvec, page)) { |
102 | + cond_resched(); |
103 | __pagevec_lru_add(&lru_pvec); |
104 | + } |
105 | } else { |
106 | page_cache_release(page); |
107 | } |
108 | Index: linux-2.6.16-ck1/fs/nfsd/vfs.c |
109 | =================================================================== |
110 | --- linux-2.6.16-ck1.orig/fs/nfsd/vfs.c 2006-03-20 20:46:23.000000000 +1100 |
111 | +++ linux-2.6.16-ck1/fs/nfsd/vfs.c 2006-03-20 20:47:04.000000000 +1100 |
112 | @@ -833,10 +833,14 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st |
113 | #endif |
114 | |
115 | /* Get readahead parameters */ |
116 | - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); |
117 | + if (prefer_adaptive_readahead()) |
118 | + ra = NULL; |
119 | + else |
120 | + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); |
121 | |
122 | if (ra && ra->p_set) |
123 | file->f_ra = ra->p_ra; |
124 | + file->f_ra.flags |= RA_FLAG_NFSD; |
125 | |
126 | if (file->f_op->sendfile) { |
127 | svc_pushback_unused_pages(rqstp); |
128 | Index: linux-2.6.16-ck1/include/linux/fs.h |
129 | =================================================================== |
130 | --- linux-2.6.16-ck1.orig/include/linux/fs.h 2006-03-20 20:46:23.000000000 +1100 |
131 | +++ linux-2.6.16-ck1/include/linux/fs.h 2006-03-20 20:47:04.000000000 +1100 |
132 | @@ -600,19 +600,40 @@ struct fown_struct { |
133 | * Track a single file's readahead state |
134 | */ |
135 | struct file_ra_state { |
136 | - unsigned long start; /* Current window */ |
137 | - unsigned long size; |
138 | - unsigned long flags; /* ra flags RA_FLAG_xxx*/ |
139 | - unsigned long cache_hit; /* cache hit count*/ |
140 | - unsigned long prev_page; /* Cache last read() position */ |
141 | - unsigned long ahead_start; /* Ahead window */ |
142 | - unsigned long ahead_size; |
143 | - unsigned long ra_pages; /* Maximum readahead window */ |
144 | - unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ |
145 | - unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ |
146 | + union { |
147 | + struct { /* conventional read-ahead */ |
148 | + unsigned long start; /* Current window */ |
149 | + unsigned long size; |
150 | + unsigned long ahead_start; /* Ahead window */ |
151 | + unsigned long ahead_size; |
152 | + unsigned long cache_hit; /* cache hit count */ |
153 | + }; |
154 | +#ifdef CONFIG_ADAPTIVE_READAHEAD |
155 | + struct { /* adaptive read-ahead */ |
156 | + pgoff_t la_index; |
157 | + pgoff_t ra_index; |
158 | + pgoff_t lookahead_index; |
159 | + pgoff_t readahead_index; |
160 | + unsigned long age; |
161 | + uint64_t cache_hits; |
162 | + }; |
163 | +#endif |
164 | + }; |
165 | + |
166 | + /* mmap read-around */ |
167 | + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ |
168 | + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ |
169 | + |
170 | + /* common ones */ |
171 | + unsigned long flags; /* ra flags RA_FLAG_xxx*/ |
172 | + unsigned long prev_page; /* Cache last read() position */ |
173 | + unsigned long ra_pages; /* Maximum readahead window */ |
174 | }; |
175 | #define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */ |
176 | #define RA_FLAG_INCACHE 0x02 /* file is already in cache */ |
177 | +#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */ |
178 | +#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */ |
179 | +#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */ |
180 | |
181 | struct file { |
182 | /* |
183 | Index: linux-2.6.16-ck1/include/linux/mm.h |
184 | =================================================================== |
185 | --- linux-2.6.16-ck1.orig/include/linux/mm.h 2006-03-20 20:46:23.000000000 +1100 |
186 | +++ linux-2.6.16-ck1/include/linux/mm.h 2006-03-20 20:47:04.000000000 +1100 |
187 | @@ -954,7 +954,11 @@ extern int filemap_populate(struct vm_ar |
188 | int write_one_page(struct page *page, int wait); |
189 | |
190 | /* readahead.c */ |
191 | +#ifdef CONFIG_ADAPTIVE_READAHEAD |
192 | +#define VM_MAX_READAHEAD 1024 /* kbytes */ |
193 | +#else |
194 | #define VM_MAX_READAHEAD 128 /* kbytes */ |
195 | +#endif |
196 | #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ |
197 | #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before |
198 | * turning readahead off */ |
199 | @@ -971,6 +975,33 @@ unsigned long page_cache_readahead(struc |
200 | void handle_ra_miss(struct address_space *mapping, |
201 | struct file_ra_state *ra, pgoff_t offset); |
202 | unsigned long max_sane_readahead(unsigned long nr); |
203 | +unsigned long |
204 | +page_cache_readahead_adaptive(struct address_space *mapping, |
205 | + struct file_ra_state *ra, struct file *filp, |
206 | + struct page *prev_page, struct page *page, |
207 | + pgoff_t first_index, pgoff_t index, pgoff_t last_index); |
208 | + |
209 | +#ifdef CONFIG_ADAPTIVE_READAHEAD |
210 | +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page); |
211 | +extern int readahead_ratio; |
212 | +#else |
213 | +#define readahead_cache_hit(ra, page) do { } while (0) |
214 | +#define readahead_ratio 1 |
215 | +#endif /* CONFIG_ADAPTIVE_READAHEAD */ |
216 | + |
217 | +static inline int prefer_adaptive_readahead(void) |
218 | +{ |
219 | + return readahead_ratio >= 10; |
220 | +} |
221 | + |
222 | +DECLARE_PER_CPU(unsigned long, readahead_aging); |
223 | +static inline void inc_readahead_aging(void) |
224 | +{ |
225 | + if (prefer_adaptive_readahead()) { |
226 | + per_cpu(readahead_aging, get_cpu())++; |
227 | + put_cpu(); |
228 | + } |
229 | +} |
230 | |
231 | /* Do stack extension */ |
232 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); |
233 | Index: linux-2.6.16-ck1/include/linux/page-flags.h |
234 | =================================================================== |
235 | --- linux-2.6.16-ck1.orig/include/linux/page-flags.h 2006-03-20 20:46:23.000000000 +1100 |
236 | +++ linux-2.6.16-ck1/include/linux/page-flags.h 2006-03-20 20:47:04.000000000 +1100 |
237 | @@ -75,6 +75,7 @@ |
238 | #define PG_reclaim 17 /* To be reclaimed asap */ |
239 | #define PG_nosave_free 18 /* Free, should not be written */ |
240 | #define PG_uncached 19 /* Page has been mapped as uncached */ |
241 | +#define PG_readahead 20 /* Reminder to do readahead */ |
242 | |
243 | /* |
244 | * Global page accounting. One instance per CPU. Only unsigned longs are |
245 | @@ -344,6 +345,10 @@ extern void __mod_page_state_offset(unsi |
246 | #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) |
247 | #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) |
248 | |
249 | +#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags) |
250 | +#define __SetPageReadahead(page) __set_bit(PG_readahead, &(page)->flags) |
251 | +#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags) |
252 | + |
253 | struct page; /* forward declaration */ |
254 | |
255 | int test_clear_page_dirty(struct page *page); |
256 | Index: linux-2.6.16-ck1/include/linux/radix-tree.h |
257 | =================================================================== |
258 | --- linux-2.6.16-ck1.orig/include/linux/radix-tree.h 2006-03-20 20:46:23.000000000 +1100 |
259 | +++ linux-2.6.16-ck1/include/linux/radix-tree.h 2006-03-20 20:47:04.000000000 +1100 |
260 | @@ -23,12 +23,24 @@ |
261 | #include <linux/preempt.h> |
262 | #include <linux/types.h> |
263 | |
264 | +#define RADIX_TREE_MAP_SHIFT 6 |
265 | +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) |
266 | +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) |
267 | + |
268 | struct radix_tree_root { |
269 | unsigned int height; |
270 | gfp_t gfp_mask; |
271 | struct radix_tree_node *rnode; |
272 | }; |
273 | |
274 | +/* |
275 | + * Lookaside cache to support access patterns with strong locality. |
276 | + */ |
277 | +struct radix_tree_cache { |
278 | + unsigned long first_index; |
279 | + struct radix_tree_node *tree_node; |
280 | +}; |
281 | + |
282 | #define RADIX_TREE_INIT(mask) { \ |
283 | .height = 0, \ |
284 | .gfp_mask = (mask), \ |
285 | @@ -46,9 +58,18 @@ do { \ |
286 | } while (0) |
287 | |
288 | int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); |
289 | -void *radix_tree_lookup(struct radix_tree_root *, unsigned long); |
290 | -void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); |
291 | +void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long, |
292 | + unsigned int); |
293 | +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long); |
294 | void *radix_tree_delete(struct radix_tree_root *, unsigned long); |
295 | +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache); |
296 | +void *radix_tree_cache_lookup_node(struct radix_tree_root *root, |
297 | + struct radix_tree_cache *cache, |
298 | + unsigned long index, unsigned int level); |
299 | +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root, |
300 | + unsigned long index, unsigned long max_scan); |
301 | +unsigned long radix_tree_scan_hole(struct radix_tree_root *root, |
302 | + unsigned long index, unsigned long max_scan); |
303 | unsigned int |
304 | radix_tree_gang_lookup(struct radix_tree_root *root, void **results, |
305 | unsigned long first_index, unsigned int max_items); |
306 | @@ -70,4 +91,61 @@ static inline void radix_tree_preload_en |
307 | preempt_enable(); |
308 | } |
309 | |
310 | +/** |
311 | + * radix_tree_lookup - perform lookup operation on a radix tree |
312 | + * @root: radix tree root |
313 | + * @index: index key |
314 | + * |
315 | + * Lookup the item at the position @index in the radix tree @root. |
316 | + */ |
317 | +static inline void *radix_tree_lookup(struct radix_tree_root *root, |
318 | + unsigned long index) |
319 | +{ |
320 | + return radix_tree_lookup_node(root, index, 0); |
321 | +} |
322 | + |
323 | +/** |
324 | + * radix_tree_cache_init - init a look-aside cache |
325 | + * @cache: look-aside cache |
326 | + * |
327 | + * Init the radix tree look-aside cache @cache. |
328 | + */ |
329 | +static inline void radix_tree_cache_init(struct radix_tree_cache *cache) |
330 | +{ |
331 | + cache->first_index = RADIX_TREE_MAP_MASK; |
332 | + cache->tree_node = NULL; |
333 | +} |
334 | + |
335 | +/** |
336 | + * radix_tree_cache_lookup - cached lookup on a radix tree |
337 | + * @root: radix tree root |
338 | + * @cache: look-aside cache |
339 | + * @index: index key |
340 | + * |
341 | + * Lookup the item at the position @index in the radix tree @root, |
342 | + * and make use of @cache to speedup the lookup process. |
343 | + */ |
344 | +static inline void *radix_tree_cache_lookup(struct radix_tree_root *root, |
345 | + struct radix_tree_cache *cache, |
346 | + unsigned long index) |
347 | +{ |
348 | + return radix_tree_cache_lookup_node(root, cache, index, 0); |
349 | +} |
350 | + |
351 | +static inline unsigned int radix_tree_cache_size(struct radix_tree_cache *cache) |
352 | +{ |
353 | + return RADIX_TREE_MAP_SIZE; |
354 | +} |
355 | + |
356 | +static inline int radix_tree_cache_full(struct radix_tree_cache *cache) |
357 | +{ |
358 | + return radix_tree_cache_count(cache) == radix_tree_cache_size(cache); |
359 | +} |
360 | + |
361 | +static inline unsigned long |
362 | +radix_tree_cache_first_index(struct radix_tree_cache *cache) |
363 | +{ |
364 | + return cache->first_index; |
365 | +} |
366 | + |
367 | #endif /* _LINUX_RADIX_TREE_H */ |
368 | Index: linux-2.6.16-ck1/include/linux/sysctl.h |
369 | =================================================================== |
370 | --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:56.000000000 +1100 |
371 | +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:47:04.000000000 +1100 |
372 | @@ -191,6 +191,8 @@ enum |
373 | VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ |
374 | VM_SWAP_PREFETCH=33, /* swap prefetch */ |
375 | VM_HARDMAPLIMIT=34, /* Make mapped a hard limit */ |
376 | + VM_READAHEAD_RATIO=35, /* percent of read-ahead size to thrashing-threshold */ |
377 | + VM_READAHEAD_HIT_RATE=36, /* one accessed page legitimizes so many read-ahead pages */ |
378 | }; |
379 | |
380 | |
381 | Index: linux-2.6.16-ck1/include/linux/writeback.h |
382 | =================================================================== |
383 | --- linux-2.6.16-ck1.orig/include/linux/writeback.h 2006-03-20 20:46:23.000000000 +1100 |
384 | +++ linux-2.6.16-ck1/include/linux/writeback.h 2006-03-20 20:47:04.000000000 +1100 |
385 | @@ -85,6 +85,12 @@ void laptop_io_completion(void); |
386 | void laptop_sync_completion(void); |
387 | void throttle_vm_writeout(void); |
388 | |
389 | +extern struct timer_list laptop_mode_wb_timer; |
390 | +static inline int laptop_spinned_down(void) |
391 | +{ |
392 | + return !timer_pending(&laptop_mode_wb_timer); |
393 | +} |
394 | + |
395 | /* These are exported to sysctl. */ |
396 | extern int dirty_background_ratio; |
397 | extern int vm_dirty_ratio; |
398 | Index: linux-2.6.16-ck1/kernel/sysctl.c |
399 | =================================================================== |
400 | --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:56.000000000 +1100 |
401 | +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:47:04.000000000 +1100 |
402 | @@ -74,6 +74,12 @@ extern int pid_max_min, pid_max_max; |
403 | extern int sysctl_drop_caches; |
404 | extern int percpu_pagelist_fraction; |
405 | |
406 | +#if defined(CONFIG_ADAPTIVE_READAHEAD) |
407 | +extern int readahead_ratio; |
408 | +extern int readahead_hit_rate; |
409 | +static int one = 1; |
410 | +#endif |
411 | + |
412 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
413 | int unknown_nmi_panic; |
414 | extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, |
415 | @@ -961,6 +967,28 @@ static ctl_table vm_table[] = { |
416 | .proc_handler = &proc_dointvec, |
417 | }, |
418 | #endif |
419 | +#ifdef CONFIG_ADAPTIVE_READAHEAD |
420 | + { |
421 | + .ctl_name = VM_READAHEAD_RATIO, |
422 | + .procname = "readahead_ratio", |
423 | + .data = &readahead_ratio, |
424 | + .maxlen = sizeof(readahead_ratio), |
425 | + .mode = 0644, |
426 | + .proc_handler = &proc_dointvec, |
427 | + .strategy = &sysctl_intvec, |
428 | + .extra1 = &zero, |
429 | + }, |
430 | + { |
431 | + .ctl_name = VM_READAHEAD_HIT_RATE, |
432 | + .procname = "readahead_hit_rate", |
433 | + .data = &readahead_hit_rate, |
434 | + .maxlen = sizeof(readahead_hit_rate), |
435 | + .mode = 0644, |
436 | + .proc_handler = &proc_dointvec, |
437 | + .strategy = &sysctl_intvec, |
438 | + .extra1 = &one, |
439 | + }, |
440 | +#endif |
441 | { .ctl_name = 0 } |
442 | }; |
443 | |
444 | Index: linux-2.6.16-ck1/lib/radix-tree.c |
445 | =================================================================== |
446 | --- linux-2.6.16-ck1.orig/lib/radix-tree.c 2006-03-20 20:46:23.000000000 +1100 |
447 | +++ linux-2.6.16-ck1/lib/radix-tree.c 2006-03-20 20:47:04.000000000 +1100 |
448 | @@ -32,16 +32,7 @@ |
449 | #include <linux/bitops.h> |
450 | |
451 | |
452 | -#ifdef __KERNEL__ |
453 | -#define RADIX_TREE_MAP_SHIFT 6 |
454 | -#else |
455 | -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ |
456 | -#endif |
457 | #define RADIX_TREE_TAGS 2 |
458 | - |
459 | -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) |
460 | -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) |
461 | - |
462 | #define RADIX_TREE_TAG_LONGS \ |
463 | ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) |
464 | |
465 | @@ -286,32 +277,89 @@ int radix_tree_insert(struct radix_tree_ |
466 | } |
467 | EXPORT_SYMBOL(radix_tree_insert); |
468 | |
469 | -static inline void **__lookup_slot(struct radix_tree_root *root, |
470 | - unsigned long index) |
471 | +/** |
472 | + * radix_tree_lookup_node - low level lookup routine |
473 | + * @root: radix tree root |
474 | + * @index: index key |
475 | + * @level: stop at that many levels from the tree leaf |
476 | + * |
477 | + * Lookup the item at the position @index in the radix tree @root. |
478 | + * The return value is: |
479 | + * @level == 0: page at @index; |
480 | + * @level == 1: the corresponding bottom level tree node; |
481 | + * @level < height: (@level-1)th parent node of the bottom node |
482 | + * that contains @index; |
483 | + * @level >= height: the root node. |
484 | + */ |
485 | +void *radix_tree_lookup_node(struct radix_tree_root *root, |
486 | + unsigned long index, unsigned int level) |
487 | { |
488 | unsigned int height, shift; |
489 | - struct radix_tree_node **slot; |
490 | + struct radix_tree_node *slot; |
491 | |
492 | height = root->height; |
493 | if (index > radix_tree_maxindex(height)) |
494 | return NULL; |
495 | |
496 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
497 | - slot = &root->rnode; |
498 | + slot = root->rnode; |
499 | |
500 | - while (height > 0) { |
501 | - if (*slot == NULL) |
502 | + while (height > level) { |
503 | + if (slot == NULL) |
504 | return NULL; |
505 | |
506 | - slot = (struct radix_tree_node **) |
507 | - ((*slot)->slots + |
508 | - ((index >> shift) & RADIX_TREE_MAP_MASK)); |
509 | + slot = slot->slots[(index >> shift) & RADIX_TREE_MAP_MASK]; |
510 | shift -= RADIX_TREE_MAP_SHIFT; |
511 | height--; |
512 | } |
513 | |
514 | - return (void **)slot; |
515 | + return slot; |
516 | +} |
517 | +EXPORT_SYMBOL(radix_tree_lookup_node); |
518 | + |
519 | +/** |
520 | + * radix_tree_cache_lookup_node - cached lookup node |
521 | + * @root: radix tree root |
522 | + * @cache: look-aside cache |
523 | + * @index: index key |
524 | + * |
525 | + * Lookup the item at the position @index in the radix tree @root, |
526 | + * and return the node @level levels from the bottom in the search path. |
527 | + * |
528 | + * @cache stores the last accessed upper level tree node by this |
529 | + * function, and is always checked first before searching in the tree. |
530 | + * It can improve speed for access patterns with strong locality. |
531 | + * |
532 | + * NOTE: |
533 | + * - The cache becomes invalid on leaving the lock; |
534 | + * - Do not intermix calls with different @level. |
535 | + */ |
536 | +void *radix_tree_cache_lookup_node(struct radix_tree_root *root, |
537 | + struct radix_tree_cache *cache, |
538 | + unsigned long index, unsigned int level) |
539 | +{ |
540 | + struct radix_tree_node *node; |
541 | + unsigned long i; |
542 | + unsigned long mask; |
543 | + |
544 | + if (level >= root->height) |
545 | + return root->rnode; |
546 | + |
547 | + i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK); |
548 | + mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1); |
549 | + |
550 | + if ((index & mask) == cache->first_index) |
551 | + return cache->tree_node->slots[i]; |
552 | + |
553 | + node = radix_tree_lookup_node(root, index, level + 1); |
554 | + if (!node) |
555 | + return 0; |
556 | + |
557 | + cache->tree_node = node; |
558 | + cache->first_index = (index & mask); |
559 | + return node->slots[i]; |
560 | } |
561 | +EXPORT_SYMBOL(radix_tree_cache_lookup_node); |
562 | |
563 | /** |
564 | * radix_tree_lookup_slot - lookup a slot in a radix tree |
565 | @@ -323,25 +371,131 @@ static inline void **__lookup_slot(struc |
566 | */ |
567 | void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) |
568 | { |
569 | - return __lookup_slot(root, index); |
570 | + struct radix_tree_node *node; |
571 | + |
572 | + node = radix_tree_lookup_node(root, index, 1); |
573 | + return node->slots + (index & RADIX_TREE_MAP_MASK); |
574 | } |
575 | EXPORT_SYMBOL(radix_tree_lookup_slot); |
576 | |
577 | /** |
578 | - * radix_tree_lookup - perform lookup operation on a radix tree |
579 | + * radix_tree_cache_count - items in the cached node |
580 | + * @cache: radix tree look-aside cache |
581 | + * |
582 | + * Query the number of items contained in the cached node. |
583 | + */ |
584 | +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache) |
585 | +{ |
586 | + if (!(cache->first_index & RADIX_TREE_MAP_MASK)) |
587 | + return cache->tree_node->count; |
588 | + else |
589 | + return 0; |
590 | +} |
591 | +EXPORT_SYMBOL(radix_tree_cache_count); |
592 | + |
593 | +/** |
594 | + * radix_tree_scan_hole_backward - scan backward for hole |
595 | * @root: radix tree root |
596 | * @index: index key |
597 | + * @max_scan: advice on max items to scan (it may scan a little more) |
598 | * |
599 | - * Lookup the item at the position @index in the radix tree @root. |
600 | + * Scan backward from @index for a hole/empty item, stop when |
601 | + * - hit hole |
602 | + * - @max_scan or more items scanned |
603 | + * - hit index 0 |
604 | + * |
605 | + * Return the correponding index. |
606 | */ |
607 | -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) |
608 | +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root, |
609 | + unsigned long index, unsigned long max_scan) |
610 | { |
611 | - void **slot; |
612 | + struct radix_tree_cache cache; |
613 | + struct radix_tree_node *node; |
614 | + unsigned long origin; |
615 | + int i; |
616 | + |
617 | + origin = index; |
618 | + radix_tree_cache_init(&cache); |
619 | + |
620 | + while (origin - index < max_scan) { |
621 | + node = radix_tree_cache_lookup_node(root, &cache, index, 1); |
622 | + if (!node) |
623 | + break; |
624 | + |
625 | + if (node->count == RADIX_TREE_MAP_SIZE) { |
626 | + index = (index - RADIX_TREE_MAP_SIZE) | |
627 | + RADIX_TREE_MAP_MASK; |
628 | + goto check_underflow; |
629 | + } |
630 | + |
631 | + for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) { |
632 | + if (!node->slots[i]) |
633 | + goto out; |
634 | + } |
635 | + |
636 | +check_underflow: |
637 | + if (unlikely(index == ULONG_MAX)) { |
638 | + index = 0; |
639 | + break; |
640 | + } |
641 | + } |
642 | + |
643 | +out: |
644 | + return index; |
645 | +} |
646 | +EXPORT_SYMBOL(radix_tree_scan_hole_backward); |
647 | |
648 | - slot = __lookup_slot(root, index); |
649 | - return slot != NULL ? *slot : NULL; |
650 | +/** |
651 | + * radix_tree_scan_hole - scan for hole |
652 | + * @root: radix tree root |
653 | + * @index: index key |
654 | + * @max_scan: advice on max items to scan (it may scan a little more) |
655 | + * |
656 | + * Scan forward from @index for a hole/empty item, stop when |
657 | + * - hit hole |
658 | + * - hit EOF |
659 | + * - hit index ULONG_MAX |
660 | + * - @max_scan or more items scanned |
661 | + * |
662 | + * Return the correponding index. |
663 | + */ |
664 | +unsigned long radix_tree_scan_hole(struct radix_tree_root *root, |
665 | + unsigned long index, unsigned long max_scan) |
666 | +{ |
667 | + struct radix_tree_cache cache; |
668 | + struct radix_tree_node *node; |
669 | + unsigned long origin; |
670 | + int i; |
671 | + |
672 | + origin = index; |
673 | + radix_tree_cache_init(&cache); |
674 | + |
675 | + while (index - origin < max_scan) { |
676 | + node = radix_tree_cache_lookup_node(root, &cache, index, 1); |
677 | + if (!node) |
678 | + break; |
679 | + |
680 | + if (node->count == RADIX_TREE_MAP_SIZE) { |
681 | + index = (index | RADIX_TREE_MAP_MASK) + 1; |
682 | + goto check_overflow; |
683 | + } |
684 | + |
685 | + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; |
686 | + i++, index++) { |
687 | + if (!node->slots[i]) |
688 | + goto out; |
689 | + } |
690 | + |
691 | +check_overflow: |
692 | + if (unlikely(!index)) { |
693 | + index = ULONG_MAX; |
694 | + break; |
695 | + } |
696 | + } |
697 | +out: |
698 | + return index; |
699 | } |
700 | -EXPORT_SYMBOL(radix_tree_lookup); |
701 | +EXPORT_SYMBOL(radix_tree_scan_hole); |
702 | |
703 | /** |
704 | * radix_tree_tag_set - set a tag on a radix tree node |
705 | Index: linux-2.6.16-ck1/mm/Kconfig |
706 | =================================================================== |
707 | --- linux-2.6.16-ck1.orig/mm/Kconfig 2006-03-20 20:46:23.000000000 +1100 |
708 | +++ linux-2.6.16-ck1/mm/Kconfig 2006-03-20 20:47:04.000000000 +1100 |
709 | @@ -139,3 +139,58 @@ config SPLIT_PTLOCK_CPUS |
710 | config MIGRATION |
711 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM |
712 | depends on SWAP |
713 | + |
714 | +# |
715 | +# Adaptive file readahead |
716 | +# |
717 | +config ADAPTIVE_READAHEAD |
718 | + bool "Adaptive file readahead (EXPERIMENTAL)" |
719 | + default n |
720 | + depends on EXPERIMENTAL |
721 | + help |
722 | + Readahead is a technique employed by the kernel in an attempt |
723 | + to improve file reading performance. If the kernel has reason |
724 | + to believe that a particular file is being read sequentially, |
725 | + it will attempt to read blocks from the file into memory before |
726 | + the application requests them. When readahead works, it speeds |
727 | + up the system's throughput, since the reading application does |
728 | + not have to wait for its requests. When readahead fails, instead, |
729 | + it generates useless I/O and occupies memory pages which are |
730 | + needed for some other purpose. For sequential readings, |
731 | + |
732 | + Normally, the kernel uses a stock readahead logic that is well |
733 | + understood and well tuned. This option enables a much complex and |
734 | + feature rich one. It is more aggressive and memory efficient in |
735 | + doing readahead, and supports some less-common access patterns such |
736 | + as reading backward and reading sparsely. However, due to the great |
737 | + diversity of real world applications, it might not fit everyone. |
738 | + |
739 | + Please refer to Documentation/sysctl/vm.txt for tunable parameters. |
740 | + |
741 | + Say Y here if you are building kernel for file servers. |
742 | + Say N if you are unsure. |
743 | + |
744 | +config DEBUG_READAHEAD |
745 | + bool "Readahead debug and accounting" |
746 | + default n |
747 | + depends on ADAPTIVE_READAHEAD |
748 | + select DEBUG_FS |
749 | + help |
750 | + This option injects extra code to dump detailed debug traces and do |
751 | + readahead events accounting. |
752 | + |
753 | + To actually get the data: |
754 | + |
755 | + mkdir /debug |
756 | + mount -t debug none /debug |
757 | + |
758 | + After that you can do the following: |
759 | + |
760 | + echo > /debug/readahead/events # reset the counters |
761 | + cat /debug/readahead/events # check the counters |
762 | + |
763 | + echo 1 > /debug/readahead/debug_level # show printk traces |
764 | + echo 2 > /debug/readahead/debug_level # show verbose printk traces |
765 | + echo 0 > /debug/readahead/debug_level # stop filling my kern.log |
766 | + |
767 | + Say N, unless you have readahead performance problems. |
768 | Index: linux-2.6.16-ck1/mm/filemap.c |
769 | =================================================================== |
770 | --- linux-2.6.16-ck1.orig/mm/filemap.c 2006-03-20 20:46:23.000000000 +1100 |
771 | +++ linux-2.6.16-ck1/mm/filemap.c 2006-03-20 20:47:04.000000000 +1100 |
772 | @@ -42,6 +42,12 @@ static ssize_t |
773 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, |
774 | loff_t offset, unsigned long nr_segs); |
775 | |
776 | +#ifdef CONFIG_DEBUG_READAHEAD |
777 | +extern u32 readahead_debug_level; |
778 | +#else |
779 | +#define readahead_debug_level 0 |
780 | +#endif /* CONFIG_DEBUG_READAHEAD */ |
781 | + |
782 | /* |
783 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
784 | * though. |
785 | @@ -746,10 +752,12 @@ void do_generic_mapping_read(struct addr |
786 | unsigned long prev_index; |
787 | loff_t isize; |
788 | struct page *cached_page; |
789 | + struct page *prev_page; |
790 | int error; |
791 | struct file_ra_state ra = *_ra; |
792 | |
793 | cached_page = NULL; |
794 | + prev_page = NULL; |
795 | index = *ppos >> PAGE_CACHE_SHIFT; |
796 | next_index = index; |
797 | prev_index = ra.prev_page; |
798 | @@ -760,6 +768,10 @@ void do_generic_mapping_read(struct addr |
799 | if (!isize) |
800 | goto out; |
801 | |
802 | + if (readahead_debug_level >= 5) |
803 | + printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n", |
804 | + inode->i_ino, index, last_index - index); |
805 | + |
806 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
807 | for (;;) { |
808 | struct page *page; |
809 | @@ -778,16 +790,45 @@ void do_generic_mapping_read(struct addr |
810 | nr = nr - offset; |
811 | |
812 | cond_resched(); |
813 | - if (index == next_index) |
814 | + |
815 | + if (!prefer_adaptive_readahead() && index == next_index) |
816 | next_index = page_cache_readahead(mapping, &ra, filp, |
817 | index, last_index - index); |
818 | |
819 | find_page: |
820 | page = find_get_page(mapping, index); |
821 | + if (prefer_adaptive_readahead()) { |
822 | + if (unlikely(page == NULL)) { |
823 | + ra.prev_page = prev_index; |
824 | + page_cache_readahead_adaptive(mapping, &ra, |
825 | + filp, prev_page, NULL, |
826 | + *ppos >> PAGE_CACHE_SHIFT, |
827 | + index, last_index); |
828 | + page = find_get_page(mapping, index); |
829 | + } else if (PageReadahead(page)) { |
830 | + ra.prev_page = prev_index; |
831 | + page_cache_readahead_adaptive(mapping, &ra, |
832 | + filp, prev_page, page, |
833 | + *ppos >> PAGE_CACHE_SHIFT, |
834 | + index, last_index); |
835 | + } |
836 | + } |
837 | if (unlikely(page == NULL)) { |
838 | - handle_ra_miss(mapping, &ra, index); |
839 | + if (!prefer_adaptive_readahead()) |
840 | + handle_ra_miss(mapping, &ra, index); |
841 | goto no_cached_page; |
842 | } |
843 | + |
844 | + if (prev_page) |
845 | + page_cache_release(prev_page); |
846 | + prev_page = page; |
847 | + |
848 | + readahead_cache_hit(&ra, page); |
849 | + if (readahead_debug_level >= 7) |
850 | + printk(KERN_DEBUG "read-page(ino=%lu, idx=%lu, io=%s)\n", |
851 | + inode->i_ino, index, |
852 | + PageUptodate(page) ? "hit" : "miss"); |
853 | + |
854 | if (!PageUptodate(page)) |
855 | goto page_not_up_to_date; |
856 | page_ok: |
857 | @@ -822,7 +863,6 @@ page_ok: |
858 | index += offset >> PAGE_CACHE_SHIFT; |
859 | offset &= ~PAGE_CACHE_MASK; |
860 | |
861 | - page_cache_release(page); |
862 | if (ret == nr && desc->count) |
863 | continue; |
864 | goto out; |
865 | @@ -834,7 +874,6 @@ page_not_up_to_date: |
866 | /* Did it get unhashed before we got the lock? */ |
867 | if (!page->mapping) { |
868 | unlock_page(page); |
869 | - page_cache_release(page); |
870 | continue; |
871 | } |
872 | |
873 | @@ -864,7 +903,6 @@ readpage: |
874 | * invalidate_inode_pages got it |
875 | */ |
876 | unlock_page(page); |
877 | - page_cache_release(page); |
878 | goto find_page; |
879 | } |
880 | unlock_page(page); |
881 | @@ -885,7 +923,6 @@ readpage: |
882 | isize = i_size_read(inode); |
883 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
884 | if (unlikely(!isize || index > end_index)) { |
885 | - page_cache_release(page); |
886 | goto out; |
887 | } |
888 | |
889 | @@ -894,7 +931,6 @@ readpage: |
890 | if (index == end_index) { |
891 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; |
892 | if (nr <= offset) { |
893 | - page_cache_release(page); |
894 | goto out; |
895 | } |
896 | } |
897 | @@ -904,7 +940,6 @@ readpage: |
898 | readpage_error: |
899 | /* UHHUH! A synchronous read error occurred. Report it */ |
900 | desc->error = error; |
901 | - page_cache_release(page); |
902 | goto out; |
903 | |
904 | no_cached_page: |
905 | @@ -929,15 +964,22 @@ no_cached_page: |
906 | } |
907 | page = cached_page; |
908 | cached_page = NULL; |
909 | + if (prev_page) |
910 | + page_cache_release(prev_page); |
911 | + prev_page = page; |
912 | goto readpage; |
913 | } |
914 | |
915 | out: |
916 | *_ra = ra; |
917 | + if (prefer_adaptive_readahead()) |
918 | + _ra->prev_page = prev_index; |
919 | |
920 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; |
921 | if (cached_page) |
922 | page_cache_release(cached_page); |
923 | + if (prev_page) |
924 | + page_cache_release(prev_page); |
925 | if (filp) |
926 | file_accessed(filp); |
927 | } |
928 | @@ -1216,6 +1258,7 @@ struct page *filemap_nopage(struct vm_ar |
929 | unsigned long size, pgoff; |
930 | int did_readaround = 0, majmin = VM_FAULT_MINOR; |
931 | |
932 | + ra->flags |= RA_FLAG_MMAP; |
933 | pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; |
934 | |
935 | retry_all: |
936 | @@ -1233,7 +1276,7 @@ retry_all: |
937 | * |
938 | * For sequential accesses, we use the generic readahead logic. |
939 | */ |
940 | - if (VM_SequentialReadHint(area)) |
941 | + if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area)) |
942 | page_cache_readahead(mapping, ra, file, pgoff, 1); |
943 | |
944 | /* |
945 | @@ -1241,11 +1284,24 @@ retry_all: |
946 | */ |
947 | retry_find: |
948 | page = find_get_page(mapping, pgoff); |
949 | + if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) { |
950 | + if (!page) { |
951 | + page_cache_readahead_adaptive(mapping, ra, |
952 | + file, NULL, NULL, |
953 | + pgoff, pgoff, pgoff + 1); |
954 | + page = find_get_page(mapping, pgoff); |
955 | + } else if (PageReadahead(page)) { |
956 | + page_cache_readahead_adaptive(mapping, ra, |
957 | + file, NULL, page, |
958 | + pgoff, pgoff, pgoff + 1); |
959 | + } |
960 | + } |
961 | if (!page) { |
962 | unsigned long ra_pages; |
963 | |
964 | if (VM_SequentialReadHint(area)) { |
965 | - handle_ra_miss(mapping, ra, pgoff); |
966 | + if (!prefer_adaptive_readahead()) |
967 | + handle_ra_miss(mapping, ra, pgoff); |
968 | goto no_cached_page; |
969 | } |
970 | ra->mmap_miss++; |
971 | @@ -1282,6 +1338,14 @@ retry_find: |
972 | if (!did_readaround) |
973 | ra->mmap_hit++; |
974 | |
975 | + readahead_cache_hit(ra, page); |
976 | + if (readahead_debug_level >= 6) |
977 | + printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n", |
978 | + inode->i_ino, pgoff, |
979 | + VM_RandomReadHint(area) ? "random" : |
980 | + (VM_SequentialReadHint(area) ? "sequential" : "none"), |
981 | + PageUptodate(page) ? "hit" : "miss"); |
982 | + |
983 | /* |
984 | * Ok, found a page in the page cache, now we need to check |
985 | * that it's up-to-date. |
986 | @@ -1296,6 +1360,8 @@ success: |
987 | mark_page_accessed(page); |
988 | if (type) |
989 | *type = majmin; |
990 | + if (prefer_adaptive_readahead()) |
991 | + ra->prev_page = page->index; |
992 | return page; |
993 | |
994 | outside_data_content: |
995 | Index: linux-2.6.16-ck1/mm/memory.c |
996 | =================================================================== |
997 | --- linux-2.6.16-ck1.orig/mm/memory.c 2006-03-20 20:46:23.000000000 +1100 |
998 | +++ linux-2.6.16-ck1/mm/memory.c 2006-03-20 20:47:04.000000000 +1100 |
999 | @@ -1993,6 +1993,7 @@ static int do_anonymous_page(struct mm_s |
1000 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1001 | if (!pte_none(*page_table)) |
1002 | goto release; |
1003 | + inc_readahead_aging(); |
1004 | inc_mm_counter(mm, anon_rss); |
1005 | lru_cache_add_active(page); |
1006 | page_add_new_anon_rmap(page, vma, address); |
1007 | Index: linux-2.6.16-ck1/mm/page-writeback.c |
1008 | =================================================================== |
1009 | --- linux-2.6.16-ck1.orig/mm/page-writeback.c 2006-03-20 20:46:53.000000000 +1100 |
1010 | +++ linux-2.6.16-ck1/mm/page-writeback.c 2006-03-20 20:47:04.000000000 +1100 |
1011 | @@ -370,7 +370,7 @@ static void wb_timer_fn(unsigned long un |
1012 | static void laptop_timer_fn(unsigned long unused); |
1013 | |
1014 | static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); |
1015 | -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); |
1016 | +DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); |
1017 | |
1018 | /* |
1019 | * Periodic writeback of "old" data. |
1020 | Index: linux-2.6.16-ck1/mm/page_alloc.c |
1021 | =================================================================== |
1022 | --- linux-2.6.16-ck1.orig/mm/page_alloc.c 2006-03-20 20:46:59.000000000 +1100 |
1023 | +++ linux-2.6.16-ck1/mm/page_alloc.c 2006-03-20 20:47:04.000000000 +1100 |
1024 | @@ -532,7 +532,7 @@ static int prep_new_page(struct page *pa |
1025 | if (PageReserved(page)) |
1026 | return 1; |
1027 | |
1028 | - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | |
1029 | + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | |
1030 | 1 << PG_referenced | 1 << PG_arch_1 | |
1031 | 1 << PG_checked | 1 << PG_mappedtodisk); |
1032 | set_page_private(page, 0); |
1033 | Index: linux-2.6.16-ck1/mm/readahead.c |
1034 | =================================================================== |
1035 | --- linux-2.6.16-ck1.orig/mm/readahead.c 2006-03-20 20:46:23.000000000 +1100 |
1036 | +++ linux-2.6.16-ck1/mm/readahead.c 2006-03-20 20:47:04.000000000 +1100 |
1037 | @@ -14,6 +14,300 @@ |
1038 | #include <linux/blkdev.h> |
1039 | #include <linux/backing-dev.h> |
1040 | #include <linux/pagevec.h> |
1041 | +#include <linux/writeback.h> |
1042 | +#include <linux/nfsd/const.h> |
1043 | +#include <asm/div64.h> |
1044 | + |
1045 | +/* The default max/min read-ahead pages. */ |
1046 | +#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE) |
1047 | +#define MAX_RA_PAGES KB(VM_MAX_READAHEAD) |
1048 | +#define MIN_RA_PAGES KB(VM_MIN_READAHEAD) |
1049 | +#define MIN_NFSD_PAGES KB(NFSSVC_MAXBLKSIZE/1024) |
1050 | + |
1051 | +#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru)) |
1052 | +#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru)) |
1053 | + |
1054 | +#ifdef CONFIG_ADAPTIVE_READAHEAD |
1055 | +/* |
1056 | + * Adaptive read-ahead parameters. |
1057 | + */ |
1058 | + |
1059 | +/* In laptop mode, poll delayed look-ahead on every ## pages read. */ |
1060 | +#define LAPTOP_POLL_INTERVAL 16 |
1061 | + |
1062 | +/* Set look-ahead size to 1/# of the thrashing-threshold. */ |
1063 | +#define LOOKAHEAD_RATIO 8 |
1064 | + |
1065 | +/* Set read-ahead size to ##% of the thrashing-threshold. */ |
1066 | +int readahead_ratio = 50; |
1067 | +EXPORT_SYMBOL(readahead_ratio); |
1068 | + |
1069 | +/* Readahead as long as cache hit ratio keeps above 1/##. */ |
1070 | +int readahead_hit_rate = 2; |
1071 | +EXPORT_SYMBOL(readahead_hit_rate); |
1072 | + |
1073 | +/* |
1074 | + * Measures the aging process of cold pages. |
1075 | + * Mainly increased on fresh page references to make it smooth. |
1076 | + */ |
1077 | +DEFINE_PER_CPU(unsigned long, readahead_aging); |
1078 | +EXPORT_PER_CPU_SYMBOL(readahead_aging); |
1079 | + |
1080 | +/* |
1081 | + * Detailed classification of read-ahead behaviors. |
1082 | + */ |
1083 | +#define RA_CLASS_SHIFT 4 |
1084 | +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1) |
1085 | +enum ra_class { |
1086 | + RA_CLASS_ALL, |
1087 | + RA_CLASS_NEWFILE, |
1088 | + RA_CLASS_STATE, |
1089 | + RA_CLASS_CONTEXT, |
1090 | + RA_CLASS_CONTEXT_AGGRESSIVE, |
1091 | + RA_CLASS_BACKWARD, |
1092 | + RA_CLASS_THRASHING, |
1093 | + RA_CLASS_SEEK, |
1094 | + RA_CLASS_END, |
1095 | +}; |
1096 | +#endif /* CONFIG_ADAPTIVE_READAHEAD */ |
1097 | + |
1098 | +/* |
1099 | + * Read-ahead events accounting. |
1100 | + */ |
1101 | +#ifdef CONFIG_DEBUG_READAHEAD |
1102 | +#include <linux/init.h> |
1103 | +#include <linux/jiffies.h> |
1104 | +#include <linux/debugfs.h> |
1105 | +#include <linux/seq_file.h> |
1106 | + |
1107 | +#define DEBUG_READAHEAD_RADIXTREE |
1108 | + |
1109 | +/* Read-ahead events to be accounted. */ |
1110 | +enum ra_event { |
1111 | + RA_EVENT_CACHE_MISS, /* read cache misses */ |
1112 | + RA_EVENT_READRANDOM, /* random reads */ |
1113 | + RA_EVENT_IO_CONGESTION, /* io congestion */ |
1114 | + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */ |
1115 | + RA_EVENT_IO_BLOCK, /* read on locked page */ |
1116 | + |
1117 | + RA_EVENT_READAHEAD, /* read-ahead issued */ |
1118 | + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */ |
1119 | + RA_EVENT_LOOKAHEAD, /* look-ahead issued */ |
1120 | + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */ |
1121 | + RA_EVENT_LOOKAHEAD_NOACTION, /* look-ahead mark ignored */ |
1122 | + RA_EVENT_READAHEAD_MMAP, /* read-ahead for memory mapped file */ |
1123 | + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */ |
1124 | + RA_EVENT_READAHEAD_SHRINK, /* ra_size under previous la_size */ |
1125 | + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */ |
1126 | + RA_EVENT_READAHEAD_MUTILATE, /* read-ahead request mutilated */ |
1127 | + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */ |
1128 | + |
1129 | + RA_EVENT_END |
1130 | +}; |
1131 | + |
1132 | +static const char * const ra_event_name[] = { |
1133 | + "cache_miss", |
1134 | + "read_random", |
1135 | + "io_congestion", |
1136 | + "io_cache_hit", |
1137 | + "io_block", |
1138 | + "readahead", |
1139 | + "readahead_hit", |
1140 | + "lookahead", |
1141 | + "lookahead_hit", |
1142 | + "lookahead_ignore", |
1143 | + "readahead_mmap", |
1144 | + "readahead_eof", |
1145 | + "readahead_shrink", |
1146 | + "readahead_thrash", |
1147 | + "readahead_mutilt", |
1148 | + "readahead_rescue", |
1149 | +}; |
1150 | + |
1151 | +static const char * const ra_class_name[] = { |
1152 | + "total", |
1153 | + "newfile", |
1154 | + "state", |
1155 | + "context", |
1156 | + "contexta", |
1157 | + "backward", |
1158 | + "onthrash", |
1159 | + "onraseek", |
1160 | + "none", |
1161 | +}; |
1162 | + |
1163 | +static unsigned long ra_events[RA_CLASS_END+1][RA_EVENT_END+1][2]; |
1164 | + |
1165 | +static inline void ra_account(struct file_ra_state *ra, |
1166 | + enum ra_event e, int pages) |
1167 | +{ |
1168 | + enum ra_class c; |
1169 | + |
1170 | + if (e == RA_EVENT_READAHEAD_HIT && pages < 0) { |
1171 | + c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK; |
1172 | + pages = -pages; |
1173 | + } else if (ra) |
1174 | + c = ra->flags & RA_CLASS_MASK; |
1175 | + else |
1176 | + c = RA_CLASS_END; |
1177 | + |
1178 | + if (!c) |
1179 | + c = RA_CLASS_END; |
1180 | + |
1181 | + ra_events[c][e][0] += 1; |
1182 | + ra_events[c][e][1] += pages; |
1183 | + |
1184 | + if (e == RA_EVENT_READAHEAD) |
1185 | + ra_events[c][RA_EVENT_END][1] += pages * pages; |
1186 | +} |
1187 | + |
1188 | +static int ra_events_show(struct seq_file *s, void *_) |
1189 | +{ |
1190 | + int i; |
1191 | + int c; |
1192 | + int e; |
1193 | + static const char event_fmt[] = "%-16s"; |
1194 | + static const char class_fmt[] = "%10s"; |
1195 | + static const char item_fmt[] = "%10lu"; |
1196 | + static const char percent_format[] = "%9lu%%"; |
1197 | + static const char * const table_name[] = { |
1198 | + "[table requests]", |
1199 | + "[table pages]", |
1200 | + "[table summary]"}; |
1201 | + |
1202 | + for (i = 0; i <= 1; i++) { |
1203 | + for (e = 0; e <= RA_EVENT_END; e++) { |
1204 | + ra_events[0][e][i] = 0; |
1205 | + for (c = 1; c < RA_CLASS_END; c++) |
1206 | + ra_events[0][e][i] += ra_events[c][e][i]; |
1207 | + } |
1208 | + |
1209 | + seq_printf(s, event_fmt, table_name[i]); |
1210 | + for (c = 0; c <= RA_CLASS_END; c++) |
1211 | + seq_printf(s, class_fmt, ra_class_name[c]); |
1212 | + seq_puts(s, "\n"); |
1213 | + |
1214 | + for (e = 0; e < RA_EVENT_END; e++) { |
1215 | + if (e == RA_EVENT_READAHEAD_HIT && i == 0) |
1216 | + continue; |
1217 | + if (e == RA_EVENT_IO_BLOCK && i == 1) |
1218 | + continue; |
1219 | + |
1220 | + seq_printf(s, event_fmt, ra_event_name[e]); |
1221 | + for (c = 0; c <= RA_CLASS_END; c++) |
1222 | + seq_printf(s, item_fmt, ra_events[c][e][i]); |
1223 | + seq_puts(s, "\n"); |
1224 | + } |
1225 | + seq_puts(s, "\n"); |
1226 | + } |
1227 | + |
1228 | + seq_printf(s, event_fmt, table_name[2]); |
1229 | + for (c = 0; c <= RA_CLASS_END; c++) |
1230 | + seq_printf(s, class_fmt, ra_class_name[c]); |
1231 | + seq_puts(s, "\n"); |
1232 | + |
1233 | + seq_printf(s, event_fmt, "random_rate"); |
1234 | + for (c = 0; c <= RA_CLASS_END; c++) |
1235 | + seq_printf(s, percent_format, |
1236 | + (ra_events[c][RA_EVENT_READRANDOM][0] * 100) / |
1237 | + ((ra_events[c][RA_EVENT_READRANDOM][0] + |
1238 | + ra_events[c][RA_EVENT_READAHEAD][0]) | 1)); |
1239 | + seq_puts(s, "\n"); |
1240 | + |
1241 | + seq_printf(s, event_fmt, "ra_hit_rate"); |
1242 | + for (c = 0; c <= RA_CLASS_END; c++) |
1243 | + seq_printf(s, percent_format, |
1244 | + (ra_events[c][RA_EVENT_READAHEAD_HIT][1] * 100) / |
1245 | + (ra_events[c][RA_EVENT_READAHEAD][1] | 1)); |
1246 | + seq_puts(s, "\n"); |
1247 | + |
1248 | + seq_printf(s, event_fmt, "la_hit_rate"); |
1249 | + for (c = 0; c <= RA_CLASS_END; c++) |
1250 | + seq_printf(s, percent_format, |
1251 | + (ra_events[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) / |
1252 | + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1)); |
1253 | + seq_puts(s, "\n"); |
1254 | + |
1255 | + seq_printf(s, event_fmt, "var_ra_size"); |
1256 | + for (c = 0; c <= RA_CLASS_END; c++) |
1257 | + seq_printf(s, item_fmt, |
1258 | + (ra_events[c][RA_EVENT_END][1] - |
1259 | + ra_events[c][RA_EVENT_READAHEAD][1] * |
1260 | + (ra_events[c][RA_EVENT_READAHEAD][1] / |
1261 | + (ra_events[c][RA_EVENT_READAHEAD][0] | 1))) / |
1262 | + (ra_events[c][RA_EVENT_READAHEAD][0] | 1)); |
1263 | + seq_puts(s, "\n"); |
1264 | + |
1265 | + seq_printf(s, event_fmt, "avg_ra_size"); |
1266 | + for (c = 0; c <= RA_CLASS_END; c++) |
1267 | + seq_printf(s, item_fmt, |
1268 | + (ra_events[c][RA_EVENT_READAHEAD][1] + |
1269 | + ra_events[c][RA_EVENT_READAHEAD][0] / 2) / |
1270 | + (ra_events[c][RA_EVENT_READAHEAD][0] | 1)); |
1271 | + seq_puts(s, "\n"); |
1272 | + |
1273 | + seq_printf(s, event_fmt, "avg_la_size"); |
1274 | + for (c = 0; c <= RA_CLASS_END; c++) |
1275 | + seq_printf(s, item_fmt, |
1276 | + (ra_events[c][RA_EVENT_LOOKAHEAD][1] + |
1277 | + ra_events[c][RA_EVENT_LOOKAHEAD][0] / 2) / |
1278 | + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1)); |
1279 | + seq_puts(s, "\n"); |
1280 | + |
1281 | + return 0; |
1282 | +} |
1283 | + |
1284 | +static int ra_events_open(struct inode *inode, struct file *file) |
1285 | +{ |
1286 | + return single_open(file, ra_events_show, NULL); |
1287 | +} |
1288 | + |
1289 | +static ssize_t ra_events_write(struct file *file, const char __user *buf, |
1290 | + size_t size, loff_t *offset) |
1291 | +{ |
1292 | + memset(ra_events, 0, sizeof(ra_events)); |
1293 | + return 1; |
1294 | +} |
1295 | + |
1296 | +struct file_operations ra_events_fops = { |
1297 | + .owner = THIS_MODULE, |
1298 | + .open = ra_events_open, |
1299 | + .write = ra_events_write, |
1300 | + .read = seq_read, |
1301 | + .llseek = seq_lseek, |
1302 | + .release = single_release, |
1303 | +}; |
1304 | + |
1305 | +u32 readahead_debug_level = 0; |
1306 | +u32 disable_stateful_method = 0; |
1307 | + |
1308 | +static int __init readahead_init(void) |
1309 | +{ |
1310 | + struct dentry *root; |
1311 | + |
1312 | + root = debugfs_create_dir("readahead", NULL); |
1313 | + |
1314 | + debugfs_create_file("events", 0644, root, NULL, &ra_events_fops); |
1315 | + |
1316 | + debugfs_create_u32("debug_level", 0644, root, &readahead_debug_level); |
1317 | + debugfs_create_bool("disable_stateful_method", 0644, root, |
1318 | + &disable_stateful_method); |
1319 | + |
1320 | + return 0; |
1321 | +} |
1322 | + |
1323 | +module_init(readahead_init) |
1324 | +#else |
1325 | +#define ra_account(ra, e, pages) do { } while (0) |
1326 | +#define readahead_debug_level (0) |
1327 | +#define disable_stateful_method (0) |
1328 | +#endif /* CONFIG_DEBUG_READAHEAD */ |
1329 | + |
1330 | +#define dprintk(args...) \ |
1331 | + do { if (readahead_debug_level >= 1) printk(KERN_DEBUG args); } while(0) |
1332 | +#define ddprintk(args...) \ |
1333 | + do { if (readahead_debug_level >= 2) printk(KERN_DEBUG args); } while(0) |
1334 | + |
1335 | |
1336 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
1337 | { |
1338 | @@ -21,7 +315,7 @@ void default_unplug_io_fn(struct backing |
1339 | EXPORT_SYMBOL(default_unplug_io_fn); |
1340 | |
1341 | struct backing_dev_info default_backing_dev_info = { |
1342 | - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, |
1343 | + .ra_pages = MAX_RA_PAGES, |
1344 | .state = 0, |
1345 | .capabilities = BDI_CAP_MAP_COPY, |
1346 | .unplug_io_fn = default_unplug_io_fn, |
1347 | @@ -49,7 +343,7 @@ static inline unsigned long get_max_read |
1348 | |
1349 | static inline unsigned long get_min_readahead(struct file_ra_state *ra) |
1350 | { |
1351 | - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
1352 | + return MIN_RA_PAGES; |
1353 | } |
1354 | |
1355 | static inline void ra_off(struct file_ra_state *ra) |
1356 | @@ -134,8 +428,10 @@ int read_cache_pages(struct address_spac |
1357 | continue; |
1358 | } |
1359 | ret = filler(data, page); |
1360 | - if (!pagevec_add(&lru_pvec, page)) |
1361 | + if (!pagevec_add(&lru_pvec, page)) { |
1362 | + cond_resched(); |
1363 | __pagevec_lru_add(&lru_pvec); |
1364 | + } |
1365 | if (ret) { |
1366 | while (!list_empty(pages)) { |
1367 | struct page *victim; |
1368 | @@ -173,8 +469,10 @@ static int read_pages(struct address_spa |
1369 | page->index, GFP_KERNEL)) { |
1370 | ret = mapping->a_ops->readpage(filp, page); |
1371 | if (ret != AOP_TRUNCATED_PAGE) { |
1372 | - if (!pagevec_add(&lru_pvec, page)) |
1373 | + if (!pagevec_add(&lru_pvec, page)) { |
1374 | + cond_resched(); |
1375 | __pagevec_lru_add(&lru_pvec); |
1376 | + } |
1377 | continue; |
1378 | } /* else fall through to release */ |
1379 | } |
1380 | @@ -257,7 +555,8 @@ out: |
1381 | */ |
1382 | static int |
1383 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
1384 | - pgoff_t offset, unsigned long nr_to_read) |
1385 | + pgoff_t offset, unsigned long nr_to_read, |
1386 | + unsigned long lookahead_size) |
1387 | { |
1388 | struct inode *inode = mapping->host; |
1389 | struct page *page; |
1390 | @@ -270,7 +569,7 @@ __do_page_cache_readahead(struct address |
1391 | if (isize == 0) |
1392 | goto out; |
1393 | |
1394 | - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); |
1395 | + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); |
1396 | |
1397 | /* |
1398 | * Preallocate as many pages as we will need. |
1399 | @@ -287,12 +586,15 @@ __do_page_cache_readahead(struct address |
1400 | continue; |
1401 | |
1402 | read_unlock_irq(&mapping->tree_lock); |
1403 | + cond_resched(); |
1404 | page = page_cache_alloc_cold(mapping); |
1405 | read_lock_irq(&mapping->tree_lock); |
1406 | if (!page) |
1407 | break; |
1408 | page->index = page_offset; |
1409 | list_add(&page->lru, &page_pool); |
1410 | + if (page_idx == nr_to_read - lookahead_size) |
1411 | + __SetPageReadahead(page); |
1412 | ret++; |
1413 | } |
1414 | read_unlock_irq(&mapping->tree_lock); |
1415 | @@ -329,7 +631,7 @@ int force_page_cache_readahead(struct ad |
1416 | if (this_chunk > nr_to_read) |
1417 | this_chunk = nr_to_read; |
1418 | err = __do_page_cache_readahead(mapping, filp, |
1419 | - offset, this_chunk); |
1420 | + offset, this_chunk, 0); |
1421 | if (err < 0) { |
1422 | ret = err; |
1423 | break; |
1424 | @@ -338,6 +640,9 @@ int force_page_cache_readahead(struct ad |
1425 | offset += this_chunk; |
1426 | nr_to_read -= this_chunk; |
1427 | } |
1428 | + |
1429 | + ra_account(NULL, RA_EVENT_READAHEAD, ret); |
1430 | + |
1431 | return ret; |
1432 | } |
1433 | |
1434 | @@ -373,10 +678,16 @@ static inline int check_ra_success(struc |
1435 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
1436 | pgoff_t offset, unsigned long nr_to_read) |
1437 | { |
1438 | + unsigned long ret; |
1439 | + |
1440 | if (bdi_read_congested(mapping->backing_dev_info)) |
1441 | return -1; |
1442 | |
1443 | - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); |
1444 | + ret = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); |
1445 | + |
1446 | + ra_account(NULL, RA_EVENT_READAHEAD, ret); |
1447 | + |
1448 | + return ret; |
1449 | } |
1450 | |
1451 | /* |
1452 | @@ -396,7 +707,11 @@ blockable_page_cache_readahead(struct ad |
1453 | if (!block && bdi_read_congested(mapping->backing_dev_info)) |
1454 | return 0; |
1455 | |
1456 | - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); |
1457 | + actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); |
1458 | + |
1459 | + ra_account(NULL, RA_EVENT_READAHEAD, actual); |
1460 | + dprintk("blockable-readahead(ino=%lu, ra=%lu+%lu) = %d\n", |
1461 | + mapping->host->i_ino, offset, nr_to_read, actual); |
1462 | |
1463 | return check_ra_success(ra, nr_to_read, actual); |
1464 | } |
1465 | @@ -442,7 +757,7 @@ static int make_ahead_window(struct addr |
1466 | * @req_size: hint: total size of the read which the caller is performing in |
1467 | * PAGE_CACHE_SIZE units |
1468 | * |
1469 | - * page_cache_readahead() is the main function. If performs the adaptive |
1470 | + * page_cache_readahead() is the main function. It performs the adaptive |
1471 | * readahead window size management and submits the readahead I/O. |
1472 | * |
1473 | * Note that @filp is purely used for passing on to the ->readpage[s]() |
1474 | @@ -572,3 +887,1187 @@ unsigned long max_sane_readahead(unsigne |
1475 | __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); |
1476 | return min(nr, (inactive + free) / 2); |
1477 | } |
1478 | + |
1479 | +/* |
1480 | + * Adaptive read-ahead. |
1481 | + * |
1482 | + * Good read patterns are compact both in space and time. The read-ahead logic |
1483 | + * tries to grant larger read-ahead size to better readers under the constraint |
1484 | + * of system memory and load pressure. |
1485 | + * |
1486 | + * It employs two methods to estimate the max thrashing safe read-ahead size: |
1487 | + * 1. state based - the default one |
1488 | + * 2. context based - the failsafe one |
1489 | + * The integration of the dual methods has the merit of being agile and robust. |
1490 | + * It makes the overall design clean: special cases are handled in general by |
1491 | + * the stateless method, leaving the stateful one simple and fast. |
1492 | + * |
1493 | + * To improve throughput and decrease read delay, the logic 'looks ahead'. |
1494 | + * In most read-ahead chunks, one page will be selected and tagged with |
1495 | + * PG_readahead. Later when the page with PG_readahead is read, the logic |
1496 | + * will be notified to submit the next read-ahead chunk in advance. |
1497 | + * |
1498 | + * a read-ahead chunk |
1499 | + * +-----------------------------------------+ |
1500 | + * | # PG_readahead | |
1501 | + * +-----------------------------------------+ |
1502 | + * ^ When this page is read, notify me for the next read-ahead. |
1503 | + * |
1504 | + * |
1505 | + * Here are some variable names used frequently: |
1506 | + * |
1507 | + * |<------- la_size ------>| |
1508 | + * +-----------------------------------------+ |
1509 | + * | # | |
1510 | + * +-----------------------------------------+ |
1511 | + * ra_index -->|<---------------- ra_size -------------->| |
1512 | + * |
1513 | + */ |
1514 | + |
1515 | +#ifdef CONFIG_ADAPTIVE_READAHEAD |
1516 | + |
1517 | +/* |
1518 | + * The nature of read-ahead allows false tests to occur occasionally. |
1519 | + * Here we just do not bother to call get_page(), it's meaningless anyway. |
1520 | + */ |
1521 | +static inline struct page *__find_page(struct address_space *mapping, |
1522 | + pgoff_t offset) |
1523 | +{ |
1524 | + return radix_tree_lookup(&mapping->page_tree, offset); |
1525 | +} |
1526 | + |
1527 | +static inline struct page *find_page(struct address_space *mapping, |
1528 | + pgoff_t offset) |
1529 | +{ |
1530 | + struct page *page; |
1531 | + |
1532 | + read_lock_irq(&mapping->tree_lock); |
1533 | + page = __find_page(mapping, offset); |
1534 | + read_unlock_irq(&mapping->tree_lock); |
1535 | + return page; |
1536 | +} |
1537 | + |
1538 | +/* |
1539 | + * Move pages in danger (of thrashing) to the head of inactive_list. |
1540 | + * Not expected to happen frequently. |
1541 | + */ |
1542 | +static unsigned long rescue_pages(struct page *page, unsigned long nr_pages) |
1543 | +{ |
1544 | + int pgrescue; |
1545 | + pgoff_t index; |
1546 | + struct zone *zone; |
1547 | + struct address_space *mapping; |
1548 | + |
1549 | + BUG_ON(!nr_pages || !page); |
1550 | + pgrescue = 0; |
1551 | + index = page_index(page); |
1552 | + mapping = page_mapping(page); |
1553 | + |
1554 | + dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n", |
1555 | + mapping->host->i_ino, index, nr_pages); |
1556 | + |
1557 | + for(;;) { |
1558 | + zone = page_zone(page); |
1559 | + spin_lock_irq(&zone->lru_lock); |
1560 | + |
1561 | + if (!PageLRU(page)) |
1562 | + goto out_unlock; |
1563 | + |
1564 | + while (page_mapping(page) == mapping && |
1565 | + page_index(page) == index) { |
1566 | + struct page *the_page = page; |
1567 | + page = next_page(page); |
1568 | + if (!PageActive(the_page) && |
1569 | + !PageLocked(the_page) && |
1570 | + page_count(the_page) == 1) { |
1571 | + list_move(&the_page->lru, &zone->inactive_list); |
1572 | + pgrescue++; |
1573 | + } |
1574 | + index++; |
1575 | + if (!--nr_pages) |
1576 | + goto out_unlock; |
1577 | + } |
1578 | + |
1579 | + spin_unlock_irq(&zone->lru_lock); |
1580 | + |
1581 | + cond_resched(); |
1582 | + page = find_page(mapping, index); |
1583 | + if (!page) |
1584 | + goto out; |
1585 | + } |
1586 | +out_unlock: |
1587 | + spin_unlock_irq(&zone->lru_lock); |
1588 | +out: |
1589 | + ra_account(NULL, RA_EVENT_READAHEAD_RESCUE, pgrescue); |
1590 | + return nr_pages; |
1591 | +} |
1592 | + |
1593 | +/* |
1594 | + * Set a new look-ahead mark at @new_index. |
1595 | + * Return 0 if the new mark is successfully set. |
1596 | + */ |
1597 | +static inline int renew_lookahead(struct address_space *mapping, |
1598 | + struct file_ra_state *ra, |
1599 | + pgoff_t index, pgoff_t new_index) |
1600 | +{ |
1601 | + struct page *page; |
1602 | + |
1603 | + if (index == ra->lookahead_index && |
1604 | + new_index >= ra->readahead_index) |
1605 | + return 1; |
1606 | + |
1607 | + page = find_page(mapping, new_index); |
1608 | + if (!page) |
1609 | + return 1; |
1610 | + |
1611 | + __SetPageReadahead(page); |
1612 | + if (ra->lookahead_index == index) |
1613 | + ra->lookahead_index = new_index; |
1614 | + |
1615 | + return 0; |
1616 | +} |
1617 | + |
1618 | +/* |
1619 | + * State based calculation of read-ahead request. |
1620 | + * |
1621 | + * This figure shows the meaning of file_ra_state members: |
1622 | + * |
1623 | + * chunk A chunk B |
1624 | + * +---------------------------+-------------------------------------------+ |
1625 | + * | # | # | |
1626 | + * +---------------------------+-------------------------------------------+ |
1627 | + * ^ ^ ^ ^ |
1628 | + * la_index ra_index lookahead_index readahead_index |
1629 | + */ |
1630 | + |
1631 | +/* |
1632 | + * The node's effective length of inactive_list(s). |
1633 | + */ |
1634 | +static unsigned long node_free_and_cold_pages(void) |
1635 | +{ |
1636 | + unsigned int i; |
1637 | + unsigned long sum = 0; |
1638 | + struct zone *zones = NODE_DATA(numa_node_id())->node_zones; |
1639 | + |
1640 | + for (i = 0; i < MAX_NR_ZONES; i++) |
1641 | + sum += zones[i].nr_inactive + |
1642 | + zones[i].free_pages - zones[i].pages_low; |
1643 | + |
1644 | + return sum; |
1645 | +} |
1646 | + |
1647 | +/* |
1648 | + * The node's accumulated aging activities. |
1649 | + */ |
1650 | +static unsigned long node_readahead_aging(void) |
1651 | +{ |
1652 | + unsigned long cpu; |
1653 | + unsigned long sum = 0; |
1654 | + cpumask_t mask = node_to_cpumask(numa_node_id()); |
1655 | + |
1656 | + for_each_cpu_mask(cpu, mask) |
1657 | + sum += per_cpu(readahead_aging, cpu); |
1658 | + |
1659 | + return sum; |
1660 | +} |
1661 | + |
1662 | +/* |
1663 | + * The 64bit cache_hits stores three accumulated values and a counter value. |
1664 | + * MSB LSB |
1665 | + * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000 |
1666 | + */ |
1667 | +static inline int ra_cache_hit(struct file_ra_state *ra, int nr) |
1668 | +{ |
1669 | + return (ra->cache_hits >> (nr * 16)) & 0xFFFF; |
1670 | +} |
1671 | + |
1672 | +/* |
1673 | + * Conceptual code: |
1674 | + * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0); |
1675 | + * ra_cache_hit(ra, 0) = 0; |
1676 | + */ |
1677 | +static inline void ra_addup_cache_hit(struct file_ra_state *ra) |
1678 | +{ |
1679 | + int n; |
1680 | + |
1681 | + n = ra_cache_hit(ra, 0); |
1682 | + ra->cache_hits -= n; |
1683 | + n <<= 16; |
1684 | + ra->cache_hits += n; |
1685 | +} |
1686 | + |
1687 | +/* |
1688 | + * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate. |
1689 | + */ |
1690 | +static inline int ra_cache_hit_ok(struct file_ra_state *ra) |
1691 | +{ |
1692 | + return ra_cache_hit(ra, 0) * readahead_hit_rate >= |
1693 | + (ra->lookahead_index - ra->la_index); |
1694 | +} |
1695 | + |
1696 | +/* |
1697 | + * Check if @index falls in the @ra request. |
1698 | + */ |
1699 | +static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) |
1700 | +{ |
1701 | + if (index < ra->la_index || index >= ra->readahead_index) |
1702 | + return 0; |
1703 | + |
1704 | + if (index >= ra->ra_index) |
1705 | + return 1; |
1706 | + else |
1707 | + return -1; |
1708 | +} |
1709 | + |
1710 | +/* |
1711 | + * Which method is issuing this read-ahead? |
1712 | + */ |
1713 | +static inline void ra_set_class(struct file_ra_state *ra, |
1714 | + enum ra_class ra_class) |
1715 | +{ |
1716 | + unsigned long flags_mask; |
1717 | + unsigned long flags; |
1718 | + unsigned long old_ra_class; |
1719 | + |
1720 | + flags_mask = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT)); |
1721 | + flags = ra->flags & flags_mask; |
1722 | + |
1723 | + old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT; |
1724 | + |
1725 | + ra->flags = flags | old_ra_class | ra_class; |
1726 | + |
1727 | + ra_addup_cache_hit(ra); |
1728 | + if (ra_class != RA_CLASS_STATE) |
1729 | + ra->cache_hits <<= 16; |
1730 | + |
1731 | + ra->age = node_readahead_aging(); |
1732 | +} |
1733 | + |
1734 | +/* |
1735 | + * Where is the old read-ahead and look-ahead? |
1736 | + */ |
1737 | +static inline void ra_set_index(struct file_ra_state *ra, |
1738 | + pgoff_t la_index, pgoff_t ra_index) |
1739 | +{ |
1740 | + ra->la_index = la_index; |
1741 | + ra->ra_index = ra_index; |
1742 | +} |
1743 | + |
1744 | +/* |
1745 | + * Where is the new read-ahead and look-ahead? |
1746 | + */ |
1747 | +static inline void ra_set_size(struct file_ra_state *ra, |
1748 | + unsigned long ra_size, unsigned long la_size) |
1749 | +{ |
1750 | + /* Disable look-ahead for loopback file. */ |
1751 | + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD)) |
1752 | + la_size = 0; |
1753 | + |
1754 | + ra->readahead_index = ra->ra_index + ra_size; |
1755 | + ra->lookahead_index = ra->readahead_index - la_size; |
1756 | +} |
1757 | + |
1758 | +/* |
1759 | + * Submit IO for the read-ahead request in file_ra_state. |
1760 | + */ |
1761 | +static int ra_dispatch(struct file_ra_state *ra, |
1762 | + struct address_space *mapping, struct file *filp) |
1763 | +{ |
1764 | + pgoff_t eof_index; |
1765 | + unsigned long ra_size; |
1766 | + unsigned long la_size; |
1767 | + int actual; |
1768 | + enum ra_class ra_class; |
1769 | + |
1770 | + ra_class = (ra->flags & RA_CLASS_MASK); |
1771 | + BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END); |
1772 | + |
1773 | + eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1; |
1774 | + ra_size = ra->readahead_index - ra->ra_index; |
1775 | + la_size = ra->readahead_index - ra->lookahead_index; |
1776 | + |
1777 | + /* Snap to EOF. */ |
1778 | + if (unlikely(ra->ra_index >= eof_index)) |
1779 | + return 0; |
1780 | + if (ra->readahead_index + ra_size / 2 > eof_index) { |
1781 | + if (ra_class == RA_CLASS_CONTEXT_AGGRESSIVE && |
1782 | + eof_index > ra->lookahead_index + 1) |
1783 | + la_size = eof_index - ra->lookahead_index; |
1784 | + else |
1785 | + la_size = 0; |
1786 | + ra_size = eof_index - ra->ra_index; |
1787 | + ra_set_size(ra, ra_size, la_size); |
1788 | + } |
1789 | + |
1790 | + actual = __do_page_cache_readahead(mapping, filp, |
1791 | + ra->ra_index, ra_size, la_size); |
1792 | + |
1793 | +#ifdef CONFIG_DEBUG_READAHEAD |
1794 | + if (ra->flags & RA_FLAG_MMAP) |
1795 | + ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual); |
1796 | + if (ra->readahead_index == eof_index) |
1797 | + ra_account(ra, RA_EVENT_READAHEAD_EOF, actual); |
1798 | + if (la_size) |
1799 | + ra_account(ra, RA_EVENT_LOOKAHEAD, la_size); |
1800 | + if (ra_size > actual) |
1801 | + ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual); |
1802 | + ra_account(ra, RA_EVENT_READAHEAD, actual); |
1803 | + |
1804 | + if (!ra->ra_index && filp->f_dentry->d_inode) { |
1805 | + char *fn; |
1806 | + static char path[1024]; |
1807 | + unsigned long size; |
1808 | + |
1809 | + size = (i_size_read(filp->f_dentry->d_inode)+1023)/1024; |
1810 | + fn = d_path(filp->f_dentry, filp->f_vfsmnt, path, 1000); |
1811 | + if (!IS_ERR(fn)) |
1812 | + ddprintk("ino %lu is %s size %luK by %s(%d)\n", |
1813 | + filp->f_dentry->d_inode->i_ino, |
1814 | + fn, size, |
1815 | + current->comm, current->pid); |
1816 | + } |
1817 | + |
1818 | + dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n", |
1819 | + ra_class_name[ra_class], |
1820 | + mapping->host->i_ino, ra->la_index, |
1821 | + ra->ra_index, ra_size, la_size, actual); |
1822 | +#endif /* CONFIG_DEBUG_READAHEAD */ |
1823 | + |
1824 | + return actual; |
1825 | +} |
1826 | + |
1827 | +/* |
1828 | + * Determine the ra request from primitive values. |
1829 | + * |
1830 | + * It applies the following rules: |
1831 | + * - Substract ra_size by the old look-ahead to get real safe read-ahead; |
1832 | + * - Set new la_size according to the (still large) ra_size; |
1833 | + * - Apply upper limits; |
1834 | + * - Make sure stream_shift is not too small. |
1835 | + * (So that the next global_shift will not be too small.) |
1836 | + * |
1837 | + * Input: |
1838 | + * ra_size stores the estimated thrashing-threshold. |
1839 | + * la_size stores the look-ahead size of previous request. |
1840 | + */ |
1841 | +static inline int adjust_rala(unsigned long ra_max, |
1842 | + unsigned long *ra_size, unsigned long *la_size) |
1843 | +{ |
1844 | + unsigned long stream_shift = *la_size; |
1845 | + |
1846 | + if (*ra_size > *la_size) |
1847 | + *ra_size -= *la_size; |
1848 | + else { |
1849 | + ra_account(NULL, RA_EVENT_READAHEAD_SHRINK, *ra_size); |
1850 | + return 0; |
1851 | + } |
1852 | + |
1853 | + *la_size = *ra_size / LOOKAHEAD_RATIO; |
1854 | + |
1855 | + if (*ra_size > ra_max) |
1856 | + *ra_size = ra_max; |
1857 | + if (*la_size > *ra_size) |
1858 | + *la_size = *ra_size; |
1859 | + |
1860 | + stream_shift += (*ra_size - *la_size); |
1861 | + if (stream_shift < *ra_size / 4) |
1862 | + *la_size -= (*ra_size / 4 - stream_shift); |
1863 | + |
1864 | + return 1; |
1865 | +} |
1866 | + |
1867 | +/* |
1868 | + * The function estimates two values: |
1869 | + * 1. thrashing-threshold for the current stream |
1870 | + * It is returned to make the next read-ahead request. |
1871 | + * 2. the remained safe space for the current chunk |
1872 | + * It will be checked to ensure that the current chunk is safe. |
1873 | + * |
1874 | + * The computation will be pretty accurate under heavy load, and will vibrate |
1875 | + * more on light load(with small global_shift), so the grow speed of ra_size |
1876 | + * must be limited, and a moderate large stream_shift must be insured. |
1877 | + * |
1878 | + * This figure illustrates the formula used in the function: |
1879 | + * While the stream reads stream_shift pages inside the chunks, |
1880 | + * the chunks are shifted global_shift pages inside inactive_list. |
1881 | + * |
1882 | + * chunk A chunk B |
1883 | + * |<=============== global_shift ================| |
1884 | + * +-------------+ +-------------------+ | |
1885 | + * | # | | # | inactive_list | |
1886 | + * +-------------+ +-------------------+ head | |
1887 | + * |---->| |---------->| |
1888 | + * | | |
1889 | + * +-- stream_shift --+ |
1890 | + */ |
1891 | +static inline unsigned long compute_thrashing_threshold( |
1892 | + struct file_ra_state *ra, |
1893 | + unsigned long *remain) |
1894 | +{ |
1895 | + unsigned long global_size; |
1896 | + unsigned long global_shift; |
1897 | + unsigned long stream_shift; |
1898 | + unsigned long ra_size; |
1899 | + uint64_t ll; |
1900 | + |
1901 | + global_size = node_free_and_cold_pages(); |
1902 | + global_shift = node_readahead_aging() - ra->age; |
1903 | + global_shift |= 1UL; |
1904 | + stream_shift = ra_cache_hit(ra, 0); |
1905 | + |
1906 | + ll = (uint64_t) stream_shift * (global_size >> 9) * readahead_ratio * 5; |
1907 | + do_div(ll, global_shift); |
1908 | + ra_size = ll; |
1909 | + |
1910 | + if (global_size > global_shift) { |
1911 | + ll = (uint64_t) stream_shift * (global_size - global_shift); |
1912 | + do_div(ll, global_shift); |
1913 | + *remain = ll; |
1914 | + } else |
1915 | + *remain = 0; |
1916 | + |
1917 | + ddprintk("compute_thrashing_threshold: " |
1918 | + "at %lu ra %lu=%lu*%lu/%lu, remain %lu for %lu\n", |
1919 | + ra->readahead_index, ra_size, |
1920 | + stream_shift, global_size, global_shift, |
1921 | + *remain, ra->readahead_index - ra->lookahead_index); |
1922 | + |
1923 | + return ra_size; |
1924 | +} |
1925 | + |
1926 | +/* |
1927 | + * Main function for file_ra_state based read-ahead. |
1928 | + */ |
1929 | +static inline unsigned long |
1930 | +state_based_readahead(struct address_space *mapping, struct file *filp, |
1931 | + struct file_ra_state *ra, |
1932 | + struct page *page, pgoff_t index, |
1933 | + unsigned long ra_size, unsigned long ra_max) |
1934 | +{ |
1935 | + unsigned long ra_old; |
1936 | + unsigned long la_size; |
1937 | + unsigned long remain_space; |
1938 | + unsigned long growth_limit; |
1939 | + |
1940 | + la_size = ra->readahead_index - index; |
1941 | + ra_old = ra->readahead_index - ra->ra_index; |
1942 | + growth_limit = ra_size + ra_max / 16 + |
1943 | + (2 + readahead_ratio / 64) * ra_old; |
1944 | + ra_size = compute_thrashing_threshold(ra, &remain_space); |
1945 | + |
1946 | + if (page && remain_space <= la_size && la_size > 1) { |
1947 | + rescue_pages(page, la_size); |
1948 | + return 0; |
1949 | + } |
1950 | + |
1951 | + if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size)) |
1952 | + return 0; |
1953 | + |
1954 | + ra_set_class(ra, RA_CLASS_STATE); |
1955 | + ra_set_index(ra, index, ra->readahead_index); |
1956 | + ra_set_size(ra, ra_size, la_size); |
1957 | + |
1958 | + return ra_dispatch(ra, mapping, filp); |
1959 | +} |
1960 | + |
1961 | +/* |
1962 | + * Page cache context based estimation of read-ahead/look-ahead size/index. |
1963 | + * |
1964 | + * The logic first looks around to find the start point of next read-ahead, |
1965 | + * and then, if necessary, looks backward in the inactive_list to get an |
1966 | + * estimation of the thrashing-threshold. |
1967 | + * |
1968 | + * The estimation theory can be illustrated with figure: |
1969 | + * |
1970 | + * chunk A chunk B chunk C head |
1971 | + * |
1972 | + * l01 l11 l12 l21 l22 |
1973 | + *| |-->|-->| |------>|-->| |------>| |
1974 | + *| +-------+ +-----------+ +-------------+ | |
1975 | + *| | # | | # | | # | | |
1976 | + *| +-------+ +-----------+ +-------------+ | |
1977 | + *| |<==============|<===========================|<============================| |
1978 | + * L0 L1 L2 |
1979 | + * |
1980 | + * Let f(l) = L be a map from |
1981 | + * l: the number of pages read by the stream |
1982 | + * to |
1983 | + * L: the number of pages pushed into inactive_list in the mean time |
1984 | + * then |
1985 | + * f(l01) <= L0 |
1986 | + * f(l11 + l12) = L1 |
1987 | + * f(l21 + l22) = L2 |
1988 | + * ... |
1989 | + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...) |
1990 | + * <= Length(inactive_list) = f(thrashing-threshold) |
1991 | + * |
1992 | + * So the count of countinuous history pages left in the inactive_list is always |
1993 | + * a lower estimation of the true thrashing-threshold. |
1994 | + */ |
1995 | + |
1996 | +#define PAGE_REFCNT_0 0 |
1997 | +#define PAGE_REFCNT_1 (1 << PG_referenced) |
1998 | +#define PAGE_REFCNT_2 (1 << PG_active) |
1999 | +#define PAGE_REFCNT_3 ((1 << PG_active) | (1 << PG_referenced)) |
2000 | +#define PAGE_REFCNT_MASK PAGE_REFCNT_3 |
2001 | + |
2002 | +/* |
2003 | + * STATUS REFERENCE COUNT |
2004 | + * __ 0 |
2005 | + * _R PAGE_REFCNT_1 |
2006 | + * A_ PAGE_REFCNT_2 |
2007 | + * AR PAGE_REFCNT_3 |
2008 | + * |
2009 | + * A/R: Active / Referenced |
2010 | + */ |
2011 | +static inline unsigned long page_refcnt(struct page *page) |
2012 | +{ |
2013 | + return page->flags & PAGE_REFCNT_MASK; |
2014 | +} |
2015 | + |
2016 | +/* |
2017 | + * STATUS REFERENCE COUNT TYPE |
2018 | + * __ 0 fresh |
2019 | + * _R PAGE_REFCNT_1 stale |
2020 | + * A_ PAGE_REFCNT_2 disturbed once |
2021 | + * AR PAGE_REFCNT_3 disturbed twice |
2022 | + * |
2023 | + * A/R: Active / Referenced |
2024 | + */ |
2025 | +static inline unsigned long cold_page_refcnt(struct page *page) |
2026 | +{ |
2027 | + if (!page || PageActive(page)) |
2028 | + return 0; |
2029 | + |
2030 | + return page_refcnt(page); |
2031 | +} |
2032 | + |
2033 | +static inline char page_refcnt_symbol(struct page *page) |
2034 | +{ |
2035 | + if (!page) |
2036 | + return 'X'; |
2037 | + |
2038 | + switch (page_refcnt(page)) { |
2039 | + case 0: |
2040 | + return '_'; |
2041 | + case PAGE_REFCNT_1: |
2042 | + return '-'; |
2043 | + case PAGE_REFCNT_2: |
2044 | + return '='; |
2045 | + case PAGE_REFCNT_3: |
2046 | + return '#'; |
2047 | + default: |
2048 | + return '?'; |
2049 | + } |
2050 | +} |
2051 | + |
2052 | +/* |
2053 | + * Count/estimate cache hits in range [first_index, last_index]. |
2054 | + * The estimation is simple and optimistic. |
2055 | + */ |
2056 | +static int count_cache_hit(struct address_space *mapping, |
2057 | + pgoff_t first_index, pgoff_t last_index) |
2058 | +{ |
2059 | + struct page *page; |
2060 | + int size = last_index - first_index + 1; |
2061 | + int count = 0; |
2062 | + int i; |
2063 | + |
2064 | + cond_resched(); |
2065 | + read_lock_irq(&mapping->tree_lock); |
2066 | + |
2067 | + /* |
2068 | + * The first page may well is chunk head and has been accessed, |
2069 | + * so it is index 0 that makes the estimation optimistic. This |
2070 | + * behavior guarantees a readahead when (size < ra_max) and |
2071 | + * (readahead_hit_rate >= 16). |
2072 | + */ |
2073 | + for (i = 0; i < 16;) { |
2074 | + page = __find_page(mapping, first_index + |
2075 | + size * ((i++ * 29) & 15) / 16); |
2076 | + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2) |
2077 | + break; |
2078 | + } |
2079 | + |
2080 | + read_unlock_irq(&mapping->tree_lock); |
2081 | + |
2082 | + return size * count / i; |
2083 | +} |
2084 | + |
2085 | +/* |
2086 | + * Look back and check history pages to estimate thrashing-threshold. |
2087 | + */ |
2088 | +static unsigned long query_page_cache_segment(struct address_space *mapping, |
2089 | + struct file_ra_state *ra, |
2090 | + unsigned long *remain, pgoff_t offset, |
2091 | + unsigned long ra_min, unsigned long ra_max) |
2092 | +{ |
2093 | + pgoff_t index; |
2094 | + unsigned long count; |
2095 | + unsigned long nr_lookback; |
2096 | + struct radix_tree_cache cache; |
2097 | + |
2098 | + /* |
2099 | + * Scan backward and check the near @ra_max pages. |
2100 | + * The count here determines ra_size. |
2101 | + */ |
2102 | + cond_resched(); |
2103 | + read_lock_irq(&mapping->tree_lock); |
2104 | + index = radix_tree_scan_hole_backward(&mapping->page_tree, |
2105 | + offset, ra_max); |
2106 | +#ifdef DEBUG_READAHEAD_RADIXTREE |
2107 | + WARN_ON(index > offset); |
2108 | + if (index != offset) |
2109 | + WARN_ON(!__find_page(mapping, index + 1)); |
2110 | + if (index && offset - index < ra_max) |
2111 | + WARN_ON(__find_page(mapping, index)); |
2112 | +#endif |
2113 | + read_unlock_irq(&mapping->tree_lock); |
2114 | + |
2115 | + *remain = offset - index; |
2116 | + |
2117 | + if (offset == ra->readahead_index && ra_cache_hit_ok(ra)) |
2118 | + count = *remain; |
2119 | + else if (count_cache_hit(mapping, index + 1, offset) * |
2120 | + readahead_hit_rate >= *remain) |
2121 | + count = *remain; |
2122 | + else |
2123 | + count = ra_min; |
2124 | + |
2125 | + /* |
2126 | + * Unnecessary to count more? |
2127 | + */ |
2128 | + if (count < ra_max) |
2129 | + goto out; |
2130 | + |
2131 | + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD)) |
2132 | + goto out; |
2133 | + |
2134 | + /* |
2135 | + * Check the far pages coarsely. |
2136 | + * The big count here helps increase la_size. |
2137 | + */ |
2138 | + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * |
2139 | + 100 / (readahead_ratio + 1); |
2140 | + |
2141 | + cond_resched(); |
2142 | + radix_tree_cache_init(&cache); |
2143 | + read_lock_irq(&mapping->tree_lock); |
2144 | + for (count += ra_max; count < nr_lookback; count += ra_max) { |
2145 | + struct radix_tree_node *node; |
2146 | + node = radix_tree_cache_lookup_node(&mapping->page_tree, |
2147 | + &cache, offset - count, 1); |
2148 | +#ifdef DEBUG_READAHEAD_RADIXTREE |
2149 | + if (node != radix_tree_lookup_node(&mapping->page_tree, |
2150 | + offset - count, 1)) |
2151 | + BUG(); |
2152 | +#endif |
2153 | + if (!node) |
2154 | + break; |
2155 | + } |
2156 | + read_unlock_irq(&mapping->tree_lock); |
2157 | + |
2158 | +out: |
2159 | + /* |
2160 | + * For sequential read that extends from index 0, the counted value |
2161 | + * may well be far under the true threshold, so return it unmodified |
2162 | + * for further process in adjust_rala_aggressive(). |
2163 | + */ |
2164 | + if (count >= offset) |
2165 | + count = offset; |
2166 | + else |
2167 | + count = max(ra_min, count * readahead_ratio / 100); |
2168 | + |
2169 | + ddprintk("query_page_cache_segment: " |
2170 | + "ino=%lu, idx=%lu, count=%lu, remain=%lu\n", |
2171 | + mapping->host->i_ino, offset, count, *remain); |
2172 | + |
2173 | + return count; |
2174 | +} |
2175 | + |
2176 | +/* |
2177 | + * Find past-the-end index of the segment before @index. |
2178 | + */ |
2179 | +static inline pgoff_t find_segtail_backward(struct address_space *mapping, |
2180 | + pgoff_t index, unsigned long max_scan) |
2181 | +{ |
2182 | + struct radix_tree_cache cache; |
2183 | + struct page *page; |
2184 | + pgoff_t origin; |
2185 | + |
2186 | + origin = index; |
2187 | + if (max_scan > index) |
2188 | + max_scan = index; |
2189 | + |
2190 | + cond_resched(); |
2191 | + radix_tree_cache_init(&cache); |
2192 | + read_lock_irq(&mapping->tree_lock); |
2193 | + for (; origin - index < max_scan;) { |
2194 | + page = radix_tree_cache_lookup(&mapping->page_tree, |
2195 | + &cache, --index); |
2196 | + if (page) { |
2197 | + read_unlock_irq(&mapping->tree_lock); |
2198 | + return index + 1; |
2199 | + } |
2200 | + } |
2201 | + read_unlock_irq(&mapping->tree_lock); |
2202 | + |
2203 | + return 0; |
2204 | +} |
2205 | + |
2206 | +/* |
2207 | + * Find past-the-end index of the segment at @index. |
2208 | + */ |
2209 | +static inline pgoff_t find_segtail(struct address_space *mapping, |
2210 | + pgoff_t index, unsigned long max_scan) |
2211 | +{ |
2212 | + pgoff_t ra_index; |
2213 | + |
2214 | + cond_resched(); |
2215 | + read_lock_irq(&mapping->tree_lock); |
2216 | + ra_index = radix_tree_scan_hole(&mapping->page_tree, index, max_scan); |
2217 | +#ifdef DEBUG_READAHEAD_RADIXTREE |
2218 | + BUG_ON(!__find_page(mapping, index)); |
2219 | + WARN_ON(ra_index < index); |
2220 | + if (ra_index != index && !__find_page(mapping, ra_index - 1)) |
2221 | + printk(KERN_ERR "radix_tree_scan_hole(index=%lu ra_index=%lu " |
2222 | + "max_scan=%lu nrpages=%lu) fooled!\n", |
2223 | + index, ra_index, max_scan, mapping->nrpages); |
2224 | + if (ra_index != ~0UL && ra_index - index < max_scan) |
2225 | + WARN_ON(__find_page(mapping, ra_index)); |
2226 | +#endif |
2227 | + read_unlock_irq(&mapping->tree_lock); |
2228 | + |
2229 | + if (ra_index <= index + max_scan) |
2230 | + return ra_index; |
2231 | + else |
2232 | + return 0; |
2233 | +} |
2234 | + |
2235 | +/* |
2236 | + * Determine the request parameters for context based read-ahead that extends |
2237 | + * from start of file. |
2238 | + * |
2239 | + * The major weakness of stateless method is perhaps the slow grow up speed of |
2240 | + * ra_size. The logic tries to make up for this in the important case of |
2241 | + * sequential reads that extend from start of file. In this case, the ra_size |
2242 | + * is not chosen to make the whole next chunk safe (as in normal ones). Only |
2243 | + * half of which is safe. The added 'unsafe' half is the look-ahead part. It |
2244 | + * is expected to be safeguarded by rescue_pages() when the previous chunks are |
2245 | + * lost. |
2246 | + */ |
2247 | +static inline int adjust_rala_aggressive(unsigned long ra_max, |
2248 | + unsigned long *ra_size, unsigned long *la_size) |
2249 | +{ |
2250 | + pgoff_t index = *ra_size; |
2251 | + |
2252 | + *ra_size -= min(*ra_size, *la_size); |
2253 | + *ra_size = *ra_size * readahead_ratio / 100; |
2254 | + *la_size = index * readahead_ratio / 100; |
2255 | + *ra_size += *la_size; |
2256 | + |
2257 | + if (*ra_size > ra_max) |
2258 | + *ra_size = ra_max; |
2259 | + if (*la_size > *ra_size) |
2260 | + *la_size = *ra_size; |
2261 | + |
2262 | + return 1; |
2263 | +} |
2264 | + |
2265 | +/* |
2266 | + * Main function for page context based read-ahead. |
2267 | + */ |
2268 | +static inline int |
2269 | +try_context_based_readahead(struct address_space *mapping, |
2270 | + struct file_ra_state *ra, struct page *prev_page, |
2271 | + struct page *page, pgoff_t index, |
2272 | + unsigned long ra_min, unsigned long ra_max) |
2273 | +{ |
2274 | + pgoff_t ra_index; |
2275 | + unsigned long ra_size; |
2276 | + unsigned long la_size; |
2277 | + unsigned long remain_pages; |
2278 | + |
2279 | + /* Where to start read-ahead? |
2280 | + * NFSv3 daemons may process adjacent requests in parallel, |
2281 | + * leading to many locally disordered, globally sequential reads. |
2282 | + * So do not require nearby history pages to be present or accessed. |
2283 | + */ |
2284 | + if (page) { |
2285 | + ra_index = find_segtail(mapping, index, ra_max * 5 / 4); |
2286 | + if (!ra_index) |
2287 | + return -1; |
2288 | + } else if (prev_page || find_page(mapping, index - 1)) { |
2289 | + ra_index = index; |
2290 | + } else if (readahead_hit_rate > 1) { |
2291 | + ra_index = find_segtail_backward(mapping, index, |
2292 | + readahead_hit_rate + ra_min); |
2293 | + if (!ra_index) |
2294 | + return 0; |
2295 | + ra_min += 2 * (index - ra_index); |
2296 | + index = ra_index; /* pretend the request starts here */ |
2297 | + } else |
2298 | + return 0; |
2299 | + |
2300 | + ra_size = query_page_cache_segment(mapping, ra, &remain_pages, |
2301 | + index, ra_min, ra_max); |
2302 | + |
2303 | + la_size = ra_index - index; |
2304 | + if (page && remain_pages <= la_size && |
2305 | + remain_pages < index && la_size > 1) { |
2306 | + rescue_pages(page, la_size); |
2307 | + return -1; |
2308 | + } |
2309 | + |
2310 | + if (ra_size == index) { |
2311 | + if (!adjust_rala_aggressive(ra_max, &ra_size, &la_size)) |
2312 | + return -1; |
2313 | + ra_set_class(ra, RA_CLASS_CONTEXT_AGGRESSIVE); |
2314 | + } else { |
2315 | + if (!adjust_rala(ra_max, &ra_size, &la_size)) |
2316 | + return -1; |
2317 | + ra_set_class(ra, RA_CLASS_CONTEXT); |
2318 | + } |
2319 | + |
2320 | + ra_set_index(ra, index, ra_index); |
2321 | + ra_set_size(ra, ra_size, la_size); |
2322 | + |
2323 | + return 1; |
2324 | +} |
2325 | + |
2326 | +/* |
2327 | + * Read-ahead on start of file. |
2328 | + * |
2329 | + * The strategies here are most important for small files. |
2330 | + * 1. Set a moderately large read-ahead size; |
2331 | + * 2. Issue the next read-ahead request as soon as possible. |
2332 | + * |
2333 | + * But be careful, there are some applications that dip into only the very head |
2334 | + * of a file. The most important thing is to prevent them from triggering the |
2335 | + * next (much larger) read-ahead request, which leads to lots of cache misses. |
2336 | + * Two pages should be enough for them, correct me if I'm wrong. |
2337 | + */ |
2338 | +static inline unsigned long |
2339 | +newfile_readahead(struct address_space *mapping, |
2340 | + struct file *filp, struct file_ra_state *ra, |
2341 | + unsigned long req_size, unsigned long ra_min) |
2342 | +{ |
2343 | + unsigned long ra_size; |
2344 | + unsigned long la_size; |
2345 | + |
2346 | + if (req_size > ra_min) /* larger value risks thrashing */ |
2347 | + req_size = ra_min; |
2348 | + |
2349 | + if (unlikely(ra->flags & RA_FLAG_NFSD)) { |
2350 | + ra_size = MIN_NFSD_PAGES; |
2351 | + la_size = 0; |
2352 | + } else { |
2353 | + ra_size = 4 * req_size; |
2354 | + la_size = 2 * req_size; |
2355 | + } |
2356 | + |
2357 | + ra_set_class(ra, RA_CLASS_NEWFILE); |
2358 | + ra_set_index(ra, 0, 0); |
2359 | + ra_set_size(ra, ra_size, la_size); |
2360 | + |
2361 | + return ra_dispatch(ra, mapping, filp); |
2362 | +} |
2363 | + |
2364 | +/* |
2365 | + * Backward prefetching. |
2366 | + * No look ahead and thrashing threshold estimation for stepping backward |
2367 | + * pattern: should be unnecessary. |
2368 | + */ |
2369 | +static inline int |
2370 | +try_read_backward(struct file_ra_state *ra, pgoff_t begin_index, |
2371 | + unsigned long ra_size, unsigned long ra_max) |
2372 | +{ |
2373 | + pgoff_t end_index; |
2374 | + |
2375 | + /* Are we reading backward? */ |
2376 | + if (begin_index > ra->prev_page) |
2377 | + return 0; |
2378 | + |
2379 | + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD && |
2380 | + ra_has_index(ra, ra->prev_page)) { |
2381 | + ra_size += 2 * ra_cache_hit(ra, 0); |
2382 | + end_index = ra->la_index; |
2383 | + } else { |
2384 | + ra_size += ra_size + ra_size * (readahead_hit_rate - 1) / 2; |
2385 | + end_index = ra->prev_page; |
2386 | + } |
2387 | + |
2388 | + if (ra_size > ra_max) |
2389 | + ra_size = ra_max; |
2390 | + |
2391 | + /* Read traces close enough to be covered by the prefetching? */ |
2392 | + if (end_index > begin_index + ra_size) |
2393 | + return 0; |
2394 | + |
2395 | + begin_index = end_index - ra_size; |
2396 | + |
2397 | + ra_set_class(ra, RA_CLASS_BACKWARD); |
2398 | + ra_set_index(ra, begin_index, begin_index); |
2399 | + ra_set_size(ra, ra_size, 0); |
2400 | + |
2401 | + return 1; |
2402 | +} |
2403 | + |
2404 | +/* |
2405 | + * Readahead thrashing recovery. |
2406 | + */ |
2407 | +static inline unsigned long |
2408 | +thrashing_recovery_readahead(struct address_space *mapping, |
2409 | + struct file *filp, struct file_ra_state *ra, |
2410 | + pgoff_t index, unsigned long ra_max) |
2411 | +{ |
2412 | + unsigned long ra_size; |
2413 | + |
2414 | + if (readahead_debug_level && find_page(mapping, index - 1)) |
2415 | + ra_account(ra, RA_EVENT_READAHEAD_MUTILATE, |
2416 | + ra->readahead_index - index); |
2417 | + ra_account(ra, RA_EVENT_READAHEAD_THRASHING, |
2418 | + ra->readahead_index - index); |
2419 | + |
2420 | + /* |
2421 | + * Some thrashing occur in (ra_index, la_index], in which case the |
2422 | + * old read-ahead chunk is lost soon after the new one is allocated. |
2423 | + * Ensure that we recover all needed pages in the old chunk. |
2424 | + */ |
2425 | + if (index < ra->ra_index) |
2426 | + ra_size = ra->ra_index - index; |
2427 | + else { |
2428 | + /* After thrashing, we know the exact thrashing-threshold. */ |
2429 | + ra_size = ra_cache_hit(ra, 0); |
2430 | + |
2431 | + /* And we'd better be a bit conservative. */ |
2432 | + ra_size = ra_size * 3 / 4; |
2433 | + } |
2434 | + |
2435 | + if (ra_size > ra_max) |
2436 | + ra_size = ra_max; |
2437 | + |
2438 | + ra_set_class(ra, RA_CLASS_THRASHING); |
2439 | + ra_set_index(ra, index, index); |
2440 | + ra_set_size(ra, ra_size, ra_size / LOOKAHEAD_RATIO); |
2441 | + |
2442 | + return ra_dispatch(ra, mapping, filp); |
2443 | +} |
2444 | + |
2445 | +/* |
2446 | + * If there is a previous sequential read, it is likely to be another |
2447 | + * sequential read at the new position. |
2448 | + * Databases are known to have this seek-and-read-one-block pattern. |
2449 | + */ |
2450 | +static inline int |
2451 | +try_readahead_on_seek(struct file_ra_state *ra, pgoff_t index, |
2452 | + unsigned long ra_size, unsigned long ra_max) |
2453 | +{ |
2454 | + unsigned long hit0 = ra_cache_hit(ra, 0); |
2455 | + unsigned long hit1 = ra_cache_hit(ra, 1) + hit0; |
2456 | + unsigned long hit2 = ra_cache_hit(ra, 2); |
2457 | + unsigned long hit3 = ra_cache_hit(ra, 3); |
2458 | + |
2459 | + /* There's a previous read-ahead request? */ |
2460 | + if (!ra_has_index(ra, ra->prev_page)) |
2461 | + return 0; |
2462 | + |
2463 | + /* The previous read-ahead sequences have similiar sizes? */ |
2464 | + if (!(ra_size < hit1 && hit1 > hit2 / 2 && |
2465 | + hit2 > hit3 / 2 && |
2466 | + hit3 > hit1 / 2)) |
2467 | + return 0; |
2468 | + |
2469 | + hit1 = max(hit1, hit2); |
2470 | + |
2471 | + /* Follow the same prefetching direction. */ |
2472 | + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD) |
2473 | + index = ((index > hit1 - ra_size) ? index - hit1 + ra_size : 0); |
2474 | + |
2475 | + ra_size = min(hit1, ra_max); |
2476 | + |
2477 | + ra_set_class(ra, RA_CLASS_SEEK); |
2478 | + ra_set_index(ra, index, index); |
2479 | + ra_set_size(ra, ra_size, 0); |
2480 | + |
2481 | + return 1; |
2482 | +} |
2483 | + |
2484 | +/* |
2485 | + * ra_min is mainly determined by the size of cache memory. |
2486 | + * Table of concrete numbers for 4KB page size: |
2487 | + * inactive + free (MB): 4 8 16 32 64 128 256 512 1024 |
2488 | + * ra_min (KB): 16 16 16 16 20 24 32 48 64 |
2489 | + */ |
2490 | +static inline void get_readahead_bounds(struct file_ra_state *ra, |
2491 | + unsigned long *ra_min, |
2492 | + unsigned long *ra_max) |
2493 | +{ |
2494 | + unsigned long pages; |
2495 | + |
2496 | + pages = max_sane_readahead(KB(1024*1024)); |
2497 | + *ra_max = min(min(pages, 0xFFFFUL), ra->ra_pages); |
2498 | + *ra_min = min(min(MIN_RA_PAGES + (pages>>13), KB(128)), *ra_max/2); |
2499 | +} |
2500 | + |
2501 | +/** |
2502 | + * page_cache_readahead_adaptive - adaptive read-ahead main function |
2503 | + * @mapping, @ra, @filp: the same as page_cache_readahead() |
2504 | + * @prev_page: the page at @index-1, may be NULL to let the function find it |
2505 | + * @page: the page at @index, or NULL if non-present |
2506 | + * @begin_index, @index, @end_index: offsets into @mapping |
2507 | + * [@begin_index, @end_index) is the read the caller is performing |
2508 | + * @index indicates the page to be read now |
2509 | + * |
2510 | + * page_cache_readahead_adaptive() is the entry point of the adaptive |
2511 | + * read-ahead logic. It tries a set of methods in turn to determine the |
2512 | + * appropriate readahead action and submits the readahead I/O. |
2513 | + * |
2514 | + * The caller is expected to point ra->prev_page to the previously accessed |
2515 | + * page, and to call it on two conditions: |
2516 | + * 1. @page == NULL |
2517 | + * A cache miss happened, some pages have to be read in |
2518 | + * 2. @page != NULL && PageReadahead(@page) |
2519 | + * A look-ahead mark encountered, this is set by a previous read-ahead |
2520 | + * invocation to instruct the caller to give the function a chance to |
2521 | + * check up and do next read-ahead in advance. |
2522 | + */ |
2523 | +unsigned long |
2524 | +page_cache_readahead_adaptive(struct address_space *mapping, |
2525 | + struct file_ra_state *ra, struct file *filp, |
2526 | + struct page *prev_page, struct page *page, |
2527 | + pgoff_t begin_index, pgoff_t index, pgoff_t end_index) |
2528 | +{ |
2529 | + unsigned long size; |
2530 | + unsigned long ra_min; |
2531 | + unsigned long ra_max; |
2532 | + int ret; |
2533 | + |
2534 | + might_sleep(); |
2535 | + |
2536 | + if (page) { |
2537 | + if(!TestClearPageReadahead(page)) |
2538 | + return 0; |
2539 | + if (bdi_read_congested(mapping->backing_dev_info)) { |
2540 | + ra_account(ra, RA_EVENT_IO_CONGESTION, |
2541 | + end_index - index); |
2542 | + return 0; |
2543 | + } |
2544 | + if (laptop_mode && laptop_spinned_down()) { |
2545 | + if (!renew_lookahead(mapping, ra, index, |
2546 | + index + LAPTOP_POLL_INTERVAL)) |
2547 | + return 0; |
2548 | + } |
2549 | + } |
2550 | + |
2551 | + if (page) |
2552 | + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT, |
2553 | + ra->readahead_index - ra->lookahead_index); |
2554 | + else if (index) |
2555 | + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index); |
2556 | + |
2557 | + size = end_index - index; |
2558 | + get_readahead_bounds(ra, &ra_min, &ra_max); |
2559 | + |
2560 | + /* readahead disabled? */ |
2561 | + if (unlikely(!ra_max || !readahead_ratio)) { |
2562 | + size = max_sane_readahead(size); |
2563 | + goto readit; |
2564 | + } |
2565 | + |
2566 | + /* |
2567 | + * Start of file. |
2568 | + */ |
2569 | + if (index == 0) |
2570 | + return newfile_readahead(mapping, filp, ra, end_index, ra_min); |
2571 | + |
2572 | + /* |
2573 | + * State based sequential read-ahead. |
2574 | + */ |
2575 | + if (!disable_stateful_method && |
2576 | + index == ra->lookahead_index && ra_cache_hit_ok(ra)) |
2577 | + return state_based_readahead(mapping, filp, ra, page, |
2578 | + index, size, ra_max); |
2579 | + |
2580 | + /* |
2581 | + * Recover from possible thrashing. |
2582 | + */ |
2583 | + if (!page && index == ra->prev_page + 1 && ra_has_index(ra, index)) |
2584 | + return thrashing_recovery_readahead(mapping, filp, ra, |
2585 | + index, ra_max); |
2586 | + |
2587 | + /* |
2588 | + * Backward read-ahead. |
2589 | + */ |
2590 | + if (!page && begin_index == index && |
2591 | + try_read_backward(ra, index, size, ra_max)) |
2592 | + return ra_dispatch(ra, mapping, filp); |
2593 | + |
2594 | + /* |
2595 | + * Context based sequential read-ahead. |
2596 | + */ |
2597 | + ret = try_context_based_readahead(mapping, ra, prev_page, page, |
2598 | + index, ra_min, ra_max); |
2599 | + if (ret > 0) |
2600 | + return ra_dispatch(ra, mapping, filp); |
2601 | + if (ret < 0) |
2602 | + return 0; |
2603 | + |
2604 | + /* No action on look ahead time? */ |
2605 | + if (page) { |
2606 | + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION, |
2607 | + ra->readahead_index - index); |
2608 | + return 0; |
2609 | + } |
2610 | + |
2611 | + /* |
2612 | + * Random read that follows a sequential one. |
2613 | + */ |
2614 | + if (try_readahead_on_seek(ra, index, size, ra_max)) |
2615 | + return ra_dispatch(ra, mapping, filp); |
2616 | + |
2617 | + /* |
2618 | + * Random read. |
2619 | + */ |
2620 | + if (size > ra_max) |
2621 | + size = ra_max; |
2622 | + |
2623 | +readit: |
2624 | + size = __do_page_cache_readahead(mapping, filp, index, size, 0); |
2625 | + |
2626 | + ra_account(ra, RA_EVENT_READRANDOM, size); |
2627 | + dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n", |
2628 | + mapping->host->i_ino, mapping->nrpages, |
2629 | + begin_index, index, end_index, size); |
2630 | + |
2631 | + return size; |
2632 | +} |
2633 | + |
2634 | +/** |
2635 | + * readahead_cache_hit - adaptive read-ahead feedback function |
2636 | + * @ra: file_ra_state which holds the readahead state |
2637 | + * @page: the page just accessed |
2638 | + * |
2639 | + * readahead_cache_hit() is the feedback route of the adaptive read-ahead |
2640 | + * logic. It must be called on every access on the read-ahead pages. |
2641 | + */ |
2642 | +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page) |
2643 | +{ |
2644 | + if (PageActive(page) || PageReferenced(page)) |
2645 | + return; |
2646 | + |
2647 | + if (!PageUptodate(page)) |
2648 | + ra_account(ra, RA_EVENT_IO_BLOCK, 1); |
2649 | + |
2650 | + if (!ra_has_index(ra, page->index)) |
2651 | + return; |
2652 | + |
2653 | + ra->cache_hits++; |
2654 | + |
2655 | + if (page->index >= ra->ra_index) |
2656 | + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1); |
2657 | + else |
2658 | + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1); |
2659 | +} |
2660 | + |
2661 | +#endif /* CONFIG_ADAPTIVE_READAHEAD */ |
2662 | Index: linux-2.6.16-ck1/mm/swap.c |
2663 | =================================================================== |
2664 | --- linux-2.6.16-ck1.orig/mm/swap.c 2006-03-20 20:46:55.000000000 +1100 |
2665 | +++ linux-2.6.16-ck1/mm/swap.c 2006-03-20 20:47:04.000000000 +1100 |
2666 | @@ -128,6 +128,8 @@ void fastcall mark_page_accessed(struct |
2667 | ClearPageReferenced(page); |
2668 | } else if (!PageReferenced(page)) { |
2669 | SetPageReferenced(page); |
2670 | + if (PageLRU(page)) |
2671 | + inc_readahead_aging(); |
2672 | } |
2673 | } |
2674 | |
2675 | Index: linux-2.6.16-ck1/mm/vmscan.c |
2676 | =================================================================== |
2677 | --- linux-2.6.16-ck1.orig/mm/vmscan.c 2006-03-20 20:47:00.000000000 +1100 |
2678 | +++ linux-2.6.16-ck1/mm/vmscan.c 2006-03-20 20:47:04.000000000 +1100 |
2679 | @@ -458,6 +458,9 @@ static int shrink_list(struct list_head |
2680 | if (PageWriteback(page)) |
2681 | goto keep_locked; |
2682 | |
2683 | + if (!PageReferenced(page)) |
2684 | + inc_readahead_aging(); |
2685 | + |
2686 | referenced = page_referenced(page, 1); |
2687 | /* In active use or really unfreeable? Activate it. */ |
2688 | if (referenced && page_mapping_inuse(page)) |