Magellan Linux

Contents of /trunk/kernel26-magellan/patches-2.6.16-r12/0019-2.6.16-mm-swap_prefetch-30.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (show annotations) (download)
Mon Jun 5 09:25:38 2006 UTC (17 years, 10 months ago) by niro
File size: 28305 byte(s)
ver bump to 2.6.16-r12:
- updated to linux-2.6.16.19
- updated to ck11

1 Documentation/sysctl/vm.txt | 11
2 include/linux/mm_inline.h | 7
3 include/linux/swap-prefetch.h | 55 ++++
4 include/linux/swap.h | 2
5 include/linux/sysctl.h | 1
6 init/Kconfig | 22 +
7 kernel/sysctl.c | 11
8 mm/Makefile | 1
9 mm/swap.c | 44 +++
10 mm/swap_prefetch.c | 574 ++++++++++++++++++++++++++++++++++++++++++
11 mm/swap_state.c | 11
12 mm/vmscan.c | 6
13 12 files changed, 744 insertions(+), 1 deletion(-)
14
15 Index: linux-2.6.16-ck1/Documentation/sysctl/vm.txt
16 ===================================================================
17 --- linux-2.6.16-ck1.orig/Documentation/sysctl/vm.txt 2006-03-20 20:46:24.000000000 +1100
18 +++ linux-2.6.16-ck1/Documentation/sysctl/vm.txt 2006-03-20 20:46:55.000000000 +1100
19 @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/
20 - drop-caches
21 - zone_reclaim_mode
22 - zone_reclaim_interval
23 +- swap_prefetch
24
25 ==============================================================
26
27 @@ -178,3 +179,13 @@ Time is set in seconds and set by defaul
28 Reduce the interval if undesired off node allocations occur. However, too
29 frequent scans will have a negative impact onoff node allocation performance.
30
31 +==============================================================
32 +
33 +swap_prefetch
34 +
35 +This enables or disables the swap prefetching feature. When the virtual
36 +memory subsystem has been extremely idle for at least 5 seconds it will start
37 +copying back pages from swap into the swapcache and keep a copy in swap. In
38 +practice it can take many minutes before the vm is idle enough.
39 +
40 +The default value is 1.
41 Index: linux-2.6.16-ck1/include/linux/swap.h
42 ===================================================================
43 --- linux-2.6.16-ck1.orig/include/linux/swap.h 2006-03-20 20:46:24.000000000 +1100
44 +++ linux-2.6.16-ck1/include/linux/swap.h 2006-03-20 20:46:55.000000000 +1100
45 @@ -164,6 +164,7 @@ extern unsigned int nr_free_pagecache_pa
46 /* linux/mm/swap.c */
47 extern void FASTCALL(lru_cache_add(struct page *));
48 extern void FASTCALL(lru_cache_add_active(struct page *));
49 +extern void FASTCALL(lru_cache_add_tail(struct page *));
50 extern void FASTCALL(activate_page(struct page *));
51 extern void FASTCALL(mark_page_accessed(struct page *));
52 extern void lru_add_drain(void);
53 @@ -235,6 +236,7 @@ extern void free_pages_and_swap_cache(st
54 extern struct page * lookup_swap_cache(swp_entry_t);
55 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
56 unsigned long addr);
57 +extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
58 /* linux/mm/swapfile.c */
59 extern long total_swap_pages;
60 extern unsigned int nr_swapfiles;
61 Index: linux-2.6.16-ck1/include/linux/sysctl.h
62 ===================================================================
63 --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:50.000000000 +1100
64 +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:46:55.000000000 +1100
65 @@ -189,6 +189,7 @@ enum
66 VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
67 VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
68 VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
69 + VM_SWAP_PREFETCH=33, /* swap prefetch */
70 };
71
72
73 Index: linux-2.6.16-ck1/init/Kconfig
74 ===================================================================
75 --- linux-2.6.16-ck1.orig/init/Kconfig 2006-03-20 20:46:24.000000000 +1100
76 +++ linux-2.6.16-ck1/init/Kconfig 2006-03-20 20:46:55.000000000 +1100
77 @@ -92,6 +92,28 @@ config SWAP
78 used to provide more virtual memory than the actual RAM present
79 in your computer. If unsure say Y.
80
81 +config SWAP_PREFETCH
82 + bool "Support for prefetching swapped memory"
83 + depends on SWAP
84 + default y
85 + ---help---
86 + This option will allow the kernel to prefetch swapped memory pages
87 + when idle. The pages will be kept on both swap and in swap_cache
88 + thus avoiding the need for further I/O if either ram or swap space
89 + is required.
90 +
91 + What this will do on workstations is slowly bring back applications
92 + that have swapped out after memory intensive workloads back into
93 + physical ram if you have free ram at a later stage and the machine
94 + is relatively idle. This means that when you come back to your
95 + computer after leaving it idle for a while, applications will come
96 + to life faster. Note that your swap usage will appear to increase
97 + but these are cached pages, can be dropped freely by the vm, and it
98 + should stabilise around 50% swap usage maximum.
99 +
100 + Workstations and multiuser workstation servers will most likely want
101 + to say Y.
102 +
103 config SYSVIPC
104 bool "System V IPC"
105 ---help---
106 Index: linux-2.6.16-ck1/kernel/sysctl.c
107 ===================================================================
108 --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:50.000000000 +1100
109 +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:46:55.000000000 +1100
110 @@ -23,6 +23,7 @@
111 #include <linux/mm.h>
112 #include <linux/swap.h>
113 #include <linux/slab.h>
114 +#include <linux/swap-prefetch.h>
115 #include <linux/sysctl.h>
116 #include <linux/proc_fs.h>
117 #include <linux/capability.h>
118 @@ -942,6 +943,16 @@ static ctl_table vm_table[] = {
119 .strategy = &sysctl_jiffies,
120 },
121 #endif
122 +#ifdef CONFIG_SWAP_PREFETCH
123 + {
124 + .ctl_name = VM_SWAP_PREFETCH,
125 + .procname = "swap_prefetch",
126 + .data = &swap_prefetch,
127 + .maxlen = sizeof(swap_prefetch),
128 + .mode = 0644,
129 + .proc_handler = &proc_dointvec,
130 + },
131 +#endif
132 { .ctl_name = 0 }
133 };
134
135 Index: linux-2.6.16-ck1/mm/Makefile
136 ===================================================================
137 --- linux-2.6.16-ck1.orig/mm/Makefile 2006-03-20 20:46:24.000000000 +1100
138 +++ linux-2.6.16-ck1/mm/Makefile 2006-03-20 20:46:55.000000000 +1100
139 @@ -13,6 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o
140 prio_tree.o util.o $(mmu-y)
141
142 obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
143 +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
144 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
145 obj-$(CONFIG_NUMA) += mempolicy.o
146 obj-$(CONFIG_SPARSEMEM) += sparse.o
147 Index: linux-2.6.16-ck1/mm/swap.c
148 ===================================================================
149 --- linux-2.6.16-ck1.orig/mm/swap.c 2006-03-20 20:46:24.000000000 +1100
150 +++ linux-2.6.16-ck1/mm/swap.c 2006-03-20 20:46:55.000000000 +1100
151 @@ -17,6 +17,7 @@
152 #include <linux/sched.h>
153 #include <linux/kernel_stat.h>
154 #include <linux/swap.h>
155 +#include <linux/swap-prefetch.h>
156 #include <linux/mman.h>
157 #include <linux/pagemap.h>
158 #include <linux/pagevec.h>
159 @@ -382,6 +383,46 @@ void __pagevec_lru_add_active(struct pag
160 pagevec_reinit(pvec);
161 }
162
163 +static inline void __pagevec_lru_add_tail(struct pagevec *pvec)
164 +{
165 + int i;
166 + struct zone *zone = NULL;
167 +
168 + for (i = 0; i < pagevec_count(pvec); i++) {
169 + struct page *page = pvec->pages[i];
170 + struct zone *pagezone = page_zone(page);
171 +
172 + if (pagezone != zone) {
173 + if (zone)
174 + spin_unlock_irq(&zone->lru_lock);
175 + zone = pagezone;
176 + spin_lock_irq(&zone->lru_lock);
177 + }
178 + BUG_ON(PageLRU(page));
179 + SetPageLRU(page);
180 + add_page_to_inactive_list_tail(zone, page);
181 + }
182 + if (zone)
183 + spin_unlock_irq(&zone->lru_lock);
184 + release_pages(pvec->pages, pvec->nr, pvec->cold);
185 + pagevec_reinit(pvec);
186 +}
187 +
188 +/*
189 + * Function used uniquely to put pages back to the lru at the end of the
190 + * inactive list to preserve the lru order. Currently only used by swap
191 + * prefetch.
192 + */
193 +void fastcall lru_cache_add_tail(struct page *page)
194 +{
195 + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
196 +
197 + page_cache_get(page);
198 + if (!pagevec_add(pvec, page))
199 + __pagevec_lru_add_tail(pvec);
200 + put_cpu_var(lru_add_pvecs);
201 +}
202 +
203 /*
204 * Try to drop buffers from the pages in a pagevec
205 */
206 @@ -536,5 +577,8 @@ void __init swap_setup(void)
207 * Right now other parts of the system means that we
208 * _really_ don't want to cluster much more
209 */
210 +
211 + prepare_swap_prefetch();
212 +
213 hotcpu_notifier(cpu_swap_callback, 0);
214 }
215 Index: linux-2.6.16-ck1/mm/swap_prefetch.c
216 ===================================================================
217 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
218 +++ linux-2.6.16-ck1/mm/swap_prefetch.c 2006-03-20 20:46:55.000000000 +1100
219 @@ -0,0 +1,574 @@
220 +/*
221 + * linux/mm/swap_prefetch.c
222 + *
223 + * Copyright (C) 2005-2006 Con Kolivas
224 + *
225 + * Written by Con Kolivas <kernel@kolivas.org>
226 + *
227 + * This program is free software; you can redistribute it and/or modify
228 + * it under the terms of the GNU General Public License version 2 as
229 + * published by the Free Software Foundation.
230 + */
231 +
232 +#include <linux/fs.h>
233 +#include <linux/mm.h>
234 +#include <linux/swap.h>
235 +#include <linux/swap-prefetch.h>
236 +#include <linux/ioprio.h>
237 +#include <linux/kthread.h>
238 +#include <linux/pagemap.h>
239 +#include <linux/syscalls.h>
240 +#include <linux/writeback.h>
241 +
242 +/*
243 + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
244 + * needs to be at least this duration of idle time meaning in practice it can
245 + * be much longer
246 + */
247 +#define PREFETCH_DELAY (HZ * 5)
248 +
249 +/* sysctl - enable/disable swap prefetching */
250 +int swap_prefetch __read_mostly = 1;
251 +
252 +struct swapped_root {
253 + unsigned long busy; /* vm busy */
254 + spinlock_t lock; /* protects all data */
255 + struct list_head list; /* MRU list of swapped pages */
256 + struct radix_tree_root swap_tree; /* Lookup tree of pages */
257 + unsigned int count; /* Number of entries */
258 + unsigned int maxcount; /* Maximum entries allowed */
259 + kmem_cache_t *cache; /* Of struct swapped_entry */
260 +};
261 +
262 +static struct swapped_root swapped = {
263 + .lock = SPIN_LOCK_UNLOCKED,
264 + .list = LIST_HEAD_INIT(swapped.list),
265 + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC),
266 +};
267 +
268 +static task_t *kprefetchd_task;
269 +
270 +/*
271 + * We check to see no part of the vm is busy. If it is this will interrupt
272 + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
273 + */
274 +inline void delay_swap_prefetch(void)
275 +{
276 + if (!test_bit(0, &swapped.busy))
277 + __set_bit(0, &swapped.busy);
278 +}
279 +
280 +/*
281 + * Drop behind accounting which keeps a list of the most recently used swap
282 + * entries.
283 + */
284 +void add_to_swapped_list(struct page *page)
285 +{
286 + struct swapped_entry *entry;
287 + unsigned long index;
288 + int wakeup;
289 +
290 + if (!swap_prefetch)
291 + return;
292 +
293 + wakeup = 0;
294 +
295 + spin_lock(&swapped.lock);
296 + if (swapped.count >= swapped.maxcount) {
297 + /*
298 + * We limit the number of entries to 2/3 of physical ram.
299 + * Once the number of entries exceeds this we start removing
300 + * the least recently used entries.
301 + */
302 + entry = list_entry(swapped.list.next,
303 + struct swapped_entry, swapped_list);
304 + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
305 + list_del(&entry->swapped_list);
306 + swapped.count--;
307 + } else {
308 + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
309 + if (unlikely(!entry))
310 + /* bad, can't allocate more mem */
311 + goto out_locked;
312 + }
313 +
314 + index = page_private(page);
315 + entry->swp_entry.val = index;
316 + /*
317 + * On numa we need to store the node id to ensure that we prefetch to
318 + * the same node it came from.
319 + */
320 + store_swap_entry_node(entry, page);
321 +
322 + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
323 + /*
324 + * If this is the first entry, kprefetchd needs to be
325 + * (re)started.
326 + */
327 + if (!swapped.count)
328 + wakeup = 1;
329 + list_add(&entry->swapped_list, &swapped.list);
330 + swapped.count++;
331 + }
332 +
333 +out_locked:
334 + spin_unlock(&swapped.lock);
335 +
336 + /* Do the wakeup outside the lock to shorten lock hold time. */
337 + if (wakeup)
338 + wake_up_process(kprefetchd_task);
339 +
340 + return;
341 +}
342 +
343 +/*
344 + * Removes entries from the swapped_list. The radix tree allows us to quickly
345 + * look up the entry from the index without having to iterate over the whole
346 + * list.
347 + */
348 +void remove_from_swapped_list(const unsigned long index)
349 +{
350 + struct swapped_entry *entry;
351 + unsigned long flags;
352 +
353 + if (list_empty(&swapped.list))
354 + return;
355 +
356 + spin_lock_irqsave(&swapped.lock, flags);
357 + entry = radix_tree_delete(&swapped.swap_tree, index);
358 + if (likely(entry)) {
359 + list_del_init(&entry->swapped_list);
360 + swapped.count--;
361 + kmem_cache_free(swapped.cache, entry);
362 + }
363 + spin_unlock_irqrestore(&swapped.lock, flags);
364 +}
365 +
366 +enum trickle_return {
367 + TRICKLE_SUCCESS,
368 + TRICKLE_FAILED,
369 + TRICKLE_DELAY,
370 +};
371 +
372 +struct node_stats {
373 + unsigned long last_free;
374 + /* Free ram after a cycle of prefetching */
375 + unsigned long current_free;
376 + /* Free ram on this cycle of checking prefetch_suitable */
377 + unsigned long prefetch_watermark;
378 + /* Maximum amount we will prefetch to */
379 + unsigned long highfree[MAX_NR_ZONES];
380 + /* The amount of free ram before we start prefetching */
381 + unsigned long lowfree[MAX_NR_ZONES];
382 + /* The amount of free ram where we will stop prefetching */
383 + unsigned long *pointfree[MAX_NR_ZONES];
384 + /* highfree or lowfree depending on whether we've hit a watermark */
385 +};
386 +
387 +/*
388 + * prefetch_stats stores the free ram data of each node and this is used to
389 + * determine if a node is suitable for prefetching into.
390 + */
391 +struct prefetch_stats {
392 + nodemask_t prefetch_nodes;
393 + /* Which nodes are currently suited to prefetching */
394 + unsigned long prefetched_pages;
395 + /* Total pages we've prefetched on this wakeup of kprefetchd */
396 + struct node_stats node[MAX_NUMNODES];
397 +};
398 +
399 +static struct prefetch_stats sp_stat;
400 +
401 +/*
402 + * This tries to read a swp_entry_t into swap cache for swap prefetching.
403 + * If it returns TRICKLE_DELAY we should delay further prefetching.
404 + */
405 +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
406 + const int node)
407 +{
408 + enum trickle_return ret = TRICKLE_FAILED;
409 + struct page *page;
410 +
411 + read_lock_irq(&swapper_space.tree_lock);
412 + /* Entry may already exist */
413 + page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
414 + read_unlock_irq(&swapper_space.tree_lock);
415 + if (page) {
416 + remove_from_swapped_list(entry.val);
417 + goto out;
418 + }
419 +
420 + /*
421 + * Get a new page to read from swap. We have already checked the
422 + * watermarks so __alloc_pages will not call on reclaim.
423 + */
424 + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
425 + if (unlikely(!page)) {
426 + ret = TRICKLE_DELAY;
427 + goto out;
428 + }
429 +
430 + if (add_to_swap_cache(page, entry)) {
431 + /* Failed to add to swap cache */
432 + goto out_release;
433 + }
434 +
435 + /* Add them to the tail of the inactive list to preserve LRU order */
436 + lru_cache_add_tail(page);
437 + if (unlikely(swap_readpage(NULL, page))) {
438 + ret = TRICKLE_DELAY;
439 + goto out_release;
440 + }
441 +
442 + sp_stat.prefetched_pages++;
443 + sp_stat.node[node].last_free--;
444 +
445 + ret = TRICKLE_SUCCESS;
446 +out_release:
447 + page_cache_release(page);
448 +out:
449 + return ret;
450 +}
451 +
452 +static void clear_last_prefetch_free(void)
453 +{
454 + int node;
455 +
456 + /*
457 + * Reset the nodes suitable for prefetching to all nodes. We could
458 + * update the data to take into account memory hotplug if desired..
459 + */
460 + sp_stat.prefetch_nodes = node_online_map;
461 + for_each_node_mask(node, sp_stat.prefetch_nodes) {
462 + struct node_stats *ns = &sp_stat.node[node];
463 +
464 + ns->last_free = 0;
465 + }
466 +}
467 +
468 +static void clear_current_prefetch_free(void)
469 +{
470 + int node;
471 +
472 + sp_stat.prefetch_nodes = node_online_map;
473 + for_each_node_mask(node, sp_stat.prefetch_nodes) {
474 + struct node_stats *ns = &sp_stat.node[node];
475 +
476 + ns->current_free = 0;
477 + }
478 +}
479 +
480 +/*
481 + * This updates the high and low watermarks of amount of free ram in each
482 + * node used to start and stop prefetching. We prefetch from pages_high * 4
483 + * down to pages_high * 3.
484 + */
485 +static void examine_free_limits(void)
486 +{
487 + struct zone *z;
488 +
489 + for_each_zone(z) {
490 + struct node_stats *ns;
491 + int idx;
492 +
493 + if (!populated_zone(z))
494 + continue;
495 +
496 + ns = &sp_stat.node[z->zone_pgdat->node_id];
497 + idx = zone_idx(z);
498 + ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx];
499 + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
500 +
501 + if (z->free_pages > ns->highfree[idx]) {
502 + /*
503 + * We've gotten above the high watermark of free pages
504 + * so we can start prefetching till we get to the low
505 + * watermark.
506 + */
507 + ns->pointfree[idx] = &ns->lowfree[idx];
508 + }
509 + }
510 +}
511 +
512 +/*
513 + * We want to be absolutely certain it's ok to start prefetching.
514 + */
515 +static int prefetch_suitable(void)
516 +{
517 + unsigned long limit;
518 + struct zone *z;
519 + int node, ret = 0, test_pagestate = 0;
520 +
521 + /* Purposefully racy */
522 + if (test_bit(0, &swapped.busy)) {
523 + __clear_bit(0, &swapped.busy);
524 + goto out;
525 + }
526 +
527 + /*
528 + * get_page_state and above_background_load are expensive so we only
529 + * perform them every SWAP_CLUSTER_MAX prefetched_pages.
530 + * We test to see if we're above_background_load as disk activity
531 + * even at low priority can cause interrupt induced scheduling
532 + * latencies.
533 + */
534 + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
535 + if (above_background_load())
536 + goto out;
537 + test_pagestate = 1;
538 + }
539 +
540 + clear_current_prefetch_free();
541 +
542 + /*
543 + * Have some hysteresis between where page reclaiming and prefetching
544 + * will occur to prevent ping-ponging between them.
545 + */
546 + for_each_zone(z) {
547 + struct node_stats *ns;
548 + unsigned long free;
549 + int idx;
550 +
551 + if (!populated_zone(z))
552 + continue;
553 +
554 + node = z->zone_pgdat->node_id;
555 + ns = &sp_stat.node[node];
556 + idx = zone_idx(z);
557 +
558 + free = z->free_pages;
559 + if (free < *ns->pointfree[idx]) {
560 + /*
561 + * Free pages have dropped below the low watermark so
562 + * we won't start prefetching again till we hit the
563 + * high watermark of free pages.
564 + */
565 + ns->pointfree[idx] = &ns->highfree[idx];
566 + node_clear(node, sp_stat.prefetch_nodes);
567 + continue;
568 + }
569 + ns->current_free += free;
570 + }
571 +
572 + /*
573 + * We iterate over each node testing to see if it is suitable for
574 + * prefetching and clear the nodemask if it is not.
575 + */
576 + for_each_node_mask(node, sp_stat.prefetch_nodes) {
577 + struct node_stats *ns = &sp_stat.node[node];
578 + struct page_state ps;
579 +
580 + /*
581 + * We check to see that pages are not being allocated
582 + * elsewhere at any significant rate implying any
583 + * degree of memory pressure (eg during file reads)
584 + */
585 + if (ns->last_free) {
586 + if (ns->current_free + SWAP_CLUSTER_MAX <
587 + ns->last_free) {
588 + ns->last_free = ns->current_free;
589 + node_clear(node,
590 + sp_stat.prefetch_nodes);
591 + continue;
592 + }
593 + } else
594 + ns->last_free = ns->current_free;
595 +
596 + if (!test_pagestate)
597 + continue;
598 +
599 + get_page_state_node(&ps, node);
600 +
601 + /* We shouldn't prefetch when we are doing writeback */
602 + if (ps.nr_writeback) {
603 + node_clear(node, sp_stat.prefetch_nodes);
604 + continue;
605 + }
606 +
607 + /*
608 + * >2/3 of the ram on this node is mapped, slab, swapcache or
609 + * dirty, we need to leave some free for pagecache.
610 + * Note that currently nr_slab is innacurate on numa because
611 + * nr_slab is incremented on the node doing the accounting
612 + * even if the slab is being allocated on a remote node. This
613 + * would be expensive to fix and not of great significance.
614 + */
615 + limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty +
616 + ps.nr_unstable + total_swapcache_pages;
617 + if (limit > ns->prefetch_watermark) {
618 + node_clear(node, sp_stat.prefetch_nodes);
619 + continue;
620 + }
621 + }
622 +
623 + if (nodes_empty(sp_stat.prefetch_nodes))
624 + goto out;
625 +
626 + /* Survived all that? Hooray we can prefetch! */
627 + ret = 1;
628 +out:
629 + return ret;
630 +}
631 +
632 +/*
633 + * Get previous swapped entry when iterating over all entries. swapped.lock
634 + * should be held and we should already ensure that entry exists.
635 + */
636 +static inline struct swapped_entry *prev_swapped_entry
637 + (struct swapped_entry *entry)
638 +{
639 + return list_entry(entry->swapped_list.prev->prev,
640 + struct swapped_entry, swapped_list);
641 +}
642 +
643 +/*
644 + * trickle_swap is the main function that initiates the swap prefetching. It
645 + * first checks to see if the busy flag is set, and does not prefetch if it
646 + * is, as the flag implied we are low on memory or swapping in currently.
647 + * Otherwise it runs until prefetch_suitable fails which occurs when the
648 + * vm is busy, we prefetch to the watermark, or the list is empty or we have
649 + * iterated over all entries
650 + */
651 +static enum trickle_return trickle_swap(void)
652 +{
653 + enum trickle_return ret = TRICKLE_DELAY;
654 + struct swapped_entry *entry;
655 +
656 + /*
657 + * If laptop_mode is enabled don't prefetch to avoid hard drives
658 + * doing unnecessary spin-ups
659 + */
660 + if (!swap_prefetch || laptop_mode)
661 + return ret;
662 +
663 + examine_free_limits();
664 + entry = NULL;
665 +
666 + for ( ; ; ) {
667 + swp_entry_t swp_entry;
668 + int node;
669 +
670 + if (!prefetch_suitable())
671 + break;
672 +
673 + spin_lock(&swapped.lock);
674 + if (list_empty(&swapped.list)) {
675 + ret = TRICKLE_FAILED;
676 + spin_unlock(&swapped.lock);
677 + break;
678 + }
679 +
680 + if (!entry) {
681 + /*
682 + * This sets the entry for the first iteration. It
683 + * also is a safeguard against the entry disappearing
684 + * while the lock is not held.
685 + */
686 + entry = list_entry(swapped.list.prev,
687 + struct swapped_entry, swapped_list);
688 + } else if (entry->swapped_list.prev == swapped.list.next) {
689 + /*
690 + * If we have iterated over all entries and there are
691 + * still entries that weren't swapped out there may
692 + * be a reason we could not swap them back in so
693 + * delay attempting further prefetching.
694 + */
695 + spin_unlock(&swapped.lock);
696 + break;
697 + }
698 +
699 + node = get_swap_entry_node(entry);
700 + if (!node_isset(node, sp_stat.prefetch_nodes)) {
701 + /*
702 + * We found an entry that belongs to a node that is
703 + * not suitable for prefetching so skip it.
704 + */
705 + entry = prev_swapped_entry(entry);
706 + spin_unlock(&swapped.lock);
707 + continue;
708 + }
709 + swp_entry = entry->swp_entry;
710 + entry = prev_swapped_entry(entry);
711 + spin_unlock(&swapped.lock);
712 +
713 + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
714 + break;
715 + }
716 +
717 + if (sp_stat.prefetched_pages) {
718 + lru_add_drain();
719 + sp_stat.prefetched_pages = 0;
720 + }
721 + return ret;
722 +}
723 +
724 +static int kprefetchd(void *__unused)
725 +{
726 + set_user_nice(current, 19);
727 + /* Set ioprio to lowest if supported by i/o scheduler */
728 + sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
729 +
730 + do {
731 + try_to_freeze();
732 +
733 + /*
734 + * TRICKLE_FAILED implies no entries left - we do not schedule
735 + * a wakeup, and further delay the next one.
736 + */
737 + if (trickle_swap() == TRICKLE_FAILED) {
738 + set_current_state(TASK_INTERRUPTIBLE);
739 + schedule();
740 + }
741 + clear_last_prefetch_free();
742 + schedule_timeout_interruptible(PREFETCH_DELAY);
743 + } while (!kthread_should_stop());
744 +
745 + return 0;
746 +}
747 +
748 +/*
749 + * Create kmem cache for swapped entries
750 + */
751 +void __init prepare_swap_prefetch(void)
752 +{
753 + struct zone *zone;
754 +
755 + swapped.cache = kmem_cache_create("swapped_entry",
756 + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
757 +
758 + /*
759 + * Set max number of entries to 2/3 the size of physical ram as we
760 + * only ever prefetch to consume 2/3 of the ram.
761 + */
762 + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
763 +
764 + for_each_zone(zone) {
765 + unsigned long present;
766 + struct node_stats *ns;
767 + int idx;
768 +
769 + present = zone->present_pages;
770 + if (!present)
771 + continue;
772 +
773 + ns = &sp_stat.node[zone->zone_pgdat->node_id];
774 + ns->prefetch_watermark += present / 3 * 2;
775 + idx = zone_idx(zone);
776 + ns->pointfree[idx] = &ns->highfree[idx];
777 + }
778 +}
779 +
780 +static int __init kprefetchd_init(void)
781 +{
782 + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
783 +
784 + return 0;
785 +}
786 +
787 +static void __exit kprefetchd_exit(void)
788 +{
789 + kthread_stop(kprefetchd_task);
790 +}
791 +
792 +module_init(kprefetchd_init);
793 +module_exit(kprefetchd_exit);
794 Index: linux-2.6.16-ck1/mm/swap_state.c
795 ===================================================================
796 --- linux-2.6.16-ck1.orig/mm/swap_state.c 2006-03-20 20:46:24.000000000 +1100
797 +++ linux-2.6.16-ck1/mm/swap_state.c 2006-03-20 20:46:55.000000000 +1100
798 @@ -10,6 +10,7 @@
799 #include <linux/mm.h>
800 #include <linux/kernel_stat.h>
801 #include <linux/swap.h>
802 +#include <linux/swap-prefetch.h>
803 #include <linux/init.h>
804 #include <linux/pagemap.h>
805 #include <linux/buffer_head.h>
806 @@ -81,6 +82,7 @@ static int __add_to_swap_cache(struct pa
807 error = radix_tree_insert(&swapper_space.page_tree,
808 entry.val, page);
809 if (!error) {
810 + remove_from_swapped_list(entry.val);
811 page_cache_get(page);
812 SetPageLocked(page);
813 SetPageSwapCache(page);
814 @@ -94,11 +96,12 @@ static int __add_to_swap_cache(struct pa
815 return error;
816 }
817
818 -static int add_to_swap_cache(struct page *page, swp_entry_t entry)
819 +int add_to_swap_cache(struct page *page, swp_entry_t entry)
820 {
821 int error;
822
823 if (!swap_duplicate(entry)) {
824 + remove_from_swapped_list(entry.val);
825 INC_CACHE_INFO(noent_race);
826 return -ENOENT;
827 }
828 @@ -147,6 +150,9 @@ int add_to_swap(struct page * page, gfp_
829 swp_entry_t entry;
830 int err;
831
832 + /* Swap prefetching is delayed if we're swapping pages */
833 + delay_swap_prefetch();
834 +
835 if (!PageLocked(page))
836 BUG();
837
838 @@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e
839 struct page *found_page, *new_page = NULL;
840 int err;
841
842 + /* Swap prefetching is delayed if we're already reading from swap */
843 + delay_swap_prefetch();
844 +
845 do {
846 /*
847 * First check the swap cache. Since this is normally
848 Index: linux-2.6.16-ck1/mm/vmscan.c
849 ===================================================================
850 --- linux-2.6.16-ck1.orig/mm/vmscan.c 2006-03-20 20:46:24.000000000 +1100
851 +++ linux-2.6.16-ck1/mm/vmscan.c 2006-03-20 20:46:55.000000000 +1100
852 @@ -16,6 +16,7 @@
853 #include <linux/slab.h>
854 #include <linux/kernel_stat.h>
855 #include <linux/swap.h>
856 +#include <linux/swap-prefetch.h>
857 #include <linux/pagemap.h>
858 #include <linux/init.h>
859 #include <linux/highmem.h>
860 @@ -396,6 +397,7 @@ static int remove_mapping(struct address
861
862 if (PageSwapCache(page)) {
863 swp_entry_t swap = { .val = page_private(page) };
864 + add_to_swapped_list(page);
865 __delete_from_swap_cache(page);
866 write_unlock_irq(&mapping->tree_lock);
867 swap_free(swap);
868 @@ -1442,6 +1444,8 @@ int try_to_free_pages(struct zone **zone
869 sc.may_writepage = !laptop_mode;
870 sc.may_swap = 1;
871
872 + delay_swap_prefetch();
873 +
874 inc_page_state(allocstall);
875
876 for (i = 0; zones[i] != NULL; i++) {
877 @@ -1788,6 +1792,8 @@ int shrink_all_memory(int nr_pages)
878 .reclaimed_slab = 0,
879 };
880
881 + delay_swap_prefetch();
882 +
883 current->reclaim_state = &reclaim_state;
884 for_each_pgdat(pgdat) {
885 int freed;
886 Index: linux-2.6.16-ck1/include/linux/mm_inline.h
887 ===================================================================
888 --- linux-2.6.16-ck1.orig/include/linux/mm_inline.h 2006-03-20 20:46:24.000000000 +1100
889 +++ linux-2.6.16-ck1/include/linux/mm_inline.h 2006-03-20 20:46:55.000000000 +1100
890 @@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z
891 }
892
893 static inline void
894 +add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
895 +{
896 + list_add_tail(&page->lru, &zone->inactive_list);
897 + zone->nr_inactive++;
898 +}
899 +
900 +static inline void
901 del_page_from_active_list(struct zone *zone, struct page *page)
902 {
903 list_del(&page->lru);
904 Index: linux-2.6.16-ck1/include/linux/swap-prefetch.h
905 ===================================================================
906 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
907 +++ linux-2.6.16-ck1/include/linux/swap-prefetch.h 2006-03-20 20:46:55.000000000 +1100
908 @@ -0,0 +1,55 @@
909 +#ifndef SWAP_PREFETCH_H_INCLUDED
910 +#define SWAP_PREFETCH_H_INCLUDED
911 +
912 +#ifdef CONFIG_SWAP_PREFETCH
913 +/* mm/swap_prefetch.c */
914 +extern int swap_prefetch;
915 +struct swapped_entry {
916 + swp_entry_t swp_entry; /* The actual swap entry */
917 + struct list_head swapped_list; /* Linked list of entries */
918 +#if MAX_NUMNODES > 1
919 + int node; /* Node id */
920 +#endif
921 +} __attribute__((packed));
922 +
923 +static inline void store_swap_entry_node(struct swapped_entry *entry,
924 + struct page *page)
925 +{
926 +#if MAX_NUMNODES > 1
927 + entry->node = page_to_nid(page);
928 +#endif
929 +}
930 +
931 +static inline int get_swap_entry_node(struct swapped_entry *entry)
932 +{
933 +#if MAX_NUMNODES > 1
934 + return entry->node;
935 +#else
936 + return 0;
937 +#endif
938 +}
939 +
940 +extern void add_to_swapped_list(struct page *page);
941 +extern void remove_from_swapped_list(const unsigned long index);
942 +extern void delay_swap_prefetch(void);
943 +extern void prepare_swap_prefetch(void);
944 +
945 +#else /* CONFIG_SWAP_PREFETCH */
946 +static inline void add_to_swapped_list(struct page *__unused)
947 +{
948 +}
949 +
950 +static inline void prepare_swap_prefetch(void)
951 +{
952 +}
953 +
954 +static inline void remove_from_swapped_list(const unsigned long __unused)
955 +{
956 +}
957 +
958 +static inline void delay_swap_prefetch(void)
959 +{
960 +}
961 +#endif /* CONFIG_SWAP_PREFETCH */
962 +
963 +#endif /* SWAP_PREFETCH_H_INCLUDED */