Magellan Linux

Annotation of /trunk/kernel26-magellan/patches-2.6.16-r12/0019-2.6.16-mm-swap_prefetch-30.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (hide annotations) (download)
Mon Jun 5 09:25:38 2006 UTC (18 years ago) by niro
File size: 28305 byte(s)
ver bump to 2.6.16-r12:
- updated to linux-2.6.16.19
- updated to ck11

1 niro 72 Documentation/sysctl/vm.txt | 11
2     include/linux/mm_inline.h | 7
3     include/linux/swap-prefetch.h | 55 ++++
4     include/linux/swap.h | 2
5     include/linux/sysctl.h | 1
6     init/Kconfig | 22 +
7     kernel/sysctl.c | 11
8     mm/Makefile | 1
9     mm/swap.c | 44 +++
10     mm/swap_prefetch.c | 574 ++++++++++++++++++++++++++++++++++++++++++
11     mm/swap_state.c | 11
12     mm/vmscan.c | 6
13     12 files changed, 744 insertions(+), 1 deletion(-)
14    
15     Index: linux-2.6.16-ck1/Documentation/sysctl/vm.txt
16     ===================================================================
17     --- linux-2.6.16-ck1.orig/Documentation/sysctl/vm.txt 2006-03-20 20:46:24.000000000 +1100
18     +++ linux-2.6.16-ck1/Documentation/sysctl/vm.txt 2006-03-20 20:46:55.000000000 +1100
19     @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/
20     - drop-caches
21     - zone_reclaim_mode
22     - zone_reclaim_interval
23     +- swap_prefetch
24    
25     ==============================================================
26    
27     @@ -178,3 +179,13 @@ Time is set in seconds and set by defaul
28     Reduce the interval if undesired off node allocations occur. However, too
29     frequent scans will have a negative impact onoff node allocation performance.
30    
31     +==============================================================
32     +
33     +swap_prefetch
34     +
35     +This enables or disables the swap prefetching feature. When the virtual
36     +memory subsystem has been extremely idle for at least 5 seconds it will start
37     +copying back pages from swap into the swapcache and keep a copy in swap. In
38     +practice it can take many minutes before the vm is idle enough.
39     +
40     +The default value is 1.
41     Index: linux-2.6.16-ck1/include/linux/swap.h
42     ===================================================================
43     --- linux-2.6.16-ck1.orig/include/linux/swap.h 2006-03-20 20:46:24.000000000 +1100
44     +++ linux-2.6.16-ck1/include/linux/swap.h 2006-03-20 20:46:55.000000000 +1100
45     @@ -164,6 +164,7 @@ extern unsigned int nr_free_pagecache_pa
46     /* linux/mm/swap.c */
47     extern void FASTCALL(lru_cache_add(struct page *));
48     extern void FASTCALL(lru_cache_add_active(struct page *));
49     +extern void FASTCALL(lru_cache_add_tail(struct page *));
50     extern void FASTCALL(activate_page(struct page *));
51     extern void FASTCALL(mark_page_accessed(struct page *));
52     extern void lru_add_drain(void);
53     @@ -235,6 +236,7 @@ extern void free_pages_and_swap_cache(st
54     extern struct page * lookup_swap_cache(swp_entry_t);
55     extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
56     unsigned long addr);
57     +extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
58     /* linux/mm/swapfile.c */
59     extern long total_swap_pages;
60     extern unsigned int nr_swapfiles;
61     Index: linux-2.6.16-ck1/include/linux/sysctl.h
62     ===================================================================
63     --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:50.000000000 +1100
64     +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:46:55.000000000 +1100
65     @@ -189,6 +189,7 @@ enum
66     VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
67     VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
68     VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
69     + VM_SWAP_PREFETCH=33, /* swap prefetch */
70     };
71    
72    
73     Index: linux-2.6.16-ck1/init/Kconfig
74     ===================================================================
75     --- linux-2.6.16-ck1.orig/init/Kconfig 2006-03-20 20:46:24.000000000 +1100
76     +++ linux-2.6.16-ck1/init/Kconfig 2006-03-20 20:46:55.000000000 +1100
77     @@ -92,6 +92,28 @@ config SWAP
78     used to provide more virtual memory than the actual RAM present
79     in your computer. If unsure say Y.
80    
81     +config SWAP_PREFETCH
82     + bool "Support for prefetching swapped memory"
83     + depends on SWAP
84     + default y
85     + ---help---
86     + This option will allow the kernel to prefetch swapped memory pages
87     + when idle. The pages will be kept on both swap and in swap_cache
88     + thus avoiding the need for further I/O if either ram or swap space
89     + is required.
90     +
91     + What this will do on workstations is slowly bring back applications
92     + that have swapped out after memory intensive workloads back into
93     + physical ram if you have free ram at a later stage and the machine
94     + is relatively idle. This means that when you come back to your
95     + computer after leaving it idle for a while, applications will come
96     + to life faster. Note that your swap usage will appear to increase
97     + but these are cached pages, can be dropped freely by the vm, and it
98     + should stabilise around 50% swap usage maximum.
99     +
100     + Workstations and multiuser workstation servers will most likely want
101     + to say Y.
102     +
103     config SYSVIPC
104     bool "System V IPC"
105     ---help---
106     Index: linux-2.6.16-ck1/kernel/sysctl.c
107     ===================================================================
108     --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:50.000000000 +1100
109     +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:46:55.000000000 +1100
110     @@ -23,6 +23,7 @@
111     #include <linux/mm.h>
112     #include <linux/swap.h>
113     #include <linux/slab.h>
114     +#include <linux/swap-prefetch.h>
115     #include <linux/sysctl.h>
116     #include <linux/proc_fs.h>
117     #include <linux/capability.h>
118     @@ -942,6 +943,16 @@ static ctl_table vm_table[] = {
119     .strategy = &sysctl_jiffies,
120     },
121     #endif
122     +#ifdef CONFIG_SWAP_PREFETCH
123     + {
124     + .ctl_name = VM_SWAP_PREFETCH,
125     + .procname = "swap_prefetch",
126     + .data = &swap_prefetch,
127     + .maxlen = sizeof(swap_prefetch),
128     + .mode = 0644,
129     + .proc_handler = &proc_dointvec,
130     + },
131     +#endif
132     { .ctl_name = 0 }
133     };
134    
135     Index: linux-2.6.16-ck1/mm/Makefile
136     ===================================================================
137     --- linux-2.6.16-ck1.orig/mm/Makefile 2006-03-20 20:46:24.000000000 +1100
138     +++ linux-2.6.16-ck1/mm/Makefile 2006-03-20 20:46:55.000000000 +1100
139     @@ -13,6 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o
140     prio_tree.o util.o $(mmu-y)
141    
142     obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
143     +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
144     obj-$(CONFIG_HUGETLBFS) += hugetlb.o
145     obj-$(CONFIG_NUMA) += mempolicy.o
146     obj-$(CONFIG_SPARSEMEM) += sparse.o
147     Index: linux-2.6.16-ck1/mm/swap.c
148     ===================================================================
149     --- linux-2.6.16-ck1.orig/mm/swap.c 2006-03-20 20:46:24.000000000 +1100
150     +++ linux-2.6.16-ck1/mm/swap.c 2006-03-20 20:46:55.000000000 +1100
151     @@ -17,6 +17,7 @@
152     #include <linux/sched.h>
153     #include <linux/kernel_stat.h>
154     #include <linux/swap.h>
155     +#include <linux/swap-prefetch.h>
156     #include <linux/mman.h>
157     #include <linux/pagemap.h>
158     #include <linux/pagevec.h>
159     @@ -382,6 +383,46 @@ void __pagevec_lru_add_active(struct pag
160     pagevec_reinit(pvec);
161     }
162    
163     +static inline void __pagevec_lru_add_tail(struct pagevec *pvec)
164     +{
165     + int i;
166     + struct zone *zone = NULL;
167     +
168     + for (i = 0; i < pagevec_count(pvec); i++) {
169     + struct page *page = pvec->pages[i];
170     + struct zone *pagezone = page_zone(page);
171     +
172     + if (pagezone != zone) {
173     + if (zone)
174     + spin_unlock_irq(&zone->lru_lock);
175     + zone = pagezone;
176     + spin_lock_irq(&zone->lru_lock);
177     + }
178     + BUG_ON(PageLRU(page));
179     + SetPageLRU(page);
180     + add_page_to_inactive_list_tail(zone, page);
181     + }
182     + if (zone)
183     + spin_unlock_irq(&zone->lru_lock);
184     + release_pages(pvec->pages, pvec->nr, pvec->cold);
185     + pagevec_reinit(pvec);
186     +}
187     +
188     +/*
189     + * Function used uniquely to put pages back to the lru at the end of the
190     + * inactive list to preserve the lru order. Currently only used by swap
191     + * prefetch.
192     + */
193     +void fastcall lru_cache_add_tail(struct page *page)
194     +{
195     + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
196     +
197     + page_cache_get(page);
198     + if (!pagevec_add(pvec, page))
199     + __pagevec_lru_add_tail(pvec);
200     + put_cpu_var(lru_add_pvecs);
201     +}
202     +
203     /*
204     * Try to drop buffers from the pages in a pagevec
205     */
206     @@ -536,5 +577,8 @@ void __init swap_setup(void)
207     * Right now other parts of the system means that we
208     * _really_ don't want to cluster much more
209     */
210     +
211     + prepare_swap_prefetch();
212     +
213     hotcpu_notifier(cpu_swap_callback, 0);
214     }
215     Index: linux-2.6.16-ck1/mm/swap_prefetch.c
216     ===================================================================
217     --- /dev/null 1970-01-01 00:00:00.000000000 +0000
218     +++ linux-2.6.16-ck1/mm/swap_prefetch.c 2006-03-20 20:46:55.000000000 +1100
219     @@ -0,0 +1,574 @@
220     +/*
221     + * linux/mm/swap_prefetch.c
222     + *
223     + * Copyright (C) 2005-2006 Con Kolivas
224     + *
225     + * Written by Con Kolivas <kernel@kolivas.org>
226     + *
227     + * This program is free software; you can redistribute it and/or modify
228     + * it under the terms of the GNU General Public License version 2 as
229     + * published by the Free Software Foundation.
230     + */
231     +
232     +#include <linux/fs.h>
233     +#include <linux/mm.h>
234     +#include <linux/swap.h>
235     +#include <linux/swap-prefetch.h>
236     +#include <linux/ioprio.h>
237     +#include <linux/kthread.h>
238     +#include <linux/pagemap.h>
239     +#include <linux/syscalls.h>
240     +#include <linux/writeback.h>
241     +
242     +/*
243     + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
244     + * needs to be at least this duration of idle time meaning in practice it can
245     + * be much longer
246     + */
247     +#define PREFETCH_DELAY (HZ * 5)
248     +
249     +/* sysctl - enable/disable swap prefetching */
250     +int swap_prefetch __read_mostly = 1;
251     +
252     +struct swapped_root {
253     + unsigned long busy; /* vm busy */
254     + spinlock_t lock; /* protects all data */
255     + struct list_head list; /* MRU list of swapped pages */
256     + struct radix_tree_root swap_tree; /* Lookup tree of pages */
257     + unsigned int count; /* Number of entries */
258     + unsigned int maxcount; /* Maximum entries allowed */
259     + kmem_cache_t *cache; /* Of struct swapped_entry */
260     +};
261     +
262     +static struct swapped_root swapped = {
263     + .lock = SPIN_LOCK_UNLOCKED,
264     + .list = LIST_HEAD_INIT(swapped.list),
265     + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC),
266     +};
267     +
268     +static task_t *kprefetchd_task;
269     +
270     +/*
271     + * We check to see no part of the vm is busy. If it is this will interrupt
272     + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
273     + */
274     +inline void delay_swap_prefetch(void)
275     +{
276     + if (!test_bit(0, &swapped.busy))
277     + __set_bit(0, &swapped.busy);
278     +}
279     +
280     +/*
281     + * Drop behind accounting which keeps a list of the most recently used swap
282     + * entries.
283     + */
284     +void add_to_swapped_list(struct page *page)
285     +{
286     + struct swapped_entry *entry;
287     + unsigned long index;
288     + int wakeup;
289     +
290     + if (!swap_prefetch)
291     + return;
292     +
293     + wakeup = 0;
294     +
295     + spin_lock(&swapped.lock);
296     + if (swapped.count >= swapped.maxcount) {
297     + /*
298     + * We limit the number of entries to 2/3 of physical ram.
299     + * Once the number of entries exceeds this we start removing
300     + * the least recently used entries.
301     + */
302     + entry = list_entry(swapped.list.next,
303     + struct swapped_entry, swapped_list);
304     + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
305     + list_del(&entry->swapped_list);
306     + swapped.count--;
307     + } else {
308     + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
309     + if (unlikely(!entry))
310     + /* bad, can't allocate more mem */
311     + goto out_locked;
312     + }
313     +
314     + index = page_private(page);
315     + entry->swp_entry.val = index;
316     + /*
317     + * On numa we need to store the node id to ensure that we prefetch to
318     + * the same node it came from.
319     + */
320     + store_swap_entry_node(entry, page);
321     +
322     + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
323     + /*
324     + * If this is the first entry, kprefetchd needs to be
325     + * (re)started.
326     + */
327     + if (!swapped.count)
328     + wakeup = 1;
329     + list_add(&entry->swapped_list, &swapped.list);
330     + swapped.count++;
331     + }
332     +
333     +out_locked:
334     + spin_unlock(&swapped.lock);
335     +
336     + /* Do the wakeup outside the lock to shorten lock hold time. */
337     + if (wakeup)
338     + wake_up_process(kprefetchd_task);
339     +
340     + return;
341     +}
342     +
343     +/*
344     + * Removes entries from the swapped_list. The radix tree allows us to quickly
345     + * look up the entry from the index without having to iterate over the whole
346     + * list.
347     + */
348     +void remove_from_swapped_list(const unsigned long index)
349     +{
350     + struct swapped_entry *entry;
351     + unsigned long flags;
352     +
353     + if (list_empty(&swapped.list))
354     + return;
355     +
356     + spin_lock_irqsave(&swapped.lock, flags);
357     + entry = radix_tree_delete(&swapped.swap_tree, index);
358     + if (likely(entry)) {
359     + list_del_init(&entry->swapped_list);
360     + swapped.count--;
361     + kmem_cache_free(swapped.cache, entry);
362     + }
363     + spin_unlock_irqrestore(&swapped.lock, flags);
364     +}
365     +
366     +enum trickle_return {
367     + TRICKLE_SUCCESS,
368     + TRICKLE_FAILED,
369     + TRICKLE_DELAY,
370     +};
371     +
372     +struct node_stats {
373     + unsigned long last_free;
374     + /* Free ram after a cycle of prefetching */
375     + unsigned long current_free;
376     + /* Free ram on this cycle of checking prefetch_suitable */
377     + unsigned long prefetch_watermark;
378     + /* Maximum amount we will prefetch to */
379     + unsigned long highfree[MAX_NR_ZONES];
380     + /* The amount of free ram before we start prefetching */
381     + unsigned long lowfree[MAX_NR_ZONES];
382     + /* The amount of free ram where we will stop prefetching */
383     + unsigned long *pointfree[MAX_NR_ZONES];
384     + /* highfree or lowfree depending on whether we've hit a watermark */
385     +};
386     +
387     +/*
388     + * prefetch_stats stores the free ram data of each node and this is used to
389     + * determine if a node is suitable for prefetching into.
390     + */
391     +struct prefetch_stats {
392     + nodemask_t prefetch_nodes;
393     + /* Which nodes are currently suited to prefetching */
394     + unsigned long prefetched_pages;
395     + /* Total pages we've prefetched on this wakeup of kprefetchd */
396     + struct node_stats node[MAX_NUMNODES];
397     +};
398     +
399     +static struct prefetch_stats sp_stat;
400     +
401     +/*
402     + * This tries to read a swp_entry_t into swap cache for swap prefetching.
403     + * If it returns TRICKLE_DELAY we should delay further prefetching.
404     + */
405     +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
406     + const int node)
407     +{
408     + enum trickle_return ret = TRICKLE_FAILED;
409     + struct page *page;
410     +
411     + read_lock_irq(&swapper_space.tree_lock);
412     + /* Entry may already exist */
413     + page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
414     + read_unlock_irq(&swapper_space.tree_lock);
415     + if (page) {
416     + remove_from_swapped_list(entry.val);
417     + goto out;
418     + }
419     +
420     + /*
421     + * Get a new page to read from swap. We have already checked the
422     + * watermarks so __alloc_pages will not call on reclaim.
423     + */
424     + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
425     + if (unlikely(!page)) {
426     + ret = TRICKLE_DELAY;
427     + goto out;
428     + }
429     +
430     + if (add_to_swap_cache(page, entry)) {
431     + /* Failed to add to swap cache */
432     + goto out_release;
433     + }
434     +
435     + /* Add them to the tail of the inactive list to preserve LRU order */
436     + lru_cache_add_tail(page);
437     + if (unlikely(swap_readpage(NULL, page))) {
438     + ret = TRICKLE_DELAY;
439     + goto out_release;
440     + }
441     +
442     + sp_stat.prefetched_pages++;
443     + sp_stat.node[node].last_free--;
444     +
445     + ret = TRICKLE_SUCCESS;
446     +out_release:
447     + page_cache_release(page);
448     +out:
449     + return ret;
450     +}
451     +
452     +static void clear_last_prefetch_free(void)
453     +{
454     + int node;
455     +
456     + /*
457     + * Reset the nodes suitable for prefetching to all nodes. We could
458     + * update the data to take into account memory hotplug if desired..
459     + */
460     + sp_stat.prefetch_nodes = node_online_map;
461     + for_each_node_mask(node, sp_stat.prefetch_nodes) {
462     + struct node_stats *ns = &sp_stat.node[node];
463     +
464     + ns->last_free = 0;
465     + }
466     +}
467     +
468     +static void clear_current_prefetch_free(void)
469     +{
470     + int node;
471     +
472     + sp_stat.prefetch_nodes = node_online_map;
473     + for_each_node_mask(node, sp_stat.prefetch_nodes) {
474     + struct node_stats *ns = &sp_stat.node[node];
475     +
476     + ns->current_free = 0;
477     + }
478     +}
479     +
480     +/*
481     + * This updates the high and low watermarks of amount of free ram in each
482     + * node used to start and stop prefetching. We prefetch from pages_high * 4
483     + * down to pages_high * 3.
484     + */
485     +static void examine_free_limits(void)
486     +{
487     + struct zone *z;
488     +
489     + for_each_zone(z) {
490     + struct node_stats *ns;
491     + int idx;
492     +
493     + if (!populated_zone(z))
494     + continue;
495     +
496     + ns = &sp_stat.node[z->zone_pgdat->node_id];
497     + idx = zone_idx(z);
498     + ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx];
499     + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
500     +
501     + if (z->free_pages > ns->highfree[idx]) {
502     + /*
503     + * We've gotten above the high watermark of free pages
504     + * so we can start prefetching till we get to the low
505     + * watermark.
506     + */
507     + ns->pointfree[idx] = &ns->lowfree[idx];
508     + }
509     + }
510     +}
511     +
512     +/*
513     + * We want to be absolutely certain it's ok to start prefetching.
514     + */
515     +static int prefetch_suitable(void)
516     +{
517     + unsigned long limit;
518     + struct zone *z;
519     + int node, ret = 0, test_pagestate = 0;
520     +
521     + /* Purposefully racy */
522     + if (test_bit(0, &swapped.busy)) {
523     + __clear_bit(0, &swapped.busy);
524     + goto out;
525     + }
526     +
527     + /*
528     + * get_page_state and above_background_load are expensive so we only
529     + * perform them every SWAP_CLUSTER_MAX prefetched_pages.
530     + * We test to see if we're above_background_load as disk activity
531     + * even at low priority can cause interrupt induced scheduling
532     + * latencies.
533     + */
534     + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
535     + if (above_background_load())
536     + goto out;
537     + test_pagestate = 1;
538     + }
539     +
540     + clear_current_prefetch_free();
541     +
542     + /*
543     + * Have some hysteresis between where page reclaiming and prefetching
544     + * will occur to prevent ping-ponging between them.
545     + */
546     + for_each_zone(z) {
547     + struct node_stats *ns;
548     + unsigned long free;
549     + int idx;
550     +
551     + if (!populated_zone(z))
552     + continue;
553     +
554     + node = z->zone_pgdat->node_id;
555     + ns = &sp_stat.node[node];
556     + idx = zone_idx(z);
557     +
558     + free = z->free_pages;
559     + if (free < *ns->pointfree[idx]) {
560     + /*
561     + * Free pages have dropped below the low watermark so
562     + * we won't start prefetching again till we hit the
563     + * high watermark of free pages.
564     + */
565     + ns->pointfree[idx] = &ns->highfree[idx];
566     + node_clear(node, sp_stat.prefetch_nodes);
567     + continue;
568     + }
569     + ns->current_free += free;
570     + }
571     +
572     + /*
573     + * We iterate over each node testing to see if it is suitable for
574     + * prefetching and clear the nodemask if it is not.
575     + */
576     + for_each_node_mask(node, sp_stat.prefetch_nodes) {
577     + struct node_stats *ns = &sp_stat.node[node];
578     + struct page_state ps;
579     +
580     + /*
581     + * We check to see that pages are not being allocated
582     + * elsewhere at any significant rate implying any
583     + * degree of memory pressure (eg during file reads)
584     + */
585     + if (ns->last_free) {
586     + if (ns->current_free + SWAP_CLUSTER_MAX <
587     + ns->last_free) {
588     + ns->last_free = ns->current_free;
589     + node_clear(node,
590     + sp_stat.prefetch_nodes);
591     + continue;
592     + }
593     + } else
594     + ns->last_free = ns->current_free;
595     +
596     + if (!test_pagestate)
597     + continue;
598     +
599     + get_page_state_node(&ps, node);
600     +
601     + /* We shouldn't prefetch when we are doing writeback */
602     + if (ps.nr_writeback) {
603     + node_clear(node, sp_stat.prefetch_nodes);
604     + continue;
605     + }
606     +
607     + /*
608     + * >2/3 of the ram on this node is mapped, slab, swapcache or
609     + * dirty, we need to leave some free for pagecache.
610     + * Note that currently nr_slab is innacurate on numa because
611     + * nr_slab is incremented on the node doing the accounting
612     + * even if the slab is being allocated on a remote node. This
613     + * would be expensive to fix and not of great significance.
614     + */
615     + limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty +
616     + ps.nr_unstable + total_swapcache_pages;
617     + if (limit > ns->prefetch_watermark) {
618     + node_clear(node, sp_stat.prefetch_nodes);
619     + continue;
620     + }
621     + }
622     +
623     + if (nodes_empty(sp_stat.prefetch_nodes))
624     + goto out;
625     +
626     + /* Survived all that? Hooray we can prefetch! */
627     + ret = 1;
628     +out:
629     + return ret;
630     +}
631     +
632     +/*
633     + * Get previous swapped entry when iterating over all entries. swapped.lock
634     + * should be held and we should already ensure that entry exists.
635     + */
636     +static inline struct swapped_entry *prev_swapped_entry
637     + (struct swapped_entry *entry)
638     +{
639     + return list_entry(entry->swapped_list.prev->prev,
640     + struct swapped_entry, swapped_list);
641     +}
642     +
643     +/*
644     + * trickle_swap is the main function that initiates the swap prefetching. It
645     + * first checks to see if the busy flag is set, and does not prefetch if it
646     + * is, as the flag implied we are low on memory or swapping in currently.
647     + * Otherwise it runs until prefetch_suitable fails which occurs when the
648     + * vm is busy, we prefetch to the watermark, or the list is empty or we have
649     + * iterated over all entries
650     + */
651     +static enum trickle_return trickle_swap(void)
652     +{
653     + enum trickle_return ret = TRICKLE_DELAY;
654     + struct swapped_entry *entry;
655     +
656     + /*
657     + * If laptop_mode is enabled don't prefetch to avoid hard drives
658     + * doing unnecessary spin-ups
659     + */
660     + if (!swap_prefetch || laptop_mode)
661     + return ret;
662     +
663     + examine_free_limits();
664     + entry = NULL;
665     +
666     + for ( ; ; ) {
667     + swp_entry_t swp_entry;
668     + int node;
669     +
670     + if (!prefetch_suitable())
671     + break;
672     +
673     + spin_lock(&swapped.lock);
674     + if (list_empty(&swapped.list)) {
675     + ret = TRICKLE_FAILED;
676     + spin_unlock(&swapped.lock);
677     + break;
678     + }
679     +
680     + if (!entry) {
681     + /*
682     + * This sets the entry for the first iteration. It
683     + * also is a safeguard against the entry disappearing
684     + * while the lock is not held.
685     + */
686     + entry = list_entry(swapped.list.prev,
687     + struct swapped_entry, swapped_list);
688     + } else if (entry->swapped_list.prev == swapped.list.next) {
689     + /*
690     + * If we have iterated over all entries and there are
691     + * still entries that weren't swapped out there may
692     + * be a reason we could not swap them back in so
693     + * delay attempting further prefetching.
694     + */
695     + spin_unlock(&swapped.lock);
696     + break;
697     + }
698     +
699     + node = get_swap_entry_node(entry);
700     + if (!node_isset(node, sp_stat.prefetch_nodes)) {
701     + /*
702     + * We found an entry that belongs to a node that is
703     + * not suitable for prefetching so skip it.
704     + */
705     + entry = prev_swapped_entry(entry);
706     + spin_unlock(&swapped.lock);
707     + continue;
708     + }
709     + swp_entry = entry->swp_entry;
710     + entry = prev_swapped_entry(entry);
711     + spin_unlock(&swapped.lock);
712     +
713     + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
714     + break;
715     + }
716     +
717     + if (sp_stat.prefetched_pages) {
718     + lru_add_drain();
719     + sp_stat.prefetched_pages = 0;
720     + }
721     + return ret;
722     +}
723     +
724     +static int kprefetchd(void *__unused)
725     +{
726     + set_user_nice(current, 19);
727     + /* Set ioprio to lowest if supported by i/o scheduler */
728     + sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
729     +
730     + do {
731     + try_to_freeze();
732     +
733     + /*
734     + * TRICKLE_FAILED implies no entries left - we do not schedule
735     + * a wakeup, and further delay the next one.
736     + */
737     + if (trickle_swap() == TRICKLE_FAILED) {
738     + set_current_state(TASK_INTERRUPTIBLE);
739     + schedule();
740     + }
741     + clear_last_prefetch_free();
742     + schedule_timeout_interruptible(PREFETCH_DELAY);
743     + } while (!kthread_should_stop());
744     +
745     + return 0;
746     +}
747     +
748     +/*
749     + * Create kmem cache for swapped entries
750     + */
751     +void __init prepare_swap_prefetch(void)
752     +{
753     + struct zone *zone;
754     +
755     + swapped.cache = kmem_cache_create("swapped_entry",
756     + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
757     +
758     + /*
759     + * Set max number of entries to 2/3 the size of physical ram as we
760     + * only ever prefetch to consume 2/3 of the ram.
761     + */
762     + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
763     +
764     + for_each_zone(zone) {
765     + unsigned long present;
766     + struct node_stats *ns;
767     + int idx;
768     +
769     + present = zone->present_pages;
770     + if (!present)
771     + continue;
772     +
773     + ns = &sp_stat.node[zone->zone_pgdat->node_id];
774     + ns->prefetch_watermark += present / 3 * 2;
775     + idx = zone_idx(zone);
776     + ns->pointfree[idx] = &ns->highfree[idx];
777     + }
778     +}
779     +
780     +static int __init kprefetchd_init(void)
781     +{
782     + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
783     +
784     + return 0;
785     +}
786     +
787     +static void __exit kprefetchd_exit(void)
788     +{
789     + kthread_stop(kprefetchd_task);
790     +}
791     +
792     +module_init(kprefetchd_init);
793     +module_exit(kprefetchd_exit);
794     Index: linux-2.6.16-ck1/mm/swap_state.c
795     ===================================================================
796     --- linux-2.6.16-ck1.orig/mm/swap_state.c 2006-03-20 20:46:24.000000000 +1100
797     +++ linux-2.6.16-ck1/mm/swap_state.c 2006-03-20 20:46:55.000000000 +1100
798     @@ -10,6 +10,7 @@
799     #include <linux/mm.h>
800     #include <linux/kernel_stat.h>
801     #include <linux/swap.h>
802     +#include <linux/swap-prefetch.h>
803     #include <linux/init.h>
804     #include <linux/pagemap.h>
805     #include <linux/buffer_head.h>
806     @@ -81,6 +82,7 @@ static int __add_to_swap_cache(struct pa
807     error = radix_tree_insert(&swapper_space.page_tree,
808     entry.val, page);
809     if (!error) {
810     + remove_from_swapped_list(entry.val);
811     page_cache_get(page);
812     SetPageLocked(page);
813     SetPageSwapCache(page);
814     @@ -94,11 +96,12 @@ static int __add_to_swap_cache(struct pa
815     return error;
816     }
817    
818     -static int add_to_swap_cache(struct page *page, swp_entry_t entry)
819     +int add_to_swap_cache(struct page *page, swp_entry_t entry)
820     {
821     int error;
822    
823     if (!swap_duplicate(entry)) {
824     + remove_from_swapped_list(entry.val);
825     INC_CACHE_INFO(noent_race);
826     return -ENOENT;
827     }
828     @@ -147,6 +150,9 @@ int add_to_swap(struct page * page, gfp_
829     swp_entry_t entry;
830     int err;
831    
832     + /* Swap prefetching is delayed if we're swapping pages */
833     + delay_swap_prefetch();
834     +
835     if (!PageLocked(page))
836     BUG();
837    
838     @@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e
839     struct page *found_page, *new_page = NULL;
840     int err;
841    
842     + /* Swap prefetching is delayed if we're already reading from swap */
843     + delay_swap_prefetch();
844     +
845     do {
846     /*
847     * First check the swap cache. Since this is normally
848     Index: linux-2.6.16-ck1/mm/vmscan.c
849     ===================================================================
850     --- linux-2.6.16-ck1.orig/mm/vmscan.c 2006-03-20 20:46:24.000000000 +1100
851     +++ linux-2.6.16-ck1/mm/vmscan.c 2006-03-20 20:46:55.000000000 +1100
852     @@ -16,6 +16,7 @@
853     #include <linux/slab.h>
854     #include <linux/kernel_stat.h>
855     #include <linux/swap.h>
856     +#include <linux/swap-prefetch.h>
857     #include <linux/pagemap.h>
858     #include <linux/init.h>
859     #include <linux/highmem.h>
860     @@ -396,6 +397,7 @@ static int remove_mapping(struct address
861    
862     if (PageSwapCache(page)) {
863     swp_entry_t swap = { .val = page_private(page) };
864     + add_to_swapped_list(page);
865     __delete_from_swap_cache(page);
866     write_unlock_irq(&mapping->tree_lock);
867     swap_free(swap);
868     @@ -1442,6 +1444,8 @@ int try_to_free_pages(struct zone **zone
869     sc.may_writepage = !laptop_mode;
870     sc.may_swap = 1;
871    
872     + delay_swap_prefetch();
873     +
874     inc_page_state(allocstall);
875    
876     for (i = 0; zones[i] != NULL; i++) {
877     @@ -1788,6 +1792,8 @@ int shrink_all_memory(int nr_pages)
878     .reclaimed_slab = 0,
879     };
880    
881     + delay_swap_prefetch();
882     +
883     current->reclaim_state = &reclaim_state;
884     for_each_pgdat(pgdat) {
885     int freed;
886     Index: linux-2.6.16-ck1/include/linux/mm_inline.h
887     ===================================================================
888     --- linux-2.6.16-ck1.orig/include/linux/mm_inline.h 2006-03-20 20:46:24.000000000 +1100
889     +++ linux-2.6.16-ck1/include/linux/mm_inline.h 2006-03-20 20:46:55.000000000 +1100
890     @@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z
891     }
892    
893     static inline void
894     +add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
895     +{
896     + list_add_tail(&page->lru, &zone->inactive_list);
897     + zone->nr_inactive++;
898     +}
899     +
900     +static inline void
901     del_page_from_active_list(struct zone *zone, struct page *page)
902     {
903     list_del(&page->lru);
904     Index: linux-2.6.16-ck1/include/linux/swap-prefetch.h
905     ===================================================================
906     --- /dev/null 1970-01-01 00:00:00.000000000 +0000
907     +++ linux-2.6.16-ck1/include/linux/swap-prefetch.h 2006-03-20 20:46:55.000000000 +1100
908     @@ -0,0 +1,55 @@
909     +#ifndef SWAP_PREFETCH_H_INCLUDED
910     +#define SWAP_PREFETCH_H_INCLUDED
911     +
912     +#ifdef CONFIG_SWAP_PREFETCH
913     +/* mm/swap_prefetch.c */
914     +extern int swap_prefetch;
915     +struct swapped_entry {
916     + swp_entry_t swp_entry; /* The actual swap entry */
917     + struct list_head swapped_list; /* Linked list of entries */
918     +#if MAX_NUMNODES > 1
919     + int node; /* Node id */
920     +#endif
921     +} __attribute__((packed));
922     +
923     +static inline void store_swap_entry_node(struct swapped_entry *entry,
924     + struct page *page)
925     +{
926     +#if MAX_NUMNODES > 1
927     + entry->node = page_to_nid(page);
928     +#endif
929     +}
930     +
931     +static inline int get_swap_entry_node(struct swapped_entry *entry)
932     +{
933     +#if MAX_NUMNODES > 1
934     + return entry->node;
935     +#else
936     + return 0;
937     +#endif
938     +}
939     +
940     +extern void add_to_swapped_list(struct page *page);
941     +extern void remove_from_swapped_list(const unsigned long index);
942     +extern void delay_swap_prefetch(void);
943     +extern void prepare_swap_prefetch(void);
944     +
945     +#else /* CONFIG_SWAP_PREFETCH */
946     +static inline void add_to_swapped_list(struct page *__unused)
947     +{
948     +}
949     +
950     +static inline void prepare_swap_prefetch(void)
951     +{
952     +}
953     +
954     +static inline void remove_from_swapped_list(const unsigned long __unused)
955     +{
956     +}
957     +
958     +static inline void delay_swap_prefetch(void)
959     +{
960     +}
961     +#endif /* CONFIG_SWAP_PREFETCH */
962     +
963     +#endif /* SWAP_PREFETCH_H_INCLUDED */