Magellan Linux

Annotation of /trunk/kernel26-alx/patches-2.6.20-r6/0015-2.6.20-mm-swap_prefetch-34.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1175 - (hide annotations) (download)
Thu Oct 14 12:15:46 2010 UTC (13 years, 7 months ago) by niro
File size: 31305 byte(s)
-2.6.20-alx-r6 new magellan 0.5.2 kernel
1 niro 1175 Implement swap prefetching when the vm is relatively idle and there is free
2     ram available.  The code is based on some preliminary code by Thomas
3     Schlichter.
4    
5     This stores a list of swapped entries in a list ordered most recently used
6     and a radix tree.  It generates a low priority kernel thread running at
7     nice 19 to do the prefetching at a later stage.
8    
9     Once pages have been added to the swapped list, a timer is started, testing
10     for conditions suitable to prefetch swap pages every 5 seconds.  Suitable
11     conditions are defined as lack of swapping out or in any pages, and no
12     watermark tests failing.  Significant amounts of dirtied ram and changes in
13     free ram representing disk writes or reads also prevent prefetching.
14    
15     It then checks that we have spare ram looking for at least 3* pages_high
16     free per zone and if it succeeds that will prefetch pages from swap into
17     the swap cache.  The pages are added to the tail of the inactive list to
18     preserve LRU ordering.
19    
20     Pages are prefetched until the list is empty or the vm is seen as busy
21     according to the previously described criteria.  Node data on numa is
22     stored with the entries and an appropriate zonelist based on this is used
23     when allocating ram.
24    
25     The pages are copied to swap cache and kept on backing store.  This allows
26     pressure on either physical ram or swap to readily find free pages without
27     further I/O.
28    
29     Prefetching can be enabled/disabled via the tunable in
30     /proc/sys/vm/swap_prefetch initially set to 1 (enabled).
31    
32     Enabling laptop_mode disables swap prefetching to prevent unnecessary spin
33     ups.
34    
35     In testing on modern pc hardware this results in wall-clock time activation
36     of the firefox browser to speed up 5 fold after a worst case complete
37     swap-out of the browser on a static web page.
38    
39     From: Ingo Molnar <mingo@elte.hu>
40    
41       Fix potential swap-prefetch deadlock, found by the locking correctness
42       validator.
43    
44     Signed-off-by: Con Kolivas <kernel@kolivas.org>
45     Signed-off-by: Ingo Molnar <mingo@elte.hu>
46     Signed-off-by: Andrew Morton <akpm@osdl.org>
47    
48     Documentation/sysctl/vm.txt | 12
49     include/linux/mm_inline.h | 7
50     include/linux/swap-prefetch.h | 55 +++
51     include/linux/swap.h | 2
52     include/linux/sysctl.h | 1
53     init/Kconfig | 22 +
54     kernel/sysctl.c | 11
55     mm/Makefile | 1
56     mm/swap.c | 48 +++
57     mm/swap_prefetch.c | 581 ++++++++++++++++++++++++++++++++++++++++++
58     mm/swap_state.c | 11
59     mm/vmscan.c | 6
60     12 files changed, 756 insertions(+), 1 deletion(-)
61    
62     Index: linux-2.6.20-ck1/Documentation/sysctl/vm.txt
63     ===================================================================
64     --- linux-2.6.20-ck1.orig/Documentation/sysctl/vm.txt 2007-02-05 22:51:59.000000000 +1100
65     +++ linux-2.6.20-ck1/Documentation/sysctl/vm.txt 2007-02-16 19:01:33.000000000 +1100
66     @@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/
67     - min_unmapped_ratio
68     - min_slab_ratio
69     - panic_on_oom
70     +- swap_prefetch
71    
72     ==============================================================
73    
74     @@ -205,3 +206,14 @@ rather than killing rogue processes, set
75    
76     The default value is 0.
77    
78     +==============================================================
79     +
80     +swap_prefetch
81     +
82     +This enables or disables the swap prefetching feature. When the virtual
83     +memory subsystem has been extremely idle for at least 5 seconds it will start
84     +copying back pages from swap into the swapcache and keep a copy in swap. In
85     +practice it can take many minutes before the vm is idle enough.
86     +
87     +The default value is 1.
88     +
89     Index: linux-2.6.20-ck1/include/linux/swap.h
90     ===================================================================
91     --- linux-2.6.20-ck1.orig/include/linux/swap.h 2007-02-05 22:52:04.000000000 +1100
92     +++ linux-2.6.20-ck1/include/linux/swap.h 2007-02-16 19:01:33.000000000 +1100
93     @@ -178,6 +178,7 @@ extern unsigned int nr_free_pagecache_pa
94     /* linux/mm/swap.c */
95     extern void FASTCALL(lru_cache_add(struct page *));
96     extern void FASTCALL(lru_cache_add_active(struct page *));
97     +extern void FASTCALL(lru_cache_add_tail(struct page *));
98     extern void FASTCALL(activate_page(struct page *));
99     extern void FASTCALL(mark_page_accessed(struct page *));
100     extern void lru_add_drain(void);
101     @@ -235,6 +236,7 @@ extern void free_pages_and_swap_cache(st
102     extern struct page * lookup_swap_cache(swp_entry_t);
103     extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
104     unsigned long addr);
105     +extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
106     /* linux/mm/swapfile.c */
107     extern long total_swap_pages;
108     extern unsigned int nr_swapfiles;
109     Index: linux-2.6.20-ck1/include/linux/sysctl.h
110     ===================================================================
111     --- linux-2.6.20-ck1.orig/include/linux/sysctl.h 2007-02-05 22:52:04.000000000 +1100
112     +++ linux-2.6.20-ck1/include/linux/sysctl.h 2007-02-16 19:01:33.000000000 +1100
113     @@ -202,6 +202,7 @@ enum
114     VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
115     VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
116     VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
117     + VM_SWAP_PREFETCH=36, /* swap prefetch */
118     };
119    
120    
121     Index: linux-2.6.20-ck1/init/Kconfig
122     ===================================================================
123     --- linux-2.6.20-ck1.orig/init/Kconfig 2007-02-05 22:52:04.000000000 +1100
124     +++ linux-2.6.20-ck1/init/Kconfig 2007-02-16 19:01:33.000000000 +1100
125     @@ -101,6 +101,28 @@ config SWAP
126     used to provide more virtual memory than the actual RAM present
127     in your computer. If unsure say Y.
128    
129     +config SWAP_PREFETCH
130     + bool "Support for prefetching swapped memory"
131     + depends on SWAP
132     + default y
133     + ---help---
134     + This option will allow the kernel to prefetch swapped memory pages
135     + when idle. The pages will be kept on both swap and in swap_cache
136     + thus avoiding the need for further I/O if either ram or swap space
137     + is required.
138     +
139     + What this will do on workstations is slowly bring back applications
140     + that have swapped out after memory intensive workloads back into
141     + physical ram if you have free ram at a later stage and the machine
142     + is relatively idle. This means that when you come back to your
143     + computer after leaving it idle for a while, applications will come
144     + to life faster. Note that your swap usage will appear to increase
145     + but these are cached pages, can be dropped freely by the vm, and it
146     + should stabilise around 50% swap usage maximum.
147     +
148     + Workstations and multiuser workstation servers will most likely want
149     + to say Y.
150     +
151     config SYSVIPC
152     bool "System V IPC"
153     ---help---
154     Index: linux-2.6.20-ck1/kernel/sysctl.c
155     ===================================================================
156     --- linux-2.6.20-ck1.orig/kernel/sysctl.c 2007-02-16 19:01:31.000000000 +1100
157     +++ linux-2.6.20-ck1/kernel/sysctl.c 2007-02-16 19:01:33.000000000 +1100
158     @@ -22,6 +22,7 @@
159     #include <linux/mm.h>
160     #include <linux/swap.h>
161     #include <linux/slab.h>
162     +#include <linux/swap-prefetch.h>
163     #include <linux/sysctl.h>
164     #include <linux/proc_fs.h>
165     #include <linux/capability.h>
166     @@ -1064,6 +1065,16 @@ static ctl_table vm_table[] = {
167     .extra1 = &zero,
168     },
169     #endif
170     +#ifdef CONFIG_SWAP_PREFETCH
171     + {
172     + .ctl_name = VM_SWAP_PREFETCH,
173     + .procname = "swap_prefetch",
174     + .data = &swap_prefetch,
175     + .maxlen = sizeof(swap_prefetch),
176     + .mode = 0644,
177     + .proc_handler = &proc_dointvec,
178     + },
179     +#endif
180     { .ctl_name = 0 }
181     };
182    
183     Index: linux-2.6.20-ck1/mm/Makefile
184     ===================================================================
185     --- linux-2.6.20-ck1.orig/mm/Makefile 2006-11-30 11:30:41.000000000 +1100
186     +++ linux-2.6.20-ck1/mm/Makefile 2007-02-16 19:01:33.000000000 +1100
187     @@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
188     obj-y += bounce.o
189     endif
190     obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
191     +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
192     obj-$(CONFIG_HUGETLBFS) += hugetlb.o
193     obj-$(CONFIG_NUMA) += mempolicy.o
194     obj-$(CONFIG_SPARSEMEM) += sparse.o
195     Index: linux-2.6.20-ck1/mm/swap.c
196     ===================================================================
197     --- linux-2.6.20-ck1.orig/mm/swap.c 2007-02-05 22:52:04.000000000 +1100
198     +++ linux-2.6.20-ck1/mm/swap.c 2007-02-16 19:01:33.000000000 +1100
199     @@ -17,6 +17,7 @@
200     #include <linux/sched.h>
201     #include <linux/kernel_stat.h>
202     #include <linux/swap.h>
203     +#include <linux/swap-prefetch.h>
204     #include <linux/mman.h>
205     #include <linux/pagemap.h>
206     #include <linux/pagevec.h>
207     @@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed);
208     */
209     static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
210     static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
211     +static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, };
212    
213     void fastcall lru_cache_add(struct page *page)
214     {
215     @@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc
216     put_cpu_var(lru_add_active_pvecs);
217     }
218    
219     +static void __pagevec_lru_add_tail(struct pagevec *pvec)
220     +{
221     + int i;
222     + struct zone *zone = NULL;
223     +
224     + for (i = 0; i < pagevec_count(pvec); i++) {
225     + struct page *page = pvec->pages[i];
226     + struct zone *pagezone = page_zone(page);
227     +
228     + if (pagezone != zone) {
229     + if (zone)
230     + spin_unlock_irq(&zone->lru_lock);
231     + zone = pagezone;
232     + spin_lock_irq(&zone->lru_lock);
233     + }
234     + BUG_ON(PageLRU(page));
235     + SetPageLRU(page);
236     + add_page_to_inactive_list_tail(zone, page);
237     + }
238     + if (zone)
239     + spin_unlock_irq(&zone->lru_lock);
240     + release_pages(pvec->pages, pvec->nr, pvec->cold);
241     + pagevec_reinit(pvec);
242     +}
243     +
244     static void __lru_add_drain(int cpu)
245     {
246     struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
247     @@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu)
248     pvec = &per_cpu(lru_add_active_pvecs, cpu);
249     if (pagevec_count(pvec))
250     __pagevec_lru_add_active(pvec);
251     + pvec = &per_cpu(lru_add_tail_pvecs, cpu);
252     + if (pagevec_count(pvec))
253     + __pagevec_lru_add_tail(pvec);
254     }
255    
256     void lru_add_drain(void)
257     @@ -403,6 +433,21 @@ void __pagevec_lru_add_active(struct pag
258     }
259    
260     /*
261     + * Function used uniquely to put pages back to the lru at the end of the
262     + * inactive list to preserve the lru order. Currently only used by swap
263     + * prefetch.
264     + */
265     +void fastcall lru_cache_add_tail(struct page *page)
266     +{
267     + struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs);
268     +
269     + page_cache_get(page);
270     + if (!pagevec_add(pvec, page))
271     + __pagevec_lru_add_tail(pvec);
272     + put_cpu_var(lru_add_pvecs);
273     +}
274     +
275     +/*
276     * Try to drop buffers from the pages in a pagevec
277     */
278     void pagevec_strip(struct pagevec *pvec)
279     @@ -514,6 +559,9 @@ void __init swap_setup(void)
280     * Right now other parts of the system means that we
281     * _really_ don't want to cluster much more
282     */
283     +
284     + prepare_swap_prefetch();
285     +
286     #ifdef CONFIG_HOTPLUG_CPU
287     hotcpu_notifier(cpu_swap_callback, 0);
288     #endif
289     Index: linux-2.6.20-ck1/mm/swap_prefetch.c
290     ===================================================================
291     --- /dev/null 1970-01-01 00:00:00.000000000 +0000
292     +++ linux-2.6.20-ck1/mm/swap_prefetch.c 2007-02-16 19:01:33.000000000 +1100
293     @@ -0,0 +1,581 @@
294     +/*
295     + * linux/mm/swap_prefetch.c
296     + *
297     + * Copyright (C) 2005-2006 Con Kolivas
298     + *
299     + * Written by Con Kolivas <kernel@kolivas.org>
300     + *
301     + * This program is free software; you can redistribute it and/or modify
302     + * it under the terms of the GNU General Public License version 2 as
303     + * published by the Free Software Foundation.
304     + */
305     +
306     +#include <linux/fs.h>
307     +#include <linux/mm.h>
308     +#include <linux/swap.h>
309     +#include <linux/swap-prefetch.h>
310     +#include <linux/ioprio.h>
311     +#include <linux/kthread.h>
312     +#include <linux/pagemap.h>
313     +#include <linux/syscalls.h>
314     +#include <linux/writeback.h>
315     +#include <linux/vmstat.h>
316     +#include <linux/freezer.h>
317     +
318     +/*
319     + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
320     + * needs to be at least this duration of idle time meaning in practice it can
321     + * be much longer
322     + */
323     +#define PREFETCH_DELAY (HZ * 5)
324     +
325     +/* sysctl - enable/disable swap prefetching */
326     +int swap_prefetch __read_mostly = 1;
327     +
328     +struct swapped_root {
329     + unsigned long busy; /* vm busy */
330     + spinlock_t lock; /* protects all data */
331     + struct list_head list; /* MRU list of swapped pages */
332     + struct radix_tree_root swap_tree; /* Lookup tree of pages */
333     + unsigned int count; /* Number of entries */
334     + unsigned int maxcount; /* Maximum entries allowed */
335     + struct kmem_cache *cache; /* Of struct swapped_entry */
336     +};
337     +
338     +static struct swapped_root swapped = {
339     + .lock = SPIN_LOCK_UNLOCKED,
340     + .list = LIST_HEAD_INIT(swapped.list),
341     + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC),
342     +};
343     +
344     +static struct task_struct *kprefetchd_task;
345     +
346     +/*
347     + * We check to see no part of the vm is busy. If it is this will interrupt
348     + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
349     + */
350     +inline void delay_swap_prefetch(void)
351     +{
352     + if (!test_bit(0, &swapped.busy))
353     + __set_bit(0, &swapped.busy);
354     +}
355     +
356     +/*
357     + * Drop behind accounting which keeps a list of the most recently used swap
358     + * entries.
359     + */
360     +void add_to_swapped_list(struct page *page)
361     +{
362     + struct swapped_entry *entry;
363     + unsigned long index, flags;
364     + int wakeup;
365     +
366     + if (!swap_prefetch)
367     + return;
368     +
369     + wakeup = 0;
370     +
371     + spin_lock_irqsave(&swapped.lock, flags);
372     + if (swapped.count >= swapped.maxcount) {
373     + /*
374     + * We limit the number of entries to 2/3 of physical ram.
375     + * Once the number of entries exceeds this we start removing
376     + * the least recently used entries.
377     + */
378     + entry = list_entry(swapped.list.next,
379     + struct swapped_entry, swapped_list);
380     + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
381     + list_del(&entry->swapped_list);
382     + swapped.count--;
383     + } else {
384     + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
385     + if (unlikely(!entry))
386     + /* bad, can't allocate more mem */
387     + goto out_locked;
388     + }
389     +
390     + index = page_private(page);
391     + entry->swp_entry.val = index;
392     + /*
393     + * On numa we need to store the node id to ensure that we prefetch to
394     + * the same node it came from.
395     + */
396     + store_swap_entry_node(entry, page);
397     +
398     + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
399     + /*
400     + * If this is the first entry, kprefetchd needs to be
401     + * (re)started.
402     + */
403     + if (!swapped.count)
404     + wakeup = 1;
405     + list_add(&entry->swapped_list, &swapped.list);
406     + swapped.count++;
407     + }
408     +
409     +out_locked:
410     + spin_unlock_irqrestore(&swapped.lock, flags);
411     +
412     + /* Do the wakeup outside the lock to shorten lock hold time. */
413     + if (wakeup)
414     + wake_up_process(kprefetchd_task);
415     +
416     + return;
417     +}
418     +
419     +/*
420     + * Removes entries from the swapped_list. The radix tree allows us to quickly
421     + * look up the entry from the index without having to iterate over the whole
422     + * list.
423     + */
424     +void remove_from_swapped_list(const unsigned long index)
425     +{
426     + struct swapped_entry *entry;
427     + unsigned long flags;
428     +
429     + if (list_empty(&swapped.list))
430     + return;
431     +
432     + spin_lock_irqsave(&swapped.lock, flags);
433     + entry = radix_tree_delete(&swapped.swap_tree, index);
434     + if (likely(entry)) {
435     + list_del_init(&entry->swapped_list);
436     + swapped.count--;
437     + kmem_cache_free(swapped.cache, entry);
438     + }
439     + spin_unlock_irqrestore(&swapped.lock, flags);
440     +}
441     +
442     +enum trickle_return {
443     + TRICKLE_SUCCESS,
444     + TRICKLE_FAILED,
445     + TRICKLE_DELAY,
446     +};
447     +
448     +struct node_stats {
449     + unsigned long last_free;
450     + /* Free ram after a cycle of prefetching */
451     + unsigned long current_free;
452     + /* Free ram on this cycle of checking prefetch_suitable */
453     + unsigned long prefetch_watermark;
454     + /* Maximum amount we will prefetch to */
455     + unsigned long highfree[MAX_NR_ZONES];
456     + /* The amount of free ram before we start prefetching */
457     + unsigned long lowfree[MAX_NR_ZONES];
458     + /* The amount of free ram where we will stop prefetching */
459     + unsigned long *pointfree[MAX_NR_ZONES];
460     + /* highfree or lowfree depending on whether we've hit a watermark */
461     +};
462     +
463     +/*
464     + * prefetch_stats stores the free ram data of each node and this is used to
465     + * determine if a node is suitable for prefetching into.
466     + */
467     +struct prefetch_stats {
468     + nodemask_t prefetch_nodes;
469     + /* Which nodes are currently suited to prefetching */
470     + unsigned long prefetched_pages;
471     + /* Total pages we've prefetched on this wakeup of kprefetchd */
472     + struct node_stats node[MAX_NUMNODES];
473     +};
474     +
475     +static struct prefetch_stats sp_stat;
476     +
477     +/*
478     + * This tries to read a swp_entry_t into swap cache for swap prefetching.
479     + * If it returns TRICKLE_DELAY we should delay further prefetching.
480     + */
481     +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
482     + const int node)
483     +{
484     + enum trickle_return ret = TRICKLE_FAILED;
485     + struct page *page;
486     +
487     + read_lock_irq(&swapper_space.tree_lock);
488     + /* Entry may already exist */
489     + page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
490     + read_unlock_irq(&swapper_space.tree_lock);
491     + if (page) {
492     + remove_from_swapped_list(entry.val);
493     + goto out;
494     + }
495     +
496     + /*
497     + * Get a new page to read from swap. We have already checked the
498     + * watermarks so __alloc_pages will not call on reclaim.
499     + */
500     + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
501     + if (unlikely(!page)) {
502     + ret = TRICKLE_DELAY;
503     + goto out;
504     + }
505     +
506     + if (add_to_swap_cache(page, entry)) {
507     + /* Failed to add to swap cache */
508     + goto out_release;
509     + }
510     +
511     + /* Add them to the tail of the inactive list to preserve LRU order */
512     + lru_cache_add_tail(page);
513     + if (unlikely(swap_readpage(NULL, page))) {
514     + ret = TRICKLE_DELAY;
515     + goto out_release;
516     + }
517     +
518     + sp_stat.prefetched_pages++;
519     + sp_stat.node[node].last_free--;
520     +
521     + ret = TRICKLE_SUCCESS;
522     +out_release:
523     + page_cache_release(page);
524     +out:
525     + return ret;
526     +}
527     +
528     +static void clear_last_prefetch_free(void)
529     +{
530     + int node;
531     +
532     + /*
533     + * Reset the nodes suitable for prefetching to all nodes. We could
534     + * update the data to take into account memory hotplug if desired..
535     + */
536     + sp_stat.prefetch_nodes = node_online_map;
537     + for_each_node_mask(node, sp_stat.prefetch_nodes) {
538     + struct node_stats *ns = &sp_stat.node[node];
539     +
540     + ns->last_free = 0;
541     + }
542     +}
543     +
544     +static void clear_current_prefetch_free(void)
545     +{
546     + int node;
547     +
548     + sp_stat.prefetch_nodes = node_online_map;
549     + for_each_node_mask(node, sp_stat.prefetch_nodes) {
550     + struct node_stats *ns = &sp_stat.node[node];
551     +
552     + ns->current_free = 0;
553     + }
554     +}
555     +
556     +/*
557     + * This updates the high and low watermarks of amount of free ram in each
558     + * node used to start and stop prefetching. We prefetch from pages_high * 4
559     + * down to pages_high * 3.
560     + */
561     +static void examine_free_limits(void)
562     +{
563     + struct zone *z;
564     +
565     + for_each_zone(z) {
566     + struct node_stats *ns;
567     + int idx;
568     +
569     + if (!populated_zone(z))
570     + continue;
571     +
572     + ns = &sp_stat.node[z->zone_pgdat->node_id];
573     + idx = zone_idx(z);
574     + ns->lowfree[idx] = z->pages_high * 3;
575     + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
576     +
577     + if (z->free_pages > ns->highfree[idx]) {
578     + /*
579     + * We've gotten above the high watermark of free pages
580     + * so we can start prefetching till we get to the low
581     + * watermark.
582     + */
583     + ns->pointfree[idx] = &ns->lowfree[idx];
584     + }
585     + }
586     +}
587     +
588     +/*
589     + * We want to be absolutely certain it's ok to start prefetching.
590     + */
591     +static int prefetch_suitable(void)
592     +{
593     + unsigned long limit;
594     + struct zone *z;
595     + int node, ret = 0, test_pagestate = 0;
596     +
597     + /* Purposefully racy */
598     + if (test_bit(0, &swapped.busy)) {
599     + __clear_bit(0, &swapped.busy);
600     + goto out;
601     + }
602     +
603     + /*
604     + * get_page_state and above_background_load are expensive so we only
605     + * perform them every SWAP_CLUSTER_MAX prefetched_pages.
606     + * We test to see if we're above_background_load as disk activity
607     + * even at low priority can cause interrupt induced scheduling
608     + * latencies.
609     + */
610     + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
611     + if (above_background_load())
612     + goto out;
613     + test_pagestate = 1;
614     + }
615     +
616     + clear_current_prefetch_free();
617     +
618     + /*
619     + * Have some hysteresis between where page reclaiming and prefetching
620     + * will occur to prevent ping-ponging between them.
621     + */
622     + for_each_zone(z) {
623     + struct node_stats *ns;
624     + unsigned long free;
625     + int idx;
626     +
627     + if (!populated_zone(z))
628     + continue;
629     +
630     + node = z->zone_pgdat->node_id;
631     + ns = &sp_stat.node[node];
632     + idx = zone_idx(z);
633     +
634     + free = z->free_pages;
635     + if (free < *ns->pointfree[idx]) {
636     + /*
637     + * Free pages have dropped below the low watermark so
638     + * we won't start prefetching again till we hit the
639     + * high watermark of free pages.
640     + */
641     + ns->pointfree[idx] = &ns->highfree[idx];
642     + node_clear(node, sp_stat.prefetch_nodes);
643     + continue;
644     + }
645     + ns->current_free += free;
646     + }
647     +
648     + /*
649     + * We iterate over each node testing to see if it is suitable for
650     + * prefetching and clear the nodemask if it is not.
651     + */
652     + for_each_node_mask(node, sp_stat.prefetch_nodes) {
653     + struct node_stats *ns = &sp_stat.node[node];
654     +
655     + /*
656     + * We check to see that pages are not being allocated
657     + * elsewhere at any significant rate implying any
658     + * degree of memory pressure (eg during file reads)
659     + */
660     + if (ns->last_free) {
661     + if (ns->current_free + SWAP_CLUSTER_MAX <
662     + ns->last_free) {
663     + ns->last_free = ns->current_free;
664     + node_clear(node,
665     + sp_stat.prefetch_nodes);
666     + continue;
667     + }
668     + } else
669     + ns->last_free = ns->current_free;
670     +
671     + if (!test_pagestate)
672     + continue;
673     +
674     + /* We shouldn't prefetch when we are doing writeback */
675     + if (node_page_state(node, NR_WRITEBACK)) {
676     + node_clear(node, sp_stat.prefetch_nodes);
677     + continue;
678     + }
679     +
680     + /*
681     + * >2/3 of the ram on this node is mapped, slab, swapcache or
682     + * dirty, we need to leave some free for pagecache.
683     + */
684     + limit = node_page_state(node, NR_FILE_PAGES);
685     + limit += node_page_state(node, NR_SLAB_RECLAIMABLE);
686     + limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE);
687     + limit += node_page_state(node, NR_FILE_DIRTY);
688     + limit += node_page_state(node, NR_UNSTABLE_NFS);
689     + limit += total_swapcache_pages;
690     + if (limit > ns->prefetch_watermark) {
691     + node_clear(node, sp_stat.prefetch_nodes);
692     + continue;
693     + }
694     + }
695     +
696     + if (nodes_empty(sp_stat.prefetch_nodes))
697     + goto out;
698     +
699     + /* Survived all that? Hooray we can prefetch! */
700     + ret = 1;
701     +out:
702     + return ret;
703     +}
704     +
705     +/*
706     + * Get previous swapped entry when iterating over all entries. swapped.lock
707     + * should be held and we should already ensure that entry exists.
708     + */
709     +static inline struct swapped_entry *prev_swapped_entry
710     + (struct swapped_entry *entry)
711     +{
712     + return list_entry(entry->swapped_list.prev->prev,
713     + struct swapped_entry, swapped_list);
714     +}
715     +
716     +/*
717     + * trickle_swap is the main function that initiates the swap prefetching. It
718     + * first checks to see if the busy flag is set, and does not prefetch if it
719     + * is, as the flag implied we are low on memory or swapping in currently.
720     + * Otherwise it runs until prefetch_suitable fails which occurs when the
721     + * vm is busy, we prefetch to the watermark, or the list is empty or we have
722     + * iterated over all entries
723     + */
724     +static enum trickle_return trickle_swap(void)
725     +{
726     + enum trickle_return ret = TRICKLE_DELAY;
727     + struct swapped_entry *entry;
728     + unsigned long flags;
729     +
730     + /*
731     + * If laptop_mode is enabled don't prefetch to avoid hard drives
732     + * doing unnecessary spin-ups
733     + */
734     + if (!swap_prefetch || laptop_mode)
735     + return ret;
736     +
737     + examine_free_limits();
738     + entry = NULL;
739     +
740     + for ( ; ; ) {
741     + swp_entry_t swp_entry;
742     + int node;
743     +
744     + if (!prefetch_suitable())
745     + break;
746     +
747     + spin_lock_irqsave(&swapped.lock, flags);
748     + if (list_empty(&swapped.list)) {
749     + ret = TRICKLE_FAILED;
750     + spin_unlock_irqrestore(&swapped.lock, flags);
751     + break;
752     + }
753     +
754     + if (!entry) {
755     + /*
756     + * This sets the entry for the first iteration. It
757     + * also is a safeguard against the entry disappearing
758     + * while the lock is not held.
759     + */
760     + entry = list_entry(swapped.list.prev,
761     + struct swapped_entry, swapped_list);
762     + } else if (entry->swapped_list.prev == swapped.list.next) {
763     + /*
764     + * If we have iterated over all entries and there are
765     + * still entries that weren't swapped out there may
766     + * be a reason we could not swap them back in so
767     + * delay attempting further prefetching.
768     + */
769     + spin_unlock_irqrestore(&swapped.lock, flags);
770     + break;
771     + }
772     +
773     + node = get_swap_entry_node(entry);
774     + if (!node_isset(node, sp_stat.prefetch_nodes)) {
775     + /*
776     + * We found an entry that belongs to a node that is
777     + * not suitable for prefetching so skip it.
778     + */
779     + entry = prev_swapped_entry(entry);
780     + spin_unlock_irqrestore(&swapped.lock, flags);
781     + continue;
782     + }
783     + swp_entry = entry->swp_entry;
784     + entry = prev_swapped_entry(entry);
785     + spin_unlock_irqrestore(&swapped.lock, flags);
786     +
787     + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
788     + break;
789     + }
790     +
791     + if (sp_stat.prefetched_pages) {
792     + lru_add_drain();
793     + sp_stat.prefetched_pages = 0;
794     + }
795     + return ret;
796     +}
797     +
798     +static int kprefetchd(void *__unused)
799     +{
800     + struct sched_param param = { .sched_priority = 0 };
801     +
802     + sched_setscheduler(current, SCHED_BATCH, &param);
803     + set_user_nice(current, 19);
804     + /* Set ioprio to lowest if supported by i/o scheduler */
805     + sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
806     +
807     + /* kprefetchd has nothing to do until it is woken up the first time */
808     + set_current_state(TASK_INTERRUPTIBLE);
809     + schedule();
810     +
811     + do {
812     + try_to_freeze();
813     +
814     + /*
815     + * TRICKLE_FAILED implies no entries left - we do not schedule
816     + * a wakeup, and further delay the next one.
817     + */
818     + if (trickle_swap() == TRICKLE_FAILED) {
819     + set_current_state(TASK_INTERRUPTIBLE);
820     + schedule();
821     + }
822     + clear_last_prefetch_free();
823     + schedule_timeout_interruptible(PREFETCH_DELAY);
824     + } while (!kthread_should_stop());
825     +
826     + return 0;
827     +}
828     +
829     +/*
830     + * Create kmem cache for swapped entries
831     + */
832     +void __init prepare_swap_prefetch(void)
833     +{
834     + struct zone *zone;
835     +
836     + swapped.cache = kmem_cache_create("swapped_entry",
837     + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
838     +
839     + /*
840     + * Set max number of entries to 2/3 the size of physical ram as we
841     + * only ever prefetch to consume 2/3 of the ram.
842     + */
843     + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
844     +
845     + for_each_zone(zone) {
846     + unsigned long present;
847     + struct node_stats *ns;
848     + int idx;
849     +
850     + present = zone->present_pages;
851     + if (!present)
852     + continue;
853     +
854     + ns = &sp_stat.node[zone->zone_pgdat->node_id];
855     + ns->prefetch_watermark += present / 3 * 2;
856     + idx = zone_idx(zone);
857     + ns->pointfree[idx] = &ns->highfree[idx];
858     + }
859     +}
860     +
861     +static int __init kprefetchd_init(void)
862     +{
863     + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
864     +
865     + return 0;
866     +}
867     +
868     +static void __exit kprefetchd_exit(void)
869     +{
870     + kthread_stop(kprefetchd_task);
871     +}
872     +
873     +module_init(kprefetchd_init);
874     +module_exit(kprefetchd_exit);
875     Index: linux-2.6.20-ck1/mm/swap_state.c
876     ===================================================================
877     --- linux-2.6.20-ck1.orig/mm/swap_state.c 2006-09-21 19:55:01.000000000 +1000
878     +++ linux-2.6.20-ck1/mm/swap_state.c 2007-02-16 19:01:33.000000000 +1100
879     @@ -10,6 +10,7 @@
880     #include <linux/mm.h>
881     #include <linux/kernel_stat.h>
882     #include <linux/swap.h>
883     +#include <linux/swap-prefetch.h>
884     #include <linux/init.h>
885     #include <linux/pagemap.h>
886     #include <linux/buffer_head.h>
887     @@ -82,6 +83,7 @@ static int __add_to_swap_cache(struct pa
888     error = radix_tree_insert(&swapper_space.page_tree,
889     entry.val, page);
890     if (!error) {
891     + remove_from_swapped_list(entry.val);
892     page_cache_get(page);
893     SetPageLocked(page);
894     SetPageSwapCache(page);
895     @@ -95,11 +97,12 @@ static int __add_to_swap_cache(struct pa
896     return error;
897     }
898    
899     -static int add_to_swap_cache(struct page *page, swp_entry_t entry)
900     +int add_to_swap_cache(struct page *page, swp_entry_t entry)
901     {
902     int error;
903    
904     if (!swap_duplicate(entry)) {
905     + remove_from_swapped_list(entry.val);
906     INC_CACHE_INFO(noent_race);
907     return -ENOENT;
908     }
909     @@ -148,6 +151,9 @@ int add_to_swap(struct page * page, gfp_
910     swp_entry_t entry;
911     int err;
912    
913     + /* Swap prefetching is delayed if we're swapping pages */
914     + delay_swap_prefetch();
915     +
916     BUG_ON(!PageLocked(page));
917    
918     for (;;) {
919     @@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e
920     struct page *found_page, *new_page = NULL;
921     int err;
922    
923     + /* Swap prefetching is delayed if we're already reading from swap */
924     + delay_swap_prefetch();
925     +
926     do {
927     /*
928     * First check the swap cache. Since this is normally
929     Index: linux-2.6.20-ck1/mm/vmscan.c
930     ===================================================================
931     --- linux-2.6.20-ck1.orig/mm/vmscan.c 2007-02-05 22:52:04.000000000 +1100
932     +++ linux-2.6.20-ck1/mm/vmscan.c 2007-02-16 19:01:33.000000000 +1100
933     @@ -16,6 +16,7 @@
934     #include <linux/slab.h>
935     #include <linux/kernel_stat.h>
936     #include <linux/swap.h>
937     +#include <linux/swap-prefetch.h>
938     #include <linux/pagemap.h>
939     #include <linux/init.h>
940     #include <linux/highmem.h>
941     @@ -424,6 +425,7 @@ int remove_mapping(struct address_space
942    
943     if (PageSwapCache(page)) {
944     swp_entry_t swap = { .val = page_private(page) };
945     + add_to_swapped_list(page);
946     __delete_from_swap_cache(page);
947     write_unlock_irq(&mapping->tree_lock);
948     swap_free(swap);
949     @@ -1029,6 +1031,8 @@ unsigned long try_to_free_pages(struct z
950     .swappiness = vm_swappiness,
951     };
952    
953     + delay_swap_prefetch();
954     +
955     count_vm_event(ALLOCSTALL);
956    
957     for (i = 0; zones[i] != NULL; i++) {
958     @@ -1375,6 +1379,8 @@ static unsigned long shrink_all_zones(un
959     struct zone *zone;
960     unsigned long nr_to_scan, ret = 0;
961    
962     + delay_swap_prefetch();
963     +
964     for_each_zone(zone) {
965    
966     if (!populated_zone(zone))
967     Index: linux-2.6.20-ck1/include/linux/mm_inline.h
968     ===================================================================
969     --- linux-2.6.20-ck1.orig/include/linux/mm_inline.h 2006-06-18 15:32:49.000000000 +1000
970     +++ linux-2.6.20-ck1/include/linux/mm_inline.h 2007-02-16 19:01:33.000000000 +1100
971     @@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z
972     }
973    
974     static inline void
975     +add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
976     +{
977     + list_add_tail(&page->lru, &zone->inactive_list);
978     + zone->nr_inactive++;
979     +}
980     +
981     +static inline void
982     del_page_from_active_list(struct zone *zone, struct page *page)
983     {
984     list_del(&page->lru);
985     Index: linux-2.6.20-ck1/include/linux/swap-prefetch.h
986     ===================================================================
987     --- /dev/null 1970-01-01 00:00:00.000000000 +0000
988     +++ linux-2.6.20-ck1/include/linux/swap-prefetch.h 2007-02-16 19:01:33.000000000 +1100
989     @@ -0,0 +1,55 @@
990     +#ifndef SWAP_PREFETCH_H_INCLUDED
991     +#define SWAP_PREFETCH_H_INCLUDED
992     +
993     +#ifdef CONFIG_SWAP_PREFETCH
994     +/* mm/swap_prefetch.c */
995     +extern int swap_prefetch;
996     +struct swapped_entry {
997     + swp_entry_t swp_entry; /* The actual swap entry */
998     + struct list_head swapped_list; /* Linked list of entries */
999     +#if MAX_NUMNODES > 1
1000     + int node; /* Node id */
1001     +#endif
1002     +} __attribute__((packed));
1003     +
1004     +static inline void store_swap_entry_node(struct swapped_entry *entry,
1005     + struct page *page)
1006     +{
1007     +#if MAX_NUMNODES > 1
1008     + entry->node = page_to_nid(page);
1009     +#endif
1010     +}
1011     +
1012     +static inline int get_swap_entry_node(struct swapped_entry *entry)
1013     +{
1014     +#if MAX_NUMNODES > 1
1015     + return entry->node;
1016     +#else
1017     + return 0;
1018     +#endif
1019     +}
1020     +
1021     +extern void add_to_swapped_list(struct page *page);
1022     +extern void remove_from_swapped_list(const unsigned long index);
1023     +extern void delay_swap_prefetch(void);
1024     +extern void prepare_swap_prefetch(void);
1025     +
1026     +#else /* CONFIG_SWAP_PREFETCH */
1027     +static inline void add_to_swapped_list(struct page *__unused)
1028     +{
1029     +}
1030     +
1031     +static inline void prepare_swap_prefetch(void)
1032     +{
1033     +}
1034     +
1035     +static inline void remove_from_swapped_list(const unsigned long __unused)
1036     +{
1037     +}
1038     +
1039     +static inline void delay_swap_prefetch(void)
1040     +{
1041     +}
1042     +#endif /* CONFIG_SWAP_PREFETCH */
1043     +
1044     +#endif /* SWAP_PREFETCH_H_INCLUDED */