Contents of /trunk/kernel26-magellan/patches-2.6.20-r2/0015-2.6.20-mm-swap_prefetch-34.patch
Parent Directory | Revision Log
Revision 108 -
(show annotations)
(download)
Mon Mar 12 00:20:28 2007 UTC (17 years, 6 months ago) by niro
File size: 31305 byte(s)
Mon Mar 12 00:20:28 2007 UTC (17 years, 6 months ago) by niro
File size: 31305 byte(s)
ver bump to 2.6.20-r2; - using linux-2.6.20.2 - 2.6.20-ck1 patch set - 8.34.8 ati-drivers - 1.0-9755 nvidia-drivers - 1.2.0 ipw-drivers - squashfs-3.0 support - vesafb-tng 1.0-rc2 - fbsplash-0.9.2-r5 for linux-2.6.20-rc6 - removed zd1211 drivers (now in upstream tree) - disabled paravirt_ops for sake of non gpl video drivers
1 | Implement swap prefetching when the vm is relatively idle and there is free |
2 | ram available. The code is based on some preliminary code by Thomas |
3 | Schlichter. |
4 | |
5 | This stores a list of swapped entries in a list ordered most recently used |
6 | and a radix tree. It generates a low priority kernel thread running at |
7 | nice 19 to do the prefetching at a later stage. |
8 | |
9 | Once pages have been added to the swapped list, a timer is started, testing |
10 | for conditions suitable to prefetch swap pages every 5 seconds. Suitable |
11 | conditions are defined as lack of swapping out or in any pages, and no |
12 | watermark tests failing. Significant amounts of dirtied ram and changes in |
13 | free ram representing disk writes or reads also prevent prefetching. |
14 | |
15 | It then checks that we have spare ram looking for at least 3* pages_high |
16 | free per zone and if it succeeds that will prefetch pages from swap into |
17 | the swap cache. The pages are added to the tail of the inactive list to |
18 | preserve LRU ordering. |
19 | |
20 | Pages are prefetched until the list is empty or the vm is seen as busy |
21 | according to the previously described criteria. Node data on numa is |
22 | stored with the entries and an appropriate zonelist based on this is used |
23 | when allocating ram. |
24 | |
25 | The pages are copied to swap cache and kept on backing store. This allows |
26 | pressure on either physical ram or swap to readily find free pages without |
27 | further I/O. |
28 | |
29 | Prefetching can be enabled/disabled via the tunable in |
30 | /proc/sys/vm/swap_prefetch initially set to 1 (enabled). |
31 | |
32 | Enabling laptop_mode disables swap prefetching to prevent unnecessary spin |
33 | ups. |
34 | |
35 | In testing on modern pc hardware this results in wall-clock time activation |
36 | of the firefox browser to speed up 5 fold after a worst case complete |
37 | swap-out of the browser on a static web page. |
38 | |
39 | From: Ingo Molnar <mingo@elte.hu> |
40 | |
41 | Fix potential swap-prefetch deadlock, found by the locking correctness |
42 | validator. |
43 | |
44 | Signed-off-by: Con Kolivas <kernel@kolivas.org> |
45 | Signed-off-by: Ingo Molnar <mingo@elte.hu> |
46 | Signed-off-by: Andrew Morton <akpm@osdl.org> |
47 | |
48 | Documentation/sysctl/vm.txt | 12 |
49 | include/linux/mm_inline.h | 7 |
50 | include/linux/swap-prefetch.h | 55 +++ |
51 | include/linux/swap.h | 2 |
52 | include/linux/sysctl.h | 1 |
53 | init/Kconfig | 22 + |
54 | kernel/sysctl.c | 11 |
55 | mm/Makefile | 1 |
56 | mm/swap.c | 48 +++ |
57 | mm/swap_prefetch.c | 581 ++++++++++++++++++++++++++++++++++++++++++ |
58 | mm/swap_state.c | 11 |
59 | mm/vmscan.c | 6 |
60 | 12 files changed, 756 insertions(+), 1 deletion(-) |
61 | |
62 | Index: linux-2.6.20-ck1/Documentation/sysctl/vm.txt |
63 | =================================================================== |
64 | --- linux-2.6.20-ck1.orig/Documentation/sysctl/vm.txt 2007-02-05 22:51:59.000000000 +1100 |
65 | +++ linux-2.6.20-ck1/Documentation/sysctl/vm.txt 2007-02-16 19:01:33.000000000 +1100 |
66 | @@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/ |
67 | - min_unmapped_ratio |
68 | - min_slab_ratio |
69 | - panic_on_oom |
70 | +- swap_prefetch |
71 | |
72 | ============================================================== |
73 | |
74 | @@ -205,3 +206,14 @@ rather than killing rogue processes, set |
75 | |
76 | The default value is 0. |
77 | |
78 | +============================================================== |
79 | + |
80 | +swap_prefetch |
81 | + |
82 | +This enables or disables the swap prefetching feature. When the virtual |
83 | +memory subsystem has been extremely idle for at least 5 seconds it will start |
84 | +copying back pages from swap into the swapcache and keep a copy in swap. In |
85 | +practice it can take many minutes before the vm is idle enough. |
86 | + |
87 | +The default value is 1. |
88 | + |
89 | Index: linux-2.6.20-ck1/include/linux/swap.h |
90 | =================================================================== |
91 | --- linux-2.6.20-ck1.orig/include/linux/swap.h 2007-02-05 22:52:04.000000000 +1100 |
92 | +++ linux-2.6.20-ck1/include/linux/swap.h 2007-02-16 19:01:33.000000000 +1100 |
93 | @@ -178,6 +178,7 @@ extern unsigned int nr_free_pagecache_pa |
94 | /* linux/mm/swap.c */ |
95 | extern void FASTCALL(lru_cache_add(struct page *)); |
96 | extern void FASTCALL(lru_cache_add_active(struct page *)); |
97 | +extern void FASTCALL(lru_cache_add_tail(struct page *)); |
98 | extern void FASTCALL(activate_page(struct page *)); |
99 | extern void FASTCALL(mark_page_accessed(struct page *)); |
100 | extern void lru_add_drain(void); |
101 | @@ -235,6 +236,7 @@ extern void free_pages_and_swap_cache(st |
102 | extern struct page * lookup_swap_cache(swp_entry_t); |
103 | extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, |
104 | unsigned long addr); |
105 | +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); |
106 | /* linux/mm/swapfile.c */ |
107 | extern long total_swap_pages; |
108 | extern unsigned int nr_swapfiles; |
109 | Index: linux-2.6.20-ck1/include/linux/sysctl.h |
110 | =================================================================== |
111 | --- linux-2.6.20-ck1.orig/include/linux/sysctl.h 2007-02-05 22:52:04.000000000 +1100 |
112 | +++ linux-2.6.20-ck1/include/linux/sysctl.h 2007-02-16 19:01:33.000000000 +1100 |
113 | @@ -202,6 +202,7 @@ enum |
114 | VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ |
115 | VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ |
116 | VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ |
117 | + VM_SWAP_PREFETCH=36, /* swap prefetch */ |
118 | }; |
119 | |
120 | |
121 | Index: linux-2.6.20-ck1/init/Kconfig |
122 | =================================================================== |
123 | --- linux-2.6.20-ck1.orig/init/Kconfig 2007-02-05 22:52:04.000000000 +1100 |
124 | +++ linux-2.6.20-ck1/init/Kconfig 2007-02-16 19:01:33.000000000 +1100 |
125 | @@ -101,6 +101,28 @@ config SWAP |
126 | used to provide more virtual memory than the actual RAM present |
127 | in your computer. If unsure say Y. |
128 | |
129 | +config SWAP_PREFETCH |
130 | + bool "Support for prefetching swapped memory" |
131 | + depends on SWAP |
132 | + default y |
133 | + ---help--- |
134 | + This option will allow the kernel to prefetch swapped memory pages |
135 | + when idle. The pages will be kept on both swap and in swap_cache |
136 | + thus avoiding the need for further I/O if either ram or swap space |
137 | + is required. |
138 | + |
139 | + What this will do on workstations is slowly bring back applications |
140 | + that have swapped out after memory intensive workloads back into |
141 | + physical ram if you have free ram at a later stage and the machine |
142 | + is relatively idle. This means that when you come back to your |
143 | + computer after leaving it idle for a while, applications will come |
144 | + to life faster. Note that your swap usage will appear to increase |
145 | + but these are cached pages, can be dropped freely by the vm, and it |
146 | + should stabilise around 50% swap usage maximum. |
147 | + |
148 | + Workstations and multiuser workstation servers will most likely want |
149 | + to say Y. |
150 | + |
151 | config SYSVIPC |
152 | bool "System V IPC" |
153 | ---help--- |
154 | Index: linux-2.6.20-ck1/kernel/sysctl.c |
155 | =================================================================== |
156 | --- linux-2.6.20-ck1.orig/kernel/sysctl.c 2007-02-16 19:01:31.000000000 +1100 |
157 | +++ linux-2.6.20-ck1/kernel/sysctl.c 2007-02-16 19:01:33.000000000 +1100 |
158 | @@ -22,6 +22,7 @@ |
159 | #include <linux/mm.h> |
160 | #include <linux/swap.h> |
161 | #include <linux/slab.h> |
162 | +#include <linux/swap-prefetch.h> |
163 | #include <linux/sysctl.h> |
164 | #include <linux/proc_fs.h> |
165 | #include <linux/capability.h> |
166 | @@ -1064,6 +1065,16 @@ static ctl_table vm_table[] = { |
167 | .extra1 = &zero, |
168 | }, |
169 | #endif |
170 | +#ifdef CONFIG_SWAP_PREFETCH |
171 | + { |
172 | + .ctl_name = VM_SWAP_PREFETCH, |
173 | + .procname = "swap_prefetch", |
174 | + .data = &swap_prefetch, |
175 | + .maxlen = sizeof(swap_prefetch), |
176 | + .mode = 0644, |
177 | + .proc_handler = &proc_dointvec, |
178 | + }, |
179 | +#endif |
180 | { .ctl_name = 0 } |
181 | }; |
182 | |
183 | Index: linux-2.6.20-ck1/mm/Makefile |
184 | =================================================================== |
185 | --- linux-2.6.20-ck1.orig/mm/Makefile 2006-11-30 11:30:41.000000000 +1100 |
186 | +++ linux-2.6.20-ck1/mm/Makefile 2007-02-16 19:01:33.000000000 +1100 |
187 | @@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) |
188 | obj-y += bounce.o |
189 | endif |
190 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
191 | +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o |
192 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
193 | obj-$(CONFIG_NUMA) += mempolicy.o |
194 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
195 | Index: linux-2.6.20-ck1/mm/swap.c |
196 | =================================================================== |
197 | --- linux-2.6.20-ck1.orig/mm/swap.c 2007-02-05 22:52:04.000000000 +1100 |
198 | +++ linux-2.6.20-ck1/mm/swap.c 2007-02-16 19:01:33.000000000 +1100 |
199 | @@ -17,6 +17,7 @@ |
200 | #include <linux/sched.h> |
201 | #include <linux/kernel_stat.h> |
202 | #include <linux/swap.h> |
203 | +#include <linux/swap-prefetch.h> |
204 | #include <linux/mman.h> |
205 | #include <linux/pagemap.h> |
206 | #include <linux/pagevec.h> |
207 | @@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed); |
208 | */ |
209 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; |
210 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; |
211 | +static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; |
212 | |
213 | void fastcall lru_cache_add(struct page *page) |
214 | { |
215 | @@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc |
216 | put_cpu_var(lru_add_active_pvecs); |
217 | } |
218 | |
219 | +static void __pagevec_lru_add_tail(struct pagevec *pvec) |
220 | +{ |
221 | + int i; |
222 | + struct zone *zone = NULL; |
223 | + |
224 | + for (i = 0; i < pagevec_count(pvec); i++) { |
225 | + struct page *page = pvec->pages[i]; |
226 | + struct zone *pagezone = page_zone(page); |
227 | + |
228 | + if (pagezone != zone) { |
229 | + if (zone) |
230 | + spin_unlock_irq(&zone->lru_lock); |
231 | + zone = pagezone; |
232 | + spin_lock_irq(&zone->lru_lock); |
233 | + } |
234 | + BUG_ON(PageLRU(page)); |
235 | + SetPageLRU(page); |
236 | + add_page_to_inactive_list_tail(zone, page); |
237 | + } |
238 | + if (zone) |
239 | + spin_unlock_irq(&zone->lru_lock); |
240 | + release_pages(pvec->pages, pvec->nr, pvec->cold); |
241 | + pagevec_reinit(pvec); |
242 | +} |
243 | + |
244 | static void __lru_add_drain(int cpu) |
245 | { |
246 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); |
247 | @@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu) |
248 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
249 | if (pagevec_count(pvec)) |
250 | __pagevec_lru_add_active(pvec); |
251 | + pvec = &per_cpu(lru_add_tail_pvecs, cpu); |
252 | + if (pagevec_count(pvec)) |
253 | + __pagevec_lru_add_tail(pvec); |
254 | } |
255 | |
256 | void lru_add_drain(void) |
257 | @@ -403,6 +433,21 @@ void __pagevec_lru_add_active(struct pag |
258 | } |
259 | |
260 | /* |
261 | + * Function used uniquely to put pages back to the lru at the end of the |
262 | + * inactive list to preserve the lru order. Currently only used by swap |
263 | + * prefetch. |
264 | + */ |
265 | +void fastcall lru_cache_add_tail(struct page *page) |
266 | +{ |
267 | + struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); |
268 | + |
269 | + page_cache_get(page); |
270 | + if (!pagevec_add(pvec, page)) |
271 | + __pagevec_lru_add_tail(pvec); |
272 | + put_cpu_var(lru_add_pvecs); |
273 | +} |
274 | + |
275 | +/* |
276 | * Try to drop buffers from the pages in a pagevec |
277 | */ |
278 | void pagevec_strip(struct pagevec *pvec) |
279 | @@ -514,6 +559,9 @@ void __init swap_setup(void) |
280 | * Right now other parts of the system means that we |
281 | * _really_ don't want to cluster much more |
282 | */ |
283 | + |
284 | + prepare_swap_prefetch(); |
285 | + |
286 | #ifdef CONFIG_HOTPLUG_CPU |
287 | hotcpu_notifier(cpu_swap_callback, 0); |
288 | #endif |
289 | Index: linux-2.6.20-ck1/mm/swap_prefetch.c |
290 | =================================================================== |
291 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 |
292 | +++ linux-2.6.20-ck1/mm/swap_prefetch.c 2007-02-16 19:01:33.000000000 +1100 |
293 | @@ -0,0 +1,581 @@ |
294 | +/* |
295 | + * linux/mm/swap_prefetch.c |
296 | + * |
297 | + * Copyright (C) 2005-2006 Con Kolivas |
298 | + * |
299 | + * Written by Con Kolivas <kernel@kolivas.org> |
300 | + * |
301 | + * This program is free software; you can redistribute it and/or modify |
302 | + * it under the terms of the GNU General Public License version 2 as |
303 | + * published by the Free Software Foundation. |
304 | + */ |
305 | + |
306 | +#include <linux/fs.h> |
307 | +#include <linux/mm.h> |
308 | +#include <linux/swap.h> |
309 | +#include <linux/swap-prefetch.h> |
310 | +#include <linux/ioprio.h> |
311 | +#include <linux/kthread.h> |
312 | +#include <linux/pagemap.h> |
313 | +#include <linux/syscalls.h> |
314 | +#include <linux/writeback.h> |
315 | +#include <linux/vmstat.h> |
316 | +#include <linux/freezer.h> |
317 | + |
318 | +/* |
319 | + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There |
320 | + * needs to be at least this duration of idle time meaning in practice it can |
321 | + * be much longer |
322 | + */ |
323 | +#define PREFETCH_DELAY (HZ * 5) |
324 | + |
325 | +/* sysctl - enable/disable swap prefetching */ |
326 | +int swap_prefetch __read_mostly = 1; |
327 | + |
328 | +struct swapped_root { |
329 | + unsigned long busy; /* vm busy */ |
330 | + spinlock_t lock; /* protects all data */ |
331 | + struct list_head list; /* MRU list of swapped pages */ |
332 | + struct radix_tree_root swap_tree; /* Lookup tree of pages */ |
333 | + unsigned int count; /* Number of entries */ |
334 | + unsigned int maxcount; /* Maximum entries allowed */ |
335 | + struct kmem_cache *cache; /* Of struct swapped_entry */ |
336 | +}; |
337 | + |
338 | +static struct swapped_root swapped = { |
339 | + .lock = SPIN_LOCK_UNLOCKED, |
340 | + .list = LIST_HEAD_INIT(swapped.list), |
341 | + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), |
342 | +}; |
343 | + |
344 | +static struct task_struct *kprefetchd_task; |
345 | + |
346 | +/* |
347 | + * We check to see no part of the vm is busy. If it is this will interrupt |
348 | + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. |
349 | + */ |
350 | +inline void delay_swap_prefetch(void) |
351 | +{ |
352 | + if (!test_bit(0, &swapped.busy)) |
353 | + __set_bit(0, &swapped.busy); |
354 | +} |
355 | + |
356 | +/* |
357 | + * Drop behind accounting which keeps a list of the most recently used swap |
358 | + * entries. |
359 | + */ |
360 | +void add_to_swapped_list(struct page *page) |
361 | +{ |
362 | + struct swapped_entry *entry; |
363 | + unsigned long index, flags; |
364 | + int wakeup; |
365 | + |
366 | + if (!swap_prefetch) |
367 | + return; |
368 | + |
369 | + wakeup = 0; |
370 | + |
371 | + spin_lock_irqsave(&swapped.lock, flags); |
372 | + if (swapped.count >= swapped.maxcount) { |
373 | + /* |
374 | + * We limit the number of entries to 2/3 of physical ram. |
375 | + * Once the number of entries exceeds this we start removing |
376 | + * the least recently used entries. |
377 | + */ |
378 | + entry = list_entry(swapped.list.next, |
379 | + struct swapped_entry, swapped_list); |
380 | + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); |
381 | + list_del(&entry->swapped_list); |
382 | + swapped.count--; |
383 | + } else { |
384 | + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); |
385 | + if (unlikely(!entry)) |
386 | + /* bad, can't allocate more mem */ |
387 | + goto out_locked; |
388 | + } |
389 | + |
390 | + index = page_private(page); |
391 | + entry->swp_entry.val = index; |
392 | + /* |
393 | + * On numa we need to store the node id to ensure that we prefetch to |
394 | + * the same node it came from. |
395 | + */ |
396 | + store_swap_entry_node(entry, page); |
397 | + |
398 | + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { |
399 | + /* |
400 | + * If this is the first entry, kprefetchd needs to be |
401 | + * (re)started. |
402 | + */ |
403 | + if (!swapped.count) |
404 | + wakeup = 1; |
405 | + list_add(&entry->swapped_list, &swapped.list); |
406 | + swapped.count++; |
407 | + } |
408 | + |
409 | +out_locked: |
410 | + spin_unlock_irqrestore(&swapped.lock, flags); |
411 | + |
412 | + /* Do the wakeup outside the lock to shorten lock hold time. */ |
413 | + if (wakeup) |
414 | + wake_up_process(kprefetchd_task); |
415 | + |
416 | + return; |
417 | +} |
418 | + |
419 | +/* |
420 | + * Removes entries from the swapped_list. The radix tree allows us to quickly |
421 | + * look up the entry from the index without having to iterate over the whole |
422 | + * list. |
423 | + */ |
424 | +void remove_from_swapped_list(const unsigned long index) |
425 | +{ |
426 | + struct swapped_entry *entry; |
427 | + unsigned long flags; |
428 | + |
429 | + if (list_empty(&swapped.list)) |
430 | + return; |
431 | + |
432 | + spin_lock_irqsave(&swapped.lock, flags); |
433 | + entry = radix_tree_delete(&swapped.swap_tree, index); |
434 | + if (likely(entry)) { |
435 | + list_del_init(&entry->swapped_list); |
436 | + swapped.count--; |
437 | + kmem_cache_free(swapped.cache, entry); |
438 | + } |
439 | + spin_unlock_irqrestore(&swapped.lock, flags); |
440 | +} |
441 | + |
442 | +enum trickle_return { |
443 | + TRICKLE_SUCCESS, |
444 | + TRICKLE_FAILED, |
445 | + TRICKLE_DELAY, |
446 | +}; |
447 | + |
448 | +struct node_stats { |
449 | + unsigned long last_free; |
450 | + /* Free ram after a cycle of prefetching */ |
451 | + unsigned long current_free; |
452 | + /* Free ram on this cycle of checking prefetch_suitable */ |
453 | + unsigned long prefetch_watermark; |
454 | + /* Maximum amount we will prefetch to */ |
455 | + unsigned long highfree[MAX_NR_ZONES]; |
456 | + /* The amount of free ram before we start prefetching */ |
457 | + unsigned long lowfree[MAX_NR_ZONES]; |
458 | + /* The amount of free ram where we will stop prefetching */ |
459 | + unsigned long *pointfree[MAX_NR_ZONES]; |
460 | + /* highfree or lowfree depending on whether we've hit a watermark */ |
461 | +}; |
462 | + |
463 | +/* |
464 | + * prefetch_stats stores the free ram data of each node and this is used to |
465 | + * determine if a node is suitable for prefetching into. |
466 | + */ |
467 | +struct prefetch_stats { |
468 | + nodemask_t prefetch_nodes; |
469 | + /* Which nodes are currently suited to prefetching */ |
470 | + unsigned long prefetched_pages; |
471 | + /* Total pages we've prefetched on this wakeup of kprefetchd */ |
472 | + struct node_stats node[MAX_NUMNODES]; |
473 | +}; |
474 | + |
475 | +static struct prefetch_stats sp_stat; |
476 | + |
477 | +/* |
478 | + * This tries to read a swp_entry_t into swap cache for swap prefetching. |
479 | + * If it returns TRICKLE_DELAY we should delay further prefetching. |
480 | + */ |
481 | +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, |
482 | + const int node) |
483 | +{ |
484 | + enum trickle_return ret = TRICKLE_FAILED; |
485 | + struct page *page; |
486 | + |
487 | + read_lock_irq(&swapper_space.tree_lock); |
488 | + /* Entry may already exist */ |
489 | + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); |
490 | + read_unlock_irq(&swapper_space.tree_lock); |
491 | + if (page) { |
492 | + remove_from_swapped_list(entry.val); |
493 | + goto out; |
494 | + } |
495 | + |
496 | + /* |
497 | + * Get a new page to read from swap. We have already checked the |
498 | + * watermarks so __alloc_pages will not call on reclaim. |
499 | + */ |
500 | + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); |
501 | + if (unlikely(!page)) { |
502 | + ret = TRICKLE_DELAY; |
503 | + goto out; |
504 | + } |
505 | + |
506 | + if (add_to_swap_cache(page, entry)) { |
507 | + /* Failed to add to swap cache */ |
508 | + goto out_release; |
509 | + } |
510 | + |
511 | + /* Add them to the tail of the inactive list to preserve LRU order */ |
512 | + lru_cache_add_tail(page); |
513 | + if (unlikely(swap_readpage(NULL, page))) { |
514 | + ret = TRICKLE_DELAY; |
515 | + goto out_release; |
516 | + } |
517 | + |
518 | + sp_stat.prefetched_pages++; |
519 | + sp_stat.node[node].last_free--; |
520 | + |
521 | + ret = TRICKLE_SUCCESS; |
522 | +out_release: |
523 | + page_cache_release(page); |
524 | +out: |
525 | + return ret; |
526 | +} |
527 | + |
528 | +static void clear_last_prefetch_free(void) |
529 | +{ |
530 | + int node; |
531 | + |
532 | + /* |
533 | + * Reset the nodes suitable for prefetching to all nodes. We could |
534 | + * update the data to take into account memory hotplug if desired.. |
535 | + */ |
536 | + sp_stat.prefetch_nodes = node_online_map; |
537 | + for_each_node_mask(node, sp_stat.prefetch_nodes) { |
538 | + struct node_stats *ns = &sp_stat.node[node]; |
539 | + |
540 | + ns->last_free = 0; |
541 | + } |
542 | +} |
543 | + |
544 | +static void clear_current_prefetch_free(void) |
545 | +{ |
546 | + int node; |
547 | + |
548 | + sp_stat.prefetch_nodes = node_online_map; |
549 | + for_each_node_mask(node, sp_stat.prefetch_nodes) { |
550 | + struct node_stats *ns = &sp_stat.node[node]; |
551 | + |
552 | + ns->current_free = 0; |
553 | + } |
554 | +} |
555 | + |
556 | +/* |
557 | + * This updates the high and low watermarks of amount of free ram in each |
558 | + * node used to start and stop prefetching. We prefetch from pages_high * 4 |
559 | + * down to pages_high * 3. |
560 | + */ |
561 | +static void examine_free_limits(void) |
562 | +{ |
563 | + struct zone *z; |
564 | + |
565 | + for_each_zone(z) { |
566 | + struct node_stats *ns; |
567 | + int idx; |
568 | + |
569 | + if (!populated_zone(z)) |
570 | + continue; |
571 | + |
572 | + ns = &sp_stat.node[z->zone_pgdat->node_id]; |
573 | + idx = zone_idx(z); |
574 | + ns->lowfree[idx] = z->pages_high * 3; |
575 | + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; |
576 | + |
577 | + if (z->free_pages > ns->highfree[idx]) { |
578 | + /* |
579 | + * We've gotten above the high watermark of free pages |
580 | + * so we can start prefetching till we get to the low |
581 | + * watermark. |
582 | + */ |
583 | + ns->pointfree[idx] = &ns->lowfree[idx]; |
584 | + } |
585 | + } |
586 | +} |
587 | + |
588 | +/* |
589 | + * We want to be absolutely certain it's ok to start prefetching. |
590 | + */ |
591 | +static int prefetch_suitable(void) |
592 | +{ |
593 | + unsigned long limit; |
594 | + struct zone *z; |
595 | + int node, ret = 0, test_pagestate = 0; |
596 | + |
597 | + /* Purposefully racy */ |
598 | + if (test_bit(0, &swapped.busy)) { |
599 | + __clear_bit(0, &swapped.busy); |
600 | + goto out; |
601 | + } |
602 | + |
603 | + /* |
604 | + * get_page_state and above_background_load are expensive so we only |
605 | + * perform them every SWAP_CLUSTER_MAX prefetched_pages. |
606 | + * We test to see if we're above_background_load as disk activity |
607 | + * even at low priority can cause interrupt induced scheduling |
608 | + * latencies. |
609 | + */ |
610 | + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { |
611 | + if (above_background_load()) |
612 | + goto out; |
613 | + test_pagestate = 1; |
614 | + } |
615 | + |
616 | + clear_current_prefetch_free(); |
617 | + |
618 | + /* |
619 | + * Have some hysteresis between where page reclaiming and prefetching |
620 | + * will occur to prevent ping-ponging between them. |
621 | + */ |
622 | + for_each_zone(z) { |
623 | + struct node_stats *ns; |
624 | + unsigned long free; |
625 | + int idx; |
626 | + |
627 | + if (!populated_zone(z)) |
628 | + continue; |
629 | + |
630 | + node = z->zone_pgdat->node_id; |
631 | + ns = &sp_stat.node[node]; |
632 | + idx = zone_idx(z); |
633 | + |
634 | + free = z->free_pages; |
635 | + if (free < *ns->pointfree[idx]) { |
636 | + /* |
637 | + * Free pages have dropped below the low watermark so |
638 | + * we won't start prefetching again till we hit the |
639 | + * high watermark of free pages. |
640 | + */ |
641 | + ns->pointfree[idx] = &ns->highfree[idx]; |
642 | + node_clear(node, sp_stat.prefetch_nodes); |
643 | + continue; |
644 | + } |
645 | + ns->current_free += free; |
646 | + } |
647 | + |
648 | + /* |
649 | + * We iterate over each node testing to see if it is suitable for |
650 | + * prefetching and clear the nodemask if it is not. |
651 | + */ |
652 | + for_each_node_mask(node, sp_stat.prefetch_nodes) { |
653 | + struct node_stats *ns = &sp_stat.node[node]; |
654 | + |
655 | + /* |
656 | + * We check to see that pages are not being allocated |
657 | + * elsewhere at any significant rate implying any |
658 | + * degree of memory pressure (eg during file reads) |
659 | + */ |
660 | + if (ns->last_free) { |
661 | + if (ns->current_free + SWAP_CLUSTER_MAX < |
662 | + ns->last_free) { |
663 | + ns->last_free = ns->current_free; |
664 | + node_clear(node, |
665 | + sp_stat.prefetch_nodes); |
666 | + continue; |
667 | + } |
668 | + } else |
669 | + ns->last_free = ns->current_free; |
670 | + |
671 | + if (!test_pagestate) |
672 | + continue; |
673 | + |
674 | + /* We shouldn't prefetch when we are doing writeback */ |
675 | + if (node_page_state(node, NR_WRITEBACK)) { |
676 | + node_clear(node, sp_stat.prefetch_nodes); |
677 | + continue; |
678 | + } |
679 | + |
680 | + /* |
681 | + * >2/3 of the ram on this node is mapped, slab, swapcache or |
682 | + * dirty, we need to leave some free for pagecache. |
683 | + */ |
684 | + limit = node_page_state(node, NR_FILE_PAGES); |
685 | + limit += node_page_state(node, NR_SLAB_RECLAIMABLE); |
686 | + limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE); |
687 | + limit += node_page_state(node, NR_FILE_DIRTY); |
688 | + limit += node_page_state(node, NR_UNSTABLE_NFS); |
689 | + limit += total_swapcache_pages; |
690 | + if (limit > ns->prefetch_watermark) { |
691 | + node_clear(node, sp_stat.prefetch_nodes); |
692 | + continue; |
693 | + } |
694 | + } |
695 | + |
696 | + if (nodes_empty(sp_stat.prefetch_nodes)) |
697 | + goto out; |
698 | + |
699 | + /* Survived all that? Hooray we can prefetch! */ |
700 | + ret = 1; |
701 | +out: |
702 | + return ret; |
703 | +} |
704 | + |
705 | +/* |
706 | + * Get previous swapped entry when iterating over all entries. swapped.lock |
707 | + * should be held and we should already ensure that entry exists. |
708 | + */ |
709 | +static inline struct swapped_entry *prev_swapped_entry |
710 | + (struct swapped_entry *entry) |
711 | +{ |
712 | + return list_entry(entry->swapped_list.prev->prev, |
713 | + struct swapped_entry, swapped_list); |
714 | +} |
715 | + |
716 | +/* |
717 | + * trickle_swap is the main function that initiates the swap prefetching. It |
718 | + * first checks to see if the busy flag is set, and does not prefetch if it |
719 | + * is, as the flag implied we are low on memory or swapping in currently. |
720 | + * Otherwise it runs until prefetch_suitable fails which occurs when the |
721 | + * vm is busy, we prefetch to the watermark, or the list is empty or we have |
722 | + * iterated over all entries |
723 | + */ |
724 | +static enum trickle_return trickle_swap(void) |
725 | +{ |
726 | + enum trickle_return ret = TRICKLE_DELAY; |
727 | + struct swapped_entry *entry; |
728 | + unsigned long flags; |
729 | + |
730 | + /* |
731 | + * If laptop_mode is enabled don't prefetch to avoid hard drives |
732 | + * doing unnecessary spin-ups |
733 | + */ |
734 | + if (!swap_prefetch || laptop_mode) |
735 | + return ret; |
736 | + |
737 | + examine_free_limits(); |
738 | + entry = NULL; |
739 | + |
740 | + for ( ; ; ) { |
741 | + swp_entry_t swp_entry; |
742 | + int node; |
743 | + |
744 | + if (!prefetch_suitable()) |
745 | + break; |
746 | + |
747 | + spin_lock_irqsave(&swapped.lock, flags); |
748 | + if (list_empty(&swapped.list)) { |
749 | + ret = TRICKLE_FAILED; |
750 | + spin_unlock_irqrestore(&swapped.lock, flags); |
751 | + break; |
752 | + } |
753 | + |
754 | + if (!entry) { |
755 | + /* |
756 | + * This sets the entry for the first iteration. It |
757 | + * also is a safeguard against the entry disappearing |
758 | + * while the lock is not held. |
759 | + */ |
760 | + entry = list_entry(swapped.list.prev, |
761 | + struct swapped_entry, swapped_list); |
762 | + } else if (entry->swapped_list.prev == swapped.list.next) { |
763 | + /* |
764 | + * If we have iterated over all entries and there are |
765 | + * still entries that weren't swapped out there may |
766 | + * be a reason we could not swap them back in so |
767 | + * delay attempting further prefetching. |
768 | + */ |
769 | + spin_unlock_irqrestore(&swapped.lock, flags); |
770 | + break; |
771 | + } |
772 | + |
773 | + node = get_swap_entry_node(entry); |
774 | + if (!node_isset(node, sp_stat.prefetch_nodes)) { |
775 | + /* |
776 | + * We found an entry that belongs to a node that is |
777 | + * not suitable for prefetching so skip it. |
778 | + */ |
779 | + entry = prev_swapped_entry(entry); |
780 | + spin_unlock_irqrestore(&swapped.lock, flags); |
781 | + continue; |
782 | + } |
783 | + swp_entry = entry->swp_entry; |
784 | + entry = prev_swapped_entry(entry); |
785 | + spin_unlock_irqrestore(&swapped.lock, flags); |
786 | + |
787 | + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) |
788 | + break; |
789 | + } |
790 | + |
791 | + if (sp_stat.prefetched_pages) { |
792 | + lru_add_drain(); |
793 | + sp_stat.prefetched_pages = 0; |
794 | + } |
795 | + return ret; |
796 | +} |
797 | + |
798 | +static int kprefetchd(void *__unused) |
799 | +{ |
800 | + struct sched_param param = { .sched_priority = 0 }; |
801 | + |
802 | + sched_setscheduler(current, SCHED_BATCH, ¶m); |
803 | + set_user_nice(current, 19); |
804 | + /* Set ioprio to lowest if supported by i/o scheduler */ |
805 | + sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); |
806 | + |
807 | + /* kprefetchd has nothing to do until it is woken up the first time */ |
808 | + set_current_state(TASK_INTERRUPTIBLE); |
809 | + schedule(); |
810 | + |
811 | + do { |
812 | + try_to_freeze(); |
813 | + |
814 | + /* |
815 | + * TRICKLE_FAILED implies no entries left - we do not schedule |
816 | + * a wakeup, and further delay the next one. |
817 | + */ |
818 | + if (trickle_swap() == TRICKLE_FAILED) { |
819 | + set_current_state(TASK_INTERRUPTIBLE); |
820 | + schedule(); |
821 | + } |
822 | + clear_last_prefetch_free(); |
823 | + schedule_timeout_interruptible(PREFETCH_DELAY); |
824 | + } while (!kthread_should_stop()); |
825 | + |
826 | + return 0; |
827 | +} |
828 | + |
829 | +/* |
830 | + * Create kmem cache for swapped entries |
831 | + */ |
832 | +void __init prepare_swap_prefetch(void) |
833 | +{ |
834 | + struct zone *zone; |
835 | + |
836 | + swapped.cache = kmem_cache_create("swapped_entry", |
837 | + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); |
838 | + |
839 | + /* |
840 | + * Set max number of entries to 2/3 the size of physical ram as we |
841 | + * only ever prefetch to consume 2/3 of the ram. |
842 | + */ |
843 | + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; |
844 | + |
845 | + for_each_zone(zone) { |
846 | + unsigned long present; |
847 | + struct node_stats *ns; |
848 | + int idx; |
849 | + |
850 | + present = zone->present_pages; |
851 | + if (!present) |
852 | + continue; |
853 | + |
854 | + ns = &sp_stat.node[zone->zone_pgdat->node_id]; |
855 | + ns->prefetch_watermark += present / 3 * 2; |
856 | + idx = zone_idx(zone); |
857 | + ns->pointfree[idx] = &ns->highfree[idx]; |
858 | + } |
859 | +} |
860 | + |
861 | +static int __init kprefetchd_init(void) |
862 | +{ |
863 | + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); |
864 | + |
865 | + return 0; |
866 | +} |
867 | + |
868 | +static void __exit kprefetchd_exit(void) |
869 | +{ |
870 | + kthread_stop(kprefetchd_task); |
871 | +} |
872 | + |
873 | +module_init(kprefetchd_init); |
874 | +module_exit(kprefetchd_exit); |
875 | Index: linux-2.6.20-ck1/mm/swap_state.c |
876 | =================================================================== |
877 | --- linux-2.6.20-ck1.orig/mm/swap_state.c 2006-09-21 19:55:01.000000000 +1000 |
878 | +++ linux-2.6.20-ck1/mm/swap_state.c 2007-02-16 19:01:33.000000000 +1100 |
879 | @@ -10,6 +10,7 @@ |
880 | #include <linux/mm.h> |
881 | #include <linux/kernel_stat.h> |
882 | #include <linux/swap.h> |
883 | +#include <linux/swap-prefetch.h> |
884 | #include <linux/init.h> |
885 | #include <linux/pagemap.h> |
886 | #include <linux/buffer_head.h> |
887 | @@ -82,6 +83,7 @@ static int __add_to_swap_cache(struct pa |
888 | error = radix_tree_insert(&swapper_space.page_tree, |
889 | entry.val, page); |
890 | if (!error) { |
891 | + remove_from_swapped_list(entry.val); |
892 | page_cache_get(page); |
893 | SetPageLocked(page); |
894 | SetPageSwapCache(page); |
895 | @@ -95,11 +97,12 @@ static int __add_to_swap_cache(struct pa |
896 | return error; |
897 | } |
898 | |
899 | -static int add_to_swap_cache(struct page *page, swp_entry_t entry) |
900 | +int add_to_swap_cache(struct page *page, swp_entry_t entry) |
901 | { |
902 | int error; |
903 | |
904 | if (!swap_duplicate(entry)) { |
905 | + remove_from_swapped_list(entry.val); |
906 | INC_CACHE_INFO(noent_race); |
907 | return -ENOENT; |
908 | } |
909 | @@ -148,6 +151,9 @@ int add_to_swap(struct page * page, gfp_ |
910 | swp_entry_t entry; |
911 | int err; |
912 | |
913 | + /* Swap prefetching is delayed if we're swapping pages */ |
914 | + delay_swap_prefetch(); |
915 | + |
916 | BUG_ON(!PageLocked(page)); |
917 | |
918 | for (;;) { |
919 | @@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e |
920 | struct page *found_page, *new_page = NULL; |
921 | int err; |
922 | |
923 | + /* Swap prefetching is delayed if we're already reading from swap */ |
924 | + delay_swap_prefetch(); |
925 | + |
926 | do { |
927 | /* |
928 | * First check the swap cache. Since this is normally |
929 | Index: linux-2.6.20-ck1/mm/vmscan.c |
930 | =================================================================== |
931 | --- linux-2.6.20-ck1.orig/mm/vmscan.c 2007-02-05 22:52:04.000000000 +1100 |
932 | +++ linux-2.6.20-ck1/mm/vmscan.c 2007-02-16 19:01:33.000000000 +1100 |
933 | @@ -16,6 +16,7 @@ |
934 | #include <linux/slab.h> |
935 | #include <linux/kernel_stat.h> |
936 | #include <linux/swap.h> |
937 | +#include <linux/swap-prefetch.h> |
938 | #include <linux/pagemap.h> |
939 | #include <linux/init.h> |
940 | #include <linux/highmem.h> |
941 | @@ -424,6 +425,7 @@ int remove_mapping(struct address_space |
942 | |
943 | if (PageSwapCache(page)) { |
944 | swp_entry_t swap = { .val = page_private(page) }; |
945 | + add_to_swapped_list(page); |
946 | __delete_from_swap_cache(page); |
947 | write_unlock_irq(&mapping->tree_lock); |
948 | swap_free(swap); |
949 | @@ -1029,6 +1031,8 @@ unsigned long try_to_free_pages(struct z |
950 | .swappiness = vm_swappiness, |
951 | }; |
952 | |
953 | + delay_swap_prefetch(); |
954 | + |
955 | count_vm_event(ALLOCSTALL); |
956 | |
957 | for (i = 0; zones[i] != NULL; i++) { |
958 | @@ -1375,6 +1379,8 @@ static unsigned long shrink_all_zones(un |
959 | struct zone *zone; |
960 | unsigned long nr_to_scan, ret = 0; |
961 | |
962 | + delay_swap_prefetch(); |
963 | + |
964 | for_each_zone(zone) { |
965 | |
966 | if (!populated_zone(zone)) |
967 | Index: linux-2.6.20-ck1/include/linux/mm_inline.h |
968 | =================================================================== |
969 | --- linux-2.6.20-ck1.orig/include/linux/mm_inline.h 2006-06-18 15:32:49.000000000 +1000 |
970 | +++ linux-2.6.20-ck1/include/linux/mm_inline.h 2007-02-16 19:01:33.000000000 +1100 |
971 | @@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z |
972 | } |
973 | |
974 | static inline void |
975 | +add_page_to_inactive_list_tail(struct zone *zone, struct page *page) |
976 | +{ |
977 | + list_add_tail(&page->lru, &zone->inactive_list); |
978 | + zone->nr_inactive++; |
979 | +} |
980 | + |
981 | +static inline void |
982 | del_page_from_active_list(struct zone *zone, struct page *page) |
983 | { |
984 | list_del(&page->lru); |
985 | Index: linux-2.6.20-ck1/include/linux/swap-prefetch.h |
986 | =================================================================== |
987 | --- /dev/null 1970-01-01 00:00:00.000000000 +0000 |
988 | +++ linux-2.6.20-ck1/include/linux/swap-prefetch.h 2007-02-16 19:01:33.000000000 +1100 |
989 | @@ -0,0 +1,55 @@ |
990 | +#ifndef SWAP_PREFETCH_H_INCLUDED |
991 | +#define SWAP_PREFETCH_H_INCLUDED |
992 | + |
993 | +#ifdef CONFIG_SWAP_PREFETCH |
994 | +/* mm/swap_prefetch.c */ |
995 | +extern int swap_prefetch; |
996 | +struct swapped_entry { |
997 | + swp_entry_t swp_entry; /* The actual swap entry */ |
998 | + struct list_head swapped_list; /* Linked list of entries */ |
999 | +#if MAX_NUMNODES > 1 |
1000 | + int node; /* Node id */ |
1001 | +#endif |
1002 | +} __attribute__((packed)); |
1003 | + |
1004 | +static inline void store_swap_entry_node(struct swapped_entry *entry, |
1005 | + struct page *page) |
1006 | +{ |
1007 | +#if MAX_NUMNODES > 1 |
1008 | + entry->node = page_to_nid(page); |
1009 | +#endif |
1010 | +} |
1011 | + |
1012 | +static inline int get_swap_entry_node(struct swapped_entry *entry) |
1013 | +{ |
1014 | +#if MAX_NUMNODES > 1 |
1015 | + return entry->node; |
1016 | +#else |
1017 | + return 0; |
1018 | +#endif |
1019 | +} |
1020 | + |
1021 | +extern void add_to_swapped_list(struct page *page); |
1022 | +extern void remove_from_swapped_list(const unsigned long index); |
1023 | +extern void delay_swap_prefetch(void); |
1024 | +extern void prepare_swap_prefetch(void); |
1025 | + |
1026 | +#else /* CONFIG_SWAP_PREFETCH */ |
1027 | +static inline void add_to_swapped_list(struct page *__unused) |
1028 | +{ |
1029 | +} |
1030 | + |
1031 | +static inline void prepare_swap_prefetch(void) |
1032 | +{ |
1033 | +} |
1034 | + |
1035 | +static inline void remove_from_swapped_list(const unsigned long __unused) |
1036 | +{ |
1037 | +} |
1038 | + |
1039 | +static inline void delay_swap_prefetch(void) |
1040 | +{ |
1041 | +} |
1042 | +#endif /* CONFIG_SWAP_PREFETCH */ |
1043 | + |
1044 | +#endif /* SWAP_PREFETCH_H_INCLUDED */ |