Annotation of /trunk/kernel26-alx/patches-2.6.21-r14/0024-2.6.21-mm-swap-prefetch-35-38.patch
Parent Directory | Revision Log
Revision 447 -
(hide annotations)
(download)
Tue Jan 22 17:55:52 2008 UTC (16 years, 8 months ago) by niro
File size: 12448 byte(s)
Tue Jan 22 17:55:52 2008 UTC (16 years, 8 months ago) by niro
File size: 12448 byte(s)
-2.6.21-alx-r14 - fixed some natsemi errors on wys terminals
1 | niro | 447 | --- |
2 | Documentation/sysctl/vm.txt | 5 - | ||
3 | mm/page_io.c | 2 | ||
4 | mm/swap_prefetch.c | 167 +++++++++++++++++++------------------------- | ||
5 | mm/swap_state.c | 2 | ||
6 | mm/vmscan.c | 1 | ||
7 | 5 files changed, 79 insertions(+), 98 deletions(-) | ||
8 | |||
9 | Index: linux-2.6.21-ck2/mm/page_io.c | ||
10 | =================================================================== | ||
11 | --- linux-2.6.21-ck2.orig/mm/page_io.c 2007-05-14 19:49:18.000000000 +1000 | ||
12 | +++ linux-2.6.21-ck2/mm/page_io.c 2007-05-14 19:49:57.000000000 +1000 | ||
13 | @@ -17,6 +17,7 @@ | ||
14 | #include <linux/bio.h> | ||
15 | #include <linux/swapops.h> | ||
16 | #include <linux/writeback.h> | ||
17 | +#include <linux/swap-prefetch.h> | ||
18 | #include <asm/pgtable.h> | ||
19 | |||
20 | static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, | ||
21 | @@ -118,6 +119,7 @@ int swap_writepage(struct page *page, st | ||
22 | ret = -ENOMEM; | ||
23 | goto out; | ||
24 | } | ||
25 | + add_to_swapped_list(page); | ||
26 | if (wbc->sync_mode == WB_SYNC_ALL) | ||
27 | rw |= (1 << BIO_RW_SYNC); | ||
28 | count_vm_event(PSWPOUT); | ||
29 | Index: linux-2.6.21-ck2/mm/swap_state.c | ||
30 | =================================================================== | ||
31 | --- linux-2.6.21-ck2.orig/mm/swap_state.c 2007-05-14 19:49:55.000000000 +1000 | ||
32 | +++ linux-2.6.21-ck2/mm/swap_state.c 2007-05-14 19:49:57.000000000 +1000 | ||
33 | @@ -83,7 +83,6 @@ static int __add_to_swap_cache(struct pa | ||
34 | error = radix_tree_insert(&swapper_space.page_tree, | ||
35 | entry.val, page); | ||
36 | if (!error) { | ||
37 | - remove_from_swapped_list(entry.val); | ||
38 | page_cache_get(page); | ||
39 | SetPageLocked(page); | ||
40 | SetPageSwapCache(page); | ||
41 | @@ -102,7 +101,6 @@ int add_to_swap_cache(struct page *page, | ||
42 | int error; | ||
43 | |||
44 | if (!swap_duplicate(entry)) { | ||
45 | - remove_from_swapped_list(entry.val); | ||
46 | INC_CACHE_INFO(noent_race); | ||
47 | return -ENOENT; | ||
48 | } | ||
49 | Index: linux-2.6.21-ck2/mm/vmscan.c | ||
50 | =================================================================== | ||
51 | --- linux-2.6.21-ck2.orig/mm/vmscan.c 2007-05-14 19:49:56.000000000 +1000 | ||
52 | +++ linux-2.6.21-ck2/mm/vmscan.c 2007-05-14 19:49:57.000000000 +1000 | ||
53 | @@ -427,7 +427,6 @@ int remove_mapping(struct address_space | ||
54 | |||
55 | if (PageSwapCache(page)) { | ||
56 | swp_entry_t swap = { .val = page_private(page) }; | ||
57 | - add_to_swapped_list(page); | ||
58 | __delete_from_swap_cache(page); | ||
59 | write_unlock_irq(&mapping->tree_lock); | ||
60 | swap_free(swap); | ||
61 | Index: linux-2.6.21-ck2/mm/swap_prefetch.c | ||
62 | =================================================================== | ||
63 | --- linux-2.6.21-ck2.orig/mm/swap_prefetch.c 2007-05-14 19:49:55.000000000 +1000 | ||
64 | +++ linux-2.6.21-ck2/mm/swap_prefetch.c 2007-05-14 19:49:57.000000000 +1000 | ||
65 | @@ -27,7 +27,8 @@ | ||
66 | * needs to be at least this duration of idle time meaning in practice it can | ||
67 | * be much longer | ||
68 | */ | ||
69 | -#define PREFETCH_DELAY (HZ * 5) | ||
70 | +#define PREFETCH_DELAY (HZ * 5) | ||
71 | +#define DISABLED_PREFETCH_DELAY (HZ * 60) | ||
72 | |||
73 | /* sysctl - enable/disable swap prefetching */ | ||
74 | int swap_prefetch __read_mostly = 1; | ||
75 | @@ -61,19 +62,30 @@ inline void delay_swap_prefetch(void) | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | + * If laptop_mode is enabled don't prefetch to avoid hard drives | ||
80 | + * doing unnecessary spin-ups unless swap_prefetch is explicitly | ||
81 | + * set to a higher value. | ||
82 | + */ | ||
83 | +static inline int prefetch_enabled(void) | ||
84 | +{ | ||
85 | + if (swap_prefetch <= laptop_mode) | ||
86 | + return 0; | ||
87 | + return 1; | ||
88 | +} | ||
89 | + | ||
90 | +static int wakeup_kprefetchd; | ||
91 | + | ||
92 | +/* | ||
93 | * Drop behind accounting which keeps a list of the most recently used swap | ||
94 | - * entries. | ||
95 | + * entries. Entries are removed lazily by kprefetchd. | ||
96 | */ | ||
97 | void add_to_swapped_list(struct page *page) | ||
98 | { | ||
99 | struct swapped_entry *entry; | ||
100 | unsigned long index, flags; | ||
101 | - int wakeup; | ||
102 | - | ||
103 | - if (!swap_prefetch) | ||
104 | - return; | ||
105 | |||
106 | - wakeup = 0; | ||
107 | + if (!prefetch_enabled()) | ||
108 | + goto out; | ||
109 | |||
110 | spin_lock_irqsave(&swapped.lock, flags); | ||
111 | if (swapped.count >= swapped.maxcount) { | ||
112 | @@ -103,23 +115,16 @@ void add_to_swapped_list(struct page *pa | ||
113 | store_swap_entry_node(entry, page); | ||
114 | |||
115 | if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { | ||
116 | - /* | ||
117 | - * If this is the first entry, kprefetchd needs to be | ||
118 | - * (re)started. | ||
119 | - */ | ||
120 | - if (!swapped.count) | ||
121 | - wakeup = 1; | ||
122 | list_add(&entry->swapped_list, &swapped.list); | ||
123 | swapped.count++; | ||
124 | - } | ||
125 | + } else | ||
126 | + kmem_cache_free(swapped.cache, entry); | ||
127 | |||
128 | out_locked: | ||
129 | spin_unlock_irqrestore(&swapped.lock, flags); | ||
130 | - | ||
131 | - /* Do the wakeup outside the lock to shorten lock hold time. */ | ||
132 | - if (wakeup) | ||
133 | +out: | ||
134 | + if (wakeup_kprefetchd) | ||
135 | wake_up_process(kprefetchd_task); | ||
136 | - | ||
137 | return; | ||
138 | } | ||
139 | |||
140 | @@ -139,7 +144,7 @@ void remove_from_swapped_list(const unsi | ||
141 | spin_lock_irqsave(&swapped.lock, flags); | ||
142 | entry = radix_tree_delete(&swapped.swap_tree, index); | ||
143 | if (likely(entry)) { | ||
144 | - list_del_init(&entry->swapped_list); | ||
145 | + list_del(&entry->swapped_list); | ||
146 | swapped.count--; | ||
147 | kmem_cache_free(swapped.cache, entry); | ||
148 | } | ||
149 | @@ -153,18 +158,18 @@ enum trickle_return { | ||
150 | }; | ||
151 | |||
152 | struct node_stats { | ||
153 | - unsigned long last_free; | ||
154 | /* Free ram after a cycle of prefetching */ | ||
155 | - unsigned long current_free; | ||
156 | + unsigned long last_free; | ||
157 | /* Free ram on this cycle of checking prefetch_suitable */ | ||
158 | - unsigned long prefetch_watermark; | ||
159 | + unsigned long current_free; | ||
160 | /* Maximum amount we will prefetch to */ | ||
161 | - unsigned long highfree[MAX_NR_ZONES]; | ||
162 | + unsigned long prefetch_watermark; | ||
163 | /* The amount of free ram before we start prefetching */ | ||
164 | - unsigned long lowfree[MAX_NR_ZONES]; | ||
165 | + unsigned long highfree[MAX_NR_ZONES]; | ||
166 | /* The amount of free ram where we will stop prefetching */ | ||
167 | - unsigned long *pointfree[MAX_NR_ZONES]; | ||
168 | + unsigned long lowfree[MAX_NR_ZONES]; | ||
169 | /* highfree or lowfree depending on whether we've hit a watermark */ | ||
170 | + unsigned long *pointfree[MAX_NR_ZONES]; | ||
171 | }; | ||
172 | |||
173 | /* | ||
174 | @@ -172,10 +177,10 @@ struct node_stats { | ||
175 | * determine if a node is suitable for prefetching into. | ||
176 | */ | ||
177 | struct prefetch_stats { | ||
178 | - nodemask_t prefetch_nodes; | ||
179 | /* Which nodes are currently suited to prefetching */ | ||
180 | - unsigned long prefetched_pages; | ||
181 | + nodemask_t prefetch_nodes; | ||
182 | /* Total pages we've prefetched on this wakeup of kprefetchd */ | ||
183 | + unsigned long prefetched_pages; | ||
184 | struct node_stats node[MAX_NUMNODES]; | ||
185 | }; | ||
186 | |||
187 | @@ -189,16 +194,15 @@ static enum trickle_return trickle_swap_ | ||
188 | const int node) | ||
189 | { | ||
190 | enum trickle_return ret = TRICKLE_FAILED; | ||
191 | + unsigned long flags; | ||
192 | struct page *page; | ||
193 | |||
194 | - read_lock_irq(&swapper_space.tree_lock); | ||
195 | + read_lock_irqsave(&swapper_space.tree_lock, flags); | ||
196 | /* Entry may already exist */ | ||
197 | page = radix_tree_lookup(&swapper_space.page_tree, entry.val); | ||
198 | - read_unlock_irq(&swapper_space.tree_lock); | ||
199 | - if (page) { | ||
200 | - remove_from_swapped_list(entry.val); | ||
201 | + read_unlock_irqrestore(&swapper_space.tree_lock, flags); | ||
202 | + if (page) | ||
203 | goto out; | ||
204 | - } | ||
205 | |||
206 | /* | ||
207 | * Get a new page to read from swap. We have already checked the | ||
208 | @@ -217,10 +221,8 @@ static enum trickle_return trickle_swap_ | ||
209 | |||
210 | /* Add them to the tail of the inactive list to preserve LRU order */ | ||
211 | lru_cache_add_tail(page); | ||
212 | - if (unlikely(swap_readpage(NULL, page))) { | ||
213 | - ret = TRICKLE_DELAY; | ||
214 | + if (unlikely(swap_readpage(NULL, page))) | ||
215 | goto out_release; | ||
216 | - } | ||
217 | |||
218 | sp_stat.prefetched_pages++; | ||
219 | sp_stat.node[node].last_free--; | ||
220 | @@ -229,6 +231,12 @@ static enum trickle_return trickle_swap_ | ||
221 | out_release: | ||
222 | page_cache_release(page); | ||
223 | out: | ||
224 | + /* | ||
225 | + * All entries are removed here lazily. This avoids the cost of | ||
226 | + * remove_from_swapped_list during normal swapin. Thus there are | ||
227 | + * usually many stale entries. | ||
228 | + */ | ||
229 | + remove_from_swapped_list(entry.val); | ||
230 | return ret; | ||
231 | } | ||
232 | |||
233 | @@ -410,17 +418,6 @@ out: | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | - * Get previous swapped entry when iterating over all entries. swapped.lock | ||
238 | - * should be held and we should already ensure that entry exists. | ||
239 | - */ | ||
240 | -static inline struct swapped_entry *prev_swapped_entry | ||
241 | - (struct swapped_entry *entry) | ||
242 | -{ | ||
243 | - return list_entry(entry->swapped_list.prev->prev, | ||
244 | - struct swapped_entry, swapped_list); | ||
245 | -} | ||
246 | - | ||
247 | -/* | ||
248 | * trickle_swap is the main function that initiates the swap prefetching. It | ||
249 | * first checks to see if the busy flag is set, and does not prefetch if it | ||
250 | * is, as the flag implied we are low on memory or swapping in currently. | ||
251 | @@ -431,70 +428,49 @@ static inline struct swapped_entry *prev | ||
252 | static enum trickle_return trickle_swap(void) | ||
253 | { | ||
254 | enum trickle_return ret = TRICKLE_DELAY; | ||
255 | - struct swapped_entry *entry; | ||
256 | + struct swapped_entry *pos, *n; | ||
257 | unsigned long flags; | ||
258 | |||
259 | - /* | ||
260 | - * If laptop_mode is enabled don't prefetch to avoid hard drives | ||
261 | - * doing unnecessary spin-ups | ||
262 | - */ | ||
263 | - if (!swap_prefetch || laptop_mode) | ||
264 | + if (!prefetch_enabled()) | ||
265 | return ret; | ||
266 | |||
267 | examine_free_limits(); | ||
268 | - entry = NULL; | ||
269 | + if (!prefetch_suitable()) | ||
270 | + return ret; | ||
271 | + if (list_empty(&swapped.list)) | ||
272 | + return TRICKLE_FAILED; | ||
273 | |||
274 | - for ( ; ; ) { | ||
275 | + spin_lock_irqsave(&swapped.lock, flags); | ||
276 | + list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) { | ||
277 | swp_entry_t swp_entry; | ||
278 | int node; | ||
279 | |||
280 | - if (!prefetch_suitable()) | ||
281 | - break; | ||
282 | + spin_unlock_irqrestore(&swapped.lock, flags); | ||
283 | + /* Yield to anything else running */ | ||
284 | + if (cond_resched() || !prefetch_suitable()) | ||
285 | + goto out_unlocked; | ||
286 | |||
287 | spin_lock_irqsave(&swapped.lock, flags); | ||
288 | - if (list_empty(&swapped.list)) { | ||
289 | - ret = TRICKLE_FAILED; | ||
290 | - spin_unlock_irqrestore(&swapped.lock, flags); | ||
291 | - break; | ||
292 | - } | ||
293 | - | ||
294 | - if (!entry) { | ||
295 | - /* | ||
296 | - * This sets the entry for the first iteration. It | ||
297 | - * also is a safeguard against the entry disappearing | ||
298 | - * while the lock is not held. | ||
299 | - */ | ||
300 | - entry = list_entry(swapped.list.prev, | ||
301 | - struct swapped_entry, swapped_list); | ||
302 | - } else if (entry->swapped_list.prev == swapped.list.next) { | ||
303 | - /* | ||
304 | - * If we have iterated over all entries and there are | ||
305 | - * still entries that weren't swapped out there may | ||
306 | - * be a reason we could not swap them back in so | ||
307 | - * delay attempting further prefetching. | ||
308 | - */ | ||
309 | - spin_unlock_irqrestore(&swapped.lock, flags); | ||
310 | - break; | ||
311 | - } | ||
312 | - | ||
313 | - node = get_swap_entry_node(entry); | ||
314 | + if (unlikely(!pos)) | ||
315 | + continue; | ||
316 | + node = get_swap_entry_node(pos); | ||
317 | if (!node_isset(node, sp_stat.prefetch_nodes)) { | ||
318 | /* | ||
319 | * We found an entry that belongs to a node that is | ||
320 | * not suitable for prefetching so skip it. | ||
321 | */ | ||
322 | - entry = prev_swapped_entry(entry); | ||
323 | - spin_unlock_irqrestore(&swapped.lock, flags); | ||
324 | continue; | ||
325 | } | ||
326 | - swp_entry = entry->swp_entry; | ||
327 | - entry = prev_swapped_entry(entry); | ||
328 | + swp_entry = pos->swp_entry; | ||
329 | spin_unlock_irqrestore(&swapped.lock, flags); | ||
330 | |||
331 | if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) | ||
332 | - break; | ||
333 | + goto out_unlocked; | ||
334 | + spin_lock_irqsave(&swapped.lock, flags); | ||
335 | } | ||
336 | + spin_unlock_irqrestore(&swapped.lock, flags); | ||
337 | |||
338 | +out_unlocked: | ||
339 | if (sp_stat.prefetched_pages) { | ||
340 | lru_add_drain(); | ||
341 | sp_stat.prefetched_pages = 0; | ||
342 | @@ -509,13 +485,14 @@ static int kprefetchd(void *__unused) | ||
343 | sched_setscheduler(current, SCHED_BATCH, ¶m); | ||
344 | set_user_nice(current, 19); | ||
345 | /* Set ioprio to lowest if supported by i/o scheduler */ | ||
346 | - sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); | ||
347 | + sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE); | ||
348 | |||
349 | /* kprefetchd has nothing to do until it is woken up the first time */ | ||
350 | + wakeup_kprefetchd = 1; | ||
351 | set_current_state(TASK_INTERRUPTIBLE); | ||
352 | schedule(); | ||
353 | |||
354 | - do { | ||
355 | + while (!kthread_should_stop()) { | ||
356 | try_to_freeze(); | ||
357 | |||
358 | /* | ||
359 | @@ -523,13 +500,17 @@ static int kprefetchd(void *__unused) | ||
360 | * a wakeup, and further delay the next one. | ||
361 | */ | ||
362 | if (trickle_swap() == TRICKLE_FAILED) { | ||
363 | + wakeup_kprefetchd = 1; | ||
364 | set_current_state(TASK_INTERRUPTIBLE); | ||
365 | schedule(); | ||
366 | - } | ||
367 | + } else | ||
368 | + wakeup_kprefetchd = 0; | ||
369 | clear_last_prefetch_free(); | ||
370 | - schedule_timeout_interruptible(PREFETCH_DELAY); | ||
371 | - } while (!kthread_should_stop()); | ||
372 | - | ||
373 | + if (!prefetch_enabled()) | ||
374 | + schedule_timeout_interruptible(DISABLED_PREFETCH_DELAY); | ||
375 | + else | ||
376 | + schedule_timeout_interruptible(PREFETCH_DELAY); | ||
377 | + } | ||
378 | return 0; | ||
379 | } | ||
380 | |||
381 | Index: linux-2.6.21-ck2/Documentation/sysctl/vm.txt | ||
382 | =================================================================== | ||
383 | --- linux-2.6.21-ck2.orig/Documentation/sysctl/vm.txt 2007-05-14 19:49:56.000000000 +1000 | ||
384 | +++ linux-2.6.21-ck2/Documentation/sysctl/vm.txt 2007-05-14 19:49:57.000000000 +1000 | ||
385 | @@ -236,7 +236,8 @@ swap_prefetch | ||
386 | This enables or disables the swap prefetching feature. When the virtual | ||
387 | memory subsystem has been extremely idle for at least 5 seconds it will start | ||
388 | copying back pages from swap into the swapcache and keep a copy in swap. In | ||
389 | -practice it can take many minutes before the vm is idle enough. | ||
390 | +practice it can take many minutes before the vm is idle enough. A value of 0 | ||
391 | +disables swap prefetching, 1 enables it unless laptop_mode is enabled, and 2 | ||
392 | +enables it even in the presence of laptop_mode. | ||
393 | |||
394 | The default value is 1. | ||
395 | - |