Contents of /alx-src/tags/kernel26-2.6.12-alx-r9/mm/page-writeback.c
Parent Directory | Revision Log
Revision 630 -
(show annotations)
(download)
Wed Mar 4 11:03:09 2009 UTC (15 years, 3 months ago) by niro
File MIME type: text/plain
File size: 23253 byte(s)
Wed Mar 4 11:03:09 2009 UTC (15 years, 3 months ago) by niro
File MIME type: text/plain
File size: 23253 byte(s)
Tag kernel26-2.6.12-alx-r9
1 | /* |
2 | * mm/page-writeback.c. |
3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * |
6 | * Contains functions related to writing back dirty pages at the |
7 | * address_space level. |
8 | * |
9 | * 10Apr2002 akpm@zip.com.au |
10 | * Initial version |
11 | */ |
12 | |
13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> |
15 | #include <linux/spinlock.h> |
16 | #include <linux/fs.h> |
17 | #include <linux/mm.h> |
18 | #include <linux/swap.h> |
19 | #include <linux/slab.h> |
20 | #include <linux/pagemap.h> |
21 | #include <linux/writeback.h> |
22 | #include <linux/init.h> |
23 | #include <linux/backing-dev.h> |
24 | #include <linux/blkdev.h> |
25 | #include <linux/mpage.h> |
26 | #include <linux/percpu.h> |
27 | #include <linux/notifier.h> |
28 | #include <linux/smp.h> |
29 | #include <linux/sysctl.h> |
30 | #include <linux/cpu.h> |
31 | #include <linux/syscalls.h> |
32 | |
33 | /* |
34 | * The maximum number of pages to writeout in a single bdflush/kupdate |
35 | * operation. We do this so we don't hold I_LOCK against an inode for |
36 | * enormous amounts of time, which would block a userspace task which has |
37 | * been forced to throttle against that inode. Also, the code reevaluates |
38 | * the dirty each time it has written this many pages. |
39 | */ |
40 | #define MAX_WRITEBACK_PAGES 1024 |
41 | |
42 | /* |
43 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
44 | * will look to see if it needs to force writeback or throttling. |
45 | */ |
46 | static long ratelimit_pages = 32; |
47 | |
48 | static long total_pages; /* The total number of pages in the machine. */ |
49 | static int dirty_exceeded; /* Dirty mem may be over limit */ |
50 | |
51 | /* |
52 | * When balance_dirty_pages decides that the caller needs to perform some |
53 | * non-background writeback, this is how many pages it will attempt to write. |
54 | * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably |
55 | * large amounts of I/O are submitted. |
56 | */ |
57 | static inline long sync_writeback_pages(void) |
58 | { |
59 | return ratelimit_pages + ratelimit_pages / 2; |
60 | } |
61 | |
62 | /* The following parameters are exported via /proc/sys/vm */ |
63 | |
64 | /* |
65 | * Start background writeback (via pdflush) at this percentage |
66 | */ |
67 | int dirty_background_ratio = 10; |
68 | |
69 | /* |
70 | * The generator of dirty data starts writeback at this percentage |
71 | */ |
72 | int vm_dirty_ratio = 40; |
73 | |
74 | /* |
75 | * The interval between `kupdate'-style writebacks, in centiseconds |
76 | * (hundredths of a second) |
77 | */ |
78 | int dirty_writeback_centisecs = 5 * 100; |
79 | |
80 | /* |
81 | * The longest number of centiseconds for which data is allowed to remain dirty |
82 | */ |
83 | int dirty_expire_centisecs = 30 * 100; |
84 | |
85 | /* |
86 | * Flag that makes the machine dump writes/reads and block dirtyings. |
87 | */ |
88 | int block_dump; |
89 | |
90 | /* |
91 | * Flag that puts the machine in "laptop mode". |
92 | */ |
93 | int laptop_mode; |
94 | |
95 | EXPORT_SYMBOL(laptop_mode); |
96 | |
97 | /* End of sysctl-exported parameters */ |
98 | |
99 | |
100 | static void background_writeout(unsigned long _min_pages); |
101 | |
102 | struct writeback_state |
103 | { |
104 | unsigned long nr_dirty; |
105 | unsigned long nr_unstable; |
106 | unsigned long nr_mapped; |
107 | unsigned long nr_writeback; |
108 | }; |
109 | |
110 | static void get_writeback_state(struct writeback_state *wbs) |
111 | { |
112 | wbs->nr_dirty = read_page_state(nr_dirty); |
113 | wbs->nr_unstable = read_page_state(nr_unstable); |
114 | wbs->nr_mapped = read_page_state(nr_mapped); |
115 | wbs->nr_writeback = read_page_state(nr_writeback); |
116 | } |
117 | |
118 | /* |
119 | * Work out the current dirty-memory clamping and background writeout |
120 | * thresholds. |
121 | * |
122 | * The main aim here is to lower them aggressively if there is a lot of mapped |
123 | * memory around. To avoid stressing page reclaim with lots of unreclaimable |
124 | * pages. It is better to clamp down on writers than to start swapping, and |
125 | * performing lots of scanning. |
126 | * |
127 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. |
128 | * |
129 | * We don't permit the clamping level to fall below 5% - that is getting rather |
130 | * excessive. |
131 | * |
132 | * We make sure that the background writeout level is below the adjusted |
133 | * clamping level. |
134 | */ |
135 | static void |
136 | get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, |
137 | struct address_space *mapping) |
138 | { |
139 | int background_ratio; /* Percentages */ |
140 | int dirty_ratio; |
141 | int unmapped_ratio; |
142 | long background; |
143 | long dirty; |
144 | unsigned long available_memory = total_pages; |
145 | struct task_struct *tsk; |
146 | |
147 | get_writeback_state(wbs); |
148 | |
149 | #ifdef CONFIG_HIGHMEM |
150 | /* |
151 | * If this mapping can only allocate from low memory, |
152 | * we exclude high memory from our count. |
153 | */ |
154 | if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) |
155 | available_memory -= totalhigh_pages; |
156 | #endif |
157 | |
158 | |
159 | unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; |
160 | |
161 | dirty_ratio = vm_dirty_ratio; |
162 | if (dirty_ratio > unmapped_ratio / 2) |
163 | dirty_ratio = unmapped_ratio / 2; |
164 | |
165 | if (dirty_ratio < 5) |
166 | dirty_ratio = 5; |
167 | |
168 | background_ratio = dirty_background_ratio; |
169 | if (background_ratio >= dirty_ratio) |
170 | background_ratio = dirty_ratio / 2; |
171 | |
172 | background = (background_ratio * available_memory) / 100; |
173 | dirty = (dirty_ratio * available_memory) / 100; |
174 | tsk = current; |
175 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
176 | background += background / 4; |
177 | dirty += dirty / 4; |
178 | } |
179 | *pbackground = background; |
180 | *pdirty = dirty; |
181 | } |
182 | |
183 | /* |
184 | * balance_dirty_pages() must be called by processes which are generating dirty |
185 | * data. It looks at the number of dirty pages in the machine and will force |
186 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. |
187 | * If we're over `background_thresh' then pdflush is woken to perform some |
188 | * writeout. |
189 | */ |
190 | static void balance_dirty_pages(struct address_space *mapping) |
191 | { |
192 | struct writeback_state wbs; |
193 | long nr_reclaimable; |
194 | long background_thresh; |
195 | long dirty_thresh; |
196 | unsigned long pages_written = 0; |
197 | unsigned long write_chunk = sync_writeback_pages(); |
198 | |
199 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
200 | |
201 | for (;;) { |
202 | struct writeback_control wbc = { |
203 | .bdi = bdi, |
204 | .sync_mode = WB_SYNC_NONE, |
205 | .older_than_this = NULL, |
206 | .nr_to_write = write_chunk, |
207 | }; |
208 | |
209 | get_dirty_limits(&wbs, &background_thresh, |
210 | &dirty_thresh, mapping); |
211 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; |
212 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) |
213 | break; |
214 | |
215 | dirty_exceeded = 1; |
216 | |
217 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
218 | * Unstable writes are a feature of certain networked |
219 | * filesystems (i.e. NFS) in which data may have been |
220 | * written to the server's write cache, but has not yet |
221 | * been flushed to permanent storage. |
222 | */ |
223 | if (nr_reclaimable) { |
224 | writeback_inodes(&wbc); |
225 | get_dirty_limits(&wbs, &background_thresh, |
226 | &dirty_thresh, mapping); |
227 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; |
228 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) |
229 | break; |
230 | pages_written += write_chunk - wbc.nr_to_write; |
231 | if (pages_written >= write_chunk) |
232 | break; /* We've done our duty */ |
233 | } |
234 | blk_congestion_wait(WRITE, HZ/10); |
235 | } |
236 | |
237 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) |
238 | dirty_exceeded = 0; |
239 | |
240 | if (writeback_in_progress(bdi)) |
241 | return; /* pdflush is already working this queue */ |
242 | |
243 | /* |
244 | * In laptop mode, we wait until hitting the higher threshold before |
245 | * starting background writeout, and then write out all the way down |
246 | * to the lower threshold. So slow writers cause minimal disk activity. |
247 | * |
248 | * In normal mode, we start background writeout at the lower |
249 | * background_thresh, to keep the amount of dirty memory low. |
250 | */ |
251 | if ((laptop_mode && pages_written) || |
252 | (!laptop_mode && (nr_reclaimable > background_thresh))) |
253 | pdflush_operation(background_writeout, 0); |
254 | } |
255 | |
256 | /** |
257 | * balance_dirty_pages_ratelimited - balance dirty memory state |
258 | * @mapping: address_space which was dirtied |
259 | * |
260 | * Processes which are dirtying memory should call in here once for each page |
261 | * which was newly dirtied. The function will periodically check the system's |
262 | * dirty state and will initiate writeback if needed. |
263 | * |
264 | * On really big machines, get_writeback_state is expensive, so try to avoid |
265 | * calling it too often (ratelimiting). But once we're over the dirty memory |
266 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
267 | * from overshooting the limit by (ratelimit_pages) each. |
268 | */ |
269 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
270 | { |
271 | static DEFINE_PER_CPU(int, ratelimits) = 0; |
272 | long ratelimit; |
273 | |
274 | ratelimit = ratelimit_pages; |
275 | if (dirty_exceeded) |
276 | ratelimit = 8; |
277 | |
278 | /* |
279 | * Check the rate limiting. Also, we do not want to throttle real-time |
280 | * tasks in balance_dirty_pages(). Period. |
281 | */ |
282 | if (get_cpu_var(ratelimits)++ >= ratelimit) { |
283 | __get_cpu_var(ratelimits) = 0; |
284 | put_cpu_var(ratelimits); |
285 | balance_dirty_pages(mapping); |
286 | return; |
287 | } |
288 | put_cpu_var(ratelimits); |
289 | } |
290 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
291 | |
292 | void throttle_vm_writeout(void) |
293 | { |
294 | struct writeback_state wbs; |
295 | long background_thresh; |
296 | long dirty_thresh; |
297 | |
298 | for ( ; ; ) { |
299 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); |
300 | |
301 | /* |
302 | * Boost the allowable dirty threshold a bit for page |
303 | * allocators so they don't get DoS'ed by heavy writers |
304 | */ |
305 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ |
306 | |
307 | if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) |
308 | break; |
309 | blk_congestion_wait(WRITE, HZ/10); |
310 | } |
311 | } |
312 | |
313 | |
314 | /* |
315 | * writeback at least _min_pages, and keep writing until the amount of dirty |
316 | * memory is less than the background threshold, or until we're all clean. |
317 | */ |
318 | static void background_writeout(unsigned long _min_pages) |
319 | { |
320 | long min_pages = _min_pages; |
321 | struct writeback_control wbc = { |
322 | .bdi = NULL, |
323 | .sync_mode = WB_SYNC_NONE, |
324 | .older_than_this = NULL, |
325 | .nr_to_write = 0, |
326 | .nonblocking = 1, |
327 | }; |
328 | |
329 | for ( ; ; ) { |
330 | struct writeback_state wbs; |
331 | long background_thresh; |
332 | long dirty_thresh; |
333 | |
334 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); |
335 | if (wbs.nr_dirty + wbs.nr_unstable < background_thresh |
336 | && min_pages <= 0) |
337 | break; |
338 | wbc.encountered_congestion = 0; |
339 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
340 | wbc.pages_skipped = 0; |
341 | writeback_inodes(&wbc); |
342 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
343 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { |
344 | /* Wrote less than expected */ |
345 | blk_congestion_wait(WRITE, HZ/10); |
346 | if (!wbc.encountered_congestion) |
347 | break; |
348 | } |
349 | } |
350 | } |
351 | |
352 | /* |
353 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
354 | * the whole world. Returns 0 if a pdflush thread was dispatched. Returns |
355 | * -1 if all pdflush threads were busy. |
356 | */ |
357 | int wakeup_bdflush(long nr_pages) |
358 | { |
359 | if (nr_pages == 0) { |
360 | struct writeback_state wbs; |
361 | |
362 | get_writeback_state(&wbs); |
363 | nr_pages = wbs.nr_dirty + wbs.nr_unstable; |
364 | } |
365 | return pdflush_operation(background_writeout, nr_pages); |
366 | } |
367 | |
368 | static void wb_timer_fn(unsigned long unused); |
369 | static void laptop_timer_fn(unsigned long unused); |
370 | |
371 | static struct timer_list wb_timer = |
372 | TIMER_INITIALIZER(wb_timer_fn, 0, 0); |
373 | static struct timer_list laptop_mode_wb_timer = |
374 | TIMER_INITIALIZER(laptop_timer_fn, 0, 0); |
375 | |
376 | /* |
377 | * Periodic writeback of "old" data. |
378 | * |
379 | * Define "old": the first time one of an inode's pages is dirtied, we mark the |
380 | * dirtying-time in the inode's address_space. So this periodic writeback code |
381 | * just walks the superblock inode list, writing back any inodes which are |
382 | * older than a specific point in time. |
383 | * |
384 | * Try to run once per dirty_writeback_centisecs. But if a writeback event |
385 | * takes longer than a dirty_writeback_centisecs interval, then leave a |
386 | * one-second gap. |
387 | * |
388 | * older_than_this takes precedence over nr_to_write. So we'll only write back |
389 | * all dirty pages if they are all attached to "old" mappings. |
390 | */ |
391 | static void wb_kupdate(unsigned long arg) |
392 | { |
393 | unsigned long oldest_jif; |
394 | unsigned long start_jif; |
395 | unsigned long next_jif; |
396 | long nr_to_write; |
397 | struct writeback_state wbs; |
398 | struct writeback_control wbc = { |
399 | .bdi = NULL, |
400 | .sync_mode = WB_SYNC_NONE, |
401 | .older_than_this = &oldest_jif, |
402 | .nr_to_write = 0, |
403 | .nonblocking = 1, |
404 | .for_kupdate = 1, |
405 | }; |
406 | |
407 | sync_supers(); |
408 | |
409 | get_writeback_state(&wbs); |
410 | oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; |
411 | start_jif = jiffies; |
412 | next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; |
413 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + |
414 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
415 | while (nr_to_write > 0) { |
416 | wbc.encountered_congestion = 0; |
417 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
418 | writeback_inodes(&wbc); |
419 | if (wbc.nr_to_write > 0) { |
420 | if (wbc.encountered_congestion) |
421 | blk_congestion_wait(WRITE, HZ/10); |
422 | else |
423 | break; /* All the old data is written */ |
424 | } |
425 | nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
426 | } |
427 | if (time_before(next_jif, jiffies + HZ)) |
428 | next_jif = jiffies + HZ; |
429 | if (dirty_writeback_centisecs) |
430 | mod_timer(&wb_timer, next_jif); |
431 | } |
432 | |
433 | /* |
434 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
435 | */ |
436 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
437 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
438 | { |
439 | proc_dointvec(table, write, file, buffer, length, ppos); |
440 | if (dirty_writeback_centisecs) { |
441 | mod_timer(&wb_timer, |
442 | jiffies + (dirty_writeback_centisecs * HZ) / 100); |
443 | } else { |
444 | del_timer(&wb_timer); |
445 | } |
446 | return 0; |
447 | } |
448 | |
449 | static void wb_timer_fn(unsigned long unused) |
450 | { |
451 | if (pdflush_operation(wb_kupdate, 0) < 0) |
452 | mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ |
453 | } |
454 | |
455 | static void laptop_flush(unsigned long unused) |
456 | { |
457 | sys_sync(); |
458 | } |
459 | |
460 | static void laptop_timer_fn(unsigned long unused) |
461 | { |
462 | pdflush_operation(laptop_flush, 0); |
463 | } |
464 | |
465 | /* |
466 | * We've spun up the disk and we're in laptop mode: schedule writeback |
467 | * of all dirty data a few seconds from now. If the flush is already scheduled |
468 | * then push it back - the user is still using the disk. |
469 | */ |
470 | void laptop_io_completion(void) |
471 | { |
472 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); |
473 | } |
474 | |
475 | /* |
476 | * We're in laptop mode and we've just synced. The sync's writes will have |
477 | * caused another writeback to be scheduled by laptop_io_completion. |
478 | * Nothing needs to be written back anymore, so we unschedule the writeback. |
479 | */ |
480 | void laptop_sync_completion(void) |
481 | { |
482 | del_timer(&laptop_mode_wb_timer); |
483 | } |
484 | |
485 | /* |
486 | * If ratelimit_pages is too high then we can get into dirty-data overload |
487 | * if a large number of processes all perform writes at the same time. |
488 | * If it is too low then SMP machines will call the (expensive) |
489 | * get_writeback_state too often. |
490 | * |
491 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are |
492 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
493 | * thresholds before writeback cuts in. |
494 | * |
495 | * But the limit should not be set too high. Because it also controls the |
496 | * amount of memory which the balance_dirty_pages() caller has to write back. |
497 | * If this is too large then the caller will block on the IO queue all the |
498 | * time. So limit it to four megabytes - the balance_dirty_pages() caller |
499 | * will write six megabyte chunks, max. |
500 | */ |
501 | |
502 | static void set_ratelimit(void) |
503 | { |
504 | ratelimit_pages = total_pages / (num_online_cpus() * 32); |
505 | if (ratelimit_pages < 16) |
506 | ratelimit_pages = 16; |
507 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) |
508 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; |
509 | } |
510 | |
511 | static int |
512 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) |
513 | { |
514 | set_ratelimit(); |
515 | return 0; |
516 | } |
517 | |
518 | static struct notifier_block ratelimit_nb = { |
519 | .notifier_call = ratelimit_handler, |
520 | .next = NULL, |
521 | }; |
522 | |
523 | /* |
524 | * If the machine has a large highmem:lowmem ratio then scale back the default |
525 | * dirty memory thresholds: allowing too much dirty highmem pins an excessive |
526 | * number of buffer_heads. |
527 | */ |
528 | void __init page_writeback_init(void) |
529 | { |
530 | long buffer_pages = nr_free_buffer_pages(); |
531 | long correction; |
532 | |
533 | total_pages = nr_free_pagecache_pages(); |
534 | |
535 | correction = (100 * 4 * buffer_pages) / total_pages; |
536 | |
537 | if (correction < 100) { |
538 | dirty_background_ratio *= correction; |
539 | dirty_background_ratio /= 100; |
540 | vm_dirty_ratio *= correction; |
541 | vm_dirty_ratio /= 100; |
542 | |
543 | if (dirty_background_ratio <= 0) |
544 | dirty_background_ratio = 1; |
545 | if (vm_dirty_ratio <= 0) |
546 | vm_dirty_ratio = 1; |
547 | } |
548 | mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); |
549 | set_ratelimit(); |
550 | register_cpu_notifier(&ratelimit_nb); |
551 | } |
552 | |
553 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) |
554 | { |
555 | if (wbc->nr_to_write <= 0) |
556 | return 0; |
557 | if (mapping->a_ops->writepages) |
558 | return mapping->a_ops->writepages(mapping, wbc); |
559 | return generic_writepages(mapping, wbc); |
560 | } |
561 | |
562 | /** |
563 | * write_one_page - write out a single page and optionally wait on I/O |
564 | * |
565 | * @page: the page to write |
566 | * @wait: if true, wait on writeout |
567 | * |
568 | * The page must be locked by the caller and will be unlocked upon return. |
569 | * |
570 | * write_one_page() returns a negative error code if I/O failed. |
571 | */ |
572 | int write_one_page(struct page *page, int wait) |
573 | { |
574 | struct address_space *mapping = page->mapping; |
575 | int ret = 0; |
576 | struct writeback_control wbc = { |
577 | .sync_mode = WB_SYNC_ALL, |
578 | .nr_to_write = 1, |
579 | }; |
580 | |
581 | BUG_ON(!PageLocked(page)); |
582 | |
583 | if (wait) |
584 | wait_on_page_writeback(page); |
585 | |
586 | if (clear_page_dirty_for_io(page)) { |
587 | page_cache_get(page); |
588 | ret = mapping->a_ops->writepage(page, &wbc); |
589 | if (ret == 0 && wait) { |
590 | wait_on_page_writeback(page); |
591 | if (PageError(page)) |
592 | ret = -EIO; |
593 | } |
594 | page_cache_release(page); |
595 | } else { |
596 | unlock_page(page); |
597 | } |
598 | return ret; |
599 | } |
600 | EXPORT_SYMBOL(write_one_page); |
601 | |
602 | /* |
603 | * For address_spaces which do not use buffers. Just tag the page as dirty in |
604 | * its radix tree. |
605 | * |
606 | * This is also used when a single buffer is being dirtied: we want to set the |
607 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
608 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. |
609 | * |
610 | * Most callers have locked the page, which pins the address_space in memory. |
611 | * But zap_pte_range() does not lock the page, however in that case the |
612 | * mapping is pinned by the vma's ->vm_file reference. |
613 | * |
614 | * We take care to handle the case where the page was truncated from the |
615 | * mapping by re-checking page_mapping() insode tree_lock. |
616 | */ |
617 | int __set_page_dirty_nobuffers(struct page *page) |
618 | { |
619 | int ret = 0; |
620 | |
621 | if (!TestSetPageDirty(page)) { |
622 | struct address_space *mapping = page_mapping(page); |
623 | struct address_space *mapping2; |
624 | |
625 | if (mapping) { |
626 | write_lock_irq(&mapping->tree_lock); |
627 | mapping2 = page_mapping(page); |
628 | if (mapping2) { /* Race with truncate? */ |
629 | BUG_ON(mapping2 != mapping); |
630 | if (mapping_cap_account_dirty(mapping)) |
631 | inc_page_state(nr_dirty); |
632 | radix_tree_tag_set(&mapping->page_tree, |
633 | page_index(page), PAGECACHE_TAG_DIRTY); |
634 | } |
635 | write_unlock_irq(&mapping->tree_lock); |
636 | if (mapping->host) { |
637 | /* !PageAnon && !swapper_space */ |
638 | __mark_inode_dirty(mapping->host, |
639 | I_DIRTY_PAGES); |
640 | } |
641 | } |
642 | } |
643 | return ret; |
644 | } |
645 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
646 | |
647 | /* |
648 | * When a writepage implementation decides that it doesn't want to write this |
649 | * page for some reason, it should redirty the locked page via |
650 | * redirty_page_for_writepage() and it should then unlock the page and return 0 |
651 | */ |
652 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
653 | { |
654 | wbc->pages_skipped++; |
655 | return __set_page_dirty_nobuffers(page); |
656 | } |
657 | EXPORT_SYMBOL(redirty_page_for_writepage); |
658 | |
659 | /* |
660 | * If the mapping doesn't provide a set_page_dirty a_op, then |
661 | * just fall through and assume that it wants buffer_heads. |
662 | */ |
663 | int fastcall set_page_dirty(struct page *page) |
664 | { |
665 | struct address_space *mapping = page_mapping(page); |
666 | |
667 | if (likely(mapping)) { |
668 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
669 | if (spd) |
670 | return (*spd)(page); |
671 | return __set_page_dirty_buffers(page); |
672 | } |
673 | if (!PageDirty(page)) |
674 | SetPageDirty(page); |
675 | return 0; |
676 | } |
677 | EXPORT_SYMBOL(set_page_dirty); |
678 | |
679 | /* |
680 | * set_page_dirty() is racy if the caller has no reference against |
681 | * page->mapping->host, and if the page is unlocked. This is because another |
682 | * CPU could truncate the page off the mapping and then free the mapping. |
683 | * |
684 | * Usually, the page _is_ locked, or the caller is a user-space process which |
685 | * holds a reference on the inode by having an open file. |
686 | * |
687 | * In other cases, the page should be locked before running set_page_dirty(). |
688 | */ |
689 | int set_page_dirty_lock(struct page *page) |
690 | { |
691 | int ret; |
692 | |
693 | lock_page(page); |
694 | ret = set_page_dirty(page); |
695 | unlock_page(page); |
696 | return ret; |
697 | } |
698 | EXPORT_SYMBOL(set_page_dirty_lock); |
699 | |
700 | /* |
701 | * Clear a page's dirty flag, while caring for dirty memory accounting. |
702 | * Returns true if the page was previously dirty. |
703 | */ |
704 | int test_clear_page_dirty(struct page *page) |
705 | { |
706 | struct address_space *mapping = page_mapping(page); |
707 | unsigned long flags; |
708 | |
709 | if (mapping) { |
710 | write_lock_irqsave(&mapping->tree_lock, flags); |
711 | if (TestClearPageDirty(page)) { |
712 | radix_tree_tag_clear(&mapping->page_tree, |
713 | page_index(page), |
714 | PAGECACHE_TAG_DIRTY); |
715 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
716 | if (mapping_cap_account_dirty(mapping)) |
717 | dec_page_state(nr_dirty); |
718 | return 1; |
719 | } |
720 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
721 | return 0; |
722 | } |
723 | return TestClearPageDirty(page); |
724 | } |
725 | EXPORT_SYMBOL(test_clear_page_dirty); |
726 | |
727 | /* |
728 | * Clear a page's dirty flag, while caring for dirty memory accounting. |
729 | * Returns true if the page was previously dirty. |
730 | * |
731 | * This is for preparing to put the page under writeout. We leave the page |
732 | * tagged as dirty in the radix tree so that a concurrent write-for-sync |
733 | * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage |
734 | * implementation will run either set_page_writeback() or set_page_dirty(), |
735 | * at which stage we bring the page's dirty flag and radix-tree dirty tag |
736 | * back into sync. |
737 | * |
738 | * This incoherency between the page's dirty flag and radix-tree tag is |
739 | * unfortunate, but it only exists while the page is locked. |
740 | */ |
741 | int clear_page_dirty_for_io(struct page *page) |
742 | { |
743 | struct address_space *mapping = page_mapping(page); |
744 | |
745 | if (mapping) { |
746 | if (TestClearPageDirty(page)) { |
747 | if (mapping_cap_account_dirty(mapping)) |
748 | dec_page_state(nr_dirty); |
749 | return 1; |
750 | } |
751 | return 0; |
752 | } |
753 | return TestClearPageDirty(page); |
754 | } |
755 | EXPORT_SYMBOL(clear_page_dirty_for_io); |
756 | |
757 | int test_clear_page_writeback(struct page *page) |
758 | { |
759 | struct address_space *mapping = page_mapping(page); |
760 | int ret; |
761 | |
762 | if (mapping) { |
763 | unsigned long flags; |
764 | |
765 | write_lock_irqsave(&mapping->tree_lock, flags); |
766 | ret = TestClearPageWriteback(page); |
767 | if (ret) |
768 | radix_tree_tag_clear(&mapping->page_tree, |
769 | page_index(page), |
770 | PAGECACHE_TAG_WRITEBACK); |
771 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
772 | } else { |
773 | ret = TestClearPageWriteback(page); |
774 | } |
775 | return ret; |
776 | } |
777 | |
778 | int test_set_page_writeback(struct page *page) |
779 | { |
780 | struct address_space *mapping = page_mapping(page); |
781 | int ret; |
782 | |
783 | if (mapping) { |
784 | unsigned long flags; |
785 | |
786 | write_lock_irqsave(&mapping->tree_lock, flags); |
787 | ret = TestSetPageWriteback(page); |
788 | if (!ret) |
789 | radix_tree_tag_set(&mapping->page_tree, |
790 | page_index(page), |
791 | PAGECACHE_TAG_WRITEBACK); |
792 | if (!PageDirty(page)) |
793 | radix_tree_tag_clear(&mapping->page_tree, |
794 | page_index(page), |
795 | PAGECACHE_TAG_DIRTY); |
796 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
797 | } else { |
798 | ret = TestSetPageWriteback(page); |
799 | } |
800 | return ret; |
801 | |
802 | } |
803 | EXPORT_SYMBOL(test_set_page_writeback); |
804 | |
805 | /* |
806 | * Return true if any of the pages in the mapping are marged with the |
807 | * passed tag. |
808 | */ |
809 | int mapping_tagged(struct address_space *mapping, int tag) |
810 | { |
811 | unsigned long flags; |
812 | int ret; |
813 | |
814 | read_lock_irqsave(&mapping->tree_lock, flags); |
815 | ret = radix_tree_tagged(&mapping->page_tree, tag); |
816 | read_unlock_irqrestore(&mapping->tree_lock, flags); |
817 | return ret; |
818 | } |
819 | EXPORT_SYMBOL(mapping_tagged); |