When reading from large files through the generic file read functions into page cache we can detect when a file is so large that it is unlikely to be fully cached in ram. Add a tunable /proc/sys/vm/tail_largefiles that puts them at the tail of the inactive list to minimise their harm on present mapped pages and pagecache and enable it by default. Signed-off-by: Con Kolivas --- Documentation/filesystems/proc.txt | 8 +++++ Documentation/sysctl/vm.txt | 2 - kernel/sysctl.c | 9 ++++++ mm/filemap.c | 53 +++++++++++++++++++++++++++++++++++-- mm/swap.c | 3 -- 5 files changed, 70 insertions(+), 5 deletions(-) Index: linux-2.6.21-ck2/mm/filemap.c =================================================================== --- linux-2.6.21-ck2.orig/mm/filemap.c 2007-05-14 19:49:18.000000000 +1000 +++ linux-2.6.21-ck2/mm/filemap.c 2007-05-14 19:49:56.000000000 +1000 @@ -466,6 +466,16 @@ int add_to_page_cache_lru(struct page *p return ret; } +int add_to_page_cache_lru_tail(struct page *page, + struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) +{ + int ret = add_to_page_cache(page, mapping, offset, gfp_mask); + + if (ret == 0) + lru_cache_add_tail(page); + return ret; +} + #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { @@ -836,6 +846,34 @@ static void shrink_readahead_size_eio(st ra->ra_pages /= 4; } +/* + * Sysctl which determines whether we should read from large files to the + * tail of the inactive lru list. + */ +int vm_tail_largefiles __read_mostly = 1; + +static inline int nr_mapped(void) +{ + return global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); +} + +/* + * This examines how large in pages a file size is and returns 1 if it is + * more than half the unmapped ram. Avoid doing read_page_state which is + * expensive unless we already know it is likely to be large enough. + */ +static int large_isize(unsigned long nr_pages) +{ + if (nr_pages * 6 > vm_total_pages) { + unsigned long unmapped_ram = vm_total_pages - nr_mapped(); + + if (nr_pages * 2 > unmapped_ram) + return 1; + } + return 0; +} + /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read @@ -1044,8 +1082,19 @@ no_cached_page: goto out; } } - error = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); + + /* + * If we know the file is large we add the pages read to the + * end of the lru as we're unlikely to be able to cache the + * whole file in ram so make those pages the first to be + * dropped if not referenced soon. + */ + if (vm_tail_largefiles && large_isize(end_index)) + error = add_to_page_cache_lru_tail(cached_page, + mapping, index, GFP_KERNEL); + else + error = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); if (error) { if (error == -EEXIST) goto find_page; Index: linux-2.6.21-ck2/mm/swap.c =================================================================== --- linux-2.6.21-ck2.orig/mm/swap.c 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/mm/swap.c 2007-05-14 19:49:56.000000000 +1000 @@ -434,8 +434,7 @@ void __pagevec_lru_add_active(struct pag /* * Function used uniquely to put pages back to the lru at the end of the - * inactive list to preserve the lru order. Currently only used by swap - * prefetch. + * inactive list to preserve the lru order. */ void fastcall lru_cache_add_tail(struct page *page) { Index: linux-2.6.21-ck2/kernel/sysctl.c =================================================================== --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:49:56.000000000 +1000 @@ -71,6 +71,7 @@ extern int suid_dumpable; extern char core_pattern[]; extern int pid_max; extern int min_free_kbytes; +extern int vm_tail_largefiles; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; @@ -759,6 +760,14 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "tail_largefiles", + .data = &vm_tail_largefiles, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, Index: linux-2.6.21-ck2/Documentation/filesystems/proc.txt =================================================================== --- linux-2.6.21-ck2.orig/Documentation/filesystems/proc.txt 2007-05-14 19:49:18.000000000 +1000 +++ linux-2.6.21-ck2/Documentation/filesystems/proc.txt 2007-05-14 19:49:56.000000000 +1000 @@ -1325,6 +1325,14 @@ To free pagecache, dentries and inodes: As this is a non-destructive operation and dirty objects are not freeable, the user should run `sync' first. +tail_largefiles +--------------- + +When enabled reads from large files to the tail end of the inactive lru list. +This means that any cache from reading large files is dropped very quickly, +preventing loss of mapped ram and useful pagecache when large files are read. +This does, however, make caching less effective when working with large files. + 2.5 /proc/sys/dev - Device specific parameters ---------------------------------------------- Index: linux-2.6.21-ck2/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.21-ck2.orig/Documentation/sysctl/vm.txt 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/Documentation/sysctl/vm.txt 2007-05-14 19:49:56.000000000 +1000 @@ -39,7 +39,7 @@ Currently, these files are in /proc/sys/ dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, -block_dump, swap_token_timeout, drop-caches: +block_dump, swap_token_timeout, drop-caches, tail_largefiles: See Documentation/filesystems/proc.txt