Turn the "swappiness" knob into one with well defined semantics. Rename it "mapped" to correspond directly with the percentage of mapped ram or "applications" as users think of it. Currently the swappiness algorithm can easily lead to swapping situations on simple file copies due to the distress algorithm which too easily overrides the swappiness value. Add a "hardmaplimit" tunable, on by default, which only allows the vm to override the "mapped" tunable when distress is at its greatest to prevent false out-of-memory situations. Signed-off-by: Con Kolivas Documentation/sysctl/vm.txt | 23 +++++++++++++++++++++++ include/linux/swap.h | 3 ++- include/linux/sysctl.h | 2 +- kernel/sysctl.c | 16 ++++++++++++---- mm/vmscan.c | 27 ++++++++++++++++----------- 5 files changed, 54 insertions(+), 17 deletions(-) Index: linux-2.6.21-ck2/include/linux/swap.h =================================================================== --- linux-2.6.21-ck2.orig/include/linux/swap.h 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/include/linux/swap.h 2007-05-14 19:49:55.000000000 +1000 @@ -191,7 +191,8 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **, gfp_t); extern unsigned long shrink_all_memory(unsigned long nr_pages); -extern int vm_swappiness; +extern int vm_mapped; +extern int vm_hardmaplimit; extern int remove_mapping(struct address_space *mapping, struct page *page); extern long vm_total_pages; Index: linux-2.6.21-ck2/include/linux/sysctl.h =================================================================== --- linux-2.6.21-ck2.orig/include/linux/sysctl.h 2007-05-14 19:49:19.000000000 +1000 +++ linux-2.6.21-ck2/include/linux/sysctl.h 2007-05-14 19:49:55.000000000 +1000 @@ -190,7 +190,7 @@ enum VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ - VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_MAPPED=19, /* percent mapped min while evicting cache */ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ Index: linux-2.6.21-ck2/kernel/sysctl.c =================================================================== --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:49:55.000000000 +1000 @@ -741,16 +741,24 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .ctl_name = CTL_UNNUMBERED, + .procname = "mapped", + .data = &vm_mapped, + .maxlen = sizeof(vm_mapped), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hardmaplimit", + .data = &vm_hardmaplimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, Index: linux-2.6.21-ck2/mm/vmscan.c =================================================================== --- linux-2.6.21-ck2.orig/mm/vmscan.c 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/mm/vmscan.c 2007-05-14 19:49:55.000000000 +1000 @@ -64,7 +64,7 @@ struct scan_control { * whole list at once. */ int swap_cluster_max; - int swappiness; + int mapped; int all_unreclaimable; }; @@ -111,9 +111,10 @@ struct shrinker { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 100. Lower means more swappy. */ -int vm_swappiness = 60; +int vm_mapped __read_mostly = 66; +int vm_hardmaplimit __read_mostly = 1; long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); @@ -809,10 +810,14 @@ static void shrink_active_list(unsigned * The distress ratio is important - we don't want to start * going oom. * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. + * This distress value is ignored if we apply a hardmaplimit except + * in extreme distress. + * + * A 0% value of vm_mapped overrides this algorithm altogether. */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; + swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); + if (!vm_hardmaplimit || distress == 100) + swap_tendency += distress; /* * Now use this metric to decide whether to start moving mapped @@ -1031,7 +1036,7 @@ unsigned long try_to_free_pages(struct z .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; delay_swap_prefetch(); @@ -1138,7 +1143,7 @@ static unsigned long balance_pgdat(pg_da .gfp_mask = GFP_KERNEL, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; /* * temp_priority is used to remember the scanning priority at which @@ -1446,7 +1451,7 @@ unsigned long shrink_all_memory(unsigned .may_swap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; current->reclaim_state = &reclaim_state; @@ -1481,7 +1486,7 @@ unsigned long shrink_all_memory(unsigned /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) { sc.may_swap = 1; - sc.swappiness = 100; + sc.mapped = 0; } for (prio = DEF_PRIORITY; prio >= 0; prio--) { @@ -1629,7 +1634,7 @@ static int __zone_reclaim(struct zone *z .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; unsigned long slab_reclaimable; Index: linux-2.6.21-ck2/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.21-ck2.orig/Documentation/sysctl/vm.txt 2007-05-14 19:49:55.000000000 +1000 +++ linux-2.6.21-ck2/Documentation/sysctl/vm.txt 2007-05-14 19:49:55.000000000 +1000 @@ -22,6 +22,8 @@ Currently, these files are in /proc/sys/ - dirty_background_ratio - dirty_expire_centisecs - dirty_writeback_centisecs +- hardmaplimit +- mapped - max_map_count - min_free_kbytes - laptop_mode @@ -87,6 +89,27 @@ for swap because we only cluster swap da ============================================================== +hardmaplimit: + +This flag makes the vm adhere to the mapped value as closely as possible +except in the most extreme vm stress where doing so would provoke an out +of memory condition (see mapped below). + +Enabled by default. + +============================================================== + +mapped: + +This is the percentage ram that is filled with mapped pages (applications) +before the vm will start reclaiming mapped pages by moving them to swap. +It is altered by the relative stress of the vm at the time so is not +strictly adhered to to prevent provoking out of memory kills. + +Set to 66 by default. + +============================================================== + max_map_count: This file contains the maximum number of memory map areas a process