Add the compute tunable for the staircase cpu scheduler. This modifies the cpu scheduler behaviour for significantly longer cpu timeslices and delays normal preemption to minimise the cpu cache harming effects of multiple concurrent running tasks. This increases cpu throughput at the cost of significantly increased latencies. Signed-off-by: Con Kolivas Documentation/sysctl/kernel.txt | 11 ++++++++++ include/linux/sched.h | 2 - kernel/sched.c | 41 +++++++++++++++++++++++++++++++--------- kernel/sysctl.c | 8 +++++++ 4 files changed, 52 insertions(+), 10 deletions(-) Index: linux-2.6.20-ck1/include/linux/sched.h =================================================================== --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100 +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100 @@ -216,7 +216,7 @@ extern void show_stack(struct task_struc void io_schedule(void); long io_schedule_timeout(long timeout); -extern int sched_interactive; +extern int sched_interactive, sched_compute; extern void cpu_init (void); extern void trap_init(void); Index: linux-2.6.20-ck1/kernel/sched.c =================================================================== --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100 +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100 @@ -63,8 +63,17 @@ /* * sched_interactive - sysctl which allows interactive tasks to have bonus * raise its priority. + * sched_compute - sysctl which enables long timeslices and delayed preemption + * for compute server usage. */ int sched_interactive __read_mostly = 1; +int sched_compute __read_mostly; + +/* + * CACHE_DELAY is the time preemption is delayed in sched_compute mode + * and is set to a nominal 10ms. + */ +#define CACHE_DELAY (10 * (HZ) / 1001 + 1) /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -96,9 +105,10 @@ int sched_interactive __read_mostly = 1; /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 6ms. + * Set to a minimum of 6ms. It is 10 times longer in compute mode. */ -#define RR_INTERVAL ((6 * HZ / 1001) + 1) +#define _RR_INTERVAL ((6 * HZ / 1001) + 1) +#define RR_INTERVAL (_RR_INTERVAL * (1 + 9 * sched_compute)) #define DEF_TIMESLICE (RR_INTERVAL * 19) /* @@ -132,6 +142,7 @@ struct rq { /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; + unsigned short cache_ticks, preempted; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -873,7 +884,7 @@ static inline int __normal_prio(struct t best_bonus = bonus(p); prio = MAX_RT_PRIO + best_bonus; - if (sched_interactive && !batch_task(p)) + if (sched_interactive && !sched_compute && !batch_task(p)) prio -= p->bonus; rr = rr_interval(p); @@ -1347,14 +1358,22 @@ static inline int wake_idle(int cpu, str #endif /* - * Check to see if p preempts rq->curr and resched if it does. + * Check to see if p preempts rq->curr and resched if it does. In compute + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted. */ -static inline void preempt(const struct task_struct *p, struct rq *rq) +static void fastcall preempt(const struct task_struct *p, struct rq *rq) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); -} + struct task_struct *curr = rq->curr; + if (p->prio >= curr->prio) + return; + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || !p->mm || + rt_task(p) || curr == rq->idle) { + resched_task(curr); + return; + } + rq->preempted = 1; +} /*** * try_to_wake_up - wake up a thread @@ -3008,6 +3027,9 @@ static void task_running_tick(struct rq time_slice_expired(p, rq); goto out_unlock; } + rq->cache_ticks++; + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) + set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); } @@ -3304,6 +3326,7 @@ switch_tasks: sched_info_switch(prev, next); if (likely(prev != next)) { + rq->preempted = rq->cache_ticks = 0; next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -6625,7 +6648,7 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); - rq->nr_running = 0; + rq->nr_running = rq->cache_ticks = rq->preempted = 0; #ifdef CONFIG_SMP rq->sd = NULL; Index: linux-2.6.20-ck1/kernel/sysctl.c =================================================================== --- linux-2.6.20-ck1.orig/kernel/sysctl.c 2007-02-16 19:01:30.000000000 +1100 +++ linux-2.6.20-ck1/kernel/sysctl.c 2007-02-16 19:01:30.000000000 +1100 @@ -684,6 +684,14 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "compute", + .data = &sched_compute, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, Index: linux-2.6.20-ck1/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.20-ck1.orig/Documentation/sysctl/kernel.txt 2007-02-16 19:01:30.000000000 +1100 +++ linux-2.6.20-ck1/Documentation/sysctl/kernel.txt 2007-02-16 19:01:30.000000000 +1100 @@ -18,6 +18,7 @@ Currently, these files might (depending show up in /proc/sys/kernel: - acpi_video_flags - acct +- compute - core_pattern - core_uses_pid - ctrl-alt-del @@ -85,6 +86,16 @@ valid for 30 seconds. ============================================================== +compute: + +This flag controls the long timeslice, delayed preemption mode in the +cpu scheduler suitable for scientific computation applications. It +leads to large latencies so is unsuitable for normal usage. + +Disabled by default. + +============================================================== + core_pattern: core_pattern is used to specify a core dumpfile pattern name.