Add the compute tunable for the staircase cpu scheduler. This modifies the
cpu scheduler behaviour for significantly longer cpu timeslices and delays
normal preemption to minimise the cpu cache harming effects of multiple
concurrent running tasks. This increases cpu throughput at the cost of
significantly increased latencies.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

 Documentation/sysctl/kernel.txt |   11 ++++++++++
 include/linux/sched.h           |    2 -
 kernel/sched.c                  |   41 +++++++++++++++++++++++++++++++---------
 kernel/sysctl.c                 |    8 +++++++
 4 files changed, 52 insertions(+), 10 deletions(-)

Index: linux-2.6.20-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.20-ck1.orig/include/linux/sched.h	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/include/linux/sched.h	2007-02-16 19:01:30.000000000 +1100
@@ -216,7 +216,7 @@ extern void show_stack(struct task_struc
 
 void io_schedule(void);
 long io_schedule_timeout(long timeout);
-extern int sched_interactive;
+extern int sched_interactive, sched_compute;
 
 extern void cpu_init (void);
 extern void trap_init(void);
Index: linux-2.6.20-ck1/kernel/sched.c
===================================================================
--- linux-2.6.20-ck1.orig/kernel/sched.c	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/kernel/sched.c	2007-02-16 19:01:30.000000000 +1100
@@ -63,8 +63,17 @@
 /*
  * sched_interactive - sysctl which allows interactive tasks to have bonus
  * raise its priority.
+ * sched_compute - sysctl which enables long timeslices and delayed preemption
+ * for compute server usage.
  */
 int sched_interactive __read_mostly = 1;
+int sched_compute __read_mostly;
+
+/*
+ * CACHE_DELAY is the time preemption is delayed in sched_compute mode
+ * and is set to a nominal 10ms.
+ */
+#define CACHE_DELAY	(10 * (HZ) / 1001 + 1)
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -96,9 +105,10 @@ int sched_interactive __read_mostly = 1;
 
 /*
  * This is the time all tasks within the same priority round robin.
- * Set to a minimum of 6ms.
+ * Set to a minimum of 6ms. It is 10 times longer in compute mode.
  */
-#define RR_INTERVAL		((6 * HZ / 1001) + 1)
+#define _RR_INTERVAL		((6 * HZ / 1001) + 1)
+#define RR_INTERVAL		(_RR_INTERVAL * (1 + 9 * sched_compute))
 #define DEF_TIMESLICE		(RR_INTERVAL * 19)
 
 /*
@@ -132,6 +142,7 @@ struct rq {
 
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
+	unsigned short cache_ticks, preempted;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
@@ -873,7 +884,7 @@ static inline int __normal_prio(struct t
 
 	best_bonus = bonus(p);
 	prio = MAX_RT_PRIO + best_bonus;
-	if (sched_interactive && !batch_task(p))
+	if (sched_interactive && !sched_compute && !batch_task(p))
 		prio -= p->bonus;
 
 	rr = rr_interval(p);
@@ -1347,14 +1358,22 @@ static inline int wake_idle(int cpu, str
 #endif
 
 /*
- * Check to see if p preempts rq->curr and resched if it does.
+ * Check to see if p preempts rq->curr and resched if it does. In compute
+ * mode we do not preempt for at least CACHE_DELAY and set rq->preempted.
  */
-static inline void preempt(const struct task_struct *p, struct rq *rq)
+static void fastcall preempt(const struct task_struct *p, struct rq *rq)
 {
-	if (TASK_PREEMPTS_CURR(p, rq))
-		resched_task(rq->curr);
-}
+	struct task_struct *curr = rq->curr;
 
+	if (p->prio >= curr->prio)
+		return;
+	if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || !p->mm ||
+	    rt_task(p) || curr == rq->idle) {
+		resched_task(curr);
+		return;
+	}
+	rq->preempted = 1;
+}
 
 /***
  * try_to_wake_up - wake up a thread
@@ -3008,6 +3027,9 @@ static void task_running_tick(struct rq 
 		time_slice_expired(p, rq);
 		goto out_unlock;
 	}
+	rq->cache_ticks++;
+	if (rq->preempted && rq->cache_ticks >= CACHE_DELAY)
+		set_tsk_need_resched(p);
 out_unlock:
 	spin_unlock(&rq->lock);
 }
@@ -3304,6 +3326,7 @@ switch_tasks:
 
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
+		rq->preempted = rq->cache_ticks = 0;
 		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
@@ -6625,7 +6648,7 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
-		rq->nr_running = 0;
+		rq->nr_running = rq->cache_ticks = rq->preempted = 0;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
Index: linux-2.6.20-ck1/kernel/sysctl.c
===================================================================
--- linux-2.6.20-ck1.orig/kernel/sysctl.c	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/kernel/sysctl.c	2007-02-16 19:01:30.000000000 +1100
@@ -684,6 +684,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "compute",
+		.data		= &sched_compute,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
Index: linux-2.6.20-ck1/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.20-ck1.orig/Documentation/sysctl/kernel.txt	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/Documentation/sysctl/kernel.txt	2007-02-16 19:01:30.000000000 +1100
@@ -18,6 +18,7 @@ Currently, these files might (depending 
 show up in /proc/sys/kernel:
 - acpi_video_flags
 - acct
+- compute
 - core_pattern
 - core_uses_pid
 - ctrl-alt-del
@@ -85,6 +86,16 @@ valid for 30 seconds.
 
 ==============================================================
 
+compute:
+
+This flag controls the long timeslice, delayed preemption mode in the
+cpu scheduler suitable for scientific computation applications. It
+leads to large latencies so is unsuitable for normal usage.
+
+Disabled by default.
+
+==============================================================
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.