Add the SCHED_ISO policy (isochronous) which is a starvation free soft
realtime policy available to unprivileged users. The amount of cpu that
SCHED_ISO tasks will run as realtime is configurable by the tunable in

/proc/sys/kernel/iso_cpu

and is set to 80% (over 3 seconds) by default.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

 Documentation/sysctl/kernel.txt |    9 ++++
 include/linux/sched.h           |   10 +++--
 kernel/sched.c                  |   77 ++++++++++++++++++++++++++++++++++++----
 kernel/sysctl.c                 |   25 +++++++++---
 4 files changed, 106 insertions(+), 15 deletions(-)

Index: linux-2.6.20-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.20-ck1.orig/include/linux/sched.h	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/include/linux/sched.h	2007-02-16 19:01:31.000000000 +1100
@@ -34,10 +34,11 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_ISO		4
 
 #ifdef __KERNEL__
 
-#define SCHED_MAX		SCHED_BATCH
+#define SCHED_MAX		SCHED_ISO
 #define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
 
 struct sched_param {
@@ -219,7 +220,7 @@ extern void show_stack(struct task_struc
 
 void io_schedule(void);
 long io_schedule_timeout(long timeout);
-extern int sched_interactive, sched_compute;
+extern int sched_interactive, sched_compute, sched_iso_cpu;
 
 extern void cpu_init (void);
 extern void trap_init(void);
@@ -526,16 +527,18 @@ struct signal_struct {
 
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define ISO_PRIO		(MAX_RT_PRIO - 1)
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 #define MIN_USER_PRIO		(MAX_PRIO - 1)
 
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_prio(prio)		unlikely((prio) < ISO_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
 #define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
 					(policy) == SCHED_RR)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+#define iso_task(p)		(unlikely((p)->policy == SCHED_ISO))
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -1151,6 +1154,7 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_ISOREF	0x04000000	/* SCHED_ISO task has used up quota */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_NONSLEEP	0x40000000	/* Waiting on in kernel activity */
Index: linux-2.6.20-ck1/kernel/sched.c
===================================================================
--- linux-2.6.20-ck1.orig/kernel/sched.c	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/kernel/sched.c	2007-02-16 19:01:31.000000000 +1100
@@ -65,10 +65,14 @@
  * raise its priority.
  * sched_compute - sysctl which enables long timeslices and delayed preemption
  * for compute server usage.
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
  */
 int sched_interactive __read_mostly = 1;
 int sched_compute __read_mostly;
+int sched_iso_cpu __read_mostly = 80;
 
+#define ISO_PERIOD		(5 * HZ)
 /*
  * CACHE_DELAY is the time preemption is delayed in sched_compute mode
  * and is set to a nominal 10ms.
@@ -143,6 +147,8 @@ struct rq {
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
 	unsigned short cache_ticks, preempted;
+	unsigned long iso_ticks;
+	unsigned short iso_refractory;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
@@ -878,6 +884,17 @@ static inline int __normal_prio(struct t
 	unsigned int full_slice, used_slice = 0;
 	unsigned int best_bonus, rr;
 
+	if (iso_task(p)) {
+		if (likely(!(p->flags & PF_ISOREF)))
+			/*
+			 * If SCHED_ISO tasks have not used up their real time
+			 * quota they have run just better than highest
+			 * SCHED_NORMAL priority. Otherwise they run as
+			 * SCHED_NORMAL.
+			 */
+			return ISO_PRIO;
+	}
+
 	full_slice = slice(p);
 	if (full_slice > p->slice)
 		used_slice = full_slice - p->slice;
@@ -2990,6 +3007,23 @@ static void time_slice_expired(struct ta
 	requeue_task(p, rq, effective_prio(p));
 }
 
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag.
+ */
+static inline unsigned int test_ret_isorefractory(struct rq *rq)
+{
+	if (likely(!rq->iso_refractory)) {
+		if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
+			rq->iso_refractory = 1;
+	} else {
+		if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
+			rq->iso_refractory = 0;
+	}
+	return rq->iso_refractory;
+}
+
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
 	unsigned long debit;
@@ -2999,11 +3033,29 @@ static void task_running_tick(struct rq 
 		set_tsk_need_resched(p);
 		return;
 	}
-	/* SCHED_FIFO tasks never run out of timeslice. */
-	if (unlikely(p->policy == SCHED_FIFO))
-		return;
 
 	spin_lock(&rq->lock);
+	if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) &&
+	    p->mm)) {
+			if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
+				rq->iso_ticks += 100;
+	} else
+		rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+
+	if (iso_task(p)) {
+		if (unlikely(test_ret_isorefractory(rq))) {
+			if (!(p->flags & PF_ISOREF)) {
+				set_tsk_need_resched(p);
+				p->flags |= PF_ISOREF;
+			}
+		} else
+			p->flags &= ~PF_ISOREF;
+	} else {
+		/* SCHED_FIFO tasks never run out of timeslice. */
+		if (unlikely(p->policy == SCHED_FIFO))
+			goto out_unlock;
+	}
+
 	debit = ns_diff(rq->most_recent_timestamp, p->timestamp);
 	p->ns_debit += debit;
 	if (p->ns_debit < NSJIFFY)
@@ -3122,7 +3174,7 @@ dependent_sleeper(int this_cpu, struct r
 	int ret = 0, i;
 
 	/* kernel/rt threads do not participate in dependent sleeping */
-	if (!p->mm || rt_task(p))
+	if (!p->mm || rt_task(p) || iso_task(p))
 		return 0;
 
 	for_each_domain(this_cpu, tmp) {
@@ -3159,7 +3211,7 @@ dependent_sleeper(int this_cpu, struct r
 		 * task from using an unfair proportion of the
 		 * physical cpu's resources. -ck
 		 */
-		if (rt_task(smt_curr)) {
+		if (rt_task(smt_curr) || iso_task(smt_curr)) {
 			/*
 			 * With real time tasks we run non-rt tasks only
 			 * per_cpu_gain% of the time.
@@ -3971,12 +4023,22 @@ static void __setscheduler(struct task_s
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
+	struct sched_param zero_param = { .sched_priority = 0 };
 	int queued, retval, oldprio, oldpolicy = -1;
 	unsigned long flags;
 	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to SCHED_ISO.
+		 * We also set the parameter to zero to pass the checks.
+		 */
+		policy = SCHED_ISO;
+		param = &zero_param;
+	}
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
@@ -4501,6 +4563,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_ISO:
 		ret = 0;
 		break;
 	}
@@ -4525,6 +4588,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_ISO:
 		ret = 0;
 	}
 	return ret;
@@ -6647,7 +6711,8 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
-		rq->nr_running = rq->cache_ticks = rq->preempted = 0;
+		rq->nr_running = rq->cache_ticks = rq->preempted =
+			rq->iso_ticks = 0;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
Index: linux-2.6.20-ck1/kernel/sysctl.c
===================================================================
--- linux-2.6.20-ck1.orig/kernel/sysctl.c	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/kernel/sysctl.c	2007-02-16 19:01:31.000000000 +1100
@@ -273,6 +273,14 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
+
+/*
+ * Constants for minimum and maximum testing.
+ * We use these as one-element integer vectors.
+ */
+static int zero;
+static int one_hundred = 100;
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_OSTYPE,
@@ -692,6 +700,17 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "iso_cpu",
+		.data		= &sched_iso_cpu,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
@@ -800,12 +819,6 @@ static ctl_table kern_table[] = {
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
Index: linux-2.6.20-ck1/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.20-ck1.orig/Documentation/sysctl/kernel.txt	2007-02-16 19:01:30.000000000 +1100
+++ linux-2.6.20-ck1/Documentation/sysctl/kernel.txt	2007-02-16 19:01:31.000000000 +1100
@@ -27,6 +27,7 @@ show up in /proc/sys/kernel:
 - hostname
 - hotplug
 - interactive
+- iso_cpu
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
 - kstack_depth_to_print       [ X86 only ]
@@ -185,6 +186,14 @@ are obeyed if this tunable is disabled. 
 
 ==============================================================
 
+iso_cpu:
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling 3 seconds.
+Set to 80% by default.
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If