Add the SCHED_ISO policy (isochronous) which is a starvation free soft realtime policy available to unprivileged users. The amount of cpu that SCHED_ISO tasks will run as realtime is configurable by the tunable in /proc/sys/kernel/iso_cpu and is set to 80% by default. The duration over which its cpu usage is averaged is controlled by the tunable /proc/sys/kernel/iso_period and is set to 5 (seconds) by default. Signed-off-by: Con Kolivas Documentation/sysctl/kernel.txt | 21 +++++++ include/linux/sched.h | 8 ++ kernel/sched.c | 115 +++++++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 24 ++++++++ 4 files changed, 160 insertions(+), 8 deletions(-) Index: linux-2.6.21-ck2/include/linux/sched.h =================================================================== --- linux-2.6.21-ck2.orig/include/linux/sched.h 2007-05-14 19:30:30.000000000 +1000 +++ linux-2.6.21-ck2/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000 @@ -34,10 +34,11 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#define SCHED_ISO 4 #ifdef __KERNEL__ -#define SCHED_MAX SCHED_BATCH +#define SCHED_MAX SCHED_ISO #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) struct sched_param { @@ -525,15 +526,17 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO #define PRIO_RANGE (40) +#define ISO_PRIO (MAX_RT_PRIO - 1) #define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_prio(prio) unlikely((prio) < ISO_PRIO) #define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) +#define iso_task(p) unlikely((p)->policy == SCHED_ISO) /* * Some day this will be a full-fledged user tracking system.. @@ -1166,6 +1169,7 @@ static inline void put_task_struct(struc #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ Index: linux-2.6.21-ck2/kernel/sched.c =================================================================== --- linux-2.6.21-ck2.orig/kernel/sched.c 2007-05-14 19:30:30.000000000 +1000 +++ linux-2.6.21-ck2/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000 @@ -104,6 +104,18 @@ int rr_interval __read_mostly = 8; int sched_interactive __read_mostly = 1; /* + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. + * sched_iso_period - sysctl which determines the number of seconds over + * which cpu usage of SCHED_ISO tasks is averaged to determine if they are + * exceeding their allowable bandwidth. +*/ +int sched_iso_cpu __read_mostly = 80; +int sched_iso_period __read_mostly = 5; + +#define ISO_PERIOD ((sched_iso_period * HZ) + 1) + +/* * This contains a bitmap for each dynamic priority level with empty slots * for the valid priorities each different nice level can have. It allows * us to stagger the slots where differing priorities run in a way that @@ -200,6 +212,8 @@ struct rq { /* How many times we have rotated the priority queue */ unsigned long prio_rotation; + unsigned long iso_ticks; + unsigned short iso_refractory; atomic_t nr_iowait; @@ -790,6 +804,11 @@ static inline void update_if_moved(struc } #endif +static inline int isoprio_suitable(struct task_struct *p) +{ + return !(p->flags & PF_ISOREF); +} + /* * recalc_task_prio determines what priority a non rt_task will be * queued at. If the task has already been running during this runqueue's @@ -806,6 +825,25 @@ static void recalc_task_prio(struct task struct prio_array *array = rq->active; int queue_prio; + if (iso_task(p)) { + if (isoprio_suitable(p)) { + /* + * If SCHED_ISO tasks have not used up their real time + * quota they have run just better than highest + * SCHED_NORMAL priority. Otherwise they run as + * SCHED_NORMAL. + */ + p->prio = p->normal_prio = ISO_PRIO; + p->array = rq->active; + if (p->time_slice <= 0) + p->time_slice = p->quota; + return; + } else if (p->prio == ISO_PRIO) { + /* Just about to be demoted to SCHED_NORMAL */ + p->time_slice = 0; + } + } + update_if_moved(p, rq); if (p->rotation == rq->prio_rotation) { if (p->array == array) { @@ -3180,18 +3218,65 @@ static void task_expired_entitlement(str p->time_slice += overrun; } +/* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. + */ +static unsigned int test_ret_isorefractory(struct rq *rq) +{ + if (likely(!rq->iso_refractory)) { + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) + rq->iso_refractory = 1; + } else { + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) + rq->iso_refractory = 0; + } + return rq->iso_refractory; +} + +/* No SCHED_ISO task was running so decrease rq->iso_ticks */ +static inline void no_iso_tick(struct rq *rq) +{ + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; +} + /* This manages tasks that have run out of timeslice during a scheduler_tick */ static void task_running_tick(struct rq *rq, struct task_struct *p) { + /* + * If a SCHED_ISO task is running we increment the iso_ticks. In + * order to prevent SCHED_ISO tasks from causing starvation in the + * presence of true RT tasks we account those as iso_ticks as well. + */ + if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) + rq->iso_ticks += 100; + } else + no_iso_tick(rq); + + if (iso_task(p)) { + if (unlikely(test_ret_isorefractory(rq))) { + if (isoprio_suitable(p)) { + /* + * SCHED_ISO task is running as RT and limit + * has been hit. Set the PF_ISOREF flag and + * force it to reschedule as SCHED_NORMAL + * by zeroing its time_slice + */ + p->flags |= PF_ISOREF; + p->time_slice = 0; + } + } else + p->flags &= ~PF_ISOREF; + } /* SCHED_FIFO tasks never run out of timeslice. */ if (p->time_slice > 0 || p->policy == SCHED_FIFO) return; /* p->time_slice <= 0 */ - spin_lock(&rq->lock); + set_tsk_need_resched(p); if (likely(task_queued(p))) task_expired_entitlement(rq, p); - set_tsk_need_resched(p); - spin_unlock(&rq->lock); } /* @@ -3207,8 +3292,12 @@ void scheduler_tick(void) update_cpu_clock(p, rq, now, 1); + spin_lock(&rq->lock); if (p != rq->idle) task_running_tick(rq, p); + else + no_iso_tick(rq); + spin_unlock(&rq->lock); #ifdef CONFIG_SMP update_load(rq); if (time_after_eq(jiffies, rq->next_balance)) @@ -3285,7 +3374,8 @@ retry: } queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); - if (unlikely(next->time_slice <= 0)) { + if (unlikely(next->time_slice <= 0 && !(iso_task(next) && + isoprio_suitable(next)))) { /* * Unlucky enough that this task ran out of time_slice * before it hit a scheduler_tick so it should have its @@ -3377,7 +3467,7 @@ need_resched_nonpreemptible: } idx = sched_find_first_bit(rq->dyn_bitmap); - if (!rt_prio(idx)) + if (likely(idx > ISO_PRIO)) next = next_dynamic_task(rq, idx); else { queue = rq->active->queue + idx; @@ -4042,12 +4132,22 @@ static void __setscheduler(struct task_s int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { + struct sched_param zero_param = { .sched_priority = 0 }; int queued, retval, oldprio, oldpolicy = -1; unsigned long flags; struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -4577,6 +4677,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: ret = 0; break; } @@ -4601,6 +4702,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: ret = 0; } return ret; @@ -6708,6 +6810,7 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); + rq->iso_ticks = 0; rq->nr_running = 0; rq->prio_rotation = 0; rq->active = rq->arrays; @@ -6801,7 +6904,7 @@ void normalize_rt_tasks(void) read_lock_irq(&tasklist_lock); for_each_process(p) { - if (!rt_task(p)) + if (!rt_task(p) && !iso_task(p)) continue; spin_lock_irqsave(&p->pi_lock, flags); Index: linux-2.6.21-ck2/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.21-ck2.orig/Documentation/sysctl/kernel.txt 2007-05-14 19:30:30.000000000 +1000 +++ linux-2.6.21-ck2/Documentation/sysctl/kernel.txt 2007-05-14 19:30:31.000000000 +1000 @@ -26,6 +26,8 @@ show up in /proc/sys/kernel: - hostname - hotplug - interactive +- iso_cpu +- iso_period - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] - kstack_depth_to_print [ X86 only ] @@ -181,6 +183,25 @@ Default value is 1 (enabled). ============================================================== +iso_cpu: + +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can +run effectively at realtime priority, averaged over a rolling iso_period +seconds. + +Set to 80 (percent) by default. + +============================================================== + +iso_period: + +This sets the number of seconds over which SCHED_ISO cpu usage is averaged +to see if it exceeds its allocated cpu bandwidth. + +Set to 5 (seconds) by default. + +============================================================== + l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If Index: linux-2.6.21-ck2/kernel/sysctl.c =================================================================== --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:30:30.000000000 +1000 +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:30:31.000000000 +1000 @@ -78,6 +78,8 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int rr_interval; extern int sched_interactive; +extern int sched_iso_cpu; +extern int sched_iso_period; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -528,6 +530,28 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "iso_period", + .data = &sched_iso_period, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &one_hundred, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC,