Add the SCHED_IDLEPRIO scheduling policy. Tasks set to this policy are only given cpu time if no other tasks at all wish to have cpu time thus running effectively at idle priority. If semaphores or mutexes are held, or the system is going into suspend, schedule them as SCHED_NORMAL nice 19. Signed-off-by: Con Kolivas --- include/linux/sched.h | 5 + kernel/sched.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 138 insertions(+), 12 deletions(-) Index: linux-2.6.21-ck2/include/linux/sched.h =================================================================== --- linux-2.6.21-ck2.orig/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000 +++ linux-2.6.21-ck2/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000 @@ -35,10 +35,11 @@ #define SCHED_RR 2 #define SCHED_BATCH 3 #define SCHED_ISO 4 +#define SCHED_IDLEPRIO 5 #ifdef __KERNEL__ -#define SCHED_MAX SCHED_ISO +#define SCHED_MAX SCHED_IDLEPRIO #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) struct sched_param { @@ -537,6 +538,7 @@ struct signal_struct { (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) #define iso_task(p) unlikely((p)->policy == SCHED_ISO) +#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) /* * Some day this will be a full-fledged user tracking system.. @@ -1173,6 +1175,7 @@ static inline void put_task_struct(struc #define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ +#define PF_NONSLEEP 0x40000000 /* Waiting on in-kernel activity */ /* * Only the _current_ task can read/write to tsk->flags, but other Index: linux-2.6.21-ck2/kernel/sched.c =================================================================== --- linux-2.6.21-ck2.orig/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000 +++ linux-2.6.21-ck2/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000 @@ -144,7 +144,7 @@ struct rq; */ struct prio_array { /* Tasks queued at each priority */ - struct list_head queue[MAX_PRIO]; + struct list_head queue[MAX_PRIO + 1]; /* * The bitmap of priorities queued for this array. While the expired @@ -201,7 +201,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; + struct prio_array *active, *expired, *idleprio, arrays[2]; unsigned long *dyn_bitmap, *exp_bitmap; /* @@ -215,6 +215,8 @@ struct rq { unsigned long iso_ticks; unsigned short iso_refractory; + /* Number of idleprio tasks running */ + unsigned long nr_idleprio; atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -652,6 +654,17 @@ sched_info_switch(struct task_struct *pr #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +static int idleprio_suitable(struct task_struct *p) +{ + return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && + !(p->flags & (PF_NONSLEEP | PF_EXITING))); +} + +static int idleprio(const struct task_struct *p) +{ + return (p->prio == MAX_PRIO); +} + static inline int task_queued(struct task_struct *task) { return !list_empty(&task->run_list); @@ -668,7 +681,9 @@ static inline void set_dynamic_bit(struc static void dequeue_task(struct task_struct *p, struct rq *rq) { list_del_init(&p->run_list); - if (list_empty(p->array->queue + p->prio)) + if (idleprio_task(p) && idleprio(p)) + rq->nr_idleprio--; + else if (list_empty(p->array->queue + p->prio)) __clear_bit(p->prio, p->array->prio_bitmap); } @@ -809,6 +824,8 @@ static inline int isoprio_suitable(struc return !(p->flags & PF_ISOREF); } +static int task_timeslice(struct task_struct *p); + /* * recalc_task_prio determines what priority a non rt_task will be * queued at. If the task has already been running during this runqueue's @@ -842,6 +859,30 @@ static void recalc_task_prio(struct task /* Just about to be demoted to SCHED_NORMAL */ p->time_slice = 0; } + } else if (idleprio_task(p)) { + if (idleprio_suitable(p)) { + /* + * If suitable idleprio_tasks are queued at MAX_PRIO + * only on the idleprio array. Their time_slice is + * their full task_timeslice as they cooperatively + * multitask. + */ + p->prio = p->normal_prio = MAX_PRIO; + p->array = rq->idleprio; + if (p->time_slice <= 0) + p->time_slice = task_timeslice(p); + return; + } + /* + * If unsuitable idleprio_tasks are queued equivalent to + * nice 19 tasks on the expired array. + */ + p->flags &= ~PF_NONSLEEP; + p->prio = p->normal_prio = MAX_PRIO - 1; + p->array = rq->expired; + if (p->time_slice <= 0 || p->time_slice > p->quota) + p->time_slice = p->quota; + return; } update_if_moved(p, rq); @@ -878,6 +919,8 @@ static inline void __enqueue_task(struct else recalc_task_prio(p, rq); + if (idleprio_task(p) && idleprio(p)) + rq->nr_idleprio++; sched_info_queued(p); set_dynamic_bit(p, rq); } @@ -942,6 +985,8 @@ static int task_timeslice(struct task_st static void set_load_weight(struct task_struct *p) { + int load_weight; + if (has_rt_policy(p)) { #ifdef CONFIG_SMP if (p == task_rq(p)->migration_thread) @@ -950,12 +995,19 @@ static void set_load_weight(struct task_ * Giving its load any weight will skew balancing * adversely. */ - p->load_weight = 0; + load_weight = 0; else #endif - p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); } else - p->load_weight = task_timeslice(p); + load_weight = task_timeslice(p); + /* + * idleprio tasks have much lower weight than SCHED_NORMAL tasks but + * still need to be weighted to allow balancing to occur. + */ + if (likely(!idleprio_task(p))) + load_weight *= PRIO_RANGE; + p->load_weight = load_weight; } static inline void @@ -1653,6 +1705,14 @@ out_activate: out_running: p->state = TASK_RUNNING; out: + /* + * Special case when freezing we need to reschedule idleprio tasks + * as SCHED_NORMAL or else they'll never freeze + */ + if (idleprio_task(p) && freezing(p) && idleprio(p)) { + dequeue_task(p, rq); + enqueue_task(p, rq); + } task_rq_unlock(rq, &flags); return success; @@ -2263,7 +2323,9 @@ skip_bitmap: idx = sched_find_first_bit(array->prio_bitmap); else idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { + if (idx == MAX_PRIO) { + if (array == busiest->idleprio && busiest->nr_idleprio) + goto found_idleprio; if (array == busiest->expired) { array = busiest->active; goto new_array; @@ -2271,6 +2333,7 @@ skip_bitmap: goto out; } +found_idleprio: head = array->queue + idx; curr = head->prev; skip_queue: @@ -2292,6 +2355,17 @@ skip_queue: best_prio_seen |= idx == best_prio; if (curr != head) goto skip_queue; + if (idx == MAX_PRIO) { + /* + * Occurs either when balancing idleprio tasks or + * there really are no more tasks to find. + */ + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out; + } idx++; goto skip_bitmap; } @@ -2309,6 +2383,13 @@ skip_queue: this_best_prio = idx; if (curr != head) goto skip_queue; + if (idx == MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out; + } idx++; goto skip_bitmap; } @@ -3136,7 +3217,7 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0 || idleprio_task(p)) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -3351,6 +3432,28 @@ static void reset_prio_levels(struct rq } /* + * Only tasks running are SCHED_IDLEPRIO. Set the active array to the + * idleprio array and if it isn't already active + */ +static struct task_struct *next_idleprio_task(struct rq *rq) +{ + struct prio_array *array = rq->active; + struct list_head *queue; + + if (array != rq->idleprio) { + rq->active = rq->idleprio; + rq->expired = array; + array = rq->active; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; + } + rq->prio_rotation++; + reset_prio_levels(rq); + queue = array->queue + MAX_PRIO; + return list_entry(queue->next, struct task_struct, run_list); +} + +/* * next_dynamic_task finds the next suitable dynamic task. */ static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) @@ -3361,6 +3464,8 @@ static inline struct task_struct *next_d int nstatic; retry: + if (unlikely(rq->nr_running == rq->nr_idleprio)) + return next_idleprio_task(rq); if (idx >= MAX_PRIO) { /* There are no more tasks in the active array. Swap arrays */ array = rq->expired; @@ -3451,8 +3556,10 @@ need_resched_nonpreemptible: unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { - if (prev->state == TASK_UNINTERRUPTIBLE) + if (prev->state == TASK_UNINTERRUPTIBLE) { + prev->flags |= PF_NONSLEEP; rq->nr_uninterruptible++; + } deactivate_task(prev, rq); } } @@ -3994,7 +4101,8 @@ void set_user_nice(struct task_struct *p * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) + if (delta < 0 || ((delta > 0 || idleprio_task(p)) && + task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -4195,6 +4303,11 @@ recheck: return -EPERM; } + if (!(p->mm) && policy == SCHED_IDLEPRIO) { + /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ + return -EINVAL; + } + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4520,12 +4633,18 @@ asmlinkage long sys_sched_yield(void) struct prio_array *old_array = p->array; int old_prio = p->prio; + if (idleprio_task(p)) { + dequeue_task(p, rq); + enqueue_task(p, rq); + goto out_release; + } /* p->prio will be updated in requeue_task via queue_expired */ if (!rt_task(p)) p->array = rq->expired; requeue_task(p, rq, old_array, old_prio); } +out_release: /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: @@ -4678,6 +4797,7 @@ asmlinkage long sys_sched_get_priority_m case SCHED_NORMAL: case SCHED_BATCH: case SCHED_ISO: + case SCHED_IDLEPRIO: ret = 0; break; } @@ -4703,6 +4823,7 @@ asmlinkage long sys_sched_get_priority_m case SCHED_NORMAL: case SCHED_BATCH: case SCHED_ISO: + case SCHED_IDLEPRIO: ret = 0; } return ret; @@ -6812,8 +6933,10 @@ void __init sched_init(void) lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->iso_ticks = 0; rq->nr_running = 0; + rq->nr_idleprio = 0; rq->prio_rotation = 0; rq->active = rq->arrays; + rq->idleprio = rq->active; rq->expired = rq->arrays + 1; reset_prio_levels(rq); rq->dyn_bitmap = rq->active->prio_bitmap; @@ -6836,7 +6959,7 @@ void __init sched_init(void) for (j = 0; j < 2; j++) { array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) + for (k = 0; k <= MAX_PRIO; k++) INIT_LIST_HEAD(array->queue + k); bitmap_zero(array->prio_bitmap, MAX_PRIO); /* delimiter for bitsearch */