Magellan Linux

Annotation of /trunk/kernel26-magellan/patches-2.6.17-r6/0003-2.6.17-smpnice-staircase-16.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 105 - (hide annotations) (download)
Sun Mar 11 16:17:56 2007 UTC (17 years, 2 months ago) by niro
File size: 52028 byte(s)
2.6.17-magellan-r6

1 niro 105 Implement the "staircase" hybrid foreground-background single priority
2     array cpu scheduler policy.
3    
4     Signed-off-by: Con Kolivas <kernel@kolivas.org>
5    
6     fs/proc/array.c | 4
7     include/linux/sched.h | 21 -
8     kernel/exit.c | 1
9     kernel/sched.c | 1015 ++++++++++++++++++--------------------------------
10     4 files changed, 378 insertions(+), 663 deletions(-)
11    
12     Index: linux-ck-dev/fs/proc/array.c
13     ===================================================================
14     --- linux-ck-dev.orig/fs/proc/array.c 2006-06-18 15:20:15.000000000 +1000
15     +++ linux-ck-dev/fs/proc/array.c 2006-06-18 15:21:50.000000000 +1000
16     @@ -165,7 +165,7 @@ static inline char * task_state(struct t
17     read_lock(&tasklist_lock);
18     buffer += sprintf(buffer,
19     "State:\t%s\n"
20     - "SleepAVG:\t%lu%%\n"
21     + "Bonus:\t%d\n"
22     "Tgid:\t%d\n"
23     "Pid:\t%d\n"
24     "PPid:\t%d\n"
25     @@ -173,7 +173,7 @@ static inline char * task_state(struct t
26     "Uid:\t%d\t%d\t%d\t%d\n"
27     "Gid:\t%d\t%d\t%d\t%d\n",
28     get_task_state(p),
29     - (p->sleep_avg/1024)*100/(1020000000/1024),
30     + p->bonus,
31     p->tgid,
32     p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0,
33     pid_alive(p) && p->ptrace ? p->parent->pid : 0,
34     Index: linux-ck-dev/include/linux/sched.h
35     ===================================================================
36     --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:21:31.000000000 +1000
37     +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:21:50.000000000 +1000
38     @@ -483,6 +483,7 @@ struct signal_struct {
39     #define MAX_RT_PRIO MAX_USER_RT_PRIO
40    
41     #define MAX_PRIO (MAX_RT_PRIO + 40)
42     +#define MIN_USER_PRIO (MAX_PRIO - 1)
43    
44     #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO))
45     #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
46     @@ -518,7 +519,6 @@ extern struct user_struct *find_user(uid
47     extern struct user_struct root_user;
48     #define INIT_USER (&root_user)
49    
50     -typedef struct prio_array prio_array_t;
51     struct backing_dev_info;
52     struct reclaim_state;
53    
54     @@ -687,13 +687,6 @@ struct audit_context; /* See audit.c */
55     struct mempolicy;
56     struct pipe_inode_info;
57    
58     -enum sleep_type {
59     - SLEEP_NORMAL,
60     - SLEEP_NONINTERACTIVE,
61     - SLEEP_INTERACTIVE,
62     - SLEEP_INTERRUPTED,
63     -};
64     -
65     struct task_struct {
66     volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
67     struct thread_info *thread_info;
68     @@ -711,19 +704,18 @@ struct task_struct {
69     int load_weight; /* for niceness load balancing purposes */
70     int prio, static_prio;
71     struct list_head run_list;
72     - prio_array_t *array;
73    
74     unsigned short ioprio;
75     unsigned int btrace_seq;
76    
77     - unsigned long sleep_avg;
78     - unsigned long long timestamp, last_ran;
79     + unsigned long long timestamp;
80     + unsigned long runtime, totalrun, ns_debit, systime;
81     + unsigned int bonus;
82     + unsigned int slice, time_slice;
83     unsigned long long sched_time; /* sched_clock time spent running */
84     - enum sleep_type sleep_type;
85    
86     unsigned long policy;
87     cpumask_t cpus_allowed;
88     - unsigned int time_slice, first_time_slice;
89    
90     #ifdef CONFIG_SCHEDSTATS
91     struct sched_info sched_info;
92     @@ -952,6 +944,8 @@ static inline void put_task_struct(struc
93     #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */
94     #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */
95     #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
96     +#define PF_NONSLEEP 0x20000000 /* Waiting on in kernel activity */
97     +#define PF_FORKED 0x40000000 /* Task just forked another process */
98    
99     /*
100     * Only the _current_ task can read/write to tsk->flags, but other
101     @@ -1073,7 +1067,6 @@ extern void FASTCALL(wake_up_new_task(st
102     static inline void kick_process(struct task_struct *tsk) { }
103     #endif
104     extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
105     -extern void FASTCALL(sched_exit(task_t * p));
106    
107     extern int in_group_p(gid_t);
108     extern int in_egroup_p(gid_t);
109     Index: linux-ck-dev/kernel/exit.c
110     ===================================================================
111     --- linux-ck-dev.orig/kernel/exit.c 2006-06-18 15:21:00.000000000 +1000
112     +++ linux-ck-dev/kernel/exit.c 2006-06-18 15:21:50.000000000 +1000
113     @@ -170,7 +170,6 @@ repeat:
114     zap_leader = (leader->exit_signal == -1);
115     }
116    
117     - sched_exit(p);
118     write_unlock_irq(&tasklist_lock);
119     spin_unlock(&p->proc_lock);
120     proc_pid_flush(proc_dentry);
121     Index: linux-ck-dev/kernel/sched.c
122     ===================================================================
123     --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000
124     +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:22:27.000000000 +1000
125     @@ -16,6 +16,9 @@
126     * by Davide Libenzi, preemptible kernel bits by Robert Love.
127     * 2003-09-03 Interactivity tuning by Con Kolivas.
128     * 2004-04-02 Scheduler domains code by Nick Piggin
129     + * 2006-06-18 Staircase scheduling policy by Con Kolivas with help
130     + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
131     + * Staircase v16
132     */
133    
134     #include <linux/mm.h>
135     @@ -75,131 +78,27 @@
136     /*
137     * Some helpers for converting nanosecond timing to jiffy resolution
138     */
139     -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
140     -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
141     -
142     -/*
143     - * These are the 'tuning knobs' of the scheduler:
144     - *
145     - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
146     - * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
147     - * Timeslices get refilled after they expire.
148     - */
149     -#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
150     -#define DEF_TIMESLICE (100 * HZ / 1000)
151     -#define ON_RUNQUEUE_WEIGHT 30
152     -#define CHILD_PENALTY 95
153     -#define PARENT_PENALTY 100
154     -#define EXIT_WEIGHT 3
155     -#define PRIO_BONUS_RATIO 25
156     -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
157     -#define INTERACTIVE_DELTA 2
158     -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
159     -#define STARVATION_LIMIT (MAX_SLEEP_AVG)
160     -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
161     -
162     -/*
163     - * If a task is 'interactive' then we reinsert it in the active
164     - * array after it has expired its current timeslice. (it will not
165     - * continue to run immediately, it will still roundrobin with
166     - * other interactive tasks.)
167     - *
168     - * This part scales the interactivity limit depending on niceness.
169     - *
170     - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
171     - * Here are a few examples of different nice levels:
172     - *
173     - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
174     - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
175     - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
176     - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
177     - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
178     - *
179     - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
180     - * priority range a task can explore, a value of '1' means the
181     - * task is rated interactive.)
182     - *
183     - * Ie. nice +19 tasks can never get 'interactive' enough to be
184     - * reinserted into the active array. And only heavily CPU-hog nice -20
185     - * tasks will be expired. Default nice 0 tasks are somewhere between,
186     - * it takes some effort for them to get interactive, but it's not
187     - * too hard.
188     - */
189     -
190     -#define CURRENT_BONUS(p) \
191     - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
192     - MAX_SLEEP_AVG)
193     -
194     -#define GRANULARITY (10 * HZ / 1000 ? : 1)
195     -
196     -#ifdef CONFIG_SMP
197     -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
198     - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
199     - num_online_cpus())
200     -#else
201     -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
202     - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
203     -#endif
204     -
205     -#define SCALE(v1,v1_max,v2_max) \
206     - (v1) * (v2_max) / (v1_max)
207     -
208     -#define DELTA(p) \
209     - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
210     - INTERACTIVE_DELTA)
211     -
212     -#define TASK_INTERACTIVE(p) \
213     - ((p)->prio <= (p)->static_prio - DELTA(p))
214     -
215     -#define INTERACTIVE_SLEEP(p) \
216     - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
217     - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
218     -
219     +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
220     +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY)
221     +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY)
222     #define TASK_PREEMPTS_CURR(p, rq) \
223     ((p)->prio < (rq)->curr->prio)
224    
225     /*
226     - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
227     - * to time slice values: [800ms ... 100ms ... 5ms]
228     - *
229     - * The higher a thread's priority, the bigger timeslices
230     - * it gets during one round of execution. But even the lowest
231     - * priority thread gets MIN_TIMESLICE worth of execution time.
232     + * This is the time all tasks within the same priority round robin.
233     + * Set to a minimum of 6ms.
234     */
235     +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
236     +#define DEF_TIMESLICE (RR_INTERVAL * 19)
237    
238     -#define SCALE_PRIO(x, prio) \
239     - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
240     -
241     -static unsigned int static_prio_timeslice(int static_prio)
242     -{
243     - if (static_prio < NICE_TO_PRIO(0))
244     - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
245     - else
246     - return SCALE_PRIO(DEF_TIMESLICE, static_prio);
247     -}
248     -
249     -static inline unsigned int task_timeslice(task_t *p)
250     -{
251     - return static_prio_timeslice(p->static_prio);
252     -}
253     -
254     -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
255     +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \
256     < (long long) (sd)->cache_hot_time)
257    
258     /*
259     * These are the runqueue data structures:
260     */
261     -
262     -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
263     -
264     typedef struct runqueue runqueue_t;
265    
266     -struct prio_array {
267     - unsigned int nr_active;
268     - unsigned long bitmap[BITMAP_SIZE];
269     - struct list_head queue[MAX_PRIO];
270     -};
271     -
272     /*
273     * This is the main, per-CPU runqueue data structure.
274     *
275     @@ -229,12 +128,11 @@ struct runqueue {
276     */
277     unsigned long nr_uninterruptible;
278    
279     - unsigned long expired_timestamp;
280     unsigned long long timestamp_last_tick;
281     task_t *curr, *idle;
282     struct mm_struct *prev_mm;
283     - prio_array_t *active, *expired, arrays[2];
284     - int best_expired_prio;
285     + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
286     + struct list_head queue[MAX_PRIO];
287     atomic_t nr_iowait;
288    
289     #ifdef CONFIG_SMP
290     @@ -499,13 +397,7 @@ static inline runqueue_t *this_rq_lock(v
291    
292     #ifdef CONFIG_SCHEDSTATS
293     /*
294     - * Called when a process is dequeued from the active array and given
295     - * the cpu. We should note that with the exception of interactive
296     - * tasks, the expired queue will become the active queue after the active
297     - * queue is empty, without explicitly dequeuing and requeuing tasks in the
298     - * expired queue. (Interactive tasks may be requeued directly to the
299     - * active queue, thus delaying tasks in the expired queue from running;
300     - * see scheduler_tick()).
301     + * Called when a process is dequeued and given the cpu.
302     *
303     * This function is only called from sched_info_arrive(), rather than
304     * dequeue_task(). Even though a task may be queued and dequeued multiple
305     @@ -543,13 +435,11 @@ static void sched_info_arrive(task_t *t)
306     }
307    
308     /*
309     - * Called when a process is queued into either the active or expired
310     - * array. The time is noted and later used to determine how long we
311     - * had to wait for us to reach the cpu. Since the expired queue will
312     - * become the active queue after active queue is empty, without dequeuing
313     - * and requeuing any tasks, we are interested in queuing to either. It
314     - * is unusual but not impossible for tasks to be dequeued and immediately
315     - * requeued in the same or another array: this can happen in sched_yield(),
316     + * Called when a process is queued
317     + * The time is noted and later used to determine how long we had to wait for
318     + * us to reach the cpu.
319     + * It is unusual but not impossible for tasks to be dequeued and immediately
320     + * requeued: this can happen in sched_yield(),
321     * set_user_nice(), and even load_balance() as it moves tasks from runqueue
322     * to runqueue.
323     *
324     @@ -603,74 +493,81 @@ static inline void sched_info_switch(tas
325     #define sched_info_switch(t, next) do { } while (0)
326     #endif /* CONFIG_SCHEDSTATS */
327    
328     -/*
329     - * Adding/removing a task to/from a priority array:
330     - */
331     -static void dequeue_task(struct task_struct *p, prio_array_t *array)
332     +#if BITS_PER_LONG < 64
333     +static inline void longlimit(unsigned long long *longlong)
334     +{
335     + if (*longlong > (1 << 31))
336     + *longlong = 1 << 31;
337     +}
338     +#else
339     +static inline void longlimit(unsigned long long *__unused)
340     {
341     - array->nr_active--;
342     - list_del(&p->run_list);
343     - if (list_empty(array->queue + p->prio))
344     - __clear_bit(p->prio, array->bitmap);
345     +}
346     +#endif
347     +
348     +/* Get nanosecond clock difference without overflowing unsigned long. */
349     +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
350     +{
351     + unsigned long long vdiff;
352     + if (likely(v1 >= v2)) {
353     + vdiff = v1 - v2;
354     + longlimit(&vdiff);
355     + } else {
356     + /*
357     + * Rarely the clock appears to go backwards. There should
358     + * always be a positive difference so return 1.
359     + */
360     + vdiff = 1;
361     + }
362     + return (unsigned long)vdiff;
363     }
364    
365     -static void enqueue_task(struct task_struct *p, prio_array_t *array)
366     +static inline int task_queued(const task_t *task)
367     {
368     - sched_info_queued(p);
369     - list_add_tail(&p->run_list, array->queue + p->prio);
370     - __set_bit(p->prio, array->bitmap);
371     - array->nr_active++;
372     - p->array = array;
373     + return !list_empty(&task->run_list);
374     }
375    
376     /*
377     - * Put task to the end of the run list without the overhead of dequeue
378     - * followed by enqueue.
379     + * Adding/removing a task to/from a runqueue:
380     */
381     -static void requeue_task(struct task_struct *p, prio_array_t *array)
382     +static void dequeue_task(task_t *p, runqueue_t *rq)
383     {
384     - list_move_tail(&p->run_list, array->queue + p->prio);
385     + list_del_init(&p->run_list);
386     + if (list_empty(rq->queue + p->prio))
387     + __clear_bit(p->prio, rq->bitmap);
388     + p->ns_debit = 0;
389     }
390    
391     -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
392     +static void enqueue_task(task_t *p, runqueue_t *rq)
393     {
394     - list_add(&p->run_list, array->queue + p->prio);
395     - __set_bit(p->prio, array->bitmap);
396     - array->nr_active++;
397     - p->array = array;
398     + list_add_tail(&p->run_list, rq->queue + p->prio);
399     + __set_bit(p->prio, rq->bitmap);
400     }
401    
402     /*
403     - * effective_prio - return the priority that is based on the static
404     - * priority but is modified by bonuses/penalties.
405     - *
406     - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
407     - * into the -5 ... 0 ... +5 bonus/penalty range.
408     - *
409     - * We use 25% of the full 0...39 priority range so that:
410     - *
411     - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
412     - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
413     - *
414     - * Both properties are important to certain workloads.
415     + * Put task to the end of the run list without the overhead of dequeue
416     + * followed by enqueue.
417     */
418     -static int effective_prio(task_t *p)
419     +static void requeue_task(task_t *p, runqueue_t *rq, const int prio)
420     {
421     - int bonus, prio;
422     -
423     - if (rt_task(p))
424     - return p->prio;
425     -
426     - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
427     + list_move_tail(&p->run_list, rq->queue + prio);
428     + if (p->prio != prio) {
429     + if (list_empty(rq->queue + p->prio))
430     + __clear_bit(p->prio, rq->bitmap);
431     + p->prio = prio;
432     + __set_bit(prio, rq->bitmap);
433     + }
434     + p->ns_debit = 0;
435     +}
436    
437     - prio = p->static_prio - bonus;
438     - if (prio < MAX_RT_PRIO)
439     - prio = MAX_RT_PRIO;
440     - if (prio > MAX_PRIO-1)
441     - prio = MAX_PRIO-1;
442     - return prio;
443     +static inline void enqueue_task_head(task_t *p, runqueue_t *rq)
444     +{
445     + list_add(&p->run_list, rq->queue + p->prio);
446     + __set_bit(p->prio, rq->bitmap);
447     }
448    
449     +static unsigned int slice(const task_t *p);
450     +
451     /*
452     * To aid in avoiding the subversion of "niceness" due to uneven distribution
453     * of tasks with abnormal "nice" values across CPUs the contribution that
454     @@ -688,10 +585,9 @@ static int effective_prio(task_t *p)
455     #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
456     #define LOAD_WEIGHT(lp) \
457     (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
458     -#define PRIO_TO_LOAD_WEIGHT(prio) \
459     - LOAD_WEIGHT(static_prio_timeslice(prio))
460     -#define RTPRIO_TO_LOAD_WEIGHT(rp) \
461     - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
462     +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p))
463     +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
464     + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
465    
466     static void set_load_weight(task_t *p)
467     {
468     @@ -708,7 +604,7 @@ static void set_load_weight(task_t *p)
469     #endif
470     p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
471     } else
472     - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
473     + p->load_weight = TASK_LOAD_WEIGHT(p);
474     }
475    
476     static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
477     @@ -736,13 +632,9 @@ static inline void dec_nr_running(task_t
478     /*
479     * __activate_task - move a task to the runqueue.
480     */
481     -static void __activate_task(task_t *p, runqueue_t *rq)
482     +static inline void __activate_task(task_t *p, runqueue_t *rq)
483     {
484     - prio_array_t *target = rq->active;
485     -
486     - if (batch_task(p))
487     - target = rq->expired;
488     - enqueue_task(p, target);
489     + enqueue_task(p, rq);
490     inc_nr_running(p, rq);
491     }
492    
493     @@ -751,85 +643,181 @@ static void __activate_task(task_t *p, r
494     */
495     static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
496     {
497     - enqueue_task_head(p, rq->active);
498     + enqueue_task_head(p, rq);
499     inc_nr_running(p, rq);
500     }
501    
502     -static int recalc_task_prio(task_t *p, unsigned long long now)
503     +/*
504     + * Bonus - How much higher than its base priority an interactive task can run.
505     + */
506     +static inline unsigned int bonus(const task_t *p)
507     {
508     - /* Caller must always ensure 'now >= p->timestamp' */
509     - unsigned long long __sleep_time = now - p->timestamp;
510     - unsigned long sleep_time;
511     + return TASK_USER_PRIO(p);
512     +}
513    
514     - if (batch_task(p))
515     - sleep_time = 0;
516     +static unsigned int rr_interval(const task_t *p)
517     +{
518     + int nice = TASK_NICE(p);
519     +
520     + if (nice < 0 && !rt_task(p))
521     + return RR_INTERVAL * (20 - nice) / 20;
522     + return RR_INTERVAL;
523     +}
524     +
525     +/*
526     + * slice - the duration a task runs before getting requeued at its best
527     + * priority and has its bonus decremented.
528     + */
529     +static unsigned int slice(const task_t *p)
530     +{
531     + unsigned int slice, rr;
532     +
533     + slice = rr = rr_interval(p);
534     + if (likely(!rt_task(p)))
535     + slice += (39 - TASK_USER_PRIO(p)) * rr;
536     + return slice;
537     +}
538     +
539     +/*
540     + * We increase our bonus by sleeping more than the time we ran.
541     + * The ratio of sleep to run gives us the cpu% that we last ran and determines
542     + * the maximum bonus we can acquire.
543     + */
544     +static void inc_bonus(task_t *p, unsigned long totalrun, unsigned long sleep)
545     +{
546     + unsigned int best_bonus = sleep / (totalrun + 1);
547     +
548     + if (p->bonus >= best_bonus)
549     + return;
550     + best_bonus = bonus(p);
551     + if (p->bonus < best_bonus)
552     + p->bonus++;
553     +}
554     +
555     +static inline void dec_bonus(task_t *p)
556     +{
557     + if (p->bonus)
558     + p->bonus--;
559     +}
560     +
561     +static inline void slice_overrun(struct task_struct *p)
562     +{
563     + unsigned long ns_slice = JIFFIES_TO_NS(p->slice);
564     +
565     + do {
566     + p->totalrun -= ns_slice;
567     + dec_bonus(p);
568     + } while (unlikely(p->totalrun > ns_slice));
569     +}
570     +
571     +/*
572     + * effective_prio - dynamic priority dependent on bonus.
573     + * The priority normally decreases by one each RR_INTERVAL.
574     + * As the bonus increases the initial priority starts at a higher "stair" or
575     + * priority for longer.
576     + */
577     +static int effective_prio(const task_t *p)
578     +{
579     + int prio;
580     + unsigned int full_slice, used_slice = 0;
581     + unsigned int best_bonus, rr;
582     +
583     + if (rt_task(p))
584     + return p->prio;
585     +
586     + full_slice = slice(p);
587     + if (full_slice > p->slice)
588     + used_slice = full_slice - p->slice;
589     +
590     + best_bonus = bonus(p);
591     + prio = MAX_RT_PRIO + best_bonus;
592     + if (!batch_task(p))
593     + prio -= p->bonus;
594     +
595     + rr = rr_interval(p);
596     + prio += used_slice / rr;
597     + if (prio > MIN_USER_PRIO)
598     + prio = MIN_USER_PRIO;
599     + return prio;
600     +}
601     +
602     +static inline void continue_slice(task_t *p)
603     +{
604     + unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
605     +
606     + if (unlikely(total_run >= p->slice))
607     + slice_overrun(p);
608     else {
609     - if (__sleep_time > NS_MAX_SLEEP_AVG)
610     - sleep_time = NS_MAX_SLEEP_AVG;
611     - else
612     - sleep_time = (unsigned long)__sleep_time;
613     + unsigned long remainder;
614     +
615     + p->slice -= total_run;
616     + remainder = p->slice % rr_interval(p);
617     + if (remainder)
618     + p->time_slice = remainder;
619     }
620     +}
621    
622     - if (likely(sleep_time > 0)) {
623     - /*
624     - * User tasks that sleep a long time are categorised as
625     - * idle. They will only have their sleep_avg increased to a
626     - * level that makes them just interactive priority to stay
627     - * active yet prevent them suddenly becoming cpu hogs and
628     - * starving other processes.
629     - */
630     - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
631     - unsigned long ceiling;
632     +/*
633     + * recalc_task_prio - this checks for tasks that have run less than a full
634     + * slice and have woken up again soon after, or have just forked a
635     + * thread/process and make them continue their old slice instead of starting
636     + * a new one at high priority.
637     + */
638     +static inline void recalc_task_prio(task_t *p, const unsigned long long now)
639     +{
640     + unsigned long sleep_time;
641    
642     - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
643     - DEF_TIMESLICE);
644     - if (p->sleep_avg < ceiling)
645     - p->sleep_avg = ceiling;
646     - } else {
647     - /*
648     - * Tasks waking from uninterruptible sleep are
649     - * limited in their sleep_avg rise as they
650     - * are likely to be waiting on I/O
651     - */
652     - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
653     - if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
654     - sleep_time = 0;
655     - else if (p->sleep_avg + sleep_time >=
656     - INTERACTIVE_SLEEP(p)) {
657     - p->sleep_avg = INTERACTIVE_SLEEP(p);
658     - sleep_time = 0;
659     - }
660     - }
661     + /*
662     + * If this task has managed to run to its lowest priority then
663     + * decrease its bonus and requeue it now at best priority instead
664     + * of possibly flagging around lowest priority. Save up any systime
665     + * that may affect priority on the next reschedule.
666     + */
667     + if (p->slice > p->time_slice &&
668     + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) {
669     + dec_bonus(p);
670     + p->totalrun = 0;
671     + return;
672     + }
673    
674     - /*
675     - * This code gives a bonus to interactive tasks.
676     - *
677     - * The boost works by updating the 'average sleep time'
678     - * value here, based on ->timestamp. The more time a
679     - * task spends sleeping, the higher the average gets -
680     - * and the higher the priority boost gets as well.
681     - */
682     - p->sleep_avg += sleep_time;
683     + /*
684     + * Add the total for this last scheduled run (p->runtime) and system
685     + * time (p->systime) done on behalf of p to the running total so far
686     + * used (p->totalrun).
687     + */
688     + p->totalrun += p->runtime + p->systime;
689     + sleep_time = ns_diff(now, p->timestamp);
690    
691     - if (p->sleep_avg > NS_MAX_SLEEP_AVG)
692     - p->sleep_avg = NS_MAX_SLEEP_AVG;
693     + if (p->systime > sleep_time || p->flags & PF_FORKED)
694     + sleep_time = 0;
695     + else {
696     + sleep_time -= p->systime;
697     + /*
698     + * We elevate priority by the amount of time we slept. If we
699     + * sleep longer than our running total and have not set the
700     + * PF_NONSLEEP flag we gain a bonus.
701     + */
702     + if (sleep_time >= p->totalrun) {
703     + if (!(p->flags & PF_NONSLEEP))
704     + inc_bonus(p, p->totalrun, sleep_time);
705     + p->totalrun = 0;
706     + return;
707     }
708     + p->totalrun -= sleep_time;
709     }
710     -
711     - return effective_prio(p);
712     + continue_slice(p);
713     }
714    
715     /*
716     * activate_task - move a task to the runqueue and do priority recalculation
717     *
718     - * Update all the scheduling statistics stuff. (sleep average
719     - * calculation, priority modifiers, etc.)
720     + * Update all the scheduling statistics stuff. (priority modifiers, etc.)
721     */
722     -static void activate_task(task_t *p, runqueue_t *rq, int local)
723     +static void activate_task(task_t *p, runqueue_t *rq, const int local)
724     {
725     - unsigned long long now;
726     + unsigned long long now = sched_clock();
727     + unsigned long rr = rr_interval(p);
728    
729     - now = sched_clock();
730     #ifdef CONFIG_SMP
731     if (!local) {
732     /* Compensate for drifting sched_clock */
733     @@ -838,45 +826,25 @@ static void activate_task(task_t *p, run
734     + rq->timestamp_last_tick;
735     }
736     #endif
737     -
738     - if (!rt_task(p))
739     - p->prio = recalc_task_prio(p, now);
740     -
741     - /*
742     - * This checks to make sure it's not an uninterruptible task
743     - * that is now waking up.
744     - */
745     - if (p->sleep_type == SLEEP_NORMAL) {
746     - /*
747     - * Tasks which were woken up by interrupts (ie. hw events)
748     - * are most likely of interactive nature. So we give them
749     - * the credit of extending their sleep time to the period
750     - * of time they spend on the runqueue, waiting for execution
751     - * on a CPU, first time around:
752     - */
753     - if (in_interrupt())
754     - p->sleep_type = SLEEP_INTERRUPTED;
755     - else {
756     - /*
757     - * Normal first-time wakeups get a credit too for
758     - * on-runqueue time, but it will be weighted down:
759     - */
760     - p->sleep_type = SLEEP_INTERACTIVE;
761     - }
762     + p->slice = slice(p);
763     + p->time_slice = p->slice % rr ? : rr;
764     + if (!rt_task(p)) {
765     + recalc_task_prio(p, now);
766     + p->prio = effective_prio(p);
767     + p->systime = 0;
768     + p->flags &= ~(PF_FORKED | PF_NONSLEEP);
769     }
770     p->timestamp = now;
771     -
772     __activate_task(p, rq);
773     }
774    
775     /*
776     * deactivate_task - remove a task from the runqueue.
777     */
778     -static void deactivate_task(struct task_struct *p, runqueue_t *rq)
779     +static void deactivate_task(task_t *p, runqueue_t *rq)
780     {
781     dec_nr_running(p, rq);
782     - dequeue_task(p, p->array);
783     - p->array = NULL;
784     + dequeue_task(p, rq);
785     }
786    
787     /*
788     @@ -952,7 +920,7 @@ static int migrate_task(task_t *p, int d
789     * If the task is not on a runqueue (and not running), then
790     * it is sufficient to simply update the task's cpu field.
791     */
792     - if (!p->array && !task_running(rq, p)) {
793     + if (!task_queued(p) && !task_running(rq, p)) {
794     set_task_cpu(p, dest_cpu);
795     return 0;
796     }
797     @@ -982,7 +950,7 @@ void wait_task_inactive(task_t *p)
798     repeat:
799     rq = task_rq_lock(p, &flags);
800     /* Must be off runqueue entirely, not preempted. */
801     - if (unlikely(p->array || task_running(rq, p))) {
802     + if (unlikely(task_queued(p) || task_running(rq, p))) {
803     /* If it's preempted, we yield. It could be a while. */
804     preempted = !task_running(rq, p);
805     task_rq_unlock(rq, &flags);
806     @@ -1234,6 +1202,15 @@ static inline int wake_idle(int cpu, tas
807     }
808     #endif
809    
810     +/*
811     + * Check to see if p preempts rq->curr and resched if it does.
812     + */
813     +static inline void preempt(const task_t *p, runqueue_t *rq)
814     +{
815     + if (TASK_PREEMPTS_CURR(p, rq))
816     + resched_task(rq->curr);
817     +}
818     +
819     /***
820     * try_to_wake_up - wake up a thread
821     * @p: the to-be-woken-up thread
822     @@ -1265,7 +1242,7 @@ static int try_to_wake_up(task_t *p, uns
823     if (!(old_state & state))
824     goto out;
825    
826     - if (p->array)
827     + if (task_queued(p))
828     goto out_running;
829    
830     cpu = task_cpu(p);
831     @@ -1356,7 +1333,7 @@ out_set_cpu:
832     old_state = p->state;
833     if (!(old_state & state))
834     goto out;
835     - if (p->array)
836     + if (task_queued(p))
837     goto out_running;
838    
839     this_cpu = smp_processor_id();
840     @@ -1365,25 +1342,9 @@ out_set_cpu:
841    
842     out_activate:
843     #endif /* CONFIG_SMP */
844     - if (old_state == TASK_UNINTERRUPTIBLE) {
845     + if (old_state == TASK_UNINTERRUPTIBLE)
846     rq->nr_uninterruptible--;
847     - /*
848     - * Tasks on involuntary sleep don't earn
849     - * sleep_avg beyond just interactive state.
850     - */
851     - p->sleep_type = SLEEP_NONINTERACTIVE;
852     - } else
853     -
854     - /*
855     - * Tasks that have marked their sleep as noninteractive get
856     - * woken up with their sleep average not weighted in an
857     - * interactive way.
858     - */
859     - if (old_state & TASK_NONINTERACTIVE)
860     - p->sleep_type = SLEEP_NONINTERACTIVE;
861     -
862    
863     - activate_task(p, rq, cpu == this_cpu);
864     /*
865     * Sync wakeups (i.e. those types of wakeups where the waker
866     * has indicated that it will leave the CPU in short order)
867     @@ -1392,10 +1353,9 @@ out_activate:
868     * the waker guarantees that the freshly woken up task is going
869     * to be considered on this CPU.)
870     */
871     - if (!sync || cpu != this_cpu) {
872     - if (TASK_PREEMPTS_CURR(p, rq))
873     - resched_task(rq->curr);
874     - }
875     + activate_task(p, rq, cpu == this_cpu);
876     + if (!sync || cpu != this_cpu)
877     + preempt(p, rq);
878     success = 1;
879    
880     out_running:
881     @@ -1440,7 +1400,6 @@ void fastcall sched_fork(task_t *p, int
882     */
883     p->state = TASK_RUNNING;
884     INIT_LIST_HEAD(&p->run_list);
885     - p->array = NULL;
886     #ifdef CONFIG_SCHEDSTATS
887     memset(&p->sched_info, 0, sizeof(p->sched_info));
888     #endif
889     @@ -1451,30 +1410,6 @@ void fastcall sched_fork(task_t *p, int
890     /* Want to start with kernel preemption disabled. */
891     task_thread_info(p)->preempt_count = 1;
892     #endif
893     - /*
894     - * Share the timeslice between parent and child, thus the
895     - * total amount of pending timeslices in the system doesn't change,
896     - * resulting in more scheduling fairness.
897     - */
898     - local_irq_disable();
899     - p->time_slice = (current->time_slice + 1) >> 1;
900     - /*
901     - * The remainder of the first timeslice might be recovered by
902     - * the parent if the child exits early enough.
903     - */
904     - p->first_time_slice = 1;
905     - current->time_slice >>= 1;
906     - p->timestamp = sched_clock();
907     - if (unlikely(!current->time_slice)) {
908     - /*
909     - * This case is rare, it happens when the parent has only
910     - * a single jiffy left from its timeslice. Taking the
911     - * runqueue lock is not a problem.
912     - */
913     - current->time_slice = 1;
914     - scheduler_tick();
915     - }
916     - local_irq_enable();
917     put_cpu();
918     }
919    
920     @@ -1496,37 +1431,20 @@ void fastcall wake_up_new_task(task_t *p
921     this_cpu = smp_processor_id();
922     cpu = task_cpu(p);
923    
924     - /*
925     - * We decrease the sleep average of forking parents
926     - * and children as well, to keep max-interactive tasks
927     - * from forking tasks that are max-interactive. The parent
928     - * (current) is done further down, under its lock.
929     - */
930     - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
931     - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
932     -
933     - p->prio = effective_prio(p);
934     + /* Forked process gets no bonus to prevent fork bombs. */
935     + p->bonus = 0;
936     + current->flags |= PF_FORKED;
937    
938     if (likely(cpu == this_cpu)) {
939     + activate_task(p, rq, 1);
940     if (!(clone_flags & CLONE_VM)) {
941     /*
942     * The VM isn't cloned, so we're in a good position to
943     * do child-runs-first in anticipation of an exec. This
944     * usually avoids a lot of COW overhead.
945     */
946     - if (unlikely(!current->array))
947     - __activate_task(p, rq);
948     - else {
949     - p->prio = current->prio;
950     - list_add_tail(&p->run_list, &current->run_list);
951     - p->array = current->array;
952     - p->array->nr_active++;
953     - inc_nr_running(p, rq);
954     - }
955     set_need_resched();
956     - } else
957     - /* Run child last */
958     - __activate_task(p, rq);
959     + }
960     /*
961     * We skip the following code due to cpu == this_cpu
962     *
963     @@ -1543,53 +1461,19 @@ void fastcall wake_up_new_task(task_t *p
964     */
965     p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
966     + rq->timestamp_last_tick;
967     - __activate_task(p, rq);
968     - if (TASK_PREEMPTS_CURR(p, rq))
969     - resched_task(rq->curr);
970     + activate_task(p, rq, 0);
971     + preempt(p, rq);
972    
973     /*
974     * Parent and child are on different CPUs, now get the
975     - * parent runqueue to update the parent's ->sleep_avg:
976     + * parent runqueue to update the parent's ->flags:
977     */
978     task_rq_unlock(rq, &flags);
979     this_rq = task_rq_lock(current, &flags);
980     }
981     - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
982     - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
983     task_rq_unlock(this_rq, &flags);
984     }
985    
986     -/*
987     - * Potentially available exiting-child timeslices are
988     - * retrieved here - this way the parent does not get
989     - * penalized for creating too many threads.
990     - *
991     - * (this cannot be used to 'generate' timeslices
992     - * artificially, because any timeslice recovered here
993     - * was given away by the parent in the first place.)
994     - */
995     -void fastcall sched_exit(task_t *p)
996     -{
997     - unsigned long flags;
998     - runqueue_t *rq;
999     -
1000     - /*
1001     - * If the child was a (relative-) CPU hog then decrease
1002     - * the sleep_avg of the parent as well.
1003     - */
1004     - rq = task_rq_lock(p->parent, &flags);
1005     - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1006     - p->parent->time_slice += p->time_slice;
1007     - if (unlikely(p->parent->time_slice > task_timeslice(p)))
1008     - p->parent->time_slice = task_timeslice(p);
1009     - }
1010     - if (p->sleep_avg < p->parent->sleep_avg)
1011     - p->parent->sleep_avg = p->parent->sleep_avg /
1012     - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1013     - (EXIT_WEIGHT + 1);
1014     - task_rq_unlock(rq, &flags);
1015     -}
1016     -
1017     /**
1018     * prepare_task_switch - prepare to switch tasks
1019     * @rq: the runqueue preparing to switch
1020     @@ -1885,23 +1769,21 @@ void sched_exec(void)
1021     * pull_task - move a task from a remote runqueue to the local runqueue.
1022     * Both runqueues must be locked.
1023     */
1024     -static
1025     -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1026     - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1027     +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq,
1028     + const int this_cpu)
1029     {
1030     - dequeue_task(p, src_array);
1031     + dequeue_task(p, src_rq);
1032     dec_nr_running(p, src_rq);
1033     set_task_cpu(p, this_cpu);
1034     inc_nr_running(p, this_rq);
1035     - enqueue_task(p, this_array);
1036     + enqueue_task(p, this_rq);
1037     p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1038     + this_rq->timestamp_last_tick;
1039     /*
1040     * Note that idle threads have a prio of MAX_PRIO, for this test
1041     * to be always true for them.
1042     */
1043     - if (TASK_PREEMPTS_CURR(p, this_rq))
1044     - resched_task(this_rq->curr);
1045     + preempt(p, this_rq);
1046     }
1047    
1048     /*
1049     @@ -1939,7 +1821,6 @@ int can_migrate_task(task_t *p, runqueue
1050     return 1;
1051     }
1052    
1053     -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1054     /*
1055     * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1056     * load from busiest to this_rq, as part of a balancing operation within
1057     @@ -1952,7 +1833,6 @@ static int move_tasks(runqueue_t *this_r
1058     struct sched_domain *sd, enum idle_type idle,
1059     int *all_pinned)
1060     {
1061     - prio_array_t *array, *dst_array;
1062     struct list_head *head, *curr;
1063     int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
1064     int busiest_best_prio_seen;
1065     @@ -1965,8 +1845,8 @@ static int move_tasks(runqueue_t *this_r
1066    
1067     rem_load_move = max_load_move;
1068     pinned = 1;
1069     - this_best_prio = rq_best_prio(this_rq);
1070     - busiest_best_prio = rq_best_prio(busiest);
1071     + this_best_prio = this_rq->curr->prio;
1072     + busiest_best_prio = busiest->curr->prio;
1073     /*
1074     * Enable handling of the case where there is more than one task
1075     * with the best priority. If the current running task is one
1076     @@ -1976,38 +1856,17 @@ static int move_tasks(runqueue_t *this_r
1077     */
1078     busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
1079    
1080     - /*
1081     - * We first consider expired tasks. Those will likely not be
1082     - * executed in the near future, and they are most likely to
1083     - * be cache-cold, thus switching CPUs has the least effect
1084     - * on them.
1085     - */
1086     - if (busiest->expired->nr_active) {
1087     - array = busiest->expired;
1088     - dst_array = this_rq->expired;
1089     - } else {
1090     - array = busiest->active;
1091     - dst_array = this_rq->active;
1092     - }
1093     -
1094     -new_array:
1095     /* Start searching at priority 0: */
1096     idx = 0;
1097     skip_bitmap:
1098     if (!idx)
1099     - idx = sched_find_first_bit(array->bitmap);
1100     + idx = sched_find_first_bit(busiest->bitmap);
1101     else
1102     - idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1103     - if (idx >= MAX_PRIO) {
1104     - if (array == busiest->expired && busiest->active->nr_active) {
1105     - array = busiest->active;
1106     - dst_array = this_rq->active;
1107     - goto new_array;
1108     - }
1109     + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1110     + if (idx >= MAX_PRIO)
1111     goto out;
1112     - }
1113    
1114     - head = array->queue + idx;
1115     + head = busiest->queue + idx;
1116     curr = head->prev;
1117     skip_queue:
1118     tmp = list_entry(curr, task_t, run_list);
1119     @@ -2036,7 +1895,7 @@ skip_queue:
1120     schedstat_inc(sd, lb_hot_gained[idle]);
1121     #endif
1122    
1123     - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1124     + pull_task(busiest, tmp, this_rq, this_cpu);
1125     pulled++;
1126     rem_load_move -= tmp->load_weight;
1127    
1128     @@ -2585,15 +2444,13 @@ static void rebalance_tick(int this_cpu,
1129     continue;
1130    
1131     interval = sd->balance_interval;
1132     - if (idle != SCHED_IDLE)
1133     - interval *= sd->busy_factor;
1134    
1135     /* scale ms to jiffies */
1136     interval = msecs_to_jiffies(interval);
1137     if (unlikely(!interval))
1138     interval = 1;
1139    
1140     - if (j - sd->last_balance >= interval) {
1141     + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) {
1142     if (load_balance(this_cpu, this_rq, sd, idle)) {
1143     /*
1144     * We've pulled tasks over so either we're no
1145     @@ -2667,22 +2524,6 @@ unsigned long long current_sched_time(co
1146     }
1147    
1148     /*
1149     - * We place interactive tasks back into the active array, if possible.
1150     - *
1151     - * To guarantee that this does not starve expired tasks we ignore the
1152     - * interactivity of a task if the first expired task had to wait more
1153     - * than a 'reasonable' amount of time. This deadline timeout is
1154     - * load-dependent, as the frequency of array switched decreases with
1155     - * increasing number of running tasks. We also ignore the interactivity
1156     - * if a better static_prio task has expired:
1157     - */
1158     -#define EXPIRED_STARVING(rq) \
1159     - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1160     - (jiffies - (rq)->expired_timestamp >= \
1161     - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1162     - ((rq)->curr->static_prio > (rq)->best_expired_prio))
1163     -
1164     -/*
1165     * Account user cpu time to a process.
1166     * @p: the process that the cpu time gets accounted to
1167     * @hardirq_offset: the offset to subtract from hardirq_count()
1168     @@ -2730,6 +2571,8 @@ void account_system_time(struct task_str
1169     cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
1170     else
1171     cpustat->idle = cputime64_add(cpustat->idle, tmp);
1172     +
1173     + p->systime += NSJIFFY;
1174     /* Account for system time used */
1175     acct_update_integrals(p);
1176     }
1177     @@ -2755,18 +2598,23 @@ void account_steal_time(struct task_stru
1178     cpustat->steal = cputime64_add(cpustat->steal, tmp);
1179     }
1180    
1181     +static void time_slice_expired(task_t *p, runqueue_t *rq)
1182     +{
1183     + set_tsk_need_resched(p);
1184     + p->time_slice = rr_interval(p);
1185     + requeue_task(p, rq, effective_prio(p));
1186     +}
1187     +
1188     /*
1189     * This function gets called by the timer code, with HZ frequency.
1190     * We call it with interrupts disabled.
1191     - *
1192     - * It also gets called by the fork code, when changing the parent's
1193     - * timeslices.
1194     */
1195     void scheduler_tick(void)
1196     {
1197     int cpu = smp_processor_id();
1198     runqueue_t *rq = this_rq();
1199     task_t *p = current;
1200     + unsigned long debit;
1201     unsigned long long now = sched_clock();
1202    
1203     update_cpu_clock(p, rq, now);
1204     @@ -2781,73 +2629,37 @@ void scheduler_tick(void)
1205     }
1206    
1207     /* Task might have expired already, but not scheduled off yet */
1208     - if (p->array != rq->active) {
1209     + if (unlikely(!task_queued(p))) {
1210     set_tsk_need_resched(p);
1211     goto out;
1212     }
1213     + /* SCHED_FIFO tasks never run out of timeslice. */
1214     + if (unlikely(p->policy == SCHED_FIFO))
1215     + goto out;
1216     +
1217     spin_lock(&rq->lock);
1218     + debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
1219     + p->ns_debit += debit;
1220     + if (p->ns_debit < NSJIFFY)
1221     + goto out_unlock;
1222     + p->ns_debit %= NSJIFFY;
1223     /*
1224     - * The task was running during this tick - update the
1225     - * time slice counter. Note: we do not update a thread's
1226     - * priority until it either goes to sleep or uses up its
1227     - * timeslice. This makes it possible for interactive tasks
1228     - * to use up their timeslices at their highest priority levels.
1229     + * Tasks lose bonus each time they use up a full slice().
1230     */
1231     - if (rt_task(p)) {
1232     - /*
1233     - * RR tasks need a special form of timeslice management.
1234     - * FIFO tasks have no timeslices.
1235     - */
1236     - if ((p->policy == SCHED_RR) && !--p->time_slice) {
1237     - p->time_slice = task_timeslice(p);
1238     - p->first_time_slice = 0;
1239     - set_tsk_need_resched(p);
1240     -
1241     - /* put it at the end of the queue: */
1242     - requeue_task(p, rq->active);
1243     - }
1244     + if (!--p->slice) {
1245     + dec_bonus(p);
1246     + p->totalrun = 0;
1247     + p->slice = slice(p);
1248     + time_slice_expired(p, rq);
1249     goto out_unlock;
1250     }
1251     + /*
1252     + * Tasks that run out of time_slice but still have slice left get
1253     + * requeued with a lower priority && RR_INTERVAL time_slice.
1254     + */
1255     if (!--p->time_slice) {
1256     - dequeue_task(p, rq->active);
1257     - set_tsk_need_resched(p);
1258     - p->prio = effective_prio(p);
1259     - p->time_slice = task_timeslice(p);
1260     - p->first_time_slice = 0;
1261     -
1262     - if (!rq->expired_timestamp)
1263     - rq->expired_timestamp = jiffies;
1264     - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1265     - enqueue_task(p, rq->expired);
1266     - if (p->static_prio < rq->best_expired_prio)
1267     - rq->best_expired_prio = p->static_prio;
1268     - } else
1269     - enqueue_task(p, rq->active);
1270     - } else {
1271     - /*
1272     - * Prevent a too long timeslice allowing a task to monopolize
1273     - * the CPU. We do this by splitting up the timeslice into
1274     - * smaller pieces.
1275     - *
1276     - * Note: this does not mean the task's timeslices expire or
1277     - * get lost in any way, they just might be preempted by
1278     - * another task of equal priority. (one with higher
1279     - * priority would have preempted this task already.) We
1280     - * requeue this task to the end of the list on this priority
1281     - * level, which is in essence a round-robin of tasks with
1282     - * equal priority.
1283     - *
1284     - * This only applies to tasks in the interactive
1285     - * delta range with at least TIMESLICE_GRANULARITY to requeue.
1286     - */
1287     - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1288     - p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1289     - (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1290     - (p->array == rq->active)) {
1291     -
1292     - requeue_task(p, rq->active);
1293     - set_tsk_need_resched(p);
1294     - }
1295     + time_slice_expired(p, rq);
1296     + goto out_unlock;
1297     }
1298     out_unlock:
1299     spin_unlock(&rq->lock);
1300     @@ -2896,12 +2708,13 @@ static void wake_sleeping_dependent(int
1301    
1302     /*
1303     * number of 'lost' timeslices this task wont be able to fully
1304     - * utilize, if another task runs on a sibling. This models the
1305     + * utilise, if another task runs on a sibling. This models the
1306     * slowdown effect of other tasks running on siblings:
1307     */
1308     -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
1309     +static inline unsigned long
1310     +smt_slice(const task_t *p, const struct sched_domain *sd)
1311     {
1312     - return p->time_slice * (100 - sd->per_cpu_gain) / 100;
1313     + return p->slice * (100 - sd->per_cpu_gain) / 100;
1314     }
1315    
1316     /*
1317     @@ -2964,7 +2777,7 @@ static int dependent_sleeper(int this_cp
1318     } else
1319     if (smt_curr->static_prio < p->static_prio &&
1320     !TASK_PREEMPTS_CURR(p, smt_rq) &&
1321     - smt_slice(smt_curr, sd) > task_timeslice(p))
1322     + smt_slice(smt_curr, sd) > slice(p))
1323     ret = 1;
1324    
1325     unlock:
1326     @@ -3015,12 +2828,6 @@ EXPORT_SYMBOL(sub_preempt_count);
1327    
1328     #endif
1329    
1330     -static inline int interactive_sleep(enum sleep_type sleep_type)
1331     -{
1332     - return (sleep_type == SLEEP_INTERACTIVE ||
1333     - sleep_type == SLEEP_INTERRUPTED);
1334     -}
1335     -
1336     /*
1337     * schedule() is the main scheduler function.
1338     */
1339     @@ -3029,11 +2836,10 @@ asmlinkage void __sched schedule(void)
1340     long *switch_count;
1341     task_t *prev, *next;
1342     runqueue_t *rq;
1343     - prio_array_t *array;
1344     struct list_head *queue;
1345     unsigned long long now;
1346     - unsigned long run_time;
1347     - int cpu, idx, new_prio;
1348     + unsigned long debit;
1349     + int cpu, idx;
1350    
1351     /*
1352     * Test if we are atomic. Since do_exit() needs to call into
1353     @@ -3066,20 +2872,11 @@ need_resched_nonpreemptible:
1354    
1355     schedstat_inc(rq, sched_cnt);
1356     now = sched_clock();
1357     - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
1358     - run_time = now - prev->timestamp;
1359     - if (unlikely((long long)(now - prev->timestamp) < 0))
1360     - run_time = 0;
1361     - } else
1362     - run_time = NS_MAX_SLEEP_AVG;
1363     -
1364     - /*
1365     - * Tasks charged proportionately less run_time at high sleep_avg to
1366     - * delay them losing their interactive status
1367     - */
1368     - run_time /= (CURRENT_BONUS(prev) ? : 1);
1369    
1370     spin_lock_irq(&rq->lock);
1371     + prev->runtime = ns_diff(now, prev->timestamp);
1372     + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY;
1373     + prev->ns_debit += debit;
1374    
1375     if (unlikely(prev->flags & PF_DEAD))
1376     prev->state = EXIT_DEAD;
1377     @@ -3091,8 +2888,10 @@ need_resched_nonpreemptible:
1378     unlikely(signal_pending(prev))))
1379     prev->state = TASK_RUNNING;
1380     else {
1381     - if (prev->state == TASK_UNINTERRUPTIBLE)
1382     + if (prev->state == TASK_UNINTERRUPTIBLE) {
1383     + prev->flags |= PF_NONSLEEP;
1384     rq->nr_uninterruptible++;
1385     + }
1386     deactivate_task(prev, rq);
1387     }
1388     }
1389     @@ -3102,64 +2901,30 @@ need_resched_nonpreemptible:
1390     idle_balance(cpu, rq);
1391     if (!rq->nr_running) {
1392     next = rq->idle;
1393     - rq->expired_timestamp = 0;
1394     wake_sleeping_dependent(cpu);
1395     goto switch_tasks;
1396     }
1397     }
1398    
1399     - array = rq->active;
1400     - if (unlikely(!array->nr_active)) {
1401     - /*
1402     - * Switch the active and expired arrays.
1403     - */
1404     - schedstat_inc(rq, sched_switch);
1405     - rq->active = rq->expired;
1406     - rq->expired = array;
1407     - array = rq->active;
1408     - rq->expired_timestamp = 0;
1409     - rq->best_expired_prio = MAX_PRIO;
1410     - }
1411     -
1412     - idx = sched_find_first_bit(array->bitmap);
1413     - queue = array->queue + idx;
1414     + idx = sched_find_first_bit(rq->bitmap);
1415     + queue = rq->queue + idx;
1416     next = list_entry(queue->next, task_t, run_list);
1417    
1418     - if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
1419     - unsigned long long delta = now - next->timestamp;
1420     - if (unlikely((long long)(now - next->timestamp) < 0))
1421     - delta = 0;
1422     -
1423     - if (next->sleep_type == SLEEP_INTERACTIVE)
1424     - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1425     -
1426     - array = next->array;
1427     - new_prio = recalc_task_prio(next, next->timestamp + delta);
1428     -
1429     - if (unlikely(next->prio != new_prio)) {
1430     - dequeue_task(next, array);
1431     - next->prio = new_prio;
1432     - enqueue_task(next, array);
1433     - }
1434     - }
1435     - next->sleep_type = SLEEP_NORMAL;
1436     if (dependent_sleeper(cpu, rq, next))
1437     next = rq->idle;
1438     + else {
1439     + prefetch(next);
1440     + prefetch_stack(next);
1441     + }
1442     switch_tasks:
1443     if (next == rq->idle)
1444     schedstat_inc(rq, sched_goidle);
1445     - prefetch(next);
1446     - prefetch_stack(next);
1447     + prev->timestamp = now;
1448     clear_tsk_need_resched(prev);
1449     rcu_qsctr_inc(task_cpu(prev));
1450    
1451     update_cpu_clock(prev, rq, now);
1452    
1453     - prev->sleep_avg -= run_time;
1454     - if ((long)prev->sleep_avg <= 0)
1455     - prev->sleep_avg = 0;
1456     - prev->timestamp = prev->last_ran = now;
1457     -
1458     sched_info_switch(prev, next);
1459     if (likely(prev != next)) {
1460     next->timestamp = now;
1461     @@ -3591,9 +3356,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
1462     void set_user_nice(task_t *p, long nice)
1463     {
1464     unsigned long flags;
1465     - prio_array_t *array;
1466     runqueue_t *rq;
1467     - int old_prio, new_prio, delta;
1468     + int queued, old_prio, new_prio, delta;
1469    
1470     if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1471     return;
1472     @@ -3612,9 +3376,8 @@ void set_user_nice(task_t *p, long nice)
1473     p->static_prio = NICE_TO_PRIO(nice);
1474     goto out_unlock;
1475     }
1476     - array = p->array;
1477     - if (array) {
1478     - dequeue_task(p, array);
1479     + if ((queued = task_queued(p))) {
1480     + dequeue_task(p, rq);
1481     dec_raw_weighted_load(rq, p);
1482     }
1483    
1484     @@ -3624,9 +3387,11 @@ void set_user_nice(task_t *p, long nice)
1485     p->static_prio = NICE_TO_PRIO(nice);
1486     set_load_weight(p);
1487     p->prio += delta;
1488     + if (p->bonus > bonus(p))
1489     + p->bonus= bonus(p);
1490    
1491     - if (array) {
1492     - enqueue_task(p, array);
1493     + if (queued) {
1494     + enqueue_task(p, rq);
1495     inc_raw_weighted_load(rq, p);
1496     /*
1497     * If the task increased its priority or is running and
1498     @@ -3750,19 +3515,13 @@ static inline task_t *find_process_by_pi
1499     /* Actually do priority change: must hold rq lock. */
1500     static void __setscheduler(struct task_struct *p, int policy, int prio)
1501     {
1502     - BUG_ON(p->array);
1503     + BUG_ON(task_queued(p));
1504     p->policy = policy;
1505     p->rt_priority = prio;
1506     if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
1507     p->prio = MAX_RT_PRIO-1 - p->rt_priority;
1508     - } else {
1509     + } else
1510     p->prio = p->static_prio;
1511     - /*
1512     - * SCHED_BATCH tasks are treated as perpetual CPU hogs:
1513     - */
1514     - if (policy == SCHED_BATCH)
1515     - p->sleep_avg = 0;
1516     - }
1517     set_load_weight(p);
1518     }
1519    
1520     @@ -3777,8 +3536,7 @@ int sched_setscheduler(struct task_struc
1521     struct sched_param *param)
1522     {
1523     int retval;
1524     - int oldprio, oldpolicy = -1;
1525     - prio_array_t *array;
1526     + int queued, oldprio, oldpolicy = -1;
1527     unsigned long flags;
1528     runqueue_t *rq;
1529    
1530     @@ -3840,12 +3598,11 @@ recheck:
1531     task_rq_unlock(rq, &flags);
1532     goto recheck;
1533     }
1534     - array = p->array;
1535     - if (array)
1536     + if ((queued = task_queued(p)))
1537     deactivate_task(p, rq);
1538     oldprio = p->prio;
1539     __setscheduler(p, policy, param->sched_priority);
1540     - if (array) {
1541     + if (queued) {
1542     __activate_task(p, rq);
1543     /*
1544     * Reschedule if we are currently running on this runqueue and
1545     @@ -3855,8 +3612,8 @@ recheck:
1546     if (task_running(rq, p)) {
1547     if (p->prio > oldprio)
1548     resched_task(rq->curr);
1549     - } else if (TASK_PREEMPTS_CURR(p, rq))
1550     - resched_task(rq->curr);
1551     + } else
1552     + preempt(p, rq);
1553     }
1554     task_rq_unlock(rq, &flags);
1555     return 0;
1556     @@ -4113,43 +3870,22 @@ asmlinkage long sys_sched_getaffinity(pi
1557    
1558     /**
1559     * sys_sched_yield - yield the current processor to other threads.
1560     - *
1561     - * this function yields the current CPU by moving the calling thread
1562     - * to the expired array. If there are no other threads running on this
1563     - * CPU then this function will return.
1564     + * This function yields the current CPU by dropping the priority of current
1565     + * to the lowest priority.
1566     */
1567     asmlinkage long sys_sched_yield(void)
1568     {
1569     + int newprio;
1570     runqueue_t *rq = this_rq_lock();
1571     - prio_array_t *array = current->array;
1572     - prio_array_t *target = rq->expired;
1573    
1574     + newprio = current->prio;
1575     schedstat_inc(rq, yld_cnt);
1576     - /*
1577     - * We implement yielding by moving the task into the expired
1578     - * queue.
1579     - *
1580     - * (special rule: RT tasks will just roundrobin in the active
1581     - * array.)
1582     - */
1583     - if (rt_task(current))
1584     - target = rq->active;
1585     + current->slice = slice(current);
1586     + current->time_slice = rr_interval(current);
1587     + if (likely(!rt_task(current)))
1588     + newprio = MIN_USER_PRIO;
1589    
1590     - if (array->nr_active == 1) {
1591     - schedstat_inc(rq, yld_act_empty);
1592     - if (!rq->expired->nr_active)
1593     - schedstat_inc(rq, yld_both_empty);
1594     - } else if (!rq->expired->nr_active)
1595     - schedstat_inc(rq, yld_exp_empty);
1596     -
1597     - if (array != target) {
1598     - dequeue_task(current, array);
1599     - enqueue_task(current, target);
1600     - } else
1601     - /*
1602     - * requeue_task is cheaper so perform that if possible.
1603     - */
1604     - requeue_task(current, array);
1605     + requeue_task(current, rq, newprio);
1606    
1607     /*
1608     * Since we are going to call schedule() anyway, there's
1609     @@ -4358,7 +4094,7 @@ long sys_sched_rr_get_interval(pid_t pid
1610     goto out_unlock;
1611    
1612     jiffies_to_timespec(p->policy & SCHED_FIFO ?
1613     - 0 : task_timeslice(p), &t);
1614     + 0 : slice(p), &t);
1615     read_unlock(&tasklist_lock);
1616     retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1617     out_nounlock:
1618     @@ -4481,8 +4217,6 @@ void __devinit init_idle(task_t *idle, i
1619     unsigned long flags;
1620    
1621     idle->timestamp = sched_clock();
1622     - idle->sleep_avg = 0;
1623     - idle->array = NULL;
1624     idle->prio = MAX_PRIO;
1625     idle->state = TASK_RUNNING;
1626     idle->cpus_allowed = cpumask_of_cpu(cpu);
1627     @@ -4599,7 +4333,7 @@ static void __migrate_task(struct task_s
1628     goto out;
1629    
1630     set_task_cpu(p, dest_cpu);
1631     - if (p->array) {
1632     + if (task_queued(p)) {
1633     /*
1634     * Sync timestamp with rq_dest's before activating.
1635     * The same thing could be achieved by doing this step
1636     @@ -4610,8 +4344,7 @@ static void __migrate_task(struct task_s
1637     + rq_dest->timestamp_last_tick;
1638     deactivate_task(p, rq_src);
1639     activate_task(p, rq_dest, 0);
1640     - if (TASK_PREEMPTS_CURR(p, rq_dest))
1641     - resched_task(rq_dest->curr);
1642     + preempt(p, rq_dest);
1643     }
1644    
1645     out:
1646     @@ -4825,7 +4558,7 @@ static void migrate_dead_tasks(unsigned
1647    
1648     for (arr = 0; arr < 2; arr++) {
1649     for (i = 0; i < MAX_PRIO; i++) {
1650     - struct list_head *list = &rq->arrays[arr].queue[i];
1651     + struct list_head *list = &rq->queue[i];
1652     while (!list_empty(list))
1653     migrate_dead(dead_cpu,
1654     list_entry(list->next, task_t,
1655     @@ -6226,17 +5959,13 @@ int in_sched_functions(unsigned long add
1656     void __init sched_init(void)
1657     {
1658     runqueue_t *rq;
1659     - int i, j, k;
1660     + int i, j;
1661    
1662     for_each_possible_cpu(i) {
1663     - prio_array_t *array;
1664    
1665     rq = cpu_rq(i);
1666     spin_lock_init(&rq->lock);
1667     rq->nr_running = 0;
1668     - rq->active = rq->arrays;
1669     - rq->expired = rq->arrays + 1;
1670     - rq->best_expired_prio = MAX_PRIO;
1671    
1672     #ifdef CONFIG_SMP
1673     rq->sd = NULL;
1674     @@ -6248,16 +5977,11 @@ void __init sched_init(void)
1675     INIT_LIST_HEAD(&rq->migration_queue);
1676     #endif
1677     atomic_set(&rq->nr_iowait, 0);
1678     -
1679     - for (j = 0; j < 2; j++) {
1680     - array = rq->arrays + j;
1681     - for (k = 0; k < MAX_PRIO; k++) {
1682     - INIT_LIST_HEAD(array->queue + k);
1683     - __clear_bit(k, array->bitmap);
1684     - }
1685     - // delimiter for bitsearch
1686     - __set_bit(MAX_PRIO, array->bitmap);
1687     - }
1688     + for (j = 0; j < MAX_PRIO; j++)
1689     + INIT_LIST_HEAD(&rq->queue[j]);
1690     + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
1691     + /* delimiter for bitsearch */
1692     + __set_bit(MAX_PRIO, rq->bitmap);
1693     }
1694    
1695     set_load_weight(&init_task);
1696     @@ -6302,9 +6026,9 @@ EXPORT_SYMBOL(__might_sleep);
1697     void normalize_rt_tasks(void)
1698     {
1699     struct task_struct *p;
1700     - prio_array_t *array;
1701     unsigned long flags;
1702     runqueue_t *rq;
1703     + int queued;
1704    
1705     read_lock_irq(&tasklist_lock);
1706     for_each_process(p) {
1707     @@ -6313,11 +6037,10 @@ void normalize_rt_tasks(void)
1708    
1709     rq = task_rq_lock(p, &flags);
1710    
1711     - array = p->array;
1712     - if (array)
1713     + if ((queued = task_queued(p)))
1714     deactivate_task(p, task_rq(p));
1715     __setscheduler(p, SCHED_NORMAL, 0);
1716     - if (array) {
1717     + if (queued) {
1718     __activate_task(p, task_rq(p));
1719     resched_task(rq->curr);
1720     }