Magellan Linux

Annotation of /trunk/kernel26-alx/patches-2.6.20-r6/0001-2.6.20-sched-staircase-17.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1175 - (hide annotations) (download)
Thu Oct 14 12:15:46 2010 UTC (13 years, 7 months ago) by niro
File size: 53298 byte(s)
-2.6.20-alx-r6 new magellan 0.5.2 kernel
1 niro 1175 Implement the "staircase" hybrid foreground-background single priority
2     array cpu scheduler policy.
3    
4     Signed-off-by: Con Kolivas <kernel@kolivas.org>
5     ---
6     fs/proc/array.c | 4
7     include/linux/sched.h | 20
8     kernel/exit.c | 1
9     kernel/sched.c | 1084 ++++++++++++++++++--------------------------------
10     4 files changed, 404 insertions(+), 705 deletions(-)
11    
12     Index: linux-2.6.20-ck1/fs/proc/array.c
13     ===================================================================
14     --- linux-2.6.20-ck1.orig/fs/proc/array.c 2007-02-05 22:52:03.000000000 +1100
15     +++ linux-2.6.20-ck1/fs/proc/array.c 2007-02-16 19:01:30.000000000 +1100
16     @@ -165,7 +165,7 @@ static inline char * task_state(struct t
17     rcu_read_lock();
18     buffer += sprintf(buffer,
19     "State:\t%s\n"
20     - "SleepAVG:\t%lu%%\n"
21     + "Bonus:\t%d\n"
22     "Tgid:\t%d\n"
23     "Pid:\t%d\n"
24     "PPid:\t%d\n"
25     @@ -173,7 +173,7 @@ static inline char * task_state(struct t
26     "Uid:\t%d\t%d\t%d\t%d\n"
27     "Gid:\t%d\t%d\t%d\t%d\n",
28     get_task_state(p),
29     - (p->sleep_avg/1024)*100/(1020000000/1024),
30     + p->bonus,
31     p->tgid, p->pid,
32     pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
33     pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
34     Index: linux-2.6.20-ck1/kernel/exit.c
35     ===================================================================
36     --- linux-2.6.20-ck1.orig/kernel/exit.c 2007-02-05 22:52:04.000000000 +1100
37     +++ linux-2.6.20-ck1/kernel/exit.c 2007-02-16 19:01:30.000000000 +1100
38     @@ -170,7 +170,6 @@ repeat:
39     zap_leader = (leader->exit_signal == -1);
40     }
41    
42     - sched_exit(p);
43     write_unlock_irq(&tasklist_lock);
44     proc_flush_task(p);
45     release_thread(p);
46     Index: linux-2.6.20-ck1/include/linux/sched.h
47     ===================================================================
48     --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-05 22:52:04.000000000 +1100
49     +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100
50     @@ -524,6 +524,7 @@ struct signal_struct {
51     #define MAX_RT_PRIO MAX_USER_RT_PRIO
52    
53     #define MAX_PRIO (MAX_RT_PRIO + 40)
54     +#define MIN_USER_PRIO (MAX_PRIO - 1)
55    
56     #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
57     #define rt_task(p) rt_prio((p)->prio)
58     @@ -789,15 +790,6 @@ struct mempolicy;
59     struct pipe_inode_info;
60     struct uts_namespace;
61    
62     -enum sleep_type {
63     - SLEEP_NORMAL,
64     - SLEEP_NONINTERACTIVE,
65     - SLEEP_INTERACTIVE,
66     - SLEEP_INTERRUPTED,
67     -};
68     -
69     -struct prio_array;
70     -
71     struct task_struct {
72     volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
73     struct thread_info *thread_info;
74     @@ -815,20 +807,19 @@ struct task_struct {
75     int load_weight; /* for niceness load balancing purposes */
76     int prio, static_prio, normal_prio;
77     struct list_head run_list;
78     - struct prio_array *array;
79    
80     unsigned short ioprio;
81     #ifdef CONFIG_BLK_DEV_IO_TRACE
82     unsigned int btrace_seq;
83     #endif
84     - unsigned long sleep_avg;
85     unsigned long long timestamp, last_ran;
86     + unsigned long runtime, totalrun, ns_debit, systime;
87     + unsigned int bonus;
88     + unsigned int slice, time_slice;
89     unsigned long long sched_time; /* sched_clock time spent running */
90     - enum sleep_type sleep_type;
91    
92     unsigned long policy;
93     cpumask_t cpus_allowed;
94     - unsigned int time_slice, first_time_slice;
95    
96     #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
97     struct sched_info sched_info;
98     @@ -1157,6 +1148,8 @@ static inline void put_task_struct(struc
99     #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
100     #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
101     #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
102     +#define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */
103     +#define PF_FORKED 0x80000000 /* Task just forked another process */
104    
105     /*
106     * Only the _current_ task can read/write to tsk->flags, but other
107     @@ -1291,7 +1284,6 @@ extern void FASTCALL(wake_up_new_task(st
108     static inline void kick_process(struct task_struct *tsk) { }
109     #endif
110     extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
111     -extern void FASTCALL(sched_exit(struct task_struct * p));
112    
113     extern int in_group_p(gid_t);
114     extern int in_egroup_p(gid_t);
115     Index: linux-2.6.20-ck1/kernel/sched.c
116     ===================================================================
117     --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-05 22:52:04.000000000 +1100
118     +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100
119     @@ -16,6 +16,10 @@
120     * by Davide Libenzi, preemptible kernel bits by Robert Love.
121     * 2003-09-03 Interactivity tuning by Con Kolivas.
122     * 2004-04-02 Scheduler domains code by Nick Piggin
123     + * 2007-02-14 Staircase scheduling policy by Con Kolivas with help
124     + * from William Lee Irwin III, Zwane Mwaikambo, Peter Williams
125     + * and Andreas Mohr.
126     + * Staircase v17
127     */
128    
129     #include <linux/mm.h>
130     @@ -77,123 +81,19 @@
131     /*
132     * Some helpers for converting nanosecond timing to jiffy resolution
133     */
134     -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
135     -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
136     -
137     -/*
138     - * These are the 'tuning knobs' of the scheduler:
139     - *
140     - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
141     - * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
142     - * Timeslices get refilled after they expire.
143     - */
144     -#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
145     -#define DEF_TIMESLICE (100 * HZ / 1000)
146     -#define ON_RUNQUEUE_WEIGHT 30
147     -#define CHILD_PENALTY 95
148     -#define PARENT_PENALTY 100
149     -#define EXIT_WEIGHT 3
150     -#define PRIO_BONUS_RATIO 25
151     -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
152     -#define INTERACTIVE_DELTA 2
153     -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
154     -#define STARVATION_LIMIT (MAX_SLEEP_AVG)
155     -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
156     -
157     -/*
158     - * If a task is 'interactive' then we reinsert it in the active
159     - * array after it has expired its current timeslice. (it will not
160     - * continue to run immediately, it will still roundrobin with
161     - * other interactive tasks.)
162     - *
163     - * This part scales the interactivity limit depending on niceness.
164     - *
165     - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
166     - * Here are a few examples of different nice levels:
167     - *
168     - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
169     - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
170     - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
171     - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
172     - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
173     - *
174     - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
175     - * priority range a task can explore, a value of '1' means the
176     - * task is rated interactive.)
177     - *
178     - * Ie. nice +19 tasks can never get 'interactive' enough to be
179     - * reinserted into the active array. And only heavily CPU-hog nice -20
180     - * tasks will be expired. Default nice 0 tasks are somewhere between,
181     - * it takes some effort for them to get interactive, but it's not
182     - * too hard.
183     - */
184     -
185     -#define CURRENT_BONUS(p) \
186     - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
187     - MAX_SLEEP_AVG)
188     -
189     -#define GRANULARITY (10 * HZ / 1000 ? : 1)
190     -
191     -#ifdef CONFIG_SMP
192     -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
193     - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
194     - num_online_cpus())
195     -#else
196     -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
197     - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
198     -#endif
199     -
200     -#define SCALE(v1,v1_max,v2_max) \
201     - (v1) * (v2_max) / (v1_max)
202     -
203     -#define DELTA(p) \
204     - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
205     - INTERACTIVE_DELTA)
206     -
207     -#define TASK_INTERACTIVE(p) \
208     - ((p)->prio <= (p)->static_prio - DELTA(p))
209     -
210     -#define INTERACTIVE_SLEEP(p) \
211     - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
212     - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
213     +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
214     +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY)
215     +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY)
216    
217     #define TASK_PREEMPTS_CURR(p, rq) \
218     ((p)->prio < (rq)->curr->prio)
219    
220     -#define SCALE_PRIO(x, prio) \
221     - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
222     -
223     -static unsigned int static_prio_timeslice(int static_prio)
224     -{
225     - if (static_prio < NICE_TO_PRIO(0))
226     - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
227     - else
228     - return SCALE_PRIO(DEF_TIMESLICE, static_prio);
229     -}
230     -
231     /*
232     - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
233     - * to time slice values: [800ms ... 100ms ... 5ms]
234     - *
235     - * The higher a thread's priority, the bigger timeslices
236     - * it gets during one round of execution. But even the lowest
237     - * priority thread gets MIN_TIMESLICE worth of execution time.
238     + * This is the time all tasks within the same priority round robin.
239     + * Set to a minimum of 6ms.
240     */
241     -
242     -static inline unsigned int task_timeslice(struct task_struct *p)
243     -{
244     - return static_prio_timeslice(p->static_prio);
245     -}
246     -
247     -/*
248     - * These are the runqueue data structures:
249     - */
250     -
251     -struct prio_array {
252     - unsigned int nr_active;
253     - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
254     - struct list_head queue[MAX_PRIO];
255     -};
256     +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
257     +#define DEF_TIMESLICE (RR_INTERVAL * 19)
258    
259     /*
260     * This is the main, per-CPU runqueue data structure.
261     @@ -224,14 +124,13 @@ struct rq {
262     */
263     unsigned long nr_uninterruptible;
264    
265     - unsigned long expired_timestamp;
266     /* Cached timestamp set by update_cpu_clock() */
267     unsigned long long most_recent_timestamp;
268     struct task_struct *curr, *idle;
269     unsigned long next_balance;
270     struct mm_struct *prev_mm;
271     - struct prio_array *active, *expired, arrays[2];
272     - int best_expired_prio;
273     + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
274     + struct list_head queue[MAX_PRIO];
275     atomic_t nr_iowait;
276    
277     #ifdef CONFIG_SMP
278     @@ -568,13 +467,7 @@ static inline struct rq *this_rq_lock(vo
279    
280     #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
281     /*
282     - * Called when a process is dequeued from the active array and given
283     - * the cpu. We should note that with the exception of interactive
284     - * tasks, the expired queue will become the active queue after the active
285     - * queue is empty, without explicitly dequeuing and requeuing tasks in the
286     - * expired queue. (Interactive tasks may be requeued directly to the
287     - * active queue, thus delaying tasks in the expired queue from running;
288     - * see scheduler_tick()).
289     + * Called when a process is dequeued and given the cpu.
290     *
291     * This function is only called from sched_info_arrive(), rather than
292     * dequeue_task(). Even though a task may be queued and dequeued multiple
293     @@ -607,13 +500,11 @@ static void sched_info_arrive(struct tas
294     }
295    
296     /*
297     - * Called when a process is queued into either the active or expired
298     - * array. The time is noted and later used to determine how long we
299     - * had to wait for us to reach the cpu. Since the expired queue will
300     - * become the active queue after active queue is empty, without dequeuing
301     - * and requeuing any tasks, we are interested in queuing to either. It
302     - * is unusual but not impossible for tasks to be dequeued and immediately
303     - * requeued in the same or another array: this can happen in sched_yield(),
304     + * Called when a process is queued.
305     + * The time is noted and later used to determine how long we had to wait for
306     + * us to reach the cpu.
307     + * It is unusual but not impossible for tasks to be dequeued and immediately
308     + * requeued: this can happen in sched_yield(),
309     * set_user_nice(), and even load_balance() as it moves tasks from runqueue
310     * to runqueue.
311     *
312     @@ -672,73 +563,81 @@ sched_info_switch(struct task_struct *pr
313     #define sched_info_switch(t, next) do { } while (0)
314     #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
315    
316     -/*
317     - * Adding/removing a task to/from a priority array:
318     - */
319     -static void dequeue_task(struct task_struct *p, struct prio_array *array)
320     +#if BITS_PER_LONG < 64
321     +static inline void longlimit(unsigned long long *longlong)
322     +{
323     + if (*longlong > (1 << 31))
324     + *longlong = 1 << 31;
325     +}
326     +#else
327     +static inline void longlimit(unsigned long long *__unused)
328     +{
329     +}
330     +#endif
331     +
332     +/* Get nanosecond clock difference without overflowing unsigned long. */
333     +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
334     {
335     - array->nr_active--;
336     - list_del(&p->run_list);
337     - if (list_empty(array->queue + p->prio))
338     - __clear_bit(p->prio, array->bitmap);
339     + unsigned long long vdiff;
340     + if (likely(v1 >= v2)) {
341     + vdiff = v1 - v2;
342     + longlimit(&vdiff);
343     + } else {
344     + /*
345     + * Rarely the clock appears to go backwards. There should
346     + * always be a positive difference so return 1.
347     + */
348     + vdiff = 1;
349     + }
350     + return (unsigned long)vdiff;
351     }
352    
353     -static void enqueue_task(struct task_struct *p, struct prio_array *array)
354     +static inline int task_queued(struct task_struct *task)
355     {
356     - sched_info_queued(p);
357     - list_add_tail(&p->run_list, array->queue + p->prio);
358     - __set_bit(p->prio, array->bitmap);
359     - array->nr_active++;
360     - p->array = array;
361     + return !list_empty(&task->run_list);
362     }
363    
364     /*
365     - * Put task to the end of the run list without the overhead of dequeue
366     - * followed by enqueue.
367     + * Adding/removing a task to/from a runqueue:
368     */
369     -static void requeue_task(struct task_struct *p, struct prio_array *array)
370     +static void dequeue_task(struct task_struct *p, struct rq *rq)
371     {
372     - list_move_tail(&p->run_list, array->queue + p->prio);
373     + list_del_init(&p->run_list);
374     + if (list_empty(rq->queue + p->prio))
375     + __clear_bit(p->prio, rq->bitmap);
376     + p->ns_debit = 0;
377     }
378    
379     -static inline void
380     -enqueue_task_head(struct task_struct *p, struct prio_array *array)
381     +static void enqueue_task(struct task_struct *p, struct rq *rq)
382     {
383     - list_add(&p->run_list, array->queue + p->prio);
384     - __set_bit(p->prio, array->bitmap);
385     - array->nr_active++;
386     - p->array = array;
387     + list_add_tail(&p->run_list, rq->queue + p->prio);
388     + __set_bit(p->prio, rq->bitmap);
389     }
390    
391     /*
392     - * __normal_prio - return the priority that is based on the static
393     - * priority but is modified by bonuses/penalties.
394     - *
395     - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
396     - * into the -5 ... 0 ... +5 bonus/penalty range.
397     - *
398     - * We use 25% of the full 0...39 priority range so that:
399     - *
400     - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
401     - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
402     - *
403     - * Both properties are important to certain workloads.
404     + * Put task to the end of the run list without the overhead of dequeue
405     + * followed by enqueue.
406     */
407     -
408     -static inline int __normal_prio(struct task_struct *p)
409     +static void requeue_task(struct task_struct *p, struct rq *rq, const int prio)
410     {
411     - int bonus, prio;
412     -
413     - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
414     + list_move_tail(&p->run_list, rq->queue + prio);
415     + if (p->prio != prio) {
416     + if (list_empty(rq->queue + p->prio))
417     + __clear_bit(p->prio, rq->bitmap);
418     + p->prio = prio;
419     + __set_bit(prio, rq->bitmap);
420     + }
421     + p->ns_debit = 0;
422     +}
423    
424     - prio = p->static_prio - bonus;
425     - if (prio < MAX_RT_PRIO)
426     - prio = MAX_RT_PRIO;
427     - if (prio > MAX_PRIO-1)
428     - prio = MAX_PRIO-1;
429     - return prio;
430     +static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
431     +{
432     + list_add(&p->run_list, rq->queue + p->prio);
433     + __set_bit(p->prio, rq->bitmap);
434     }
435    
436     +static unsigned int slice(const struct task_struct *p);
437     +
438     /*
439     * To aid in avoiding the subversion of "niceness" due to uneven distribution
440     * of tasks with abnormal "nice" values across CPUs the contribution that
441     @@ -756,10 +655,9 @@ static inline int __normal_prio(struct t
442     #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
443     #define LOAD_WEIGHT(lp) \
444     (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
445     -#define PRIO_TO_LOAD_WEIGHT(prio) \
446     - LOAD_WEIGHT(static_prio_timeslice(prio))
447     -#define RTPRIO_TO_LOAD_WEIGHT(rp) \
448     - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
449     +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p))
450     +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
451     + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
452    
453     static void set_load_weight(struct task_struct *p)
454     {
455     @@ -776,7 +674,7 @@ static void set_load_weight(struct task_
456     #endif
457     p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
458     } else
459     - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
460     + p->load_weight = TASK_LOAD_WEIGHT(p);
461     }
462    
463     static inline void
464     @@ -804,6 +702,182 @@ static inline void dec_nr_running(struct
465     }
466    
467     /*
468     + * __activate_task - move a task to the runqueue.
469     + */
470     +static inline void __activate_task(struct task_struct *p, struct rq *rq)
471     +{
472     + enqueue_task(p, rq);
473     + inc_nr_running(p, rq);
474     +}
475     +
476     +/*
477     + * __activate_idle_task - move idle task to the _front_ of runqueue.
478     + */
479     +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
480     +{
481     + enqueue_task_head(p, rq);
482     + inc_nr_running(p, rq);
483     +}
484     +
485     +/*
486     + * Bonus - How much higher than its base priority an interactive task can run.
487     + */
488     +static inline unsigned int bonus(const struct task_struct *p)
489     +{
490     + return TASK_USER_PRIO(p);
491     +}
492     +
493     +static unsigned int rr_interval(const struct task_struct *p)
494     +{
495     + int nice = TASK_NICE(p);
496     +
497     + if (nice < 0 && !rt_task(p))
498     + return RR_INTERVAL * (20 - nice) / 20;
499     + return RR_INTERVAL;
500     +}
501     +
502     +/*
503     + * slice - the duration a task runs before getting requeued at its best
504     + * priority and has its bonus decremented.
505     + */
506     +static unsigned int slice(const struct task_struct *p)
507     +{
508     + unsigned int slice, rr;
509     +
510     + slice = rr = rr_interval(p);
511     + if (likely(!rt_task(p)))
512     + slice += (39 - TASK_USER_PRIO(p)) * rr;
513     + return slice;
514     +}
515     +
516     +/*
517     + * We increase our bonus by sleeping more than the time we ran.
518     + * The ratio of sleep to run gives us the cpu% that we last ran and determines
519     + * the maximum bonus we can acquire.
520     + */
521     +static void inc_bonus(struct task_struct *p, unsigned long totalrun, unsigned long sleep)
522     +{
523     + unsigned int best_bonus = sleep / (totalrun + 1);
524     +
525     + if (p->bonus >= best_bonus)
526     + return;
527     + best_bonus = bonus(p);
528     + if (p->bonus < best_bonus)
529     + p->bonus++;
530     +}
531     +
532     +static inline void dec_bonus(struct task_struct *p)
533     +{
534     + if (p->bonus)
535     + p->bonus--;
536     +}
537     +
538     +static inline void slice_overrun(struct task_struct *p)
539     +{
540     + unsigned long ns_slice = JIFFIES_TO_NS(p->slice);
541     +
542     + do {
543     + p->totalrun -= ns_slice;
544     + dec_bonus(p);
545     + } while (unlikely(p->totalrun > ns_slice));
546     +}
547     +
548     +static inline void continue_slice(struct task_struct *p)
549     +{
550     + unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
551     +
552     + if (unlikely(total_run >= p->slice))
553     + slice_overrun(p);
554     + else {
555     + unsigned long remainder;
556     +
557     + p->slice -= total_run;
558     + remainder = p->slice % rr_interval(p);
559     + if (remainder)
560     + p->time_slice = remainder;
561     + }
562     +}
563     +
564     +/*
565     + * recalc_task_prio - this checks for tasks that have run less than a full
566     + * slice and have woken up again soon after, or have just forked a
567     + * thread/process and make them continue their old slice instead of starting
568     + * a new one at high priority.
569     + */
570     +static inline void recalc_task_prio(struct task_struct *p, const unsigned long long now)
571     +{
572     + unsigned long sleep_time;
573     +
574     + /*
575     + * If this task has managed to run to its lowest priority then
576     + * decrease its bonus and requeue it now at best priority instead
577     + * of possibly flagging around lowest priority. Save up any systime
578     + * that may affect priority on the next reschedule.
579     + */
580     + if (p->slice > p->time_slice &&
581     + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) {
582     + dec_bonus(p);
583     + p->totalrun = 0;
584     + return;
585     + }
586     +
587     + /*
588     + * Add the total for this last scheduled run (p->runtime) and system
589     + * time (p->systime) done on behalf of p to the running total so far
590     + * used (p->totalrun).
591     + */
592     + p->totalrun += p->runtime + p->systime;
593     + sleep_time = ns_diff(now, p->timestamp);
594     +
595     + if (p->systime > sleep_time || p->flags & PF_FORKED)
596     + sleep_time = 0;
597     + else {
598     + sleep_time -= p->systime;
599     + /*
600     + * We elevate priority by the amount of time we slept. If we
601     + * sleep longer than our running total and have not set the
602     + * PF_NONSLEEP flag we gain a bonus.
603     + */
604     + if (sleep_time >= p->totalrun) {
605     + if (!(p->flags & PF_NONSLEEP))
606     + inc_bonus(p, p->totalrun, sleep_time);
607     + p->totalrun = 0;
608     + return;
609     + }
610     + p->totalrun -= sleep_time;
611     + }
612     + continue_slice(p);
613     +}
614     +
615     +/*
616     + * __normal_prio - dynamic priority dependent on bonus.
617     + * The priority normally decreases by one each RR_INTERVAL.
618     + * As the bonus increases the initial priority starts at a higher "stair" or
619     + * priority for longer.
620     + */
621     +static inline int __normal_prio(struct task_struct *p)
622     +{
623     + int prio;
624     + unsigned int full_slice, used_slice = 0;
625     + unsigned int best_bonus, rr;
626     +
627     + full_slice = slice(p);
628     + if (full_slice > p->slice)
629     + used_slice = full_slice - p->slice;
630     +
631     + best_bonus = bonus(p);
632     + prio = MAX_RT_PRIO + best_bonus;
633     + if (!batch_task(p))
634     + prio -= p->bonus;
635     +
636     + rr = rr_interval(p);
637     + prio += used_slice / rr;
638     + if (prio > MIN_USER_PRIO)
639     + prio = MIN_USER_PRIO;
640     + return prio;
641     +}
642     +
643     +/*
644     * Calculate the expected normal priority: i.e. priority
645     * without taking RT-inheritance into account. Might be
646     * boosted by interactivity modifiers. Changes upon fork,
647     @@ -842,111 +916,14 @@ static int effective_prio(struct task_st
648     }
649    
650     /*
651     - * __activate_task - move a task to the runqueue.
652     - */
653     -static void __activate_task(struct task_struct *p, struct rq *rq)
654     -{
655     - struct prio_array *target = rq->active;
656     -
657     - if (batch_task(p))
658     - target = rq->expired;
659     - enqueue_task(p, target);
660     - inc_nr_running(p, rq);
661     -}
662     -
663     -/*
664     - * __activate_idle_task - move idle task to the _front_ of runqueue.
665     - */
666     -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
667     -{
668     - enqueue_task_head(p, rq->active);
669     - inc_nr_running(p, rq);
670     -}
671     -
672     -/*
673     - * Recalculate p->normal_prio and p->prio after having slept,
674     - * updating the sleep-average too:
675     - */
676     -static int recalc_task_prio(struct task_struct *p, unsigned long long now)
677     -{
678     - /* Caller must always ensure 'now >= p->timestamp' */
679     - unsigned long sleep_time = now - p->timestamp;
680     -
681     - if (batch_task(p))
682     - sleep_time = 0;
683     -
684     - if (likely(sleep_time > 0)) {
685     - /*
686     - * This ceiling is set to the lowest priority that would allow
687     - * a task to be reinserted into the active array on timeslice
688     - * completion.
689     - */
690     - unsigned long ceiling = INTERACTIVE_SLEEP(p);
691     -
692     - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
693     - /*
694     - * Prevents user tasks from achieving best priority
695     - * with one single large enough sleep.
696     - */
697     - p->sleep_avg = ceiling;
698     - /*
699     - * Using INTERACTIVE_SLEEP() as a ceiling places a
700     - * nice(0) task 1ms sleep away from promotion, and
701     - * gives it 700ms to round-robin with no chance of
702     - * being demoted. This is more than generous, so
703     - * mark this sleep as non-interactive to prevent the
704     - * on-runqueue bonus logic from intervening should
705     - * this task not receive cpu immediately.
706     - */
707     - p->sleep_type = SLEEP_NONINTERACTIVE;
708     - } else {
709     - /*
710     - * Tasks waking from uninterruptible sleep are
711     - * limited in their sleep_avg rise as they
712     - * are likely to be waiting on I/O
713     - */
714     - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
715     - if (p->sleep_avg >= ceiling)
716     - sleep_time = 0;
717     - else if (p->sleep_avg + sleep_time >=
718     - ceiling) {
719     - p->sleep_avg = ceiling;
720     - sleep_time = 0;
721     - }
722     - }
723     -
724     - /*
725     - * This code gives a bonus to interactive tasks.
726     - *
727     - * The boost works by updating the 'average sleep time'
728     - * value here, based on ->timestamp. The more time a
729     - * task spends sleeping, the higher the average gets -
730     - * and the higher the priority boost gets as well.
731     - */
732     - p->sleep_avg += sleep_time;
733     -
734     - }
735     - if (p->sleep_avg > NS_MAX_SLEEP_AVG)
736     - p->sleep_avg = NS_MAX_SLEEP_AVG;
737     - }
738     -
739     - return effective_prio(p);
740     -}
741     -
742     -/*
743     * activate_task - move a task to the runqueue and do priority recalculation
744     *
745     - * Update all the scheduling statistics stuff. (sleep average
746     - * calculation, priority modifiers, etc.)
747     */
748     static void activate_task(struct task_struct *p, struct rq *rq, int local)
749     {
750     - unsigned long long now;
751     -
752     - if (rt_task(p))
753     - goto out;
754     + unsigned long long now = sched_clock();
755     + unsigned long rr = rr_interval(p);
756    
757     - now = sched_clock();
758     #ifdef CONFIG_SMP
759     if (!local) {
760     /* Compensate for drifting sched_clock */
761     @@ -967,32 +944,15 @@ static void activate_task(struct task_st
762     (now - p->timestamp) >> 20);
763     }
764    
765     - p->prio = recalc_task_prio(p, now);
766     -
767     - /*
768     - * This checks to make sure it's not an uninterruptible task
769     - * that is now waking up.
770     - */
771     - if (p->sleep_type == SLEEP_NORMAL) {
772     - /*
773     - * Tasks which were woken up by interrupts (ie. hw events)
774     - * are most likely of interactive nature. So we give them
775     - * the credit of extending their sleep time to the period
776     - * of time they spend on the runqueue, waiting for execution
777     - * on a CPU, first time around:
778     - */
779     - if (in_interrupt())
780     - p->sleep_type = SLEEP_INTERRUPTED;
781     - else {
782     - /*
783     - * Normal first-time wakeups get a credit too for
784     - * on-runqueue time, but it will be weighted down:
785     - */
786     - p->sleep_type = SLEEP_INTERACTIVE;
787     - }
788     + p->slice = slice(p);
789     + p->time_slice = p->slice % rr ? : rr;
790     + if (!rt_task(p)) {
791     + recalc_task_prio(p, now);
792     + p->prio = effective_prio(p);
793     + p->systime = 0;
794     + p->flags &= ~(PF_FORKED | PF_NONSLEEP);
795     }
796     p->timestamp = now;
797     -out:
798     __activate_task(p, rq);
799     }
800    
801     @@ -1002,8 +962,7 @@ out:
802     static void deactivate_task(struct task_struct *p, struct rq *rq)
803     {
804     dec_nr_running(p, rq);
805     - dequeue_task(p, p->array);
806     - p->array = NULL;
807     + dequeue_task(p, rq);
808     }
809    
810     /*
811     @@ -1085,7 +1044,7 @@ migrate_task(struct task_struct *p, int
812     * If the task is not on a runqueue (and not running), then
813     * it is sufficient to simply update the task's cpu field.
814     */
815     - if (!p->array && !task_running(rq, p)) {
816     + if (!task_queued(p) && !task_running(rq, p)) {
817     set_task_cpu(p, dest_cpu);
818     return 0;
819     }
820     @@ -1116,7 +1075,7 @@ void wait_task_inactive(struct task_stru
821     repeat:
822     rq = task_rq_lock(p, &flags);
823     /* Must be off runqueue entirely, not preempted. */
824     - if (unlikely(p->array || task_running(rq, p))) {
825     + if (unlikely(task_queued(p) || task_running(rq, p))) {
826     /* If it's preempted, we yield. It could be a while. */
827     preempted = !task_running(rq, p);
828     task_rq_unlock(rq, &flags);
829     @@ -1381,6 +1340,16 @@ static inline int wake_idle(int cpu, str
830     }
831     #endif
832    
833     +/*
834     + * Check to see if p preempts rq->curr and resched if it does.
835     + */
836     +static inline void preempt(const struct task_struct *p, struct rq *rq)
837     +{
838     + if (TASK_PREEMPTS_CURR(p, rq))
839     + resched_task(rq->curr);
840     +}
841     +
842     +
843     /***
844     * try_to_wake_up - wake up a thread
845     * @p: the to-be-woken-up thread
846     @@ -1412,7 +1381,7 @@ static int try_to_wake_up(struct task_st
847     if (!(old_state & state))
848     goto out;
849    
850     - if (p->array)
851     + if (task_queued(p))
852     goto out_running;
853    
854     cpu = task_cpu(p);
855     @@ -1505,7 +1474,7 @@ out_set_cpu:
856     old_state = p->state;
857     if (!(old_state & state))
858     goto out;
859     - if (p->array)
860     + if (task_queued(p))
861     goto out_running;
862    
863     this_cpu = smp_processor_id();
864     @@ -1514,25 +1483,9 @@ out_set_cpu:
865    
866     out_activate:
867     #endif /* CONFIG_SMP */
868     - if (old_state == TASK_UNINTERRUPTIBLE) {
869     + if (old_state == TASK_UNINTERRUPTIBLE)
870     rq->nr_uninterruptible--;
871     - /*
872     - * Tasks on involuntary sleep don't earn
873     - * sleep_avg beyond just interactive state.
874     - */
875     - p->sleep_type = SLEEP_NONINTERACTIVE;
876     - } else
877     -
878     - /*
879     - * Tasks that have marked their sleep as noninteractive get
880     - * woken up with their sleep average not weighted in an
881     - * interactive way.
882     - */
883     - if (old_state & TASK_NONINTERACTIVE)
884     - p->sleep_type = SLEEP_NONINTERACTIVE;
885     -
886    
887     - activate_task(p, rq, cpu == this_cpu);
888     /*
889     * Sync wakeups (i.e. those types of wakeups where the waker
890     * has indicated that it will leave the CPU in short order)
891     @@ -1541,10 +1494,9 @@ out_activate:
892     * the waker guarantees that the freshly woken up task is going
893     * to be considered on this CPU.)
894     */
895     - if (!sync || cpu != this_cpu) {
896     - if (TASK_PREEMPTS_CURR(p, rq))
897     - resched_task(rq->curr);
898     - }
899     + activate_task(p, rq, cpu == this_cpu);
900     + if (!sync || cpu != this_cpu)
901     + preempt(p, rq);
902     success = 1;
903    
904     out_running:
905     @@ -1595,7 +1547,6 @@ void fastcall sched_fork(struct task_str
906     p->prio = current->normal_prio;
907    
908     INIT_LIST_HEAD(&p->run_list);
909     - p->array = NULL;
910     #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
911     if (unlikely(sched_info_on()))
912     memset(&p->sched_info, 0, sizeof(p->sched_info));
913     @@ -1607,30 +1558,6 @@ void fastcall sched_fork(struct task_str
914     /* Want to start with kernel preemption disabled. */
915     task_thread_info(p)->preempt_count = 1;
916     #endif
917     - /*
918     - * Share the timeslice between parent and child, thus the
919     - * total amount of pending timeslices in the system doesn't change,
920     - * resulting in more scheduling fairness.
921     - */
922     - local_irq_disable();
923     - p->time_slice = (current->time_slice + 1) >> 1;
924     - /*
925     - * The remainder of the first timeslice might be recovered by
926     - * the parent if the child exits early enough.
927     - */
928     - p->first_time_slice = 1;
929     - current->time_slice >>= 1;
930     - p->timestamp = sched_clock();
931     - if (unlikely(!current->time_slice)) {
932     - /*
933     - * This case is rare, it happens when the parent has only
934     - * a single jiffy left from its timeslice. Taking the
935     - * runqueue lock is not a problem.
936     - */
937     - current->time_slice = 1;
938     - task_running_tick(cpu_rq(cpu), current);
939     - }
940     - local_irq_enable();
941     put_cpu();
942     }
943    
944     @@ -1652,38 +1579,20 @@ void fastcall wake_up_new_task(struct ta
945     this_cpu = smp_processor_id();
946     cpu = task_cpu(p);
947    
948     - /*
949     - * We decrease the sleep average of forking parents
950     - * and children as well, to keep max-interactive tasks
951     - * from forking tasks that are max-interactive. The parent
952     - * (current) is done further down, under its lock.
953     - */
954     - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
955     - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
956     -
957     - p->prio = effective_prio(p);
958     + /* Forked process gets no bonus to prevent fork bombs. */
959     + p->bonus = 0;
960     + current->flags |= PF_FORKED;
961    
962     if (likely(cpu == this_cpu)) {
963     + activate_task(p, rq, 1);
964     if (!(clone_flags & CLONE_VM)) {
965     /*
966     * The VM isn't cloned, so we're in a good position to
967     * do child-runs-first in anticipation of an exec. This
968     * usually avoids a lot of COW overhead.
969     */
970     - if (unlikely(!current->array))
971     - __activate_task(p, rq);
972     - else {
973     - p->prio = current->prio;
974     - p->normal_prio = current->normal_prio;
975     - list_add_tail(&p->run_list, &current->run_list);
976     - p->array = current->array;
977     - p->array->nr_active++;
978     - inc_nr_running(p, rq);
979     - }
980     set_need_resched();
981     - } else
982     - /* Run child last */
983     - __activate_task(p, rq);
984     + }
985     /*
986     * We skip the following code due to cpu == this_cpu
987     *
988     @@ -1700,53 +1609,19 @@ void fastcall wake_up_new_task(struct ta
989     */
990     p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
991     + rq->most_recent_timestamp;
992     - __activate_task(p, rq);
993     - if (TASK_PREEMPTS_CURR(p, rq))
994     - resched_task(rq->curr);
995     + activate_task(p, rq, 0);
996     + preempt(p, rq);
997    
998     /*
999     * Parent and child are on different CPUs, now get the
1000     - * parent runqueue to update the parent's ->sleep_avg:
1001     + * parent runqueue to update the parent's ->flags:
1002     */
1003     task_rq_unlock(rq, &flags);
1004     this_rq = task_rq_lock(current, &flags);
1005     }
1006     - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1007     - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1008     task_rq_unlock(this_rq, &flags);
1009     }
1010    
1011     -/*
1012     - * Potentially available exiting-child timeslices are
1013     - * retrieved here - this way the parent does not get
1014     - * penalized for creating too many threads.
1015     - *
1016     - * (this cannot be used to 'generate' timeslices
1017     - * artificially, because any timeslice recovered here
1018     - * was given away by the parent in the first place.)
1019     - */
1020     -void fastcall sched_exit(struct task_struct *p)
1021     -{
1022     - unsigned long flags;
1023     - struct rq *rq;
1024     -
1025     - /*
1026     - * If the child was a (relative-) CPU hog then decrease
1027     - * the sleep_avg of the parent as well.
1028     - */
1029     - rq = task_rq_lock(p->parent, &flags);
1030     - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1031     - p->parent->time_slice += p->time_slice;
1032     - if (unlikely(p->parent->time_slice > task_timeslice(p)))
1033     - p->parent->time_slice = task_timeslice(p);
1034     - }
1035     - if (p->sleep_avg < p->parent->sleep_avg)
1036     - p->parent->sleep_avg = p->parent->sleep_avg /
1037     - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1038     - (EXIT_WEIGHT + 1);
1039     - task_rq_unlock(rq, &flags);
1040     -}
1041     -
1042     /**
1043     * prepare_task_switch - prepare to switch tasks
1044     * @rq: the runqueue preparing to switch
1045     @@ -2068,23 +1943,21 @@ void sched_exec(void)
1046     * pull_task - move a task from a remote runqueue to the local runqueue.
1047     * Both runqueues must be locked.
1048     */
1049     -static void pull_task(struct rq *src_rq, struct prio_array *src_array,
1050     - struct task_struct *p, struct rq *this_rq,
1051     - struct prio_array *this_array, int this_cpu)
1052     +static void pull_task(struct rq *src_rq, struct task_struct *p,
1053     + struct rq *this_rq, int this_cpu)
1054     {
1055     - dequeue_task(p, src_array);
1056     + dequeue_task(p, src_rq);
1057     dec_nr_running(p, src_rq);
1058     set_task_cpu(p, this_cpu);
1059     inc_nr_running(p, this_rq);
1060     - enqueue_task(p, this_array);
1061     + enqueue_task(p, this_rq);
1062     p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
1063     + this_rq->most_recent_timestamp;
1064     /*
1065     * Note that idle threads have a prio of MAX_PRIO, for this test
1066     * to be always true for them.
1067     */
1068     - if (TASK_PREEMPTS_CURR(p, this_rq))
1069     - resched_task(this_rq->curr);
1070     + preempt(p, this_rq);
1071     }
1072    
1073     /*
1074     @@ -2127,8 +2000,6 @@ int can_migrate_task(struct task_struct
1075     return 1;
1076     }
1077    
1078     -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1079     -
1080     /*
1081     * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1082     * load from busiest to this_rq, as part of a balancing operation within
1083     @@ -2143,7 +2014,6 @@ static int move_tasks(struct rq *this_rq
1084     {
1085     int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
1086     best_prio_seen, skip_for_load;
1087     - struct prio_array *array, *dst_array;
1088     struct list_head *head, *curr;
1089     struct task_struct *tmp;
1090     long rem_load_move;
1091     @@ -2153,8 +2023,8 @@ static int move_tasks(struct rq *this_rq
1092    
1093     rem_load_move = max_load_move;
1094     pinned = 1;
1095     - this_best_prio = rq_best_prio(this_rq);
1096     - best_prio = rq_best_prio(busiest);
1097     + this_best_prio = this_rq->curr->prio;
1098     + best_prio = busiest->curr->prio;
1099     /*
1100     * Enable handling of the case where there is more than one task
1101     * with the best priority. If the current running task is one
1102     @@ -2164,38 +2034,17 @@ static int move_tasks(struct rq *this_rq
1103     */
1104     best_prio_seen = best_prio == busiest->curr->prio;
1105    
1106     - /*
1107     - * We first consider expired tasks. Those will likely not be
1108     - * executed in the near future, and they are most likely to
1109     - * be cache-cold, thus switching CPUs has the least effect
1110     - * on them.
1111     - */
1112     - if (busiest->expired->nr_active) {
1113     - array = busiest->expired;
1114     - dst_array = this_rq->expired;
1115     - } else {
1116     - array = busiest->active;
1117     - dst_array = this_rq->active;
1118     - }
1119     -
1120     -new_array:
1121     /* Start searching at priority 0: */
1122     idx = 0;
1123     skip_bitmap:
1124     if (!idx)
1125     - idx = sched_find_first_bit(array->bitmap);
1126     + idx = sched_find_first_bit(busiest->bitmap);
1127     else
1128     - idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1129     - if (idx >= MAX_PRIO) {
1130     - if (array == busiest->expired && busiest->active->nr_active) {
1131     - array = busiest->active;
1132     - dst_array = this_rq->active;
1133     - goto new_array;
1134     - }
1135     + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1136     + if (idx >= MAX_PRIO)
1137     goto out;
1138     - }
1139    
1140     - head = array->queue + idx;
1141     + head = busiest->queue + idx;
1142     curr = head->prev;
1143     skip_queue:
1144     tmp = list_entry(curr, struct task_struct, run_list);
1145     @@ -2220,7 +2069,7 @@ skip_queue:
1146     goto skip_bitmap;
1147     }
1148    
1149     - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1150     + pull_task(busiest, tmp, this_rq, this_cpu);
1151     pulled++;
1152     rem_load_move -= tmp->load_weight;
1153    
1154     @@ -3036,27 +2885,6 @@ unsigned long long current_sched_time(co
1155     }
1156    
1157     /*
1158     - * We place interactive tasks back into the active array, if possible.
1159     - *
1160     - * To guarantee that this does not starve expired tasks we ignore the
1161     - * interactivity of a task if the first expired task had to wait more
1162     - * than a 'reasonable' amount of time. This deadline timeout is
1163     - * load-dependent, as the frequency of array switched decreases with
1164     - * increasing number of running tasks. We also ignore the interactivity
1165     - * if a better static_prio task has expired:
1166     - */
1167     -static inline int expired_starving(struct rq *rq)
1168     -{
1169     - if (rq->curr->static_prio > rq->best_expired_prio)
1170     - return 1;
1171     - if (!STARVATION_LIMIT || !rq->expired_timestamp)
1172     - return 0;
1173     - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
1174     - return 1;
1175     - return 0;
1176     -}
1177     -
1178     -/*
1179     * Account user cpu time to a process.
1180     * @p: the process that the cpu time gets accounted to
1181     * @hardirq_offset: the offset to subtract from hardirq_count()
1182     @@ -3104,6 +2932,7 @@ void account_system_time(struct task_str
1183     cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
1184     else
1185     cpustat->idle = cputime64_add(cpustat->idle, tmp);
1186     + p->systime += NSJIFFY;
1187     /* Account for system time used */
1188     acct_update_integrals(p);
1189     }
1190     @@ -3129,76 +2958,49 @@ void account_steal_time(struct task_stru
1191     cpustat->steal = cputime64_add(cpustat->steal, tmp);
1192     }
1193    
1194     +static void time_slice_expired(struct task_struct *p, struct rq *rq)
1195     +{
1196     + set_tsk_need_resched(p);
1197     + p->time_slice = rr_interval(p);
1198     + requeue_task(p, rq, effective_prio(p));
1199     +}
1200     +
1201     static void task_running_tick(struct rq *rq, struct task_struct *p)
1202     {
1203     - if (p->array != rq->active) {
1204     + unsigned long debit;
1205     +
1206     + if (unlikely(!task_queued(p))) {
1207     /* Task has expired but was not scheduled yet */
1208     set_tsk_need_resched(p);
1209     return;
1210     }
1211     + /* SCHED_FIFO tasks never run out of timeslice. */
1212     + if (unlikely(p->policy == SCHED_FIFO))
1213     + return;
1214     +
1215     spin_lock(&rq->lock);
1216     + debit = ns_diff(rq->most_recent_timestamp, p->timestamp);
1217     + p->ns_debit += debit;
1218     + if (p->ns_debit < NSJIFFY)
1219     + goto out_unlock;
1220     + p->ns_debit %= NSJIFFY;
1221     /*
1222     - * The task was running during this tick - update the
1223     - * time slice counter. Note: we do not update a thread's
1224     - * priority until it either goes to sleep or uses up its
1225     - * timeslice. This makes it possible for interactive tasks
1226     - * to use up their timeslices at their highest priority levels.
1227     + * Tasks lose bonus each time they use up a full slice().
1228     */
1229     - if (rt_task(p)) {
1230     - /*
1231     - * RR tasks need a special form of timeslice management.
1232     - * FIFO tasks have no timeslices.
1233     - */
1234     - if ((p->policy == SCHED_RR) && !--p->time_slice) {
1235     - p->time_slice = task_timeslice(p);
1236     - p->first_time_slice = 0;
1237     - set_tsk_need_resched(p);
1238     -
1239     - /* put it at the end of the queue: */
1240     - requeue_task(p, rq->active);
1241     - }
1242     + if (!--p->slice) {
1243     + dec_bonus(p);
1244     + p->totalrun = 0;
1245     + p->slice = slice(p);
1246     + time_slice_expired(p, rq);
1247     goto out_unlock;
1248     }
1249     + /*
1250     + * Tasks that run out of time_slice but still have slice left get
1251     + * requeued with a lower priority && RR_INTERVAL time_slice.
1252     + */
1253     if (!--p->time_slice) {
1254     - dequeue_task(p, rq->active);
1255     - set_tsk_need_resched(p);
1256     - p->prio = effective_prio(p);
1257     - p->time_slice = task_timeslice(p);
1258     - p->first_time_slice = 0;
1259     -
1260     - if (!rq->expired_timestamp)
1261     - rq->expired_timestamp = jiffies;
1262     - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
1263     - enqueue_task(p, rq->expired);
1264     - if (p->static_prio < rq->best_expired_prio)
1265     - rq->best_expired_prio = p->static_prio;
1266     - } else
1267     - enqueue_task(p, rq->active);
1268     - } else {
1269     - /*
1270     - * Prevent a too long timeslice allowing a task to monopolize
1271     - * the CPU. We do this by splitting up the timeslice into
1272     - * smaller pieces.
1273     - *
1274     - * Note: this does not mean the task's timeslices expire or
1275     - * get lost in any way, they just might be preempted by
1276     - * another task of equal priority. (one with higher
1277     - * priority would have preempted this task already.) We
1278     - * requeue this task to the end of the list on this priority
1279     - * level, which is in essence a round-robin of tasks with
1280     - * equal priority.
1281     - *
1282     - * This only applies to tasks in the interactive
1283     - * delta range with at least TIMESLICE_GRANULARITY to requeue.
1284     - */
1285     - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1286     - p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1287     - (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1288     - (p->array == rq->active)) {
1289     -
1290     - requeue_task(p, rq->active);
1291     - set_tsk_need_resched(p);
1292     - }
1293     + time_slice_expired(p, rq);
1294     + goto out_unlock;
1295     }
1296     out_unlock:
1297     spin_unlock(&rq->lock);
1298     @@ -3207,9 +3009,6 @@ out_unlock:
1299     /*
1300     * This function gets called by the timer code, with HZ frequency.
1301     * We call it with interrupts disabled.
1302     - *
1303     - * It also gets called by the fork code, when changing the parent's
1304     - * timeslices.
1305     */
1306     void scheduler_tick(void)
1307     {
1308     @@ -3273,13 +3072,13 @@ static void wake_sleeping_dependent(int
1309    
1310     /*
1311     * number of 'lost' timeslices this task wont be able to fully
1312     - * utilize, if another task runs on a sibling. This models the
1313     + * utilise, if another task runs on a sibling. This models the
1314     * slowdown effect of other tasks running on siblings:
1315     */
1316     static inline unsigned long
1317     smt_slice(struct task_struct *p, struct sched_domain *sd)
1318     {
1319     - return p->time_slice * (100 - sd->per_cpu_gain) / 100;
1320     + return p->slice * (100 - sd->per_cpu_gain) / 100;
1321     }
1322    
1323     /*
1324     @@ -3343,7 +3142,7 @@ dependent_sleeper(int this_cpu, struct r
1325     } else {
1326     if (smt_curr->static_prio < p->static_prio &&
1327     !TASK_PREEMPTS_CURR(p, smt_rq) &&
1328     - smt_slice(smt_curr, sd) > task_timeslice(p))
1329     + smt_slice(smt_curr, sd) > slice(p))
1330     ret = 1;
1331     }
1332     unlock:
1333     @@ -3400,25 +3199,18 @@ EXPORT_SYMBOL(sub_preempt_count);
1334    
1335     #endif
1336    
1337     -static inline int interactive_sleep(enum sleep_type sleep_type)
1338     -{
1339     - return (sleep_type == SLEEP_INTERACTIVE ||
1340     - sleep_type == SLEEP_INTERRUPTED);
1341     -}
1342     -
1343     /*
1344     * schedule() is the main scheduler function.
1345     */
1346     asmlinkage void __sched schedule(void)
1347     {
1348     struct task_struct *prev, *next;
1349     - struct prio_array *array;
1350     struct list_head *queue;
1351     unsigned long long now;
1352     - unsigned long run_time;
1353     - int cpu, idx, new_prio;
1354     long *switch_count;
1355     + unsigned long debit;
1356     struct rq *rq;
1357     + int cpu, idx;
1358    
1359     /*
1360     * Test if we are atomic. Since do_exit() needs to call into
1361     @@ -3454,20 +3246,11 @@ need_resched_nonpreemptible:
1362    
1363     schedstat_inc(rq, sched_cnt);
1364     now = sched_clock();
1365     - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
1366     - run_time = now - prev->timestamp;
1367     - if (unlikely((long long)(now - prev->timestamp) < 0))
1368     - run_time = 0;
1369     - } else
1370     - run_time = NS_MAX_SLEEP_AVG;
1371     -
1372     - /*
1373     - * Tasks charged proportionately less run_time at high sleep_avg to
1374     - * delay them losing their interactive status
1375     - */
1376     - run_time /= (CURRENT_BONUS(prev) ? : 1);
1377    
1378     spin_lock_irq(&rq->lock);
1379     + prev->runtime = ns_diff(now, prev->timestamp);
1380     + debit = ns_diff(now, rq->most_recent_timestamp) % NSJIFFY;
1381     + prev->ns_debit += debit;
1382    
1383     switch_count = &prev->nivcsw;
1384     if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1385     @@ -3476,8 +3259,10 @@ need_resched_nonpreemptible:
1386     unlikely(signal_pending(prev))))
1387     prev->state = TASK_RUNNING;
1388     else {
1389     - if (prev->state == TASK_UNINTERRUPTIBLE)
1390     + if (prev->state == TASK_UNINTERRUPTIBLE) {
1391     + prev->flags |= PF_NONSLEEP;
1392     rq->nr_uninterruptible++;
1393     + }
1394     deactivate_task(prev, rq);
1395     }
1396     }
1397     @@ -3487,62 +3272,28 @@ need_resched_nonpreemptible:
1398     idle_balance(cpu, rq);
1399     if (!rq->nr_running) {
1400     next = rq->idle;
1401     - rq->expired_timestamp = 0;
1402     wake_sleeping_dependent(cpu);
1403     goto switch_tasks;
1404     }
1405     }
1406    
1407     - array = rq->active;
1408     - if (unlikely(!array->nr_active)) {
1409     - /*
1410     - * Switch the active and expired arrays.
1411     - */
1412     - schedstat_inc(rq, sched_switch);
1413     - rq->active = rq->expired;
1414     - rq->expired = array;
1415     - array = rq->active;
1416     - rq->expired_timestamp = 0;
1417     - rq->best_expired_prio = MAX_PRIO;
1418     - }
1419     -
1420     - idx = sched_find_first_bit(array->bitmap);
1421     - queue = array->queue + idx;
1422     + idx = sched_find_first_bit(rq->bitmap);
1423     + queue = rq->queue + idx;
1424     next = list_entry(queue->next, struct task_struct, run_list);
1425    
1426     - if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
1427     - unsigned long long delta = now - next->timestamp;
1428     - if (unlikely((long long)(now - next->timestamp) < 0))
1429     - delta = 0;
1430     -
1431     - if (next->sleep_type == SLEEP_INTERACTIVE)
1432     - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1433     -
1434     - array = next->array;
1435     - new_prio = recalc_task_prio(next, next->timestamp + delta);
1436     -
1437     - if (unlikely(next->prio != new_prio)) {
1438     - dequeue_task(next, array);
1439     - next->prio = new_prio;
1440     - enqueue_task(next, array);
1441     - }
1442     - }
1443     - next->sleep_type = SLEEP_NORMAL;
1444     if (dependent_sleeper(cpu, rq, next))
1445     next = rq->idle;
1446     + else {
1447     + prefetch(next);
1448     + prefetch_stack(next);
1449     + }
1450     switch_tasks:
1451     if (next == rq->idle)
1452     schedstat_inc(rq, sched_goidle);
1453     - prefetch(next);
1454     - prefetch_stack(next);
1455     clear_tsk_need_resched(prev);
1456     rcu_qsctr_inc(task_cpu(prev));
1457    
1458     update_cpu_clock(prev, rq, now);
1459     -
1460     - prev->sleep_avg -= run_time;
1461     - if ((long)prev->sleep_avg <= 0)
1462     - prev->sleep_avg = 0;
1463     prev->timestamp = prev->last_ran = now;
1464    
1465     sched_info_switch(prev, next);
1466     @@ -3978,29 +3729,21 @@ EXPORT_SYMBOL(sleep_on_timeout);
1467     */
1468     void rt_mutex_setprio(struct task_struct *p, int prio)
1469     {
1470     - struct prio_array *array;
1471     unsigned long flags;
1472     + int queued, oldprio;
1473     struct rq *rq;
1474     - int oldprio;
1475    
1476     BUG_ON(prio < 0 || prio > MAX_PRIO);
1477    
1478     rq = task_rq_lock(p, &flags);
1479    
1480     oldprio = p->prio;
1481     - array = p->array;
1482     - if (array)
1483     - dequeue_task(p, array);
1484     + if ((queued = task_queued(p)))
1485     + dequeue_task(p, rq);
1486     p->prio = prio;
1487    
1488     - if (array) {
1489     - /*
1490     - * If changing to an RT priority then queue it
1491     - * in the active array!
1492     - */
1493     - if (rt_task(p))
1494     - array = rq->active;
1495     - enqueue_task(p, array);
1496     + if (queued) {
1497     + enqueue_task(p, rq);
1498     /*
1499     * Reschedule if we are currently running on this runqueue and
1500     * our priority decreased, or if we are not currently running on
1501     @@ -4009,8 +3752,8 @@ void rt_mutex_setprio(struct task_struct
1502     if (task_running(rq, p)) {
1503     if (p->prio > oldprio)
1504     resched_task(rq->curr);
1505     - } else if (TASK_PREEMPTS_CURR(p, rq))
1506     - resched_task(rq->curr);
1507     + } else
1508     + preempt(p, rq);
1509     }
1510     task_rq_unlock(rq, &flags);
1511     }
1512     @@ -4019,8 +3762,7 @@ void rt_mutex_setprio(struct task_struct
1513    
1514     void set_user_nice(struct task_struct *p, long nice)
1515     {
1516     - struct prio_array *array;
1517     - int old_prio, delta;
1518     + int queued, old_prio,delta;
1519     unsigned long flags;
1520     struct rq *rq;
1521    
1522     @@ -4041,20 +3783,21 @@ void set_user_nice(struct task_struct *p
1523     p->static_prio = NICE_TO_PRIO(nice);
1524     goto out_unlock;
1525     }
1526     - array = p->array;
1527     - if (array) {
1528     - dequeue_task(p, array);
1529     + if ((queued = task_queued(p))) {
1530     + dequeue_task(p, rq);
1531     dec_raw_weighted_load(rq, p);
1532     }
1533    
1534     p->static_prio = NICE_TO_PRIO(nice);
1535     set_load_weight(p);
1536     old_prio = p->prio;
1537     + if (p->bonus > bonus(p))
1538     + p->bonus= bonus(p);
1539     p->prio = effective_prio(p);
1540     delta = p->prio - old_prio;
1541    
1542     - if (array) {
1543     - enqueue_task(p, array);
1544     + if (queued) {
1545     + enqueue_task(p, rq);
1546     inc_raw_weighted_load(rq, p);
1547     /*
1548     * If the task increased its priority or is running and
1549     @@ -4177,18 +3920,13 @@ static inline struct task_struct *find_p
1550     /* Actually do priority change: must hold rq lock. */
1551     static void __setscheduler(struct task_struct *p, int policy, int prio)
1552     {
1553     - BUG_ON(p->array);
1554     + BUG_ON(task_queued(p));
1555    
1556     p->policy = policy;
1557     p->rt_priority = prio;
1558     p->normal_prio = normal_prio(p);
1559     /* we are holding p->pi_lock already */
1560     p->prio = rt_mutex_getprio(p);
1561     - /*
1562     - * SCHED_BATCH tasks are treated as perpetual CPU hogs:
1563     - */
1564     - if (policy == SCHED_BATCH)
1565     - p->sleep_avg = 0;
1566     set_load_weight(p);
1567     }
1568    
1569     @@ -4204,8 +3942,7 @@ static void __setscheduler(struct task_s
1570     int sched_setscheduler(struct task_struct *p, int policy,
1571     struct sched_param *param)
1572     {
1573     - int retval, oldprio, oldpolicy = -1;
1574     - struct prio_array *array;
1575     + int queued, retval, oldprio, oldpolicy = -1;
1576     unsigned long flags;
1577     struct rq *rq;
1578    
1579     @@ -4279,12 +4016,11 @@ recheck:
1580     spin_unlock_irqrestore(&p->pi_lock, flags);
1581     goto recheck;
1582     }
1583     - array = p->array;
1584     - if (array)
1585     + if ((queued = task_queued(p)))
1586     deactivate_task(p, rq);
1587     oldprio = p->prio;
1588     __setscheduler(p, policy, param->sched_priority);
1589     - if (array) {
1590     + if (queued) {
1591     __activate_task(p, rq);
1592     /*
1593     * Reschedule if we are currently running on this runqueue and
1594     @@ -4294,8 +4030,8 @@ recheck:
1595     if (task_running(rq, p)) {
1596     if (p->prio > oldprio)
1597     resched_task(rq->curr);
1598     - } else if (TASK_PREEMPTS_CURR(p, rq))
1599     - resched_task(rq->curr);
1600     + } else
1601     + preempt(p, rq);
1602     }
1603     __task_rq_unlock(rq);
1604     spin_unlock_irqrestore(&p->pi_lock, flags);
1605     @@ -4567,41 +4303,24 @@ asmlinkage long sys_sched_getaffinity(pi
1606     /**
1607     * sys_sched_yield - yield the current processor to other threads.
1608     *
1609     - * this function yields the current CPU by moving the calling thread
1610     - * to the expired array. If there are no other threads running on this
1611     - * CPU then this function will return.
1612     + * This function yields the current CPU by dropping the priority of current
1613     + * to the lowest priority.
1614     */
1615     asmlinkage long sys_sched_yield(void)
1616     {
1617     struct rq *rq = this_rq_lock();
1618     - struct prio_array *array = current->array, *target = rq->expired;
1619     + int newprio = current->prio;
1620    
1621     schedstat_inc(rq, yld_cnt);
1622     - /*
1623     - * We implement yielding by moving the task into the expired
1624     - * queue.
1625     - *
1626     - * (special rule: RT tasks will just roundrobin in the active
1627     - * array.)
1628     - */
1629     - if (rt_task(current))
1630     - target = rq->active;
1631    
1632     - if (array->nr_active == 1) {
1633     - schedstat_inc(rq, yld_act_empty);
1634     - if (!rq->expired->nr_active)
1635     - schedstat_inc(rq, yld_both_empty);
1636     - } else if (!rq->expired->nr_active)
1637     - schedstat_inc(rq, yld_exp_empty);
1638     -
1639     - if (array != target) {
1640     - dequeue_task(current, array);
1641     - enqueue_task(current, target);
1642     - } else
1643     - /*
1644     - * requeue_task is cheaper so perform that if possible.
1645     - */
1646     - requeue_task(current, array);
1647     + newprio = current->prio;
1648     + schedstat_inc(rq, yld_cnt);
1649     + current->slice = slice(current);
1650     + current->time_slice = rr_interval(current);
1651     + if (likely(!rt_task(current)))
1652     + newprio = MIN_USER_PRIO;
1653     +
1654     + requeue_task(current, rq, newprio);
1655    
1656     /*
1657     * Since we are going to call schedule() anyway, there's
1658     @@ -4812,7 +4531,7 @@ long sys_sched_rr_get_interval(pid_t pid
1659     goto out_unlock;
1660    
1661     jiffies_to_timespec(p->policy == SCHED_FIFO ?
1662     - 0 : task_timeslice(p), &t);
1663     + 0 : slice(p), &t);
1664     read_unlock(&tasklist_lock);
1665     retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1666     out_nounlock:
1667     @@ -4941,8 +4660,6 @@ void __cpuinit init_idle(struct task_str
1668     unsigned long flags;
1669    
1670     idle->timestamp = sched_clock();
1671     - idle->sleep_avg = 0;
1672     - idle->array = NULL;
1673     idle->prio = idle->normal_prio = MAX_PRIO;
1674     idle->state = TASK_RUNNING;
1675     idle->cpus_allowed = cpumask_of_cpu(cpu);
1676     @@ -5062,7 +4779,7 @@ static int __migrate_task(struct task_st
1677     goto out;
1678    
1679     set_task_cpu(p, dest_cpu);
1680     - if (p->array) {
1681     + if (task_queued(p)) {
1682     /*
1683     * Sync timestamp with rq_dest's before activating.
1684     * The same thing could be achieved by doing this step
1685     @@ -5073,8 +4790,7 @@ static int __migrate_task(struct task_st
1686     + rq_dest->most_recent_timestamp;
1687     deactivate_task(p, rq_src);
1688     __activate_task(p, rq_dest);
1689     - if (TASK_PREEMPTS_CURR(p, rq_dest))
1690     - resched_task(rq_dest->curr);
1691     + preempt(p, rq_dest);
1692     }
1693     ret = 1;
1694     out:
1695     @@ -5303,7 +5019,7 @@ static void migrate_dead_tasks(unsigned
1696    
1697     for (arr = 0; arr < 2; arr++) {
1698     for (i = 0; i < MAX_PRIO; i++) {
1699     - struct list_head *list = &rq->arrays[arr].queue[i];
1700     + struct list_head *list = &rq->queue[i];
1701    
1702     while (!list_empty(list))
1703     migrate_dead(dead_cpu, list_entry(list->next,
1704     @@ -6894,19 +6610,16 @@ int in_sched_functions(unsigned long add
1705    
1706     void __init sched_init(void)
1707     {
1708     - int i, j, k;
1709     + int i;
1710    
1711     for_each_possible_cpu(i) {
1712     - struct prio_array *array;
1713     struct rq *rq;
1714     + int j;
1715    
1716     rq = cpu_rq(i);
1717     spin_lock_init(&rq->lock);
1718     lockdep_set_class(&rq->lock, &rq->rq_lock_key);
1719     rq->nr_running = 0;
1720     - rq->active = rq->arrays;
1721     - rq->expired = rq->arrays + 1;
1722     - rq->best_expired_prio = MAX_PRIO;
1723    
1724     #ifdef CONFIG_SMP
1725     rq->sd = NULL;
1726     @@ -6920,15 +6633,11 @@ void __init sched_init(void)
1727     #endif
1728     atomic_set(&rq->nr_iowait, 0);
1729    
1730     - for (j = 0; j < 2; j++) {
1731     - array = rq->arrays + j;
1732     - for (k = 0; k < MAX_PRIO; k++) {
1733     - INIT_LIST_HEAD(array->queue + k);
1734     - __clear_bit(k, array->bitmap);
1735     - }
1736     - // delimiter for bitsearch
1737     - __set_bit(MAX_PRIO, array->bitmap);
1738     - }
1739     + for (j = 0; j < MAX_PRIO; j++)
1740     + INIT_LIST_HEAD(&rq->queue[j]);
1741     + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
1742     + /* delimiter for bitsearch */
1743     + __set_bit(MAX_PRIO, rq->bitmap);
1744     }
1745    
1746     set_load_weight(&init_task);
1747     @@ -6984,10 +6693,10 @@ EXPORT_SYMBOL(__might_sleep);
1748     #ifdef CONFIG_MAGIC_SYSRQ
1749     void normalize_rt_tasks(void)
1750     {
1751     - struct prio_array *array;
1752     struct task_struct *p;
1753     unsigned long flags;
1754     struct rq *rq;
1755     + int queued;
1756    
1757     read_lock_irq(&tasklist_lock);
1758     for_each_process(p) {
1759     @@ -6997,11 +6706,10 @@ void normalize_rt_tasks(void)
1760     spin_lock_irqsave(&p->pi_lock, flags);
1761     rq = __task_rq_lock(p);
1762    
1763     - array = p->array;
1764     - if (array)
1765     + if ((queued = task_queued(p)))
1766     deactivate_task(p, task_rq(p));
1767     __setscheduler(p, SCHED_NORMAL, 0);
1768     - if (array) {
1769     + if (queued) {
1770     __activate_task(p, task_rq(p));
1771     resched_task(rq->curr);
1772     }