Magellan Linux

Annotation of /trunk/kernel26-magellan/patches-2.6.16-r12/0007-2.6.16-sched-staircase14.2.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (hide annotations) (download)
Mon Jun 5 09:25:38 2006 UTC (18 years ago) by niro
File size: 53449 byte(s)
ver bump to 2.6.16-r12:
- updated to linux-2.6.16.19
- updated to ck11

1 niro 72 fs/proc/array.c | 4
2     include/linux/sched.h | 13
3     include/linux/sysctl.h | 2
4     kernel/exit.c | 1
5     kernel/sched.c | 1022 ++++++++++++++++++-------------------------------
6     kernel/sysctl.c | 16
7     6 files changed, 406 insertions(+), 652 deletions(-)
8    
9     Index: linux-2.6.16-ck1/fs/proc/array.c
10     ===================================================================
11     --- linux-2.6.16-ck1.orig/fs/proc/array.c 2006-03-20 20:46:26.000000000 +1100
12     +++ linux-2.6.16-ck1/fs/proc/array.c 2006-03-20 20:46:48.000000000 +1100
13     @@ -165,7 +165,7 @@ static inline char * task_state(struct t
14     read_lock(&tasklist_lock);
15     buffer += sprintf(buffer,
16     "State:\t%s\n"
17     - "SleepAVG:\t%lu%%\n"
18     + "Bonus:\t%d\n"
19     "Tgid:\t%d\n"
20     "Pid:\t%d\n"
21     "PPid:\t%d\n"
22     @@ -173,7 +173,7 @@ static inline char * task_state(struct t
23     "Uid:\t%d\t%d\t%d\t%d\n"
24     "Gid:\t%d\t%d\t%d\t%d\n",
25     get_task_state(p),
26     - (p->sleep_avg/1024)*100/(1020000000/1024),
27     + p->bonus,
28     p->tgid,
29     p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0,
30     pid_alive(p) && p->ptrace ? p->parent->pid : 0,
31     Index: linux-2.6.16-ck1/include/linux/sched.h
32     ===================================================================
33     --- linux-2.6.16-ck1.orig/include/linux/sched.h 2006-03-20 20:46:47.000000000 +1100
34     +++ linux-2.6.16-ck1/include/linux/sched.h 2006-03-20 20:46:48.000000000 +1100
35     @@ -200,6 +200,7 @@ extern void show_stack(struct task_struc
36    
37     void io_schedule(void);
38     long io_schedule_timeout(long timeout);
39     +extern int sched_interactive, sched_compute;
40    
41     extern void cpu_init (void);
42     extern void trap_init(void);
43     @@ -522,7 +523,6 @@ extern struct user_struct *find_user(uid
44     extern struct user_struct root_user;
45     #define INIT_USER (&root_user)
46    
47     -typedef struct prio_array prio_array_t;
48     struct backing_dev_info;
49     struct reclaim_state;
50    
51     @@ -723,18 +723,17 @@ struct task_struct {
52     int load_weight; /* for niceness load balancing purposes */
53     int prio, static_prio;
54     struct list_head run_list;
55     - prio_array_t *array;
56    
57     unsigned short ioprio;
58    
59     - unsigned long sleep_avg;
60     - unsigned long long timestamp, last_ran;
61     + unsigned long long timestamp;
62     + unsigned long runtime, totalrun, ns_debit;
63     + unsigned int bonus;
64     + unsigned int slice, time_slice;
65     unsigned long long sched_time; /* sched_clock time spent running */
66     - int activated;
67    
68     unsigned long policy;
69     cpumask_t cpus_allowed;
70     - unsigned int time_slice, first_time_slice;
71    
72     #ifdef CONFIG_SCHEDSTATS
73     struct sched_info sched_info;
74     @@ -948,6 +947,7 @@ static inline void put_task_struct(struc
75     #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
76     #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */
77     #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */
78     +#define PF_NONSLEEP 0x02000000 /* Waiting on in kernel activity */
79    
80     /*
81     * Only the _current_ task can read/write to tsk->flags, but other
82     @@ -1069,7 +1069,6 @@ extern void FASTCALL(wake_up_new_task(st
83     static inline void kick_process(struct task_struct *tsk) { }
84     #endif
85     extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
86     -extern void FASTCALL(sched_exit(task_t * p));
87    
88     extern int in_group_p(gid_t);
89     extern int in_egroup_p(gid_t);
90     Index: linux-2.6.16-ck1/include/linux/sysctl.h
91     ===================================================================
92     --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:26.000000000 +1100
93     +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:46:48.000000000 +1100
94     @@ -148,6 +148,8 @@ enum
95     KERN_SPIN_RETRY=70, /* int: number of spinlock retries */
96     KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
97     KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
98     + KERN_INTERACTIVE=73, /* interactive tasks can have cpu bursts */
99     + KERN_COMPUTE=74, /* adjust timeslices for a compute server */
100     };
101    
102    
103     Index: linux-2.6.16-ck1/kernel/exit.c
104     ===================================================================
105     --- linux-2.6.16-ck1.orig/kernel/exit.c 2006-03-20 20:46:26.000000000 +1100
106     +++ linux-2.6.16-ck1/kernel/exit.c 2006-03-20 20:46:48.000000000 +1100
107     @@ -102,7 +102,6 @@ repeat:
108     zap_leader = (leader->exit_signal == -1);
109     }
110    
111     - sched_exit(p);
112     write_unlock_irq(&tasklist_lock);
113     spin_unlock(&p->proc_lock);
114     proc_pid_flush(proc_dentry);
115     Index: linux-2.6.16-ck1/kernel/sched.c
116     ===================================================================
117     --- linux-2.6.16-ck1.orig/kernel/sched.c 2006-03-20 20:46:46.000000000 +1100
118     +++ linux-2.6.16-ck1/kernel/sched.c 2006-03-20 20:46:48.000000000 +1100
119     @@ -16,6 +16,9 @@
120     * by Davide Libenzi, preemptible kernel bits by Robert Love.
121     * 2003-09-03 Interactivity tuning by Con Kolivas.
122     * 2004-04-02 Scheduler domains code by Nick Piggin
123     + * 2006-03-16 New staircase scheduling policy by Con Kolivas with help
124     + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
125     + * Staircase v14.2
126     */
127    
128     #include <linux/mm.h>
129     @@ -76,128 +79,27 @@
130     */
131     #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
132     #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
133     +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
134     +#define TASK_PREEMPTS_CURR(p, rq) ((p)->prio < (rq)->curr->prio)
135    
136     +int sched_compute __read_mostly = 0;
137     /*
138     - * These are the 'tuning knobs' of the scheduler:
139     - *
140     - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
141     - * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
142     - * Timeslices get refilled after they expire.
143     - */
144     -#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
145     -#define DEF_TIMESLICE (100 * HZ / 1000)
146     -#define ON_RUNQUEUE_WEIGHT 30
147     -#define CHILD_PENALTY 95
148     -#define PARENT_PENALTY 100
149     -#define EXIT_WEIGHT 3
150     -#define PRIO_BONUS_RATIO 25
151     -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
152     -#define INTERACTIVE_DELTA 2
153     -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
154     -#define STARVATION_LIMIT (MAX_SLEEP_AVG)
155     -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
156     -
157     -/*
158     - * If a task is 'interactive' then we reinsert it in the active
159     - * array after it has expired its current timeslice. (it will not
160     - * continue to run immediately, it will still roundrobin with
161     - * other interactive tasks.)
162     - *
163     - * This part scales the interactivity limit depending on niceness.
164     - *
165     - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
166     - * Here are a few examples of different nice levels:
167     - *
168     - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
169     - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
170     - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
171     - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
172     - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
173     - *
174     - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
175     - * priority range a task can explore, a value of '1' means the
176     - * task is rated interactive.)
177     - *
178     - * Ie. nice +19 tasks can never get 'interactive' enough to be
179     - * reinserted into the active array. And only heavily CPU-hog nice -20
180     - * tasks will be expired. Default nice 0 tasks are somewhere between,
181     - * it takes some effort for them to get interactive, but it's not
182     - * too hard.
183     - */
184     -
185     -#define CURRENT_BONUS(p) \
186     - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
187     - MAX_SLEEP_AVG)
188     -
189     -#define GRANULARITY (10 * HZ / 1000 ? : 1)
190     -
191     -#ifdef CONFIG_SMP
192     -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
193     - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
194     - num_online_cpus())
195     -#else
196     -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
197     - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
198     -#endif
199     -
200     -#define SCALE(v1,v1_max,v2_max) \
201     - (v1) * (v2_max) / (v1_max)
202     -
203     -#define DELTA(p) \
204     - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
205     -
206     -#define TASK_INTERACTIVE(p) \
207     - ((p)->prio <= (p)->static_prio - DELTA(p))
208     -
209     -#define INTERACTIVE_SLEEP(p) \
210     - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
211     - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
212     -
213     -#define TASK_PREEMPTS_CURR(p, rq) \
214     - ((p)->prio < (rq)->curr->prio)
215     -
216     -/*
217     - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
218     - * to time slice values: [800ms ... 100ms ... 5ms]
219     - *
220     - * The higher a thread's priority, the bigger timeslices
221     - * it gets during one round of execution. But even the lowest
222     - * priority thread gets MIN_TIMESLICE worth of execution time.
223     + *This is the time all tasks within the same priority round robin.
224     + *compute setting is reserved for dedicated computational scheduling
225     + *and has twenty times larger intervals. Set to a minimum of 6ms.
226     */
227     +#define _RR_INTERVAL ((6 * HZ / 1001) + 1)
228     +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 16 * sched_compute))
229     +#define DEF_TIMESLICE (RR_INTERVAL() * 19)
230    
231     -#define SCALE_PRIO(x, prio) \
232     - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
233     -
234     -static unsigned int static_prio_timeslice(int static_prio)
235     -{
236     - if (static_prio < NICE_TO_PRIO(0))
237     - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
238     - else
239     - return SCALE_PRIO(DEF_TIMESLICE, static_prio);
240     -}
241     -
242     -static inline unsigned int task_timeslice(task_t *p)
243     -{
244     - return static_prio_timeslice(p->static_prio);
245     -}
246     -
247     -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
248     +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \
249     < (long long) (sd)->cache_hot_time)
250    
251     /*
252     * These are the runqueue data structures:
253     */
254     -
255     -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
256     -
257     typedef struct runqueue runqueue_t;
258    
259     -struct prio_array {
260     - unsigned int nr_active;
261     - unsigned long bitmap[BITMAP_SIZE];
262     - struct list_head queue[MAX_PRIO];
263     -};
264     -
265     /*
266     * This is the main, per-CPU runqueue data structure.
267     *
268     @@ -227,12 +129,12 @@ struct runqueue {
269     */
270     unsigned long nr_uninterruptible;
271    
272     - unsigned long expired_timestamp;
273     unsigned long long timestamp_last_tick;
274     + unsigned int cache_ticks, preempted;
275     task_t *curr, *idle;
276     struct mm_struct *prev_mm;
277     - prio_array_t *active, *expired, arrays[2];
278     - int best_expired_prio;
279     + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
280     + struct list_head queue[MAX_PRIO];
281     atomic_t nr_iowait;
282    
283     #ifdef CONFIG_SMP
284     @@ -496,13 +398,7 @@ static inline runqueue_t *this_rq_lock(v
285    
286     #ifdef CONFIG_SCHEDSTATS
287     /*
288     - * Called when a process is dequeued from the active array and given
289     - * the cpu. We should note that with the exception of interactive
290     - * tasks, the expired queue will become the active queue after the active
291     - * queue is empty, without explicitly dequeuing and requeuing tasks in the
292     - * expired queue. (Interactive tasks may be requeued directly to the
293     - * active queue, thus delaying tasks in the expired queue from running;
294     - * see scheduler_tick()).
295     + * Called when a process is dequeued and given the cpu.
296     *
297     * This function is only called from sched_info_arrive(), rather than
298     * dequeue_task(). Even though a task may be queued and dequeued multiple
299     @@ -540,13 +436,11 @@ static void sched_info_arrive(task_t *t)
300     }
301    
302     /*
303     - * Called when a process is queued into either the active or expired
304     - * array. The time is noted and later used to determine how long we
305     - * had to wait for us to reach the cpu. Since the expired queue will
306     - * become the active queue after active queue is empty, without dequeuing
307     - * and requeuing any tasks, we are interested in queuing to either. It
308     - * is unusual but not impossible for tasks to be dequeued and immediately
309     - * requeued in the same or another array: this can happen in sched_yield(),
310     + * Called when a process is queued
311     + * The time is noted and later used to determine how long we had to wait for
312     + * us to reach the cpu.
313     + * It is unusual but not impossible for tasks to be dequeued and immediately
314     + * requeued: this can happen in sched_yield(),
315     * set_user_nice(), and even load_balance() as it moves tasks from runqueue
316     * to runqueue.
317     *
318     @@ -601,73 +495,67 @@ static inline void sched_info_switch(tas
319     #endif /* CONFIG_SCHEDSTATS */
320    
321     /*
322     - * Adding/removing a task to/from a priority array:
323     + * Get nanosecond clock difference without overflowing unsigned long.
324     */
325     -static void dequeue_task(struct task_struct *p, prio_array_t *array)
326     +static unsigned long ns_diff(const unsigned long long v1,
327     + const unsigned long long v2)
328     {
329     - array->nr_active--;
330     - list_del(&p->run_list);
331     - if (list_empty(array->queue + p->prio))
332     - __clear_bit(p->prio, array->bitmap);
333     + unsigned long long vdiff;
334     + if (likely(v1 > v2)) {
335     + vdiff = v1 - v2;
336     +#if BITS_PER_LONG < 64
337     + if (vdiff > (1 << 31))
338     + vdiff = 1 << 31;
339     +#endif
340     + } else {
341     + /*
342     + * Rarely the clock appears to go backwards. There should
343     + * always be a positive difference so return 1.
344     + */
345     + vdiff = 1;
346     + }
347     + return (unsigned long)vdiff;
348     }
349    
350     -static void enqueue_task(struct task_struct *p, prio_array_t *array)
351     +static inline int task_queued(const task_t *task)
352     {
353     - sched_info_queued(p);
354     - list_add_tail(&p->run_list, array->queue + p->prio);
355     - __set_bit(p->prio, array->bitmap);
356     - array->nr_active++;
357     - p->array = array;
358     + return !list_empty(&task->run_list);
359     }
360    
361     /*
362     - * Put task to the end of the run list without the overhead of dequeue
363     - * followed by enqueue.
364     + * Adding/removing a task to/from a runqueue:
365     */
366     -static void requeue_task(struct task_struct *p, prio_array_t *array)
367     +static void fastcall dequeue_task(task_t *p, runqueue_t *rq)
368     {
369     - list_move_tail(&p->run_list, array->queue + p->prio);
370     + list_del_init(&p->run_list);
371     + if (list_empty(rq->queue + p->prio))
372     + __clear_bit(p->prio, rq->bitmap);
373     + p->ns_debit = 0;
374     }
375    
376     -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
377     +static void fastcall enqueue_task(task_t *p, runqueue_t *rq)
378     {
379     - list_add(&p->run_list, array->queue + p->prio);
380     - __set_bit(p->prio, array->bitmap);
381     - array->nr_active++;
382     - p->array = array;
383     + list_add_tail(&p->run_list, rq->queue + p->prio);
384     + __set_bit(p->prio, rq->bitmap);
385     }
386    
387     /*
388     - * effective_prio - return the priority that is based on the static
389     - * priority but is modified by bonuses/penalties.
390     - *
391     - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
392     - * into the -5 ... 0 ... +5 bonus/penalty range.
393     - *
394     - * We use 25% of the full 0...39 priority range so that:
395     - *
396     - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
397     - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
398     - *
399     - * Both properties are important to certain workloads.
400     + * Put task to the end of the run list without the overhead of dequeue
401     + * followed by enqueue.
402     */
403     -static int effective_prio(task_t *p)
404     +static inline void requeue_task(task_t *p, runqueue_t *rq)
405     {
406     - int bonus, prio;
407     -
408     - if (rt_task(p))
409     - return p->prio;
410     -
411     - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
412     + list_move_tail(&p->run_list, rq->queue + p->prio);
413     +}
414    
415     - prio = p->static_prio - bonus;
416     - if (prio < MAX_RT_PRIO)
417     - prio = MAX_RT_PRIO;
418     - if (prio > MAX_PRIO-1)
419     - prio = MAX_PRIO-1;
420     - return prio;
421     +static inline void enqueue_task_head(task_t *p, runqueue_t *rq)
422     +{
423     + list_add(&p->run_list, rq->queue + p->prio);
424     + __set_bit(p->prio, rq->bitmap);
425     }
426    
427     +static unsigned int fastcall slice(const task_t *p);
428     +
429     /*
430     * To aid in avoiding the subversion of "niceness" due to uneven distribution
431     * of tasks with abnormal "nice" values across CPUs the contribution that
432     @@ -685,10 +573,9 @@ static int effective_prio(task_t *p)
433     #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
434     #define LOAD_WEIGHT(lp) \
435     (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
436     -#define PRIO_TO_LOAD_WEIGHT(prio) \
437     - LOAD_WEIGHT(static_prio_timeslice(prio))
438     -#define RTPRIO_TO_LOAD_WEIGHT(rp) \
439     - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
440     +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p))
441     +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
442     + (LOAD_WEIGHT((RR_INTERVAL() + 20 + (rp))))
443    
444     static void set_load_weight(task_t *p)
445     {
446     @@ -705,7 +592,7 @@ static void set_load_weight(task_t *p)
447     #endif
448     p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
449     } else
450     - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
451     + p->load_weight = TASK_LOAD_WEIGHT(p);
452     }
453    
454     static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
455     @@ -733,9 +620,9 @@ static inline void dec_nr_running(task_t
456     /*
457     * __activate_task - move a task to the runqueue.
458     */
459     -static inline void __activate_task(task_t *p, runqueue_t *rq)
460     +static void fastcall __activate_task(task_t *p, runqueue_t *rq)
461     {
462     - enqueue_task(p, rq->active);
463     + enqueue_task(p, rq);
464     inc_nr_running(p, rq);
465     }
466    
467     @@ -744,74 +631,157 @@ static inline void __activate_task(task_
468     */
469     static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
470     {
471     - enqueue_task_head(p, rq->active);
472     + enqueue_task_head(p, rq);
473     inc_nr_running(p, rq);
474     }
475    
476     -static int recalc_task_prio(task_t *p, unsigned long long now)
477     +/*
478     + * Bonus - How much higher than its base priority an interactive task can run.
479     + */
480     +static inline unsigned int bonus(const task_t *p)
481     {
482     - /* Caller must always ensure 'now >= p->timestamp' */
483     - unsigned long long __sleep_time = now - p->timestamp;
484     - unsigned long sleep_time;
485     -
486     - if (unlikely(p->policy == SCHED_BATCH))
487     - sleep_time = 0;
488     - else {
489     - if (__sleep_time > NS_MAX_SLEEP_AVG)
490     - sleep_time = NS_MAX_SLEEP_AVG;
491     - else
492     - sleep_time = (unsigned long)__sleep_time;
493     - }
494     + return TASK_USER_PRIO(p);
495     +}
496    
497     - if (likely(sleep_time > 0)) {
498     - /*
499     - * User tasks that sleep a long time are categorised as
500     - * idle and will get just interactive status to stay active &
501     - * prevent them suddenly becoming cpu hogs and starving
502     - * other processes.
503     - */
504     - if (p->mm && p->activated != -1 &&
505     - sleep_time > INTERACTIVE_SLEEP(p)) {
506     - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
507     - DEF_TIMESLICE);
508     - } else {
509     - /*
510     - * The lower the sleep avg a task has the more
511     - * rapidly it will rise with sleep time.
512     - */
513     - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
514     +static unsigned int fastcall rr_interval(const task_t *p)
515     +{
516     + int nice = TASK_NICE(p);
517    
518     - /*
519     - * Tasks waking from uninterruptible sleep are
520     - * limited in their sleep_avg rise as they
521     - * are likely to be waiting on I/O
522     - */
523     - if (p->activated == -1 && p->mm) {
524     - if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
525     - sleep_time = 0;
526     - else if (p->sleep_avg + sleep_time >=
527     - INTERACTIVE_SLEEP(p)) {
528     - p->sleep_avg = INTERACTIVE_SLEEP(p);
529     - sleep_time = 0;
530     - }
531     - }
532     + if (nice < 0 && !rt_task(p))
533     + return RR_INTERVAL() * (20 - nice) / 20;
534     + return RR_INTERVAL();
535     +}
536    
537     - /*
538     - * This code gives a bonus to interactive tasks.
539     - *
540     - * The boost works by updating the 'average sleep time'
541     - * value here, based on ->timestamp. The more time a
542     - * task spends sleeping, the higher the average gets -
543     - * and the higher the priority boost gets as well.
544     - */
545     - p->sleep_avg += sleep_time;
546     +/*
547     + * slice - the duration a task runs before getting requeued at its best
548     + * priority and has its bonus decremented.
549     + */
550     +static unsigned int fastcall slice(const task_t *p)
551     +{
552     + unsigned int slice, rr;
553    
554     - if (p->sleep_avg > NS_MAX_SLEEP_AVG)
555     - p->sleep_avg = NS_MAX_SLEEP_AVG;
556     - }
557     + slice = rr = rr_interval(p);
558     + if (likely(!rt_task(p)))
559     + slice += (39 - TASK_USER_PRIO(p)) * rr;
560     + return slice;
561     +}
562     +
563     +/*
564     + * We increase our bonus by sleeping more than the time we ran.
565     + * The ratio of sleep to run gives us the cpu% that we last ran and determines
566     + * the maximum bonus we can acquire.
567     + */
568     +static void fastcall inc_bonus(task_t *p, const unsigned long totalrun,
569     + const unsigned long sleep)
570     +{
571     + unsigned int best_bonus;
572     +
573     + best_bonus = sleep / (totalrun + 1);
574     + if (p->bonus >= best_bonus)
575     + return;
576     +
577     + p->bonus++;
578     + best_bonus = bonus(p);
579     + if (p->bonus > best_bonus)
580     + p->bonus = best_bonus;
581     +}
582     +
583     +static void dec_bonus(task_t *p)
584     +{
585     + if (p->bonus)
586     + p->bonus--;
587     +}
588     +
589     +/*
590     + * sched_interactive - sysctl which allows interactive tasks to have bonus
591     + * raise its priority.
592     + */
593     +int sched_interactive __read_mostly = 1;
594     +
595     +/*
596     + * effective_prio - dynamic priority dependent on bonus.
597     + * The priority normally decreases by one each RR_INTERVAL.
598     + * As the bonus increases the initial priority starts at a higher "stair" or
599     + * priority for longer.
600     + */
601     +static int effective_prio(const task_t *p)
602     +{
603     + int prio;
604     + unsigned int full_slice, used_slice = 0;
605     + unsigned int best_bonus, rr;
606     +
607     + if (rt_task(p))
608     + return p->prio;
609     +
610     + full_slice = slice(p);
611     + if (full_slice > p->slice)
612     + used_slice = full_slice - p->slice;
613     +
614     + best_bonus = bonus(p);
615     + prio = MAX_RT_PRIO + best_bonus;
616     + if (sched_interactive && !sched_compute && p->policy != SCHED_BATCH)
617     + prio -= p->bonus;
618     +
619     + rr = rr_interval(p);
620     + prio += used_slice / rr;
621     + if (prio > MAX_PRIO - 1)
622     + prio = MAX_PRIO - 1;
623     + return prio;
624     +}
625     +
626     +static inline void continue_slice(task_t *p)
627     +{
628     + unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
629     +
630     + if (total_run >= p->slice) {
631     + p->totalrun -= JIFFIES_TO_NS(p->slice);
632     + dec_bonus(p);
633     + } else {
634     + unsigned int remainder;
635     +
636     + p->slice -= total_run;
637     + remainder = p->slice % rr_interval(p);
638     + if (remainder)
639     + p->time_slice = remainder;
640     }
641     +}
642    
643     - return effective_prio(p);
644     +/*
645     + * recalc_task_prio - this checks for tasks that run ultra short timeslices
646     + * or have just forked a thread/process and make them continue their old
647     + * slice instead of starting a new one at high priority.
648     + */
649     +static inline void recalc_task_prio(task_t *p, const unsigned long long now)
650     +{
651     + unsigned long sleep_time = ns_diff(now, p->timestamp);
652     +
653     + /*
654     + * Add the total for this last scheduled run (p->runtime) to the
655     + * running total so far used (p->totalrun).
656     + */
657     + p->totalrun += p->runtime;
658     +
659     + /*
660     + * If we sleep longer than our running total and have not set the
661     + * PF_NONSLEEP flag we gain a bonus.
662     + */
663     + if (sleep_time >= p->totalrun && !(p->flags & PF_NONSLEEP) &&
664     + !sched_compute) {
665     + inc_bonus(p, p->totalrun, sleep_time);
666     + p->totalrun = 0;
667     + return;
668     + }
669     +
670     + /*
671     + * If we have not set the PF_NONSLEEP flag we elevate priority by the
672     + * amount of time we slept.
673     + */
674     + if (p->flags & PF_NONSLEEP)
675     + p->flags &= ~PF_NONSLEEP;
676     + else
677     + p->totalrun -= sleep_time;
678     +
679     + continue_slice(p);
680     }
681    
682     /*
683     @@ -820,11 +790,11 @@ static int recalc_task_prio(task_t *p, u
684     * Update all the scheduling statistics stuff. (sleep average
685     * calculation, priority modifiers, etc.)
686     */
687     -static void activate_task(task_t *p, runqueue_t *rq, int local)
688     +static void activate_task(task_t *p, runqueue_t *rq, const int local)
689     {
690     - unsigned long long now;
691     + unsigned long long now = sched_clock();
692     + unsigned long rr = rr_interval(p);
693    
694     - now = sched_clock();
695     #ifdef CONFIG_SMP
696     if (!local) {
697     /* Compensate for drifting sched_clock */
698     @@ -833,45 +803,24 @@ static void activate_task(task_t *p, run
699     + rq->timestamp_last_tick;
700     }
701     #endif
702     -
703     - if (!rt_task(p))
704     - p->prio = recalc_task_prio(p, now);
705     -
706     - /*
707     - * This checks to make sure it's not an uninterruptible task
708     - * that is now waking up.
709     - */
710     - if (!p->activated) {
711     - /*
712     - * Tasks which were woken up by interrupts (ie. hw events)
713     - * are most likely of interactive nature. So we give them
714     - * the credit of extending their sleep time to the period
715     - * of time they spend on the runqueue, waiting for execution
716     - * on a CPU, first time around:
717     - */
718     - if (in_interrupt())
719     - p->activated = 2;
720     - else {
721     - /*
722     - * Normal first-time wakeups get a credit too for
723     - * on-runqueue time, but it will be weighted down:
724     - */
725     - p->activated = 1;
726     - }
727     + p->slice = slice(p);
728     + p->time_slice = p->slice % rr ? : rr;
729     + if (!rt_task(p)) {
730     + recalc_task_prio(p, now);
731     + p->flags &= ~PF_NONSLEEP;
732     + p->prio = effective_prio(p);
733     }
734     p->timestamp = now;
735     -
736     __activate_task(p, rq);
737     }
738    
739     /*
740     * deactivate_task - remove a task from the runqueue.
741     */
742     -static void deactivate_task(struct task_struct *p, runqueue_t *rq)
743     +static void fastcall deactivate_task(task_t *p, runqueue_t *rq)
744     {
745     dec_nr_running(p, rq);
746     - dequeue_task(p, p->array);
747     - p->array = NULL;
748     + dequeue_task(p, rq);
749     }
750    
751     /*
752     @@ -947,7 +896,7 @@ static int migrate_task(task_t *p, int d
753     * If the task is not on a runqueue (and not running), then
754     * it is sufficient to simply update the task's cpu field.
755     */
756     - if (!p->array && !task_running(rq, p)) {
757     + if (!task_queued(p) && !task_running(rq, p)) {
758     set_task_cpu(p, dest_cpu);
759     return 0;
760     }
761     @@ -977,7 +926,7 @@ void wait_task_inactive(task_t *p)
762     repeat:
763     rq = task_rq_lock(p, &flags);
764     /* Must be off runqueue entirely, not preempted. */
765     - if (unlikely(p->array || task_running(rq, p))) {
766     + if (unlikely(task_queued(p) || task_running(rq, p))) {
767     /* If it's preempted, we yield. It could be a while. */
768     preempted = !task_running(rq, p);
769     task_rq_unlock(rq, &flags);
770     @@ -1228,6 +1177,26 @@ static inline int wake_idle(int cpu, tas
771     }
772     #endif
773    
774     +/*
775     + * CACHE_DELAY is the time preemption is delayed in sched_compute mode
776     + * and is set to a nominal 10ms.
777     + */
778     +#define CACHE_DELAY (10 * (HZ) / 1001 + 1)
779     +
780     +/*
781     + * Check to see if p preempts rq->curr and resched if it does. In compute
782     + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted.
783     + */
784     +static void fastcall preempt(const task_t *p, runqueue_t *rq)
785     +{
786     + if (p->prio >= rq->curr->prio)
787     + return;
788     + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY ||
789     + !p->mm || rt_task(p))
790     + resched_task(rq->curr);
791     + rq->preempted = 1;
792     +}
793     +
794     /***
795     * try_to_wake_up - wake up a thread
796     * @p: the to-be-woken-up thread
797     @@ -1259,7 +1228,7 @@ static int try_to_wake_up(task_t *p, uns
798     if (!(old_state & state))
799     goto out;
800    
801     - if (p->array)
802     + if (task_queued(p))
803     goto out_running;
804    
805     cpu = task_cpu(p);
806     @@ -1350,7 +1319,7 @@ out_set_cpu:
807     old_state = p->state;
808     if (!(old_state & state))
809     goto out;
810     - if (p->array)
811     + if (task_queued(p))
812     goto out_running;
813    
814     this_cpu = smp_processor_id();
815     @@ -1359,26 +1328,10 @@ out_set_cpu:
816    
817     out_activate:
818     #endif /* CONFIG_SMP */
819     - if (old_state == TASK_UNINTERRUPTIBLE) {
820     + if (old_state == TASK_UNINTERRUPTIBLE)
821     rq->nr_uninterruptible--;
822     - /*
823     - * Tasks on involuntary sleep don't earn
824     - * sleep_avg beyond just interactive state.
825     - */
826     - p->activated = -1;
827     - }
828    
829     /*
830     - * Tasks that have marked their sleep as noninteractive get
831     - * woken up without updating their sleep average. (i.e. their
832     - * sleep is handled in a priority-neutral manner, no priority
833     - * boost and no penalty.)
834     - */
835     - if (old_state & TASK_NONINTERACTIVE)
836     - __activate_task(p, rq);
837     - else
838     - activate_task(p, rq, cpu == this_cpu);
839     - /*
840     * Sync wakeups (i.e. those types of wakeups where the waker
841     * has indicated that it will leave the CPU in short order)
842     * don't trigger a preemption, if the woken up task will run on
843     @@ -1386,10 +1339,9 @@ out_activate:
844     * the waker guarantees that the freshly woken up task is going
845     * to be considered on this CPU.)
846     */
847     - if (!sync || cpu != this_cpu) {
848     - if (TASK_PREEMPTS_CURR(p, rq))
849     - resched_task(rq->curr);
850     - }
851     + activate_task(p, rq, cpu == this_cpu);
852     + if (!sync || cpu != this_cpu)
853     + preempt(p, rq);
854     success = 1;
855    
856     out_running:
857     @@ -1434,7 +1386,6 @@ void fastcall sched_fork(task_t *p, int
858     */
859     p->state = TASK_RUNNING;
860     INIT_LIST_HEAD(&p->run_list);
861     - p->array = NULL;
862     #ifdef CONFIG_SCHEDSTATS
863     memset(&p->sched_info, 0, sizeof(p->sched_info));
864     #endif
865     @@ -1445,30 +1396,6 @@ void fastcall sched_fork(task_t *p, int
866     /* Want to start with kernel preemption disabled. */
867     task_thread_info(p)->preempt_count = 1;
868     #endif
869     - /*
870     - * Share the timeslice between parent and child, thus the
871     - * total amount of pending timeslices in the system doesn't change,
872     - * resulting in more scheduling fairness.
873     - */
874     - local_irq_disable();
875     - p->time_slice = (current->time_slice + 1) >> 1;
876     - /*
877     - * The remainder of the first timeslice might be recovered by
878     - * the parent if the child exits early enough.
879     - */
880     - p->first_time_slice = 1;
881     - current->time_slice >>= 1;
882     - p->timestamp = sched_clock();
883     - if (unlikely(!current->time_slice)) {
884     - /*
885     - * This case is rare, it happens when the parent has only
886     - * a single jiffy left from its timeslice. Taking the
887     - * runqueue lock is not a problem.
888     - */
889     - current->time_slice = 1;
890     - scheduler_tick();
891     - }
892     - local_irq_enable();
893     put_cpu();
894     }
895    
896     @@ -1491,36 +1418,20 @@ void fastcall wake_up_new_task(task_t *p
897     cpu = task_cpu(p);
898    
899     /*
900     - * We decrease the sleep average of forking parents
901     - * and children as well, to keep max-interactive tasks
902     - * from forking tasks that are max-interactive. The parent
903     - * (current) is done further down, under its lock.
904     + * Forked process gets no bonus to prevent fork bombs.
905     */
906     - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
907     - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
908     -
909     - p->prio = effective_prio(p);
910     + p->bonus = 0;
911    
912     if (likely(cpu == this_cpu)) {
913     - if (!(clone_flags & CLONE_VM)) {
914     + current->flags |= PF_NONSLEEP;
915     + activate_task(p, rq, 1);
916     + if (!(clone_flags & CLONE_VM))
917     /*
918     * The VM isn't cloned, so we're in a good position to
919     * do child-runs-first in anticipation of an exec. This
920     * usually avoids a lot of COW overhead.
921     */
922     - if (unlikely(!current->array))
923     - __activate_task(p, rq);
924     - else {
925     - p->prio = current->prio;
926     - list_add_tail(&p->run_list, &current->run_list);
927     - p->array = current->array;
928     - p->array->nr_active++;
929     - inc_nr_running(p, rq);
930     - }
931     set_need_resched();
932     - } else
933     - /* Run child last */
934     - __activate_task(p, rq);
935     /*
936     * We skip the following code due to cpu == this_cpu
937     *
938     @@ -1537,53 +1448,20 @@ void fastcall wake_up_new_task(task_t *p
939     */
940     p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
941     + rq->timestamp_last_tick;
942     - __activate_task(p, rq);
943     - if (TASK_PREEMPTS_CURR(p, rq))
944     - resched_task(rq->curr);
945     + activate_task(p, rq, 0);
946     + preempt(p, rq);
947    
948     /*
949     * Parent and child are on different CPUs, now get the
950     - * parent runqueue to update the parent's ->sleep_avg:
951     + * parent runqueue to update the parent's ->flags:
952     */
953     task_rq_unlock(rq, &flags);
954     this_rq = task_rq_lock(current, &flags);
955     + current->flags |= PF_NONSLEEP;
956     }
957     - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
958     - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
959     task_rq_unlock(this_rq, &flags);
960     }
961    
962     -/*
963     - * Potentially available exiting-child timeslices are
964     - * retrieved here - this way the parent does not get
965     - * penalized for creating too many threads.
966     - *
967     - * (this cannot be used to 'generate' timeslices
968     - * artificially, because any timeslice recovered here
969     - * was given away by the parent in the first place.)
970     - */
971     -void fastcall sched_exit(task_t *p)
972     -{
973     - unsigned long flags;
974     - runqueue_t *rq;
975     -
976     - /*
977     - * If the child was a (relative-) CPU hog then decrease
978     - * the sleep_avg of the parent as well.
979     - */
980     - rq = task_rq_lock(p->parent, &flags);
981     - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
982     - p->parent->time_slice += p->time_slice;
983     - if (unlikely(p->parent->time_slice > task_timeslice(p)))
984     - p->parent->time_slice = task_timeslice(p);
985     - }
986     - if (p->sleep_avg < p->parent->sleep_avg)
987     - p->parent->sleep_avg = p->parent->sleep_avg /
988     - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
989     - (EXIT_WEIGHT + 1);
990     - task_rq_unlock(rq, &flags);
991     -}
992     -
993     /**
994     * prepare_task_switch - prepare to switch tasks
995     * @rq: the runqueue preparing to switch
996     @@ -1855,32 +1733,28 @@ void sched_exec(void)
997     * pull_task - move a task from a remote runqueue to the local runqueue.
998     * Both runqueues must be locked.
999     */
1000     -static
1001     -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1002     - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1003     +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq,
1004     + const int this_cpu)
1005     {
1006     - dequeue_task(p, src_array);
1007     + dequeue_task(p, src_rq);
1008     dec_nr_running(p, src_rq);
1009     set_task_cpu(p, this_cpu);
1010     inc_nr_running(p, this_rq);
1011     - enqueue_task(p, this_array);
1012     + enqueue_task(p, this_rq);
1013     p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1014     + this_rq->timestamp_last_tick;
1015     /*
1016     * Note that idle threads have a prio of MAX_PRIO, for this test
1017     * to be always true for them.
1018     */
1019     - if (TASK_PREEMPTS_CURR(p, this_rq))
1020     - resched_task(this_rq->curr);
1021     + preempt(p, this_rq);
1022     }
1023    
1024     /*
1025     * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1026     */
1027     -static
1028     -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1029     - struct sched_domain *sd, enum idle_type idle,
1030     - int *all_pinned)
1031     +static int can_migrate_task(task_t *p, runqueue_t *rq, const int this_cpu,
1032     + struct sched_domain *sd, const enum idle_type idle, int *all_pinned)
1033     {
1034     /*
1035     * We do not migrate tasks that are:
1036     @@ -1921,7 +1795,6 @@ static int move_tasks(runqueue_t *this_r
1037     struct sched_domain *sd, enum idle_type idle,
1038     int *all_pinned)
1039     {
1040     - prio_array_t *array, *dst_array;
1041     struct list_head *head, *curr;
1042     int idx, pulled = 0, pinned = 0;
1043     long rem_load_move;
1044     @@ -1933,38 +1806,17 @@ static int move_tasks(runqueue_t *this_r
1045     rem_load_move = max_load_move;
1046     pinned = 1;
1047    
1048     - /*
1049     - * We first consider expired tasks. Those will likely not be
1050     - * executed in the near future, and they are most likely to
1051     - * be cache-cold, thus switching CPUs has the least effect
1052     - * on them.
1053     - */
1054     - if (busiest->expired->nr_active) {
1055     - array = busiest->expired;
1056     - dst_array = this_rq->expired;
1057     - } else {
1058     - array = busiest->active;
1059     - dst_array = this_rq->active;
1060     - }
1061     -
1062     -new_array:
1063     /* Start searching at priority 0: */
1064     idx = 0;
1065     skip_bitmap:
1066     if (!idx)
1067     - idx = sched_find_first_bit(array->bitmap);
1068     + idx = sched_find_first_bit(busiest->bitmap);
1069     else
1070     - idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1071     - if (idx >= MAX_PRIO) {
1072     - if (array == busiest->expired && busiest->active->nr_active) {
1073     - array = busiest->active;
1074     - dst_array = this_rq->active;
1075     - goto new_array;
1076     - }
1077     + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1078     + if (idx >= MAX_PRIO)
1079     goto out;
1080     - }
1081    
1082     - head = array->queue + idx;
1083     + head = busiest->queue + idx;
1084     curr = head->prev;
1085     skip_queue:
1086     tmp = list_entry(curr, task_t, run_list);
1087     @@ -1984,7 +1836,7 @@ skip_queue:
1088     schedstat_inc(sd, lb_hot_gained[idle]);
1089     #endif
1090    
1091     - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1092     + pull_task(busiest, tmp, this_rq, this_cpu);
1093     pulled++;
1094     rem_load_move -= tmp->load_weight;
1095    
1096     @@ -2507,15 +2359,13 @@ static void rebalance_tick(int this_cpu,
1097     continue;
1098    
1099     interval = sd->balance_interval;
1100     - if (idle != SCHED_IDLE)
1101     - interval *= sd->busy_factor;
1102    
1103     /* scale ms to jiffies */
1104     interval = msecs_to_jiffies(interval);
1105     if (unlikely(!interval))
1106     interval = 1;
1107    
1108     - if (j - sd->last_balance >= interval) {
1109     + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) {
1110     if (load_balance(this_cpu, this_rq, sd, idle)) {
1111     /*
1112     * We've pulled tasks over so either we're no
1113     @@ -2589,22 +2439,6 @@ unsigned long long current_sched_time(co
1114     }
1115    
1116     /*
1117     - * We place interactive tasks back into the active array, if possible.
1118     - *
1119     - * To guarantee that this does not starve expired tasks we ignore the
1120     - * interactivity of a task if the first expired task had to wait more
1121     - * than a 'reasonable' amount of time. This deadline timeout is
1122     - * load-dependent, as the frequency of array switched decreases with
1123     - * increasing number of running tasks. We also ignore the interactivity
1124     - * if a better static_prio task has expired:
1125     - */
1126     -#define EXPIRED_STARVING(rq) \
1127     - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1128     - (jiffies - (rq)->expired_timestamp >= \
1129     - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1130     - ((rq)->curr->static_prio > (rq)->best_expired_prio))
1131     -
1132     -/*
1133     * Account user cpu time to a process.
1134     * @p: the process that the cpu time gets accounted to
1135     * @hardirq_offset: the offset to subtract from hardirq_count()
1136     @@ -2652,6 +2486,7 @@ void account_system_time(struct task_str
1137     cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
1138     else
1139     cpustat->idle = cputime64_add(cpustat->idle, tmp);
1140     +
1141     /* Account for system time used */
1142     acct_update_integrals(p);
1143     }
1144     @@ -2677,18 +2512,25 @@ void account_steal_time(struct task_stru
1145     cpustat->steal = cputime64_add(cpustat->steal, tmp);
1146     }
1147    
1148     +static void time_slice_expired(task_t *p, runqueue_t *rq)
1149     +{
1150     + set_tsk_need_resched(p);
1151     + dequeue_task(p, rq);
1152     + p->prio = effective_prio(p);
1153     + p->time_slice = rr_interval(p);
1154     + enqueue_task(p, rq);
1155     +}
1156     +
1157     /*
1158     * This function gets called by the timer code, with HZ frequency.
1159     * We call it with interrupts disabled.
1160     - *
1161     - * It also gets called by the fork code, when changing the parent's
1162     - * timeslices.
1163     */
1164     void scheduler_tick(void)
1165     {
1166     int cpu = smp_processor_id();
1167     runqueue_t *rq = this_rq();
1168     task_t *p = current;
1169     + unsigned long debit, expired_balance = rq->nr_running;
1170     unsigned long long now = sched_clock();
1171    
1172     update_cpu_clock(p, rq, now);
1173     @@ -2703,78 +2545,53 @@ void scheduler_tick(void)
1174     }
1175    
1176     /* Task might have expired already, but not scheduled off yet */
1177     - if (p->array != rq->active) {
1178     + if (unlikely(!task_queued(p))) {
1179     set_tsk_need_resched(p);
1180     goto out;
1181     }
1182     - spin_lock(&rq->lock);
1183     /*
1184     - * The task was running during this tick - update the
1185     - * time slice counter. Note: we do not update a thread's
1186     - * priority until it either goes to sleep or uses up its
1187     - * timeslice. This makes it possible for interactive tasks
1188     - * to use up their timeslices at their highest priority levels.
1189     + * SCHED_FIFO tasks never run out of timeslice.
1190     */
1191     - if (rt_task(p)) {
1192     - /*
1193     - * RR tasks need a special form of timeslice management.
1194     - * FIFO tasks have no timeslices.
1195     - */
1196     - if ((p->policy == SCHED_RR) && !--p->time_slice) {
1197     - p->time_slice = task_timeslice(p);
1198     - p->first_time_slice = 0;
1199     - set_tsk_need_resched(p);
1200     + if (unlikely(p->policy == SCHED_FIFO)) {
1201     + expired_balance = 0;
1202     + goto out;
1203     + }
1204    
1205     - /* put it at the end of the queue: */
1206     - requeue_task(p, rq->active);
1207     - }
1208     + spin_lock(&rq->lock);
1209     + debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
1210     + p->ns_debit += debit;
1211     + if (p->ns_debit < NSJIFFY)
1212     + goto out_unlock;
1213     + p->ns_debit %= NSJIFFY;
1214     + /*
1215     + * Tasks lose bonus each time they use up a full slice().
1216     + */
1217     + if (!--p->slice) {
1218     + dec_bonus(p);
1219     + p->slice = slice(p);
1220     + time_slice_expired(p, rq);
1221     + p->totalrun = 0;
1222     goto out_unlock;
1223     }
1224     + /*
1225     + * Tasks that run out of time_slice but still have slice left get
1226     + * requeued with a lower priority && RR_INTERVAL time_slice.
1227     + */
1228     if (!--p->time_slice) {
1229     - dequeue_task(p, rq->active);
1230     + time_slice_expired(p, rq);
1231     + goto out_unlock;
1232     + }
1233     + rq->cache_ticks++;
1234     + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) {
1235     set_tsk_need_resched(p);
1236     - p->prio = effective_prio(p);
1237     - p->time_slice = task_timeslice(p);
1238     - p->first_time_slice = 0;
1239     -
1240     - if (!rq->expired_timestamp)
1241     - rq->expired_timestamp = jiffies;
1242     - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1243     - enqueue_task(p, rq->expired);
1244     - if (p->static_prio < rq->best_expired_prio)
1245     - rq->best_expired_prio = p->static_prio;
1246     - } else
1247     - enqueue_task(p, rq->active);
1248     - } else {
1249     - /*
1250     - * Prevent a too long timeslice allowing a task to monopolize
1251     - * the CPU. We do this by splitting up the timeslice into
1252     - * smaller pieces.
1253     - *
1254     - * Note: this does not mean the task's timeslices expire or
1255     - * get lost in any way, they just might be preempted by
1256     - * another task of equal priority. (one with higher
1257     - * priority would have preempted this task already.) We
1258     - * requeue this task to the end of the list on this priority
1259     - * level, which is in essence a round-robin of tasks with
1260     - * equal priority.
1261     - *
1262     - * This only applies to tasks in the interactive
1263     - * delta range with at least TIMESLICE_GRANULARITY to requeue.
1264     - */
1265     - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1266     - p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1267     - (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1268     - (p->array == rq->active)) {
1269     -
1270     - requeue_task(p, rq->active);
1271     - set_tsk_need_resched(p);
1272     - }
1273     + goto out_unlock;
1274     }
1275     + expired_balance = 0;
1276     out_unlock:
1277     spin_unlock(&rq->lock);
1278     out:
1279     - rebalance_tick(cpu, rq, NOT_IDLE);
1280     + if (expired_balance > 1)
1281     + rebalance_tick(cpu, rq, NOT_IDLE);
1282     }
1283    
1284     #ifdef CONFIG_SCHED_SMT
1285     @@ -2831,19 +2648,19 @@ static void wake_sleeping_dependent(int
1286    
1287     /*
1288     * number of 'lost' timeslices this task wont be able to fully
1289     - * utilize, if another task runs on a sibling. This models the
1290     + * utilise, if another task runs on a sibling. This models the
1291     * slowdown effect of other tasks running on siblings:
1292     */
1293     -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
1294     +static inline unsigned long smt_slice(const task_t *p,
1295     + const struct sched_domain *sd)
1296     {
1297     - return p->time_slice * (100 - sd->per_cpu_gain) / 100;
1298     + return p->slice * (100 - sd->per_cpu_gain) / 100;
1299     }
1300    
1301     static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
1302     {
1303     struct sched_domain *tmp, *sd = NULL;
1304     cpumask_t sibling_map;
1305     - prio_array_t *array;
1306     int ret = 0, i;
1307     task_t *p;
1308    
1309     @@ -2870,12 +2687,8 @@ static int dependent_sleeper(int this_cp
1310     */
1311     if (!this_rq->nr_running)
1312     goto out_unlock;
1313     - array = this_rq->active;
1314     - if (!array->nr_active)
1315     - array = this_rq->expired;
1316     - BUG_ON(!array->nr_active);
1317    
1318     - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
1319     + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next,
1320     task_t, run_list);
1321    
1322     for_each_cpu_mask(i, sibling_map) {
1323     @@ -2905,7 +2718,7 @@ static int dependent_sleeper(int this_cp
1324     } else
1325     if (smt_curr->static_prio < p->static_prio &&
1326     !TASK_PREEMPTS_CURR(p, smt_rq) &&
1327     - smt_slice(smt_curr, sd) > task_timeslice(p))
1328     + smt_slice(smt_curr, sd) > slice(p))
1329     ret = 1;
1330    
1331     check_smt_task:
1332     @@ -2928,7 +2741,7 @@ check_smt_task:
1333     resched_task(smt_curr);
1334     } else {
1335     if (TASK_PREEMPTS_CURR(p, smt_rq) &&
1336     - smt_slice(p, sd) > task_timeslice(smt_curr))
1337     + smt_slice(p, sd) > slice(smt_curr))
1338     resched_task(smt_curr);
1339     else
1340     wakeup_busy_runqueue(smt_rq);
1341     @@ -2990,11 +2803,10 @@ asmlinkage void __sched schedule(void)
1342     long *switch_count;
1343     task_t *prev, *next;
1344     runqueue_t *rq;
1345     - prio_array_t *array;
1346     struct list_head *queue;
1347     unsigned long long now;
1348     - unsigned long run_time;
1349     - int cpu, idx, new_prio;
1350     + unsigned long debit;
1351     + int cpu, idx;
1352    
1353     /*
1354     * Test if we are atomic. Since do_exit() needs to call into
1355     @@ -3029,20 +2841,11 @@ need_resched_nonpreemptible:
1356    
1357     schedstat_inc(rq, sched_cnt);
1358     now = sched_clock();
1359     - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
1360     - run_time = now - prev->timestamp;
1361     - if (unlikely((long long)(now - prev->timestamp) < 0))
1362     - run_time = 0;
1363     - } else
1364     - run_time = NS_MAX_SLEEP_AVG;
1365     -
1366     - /*
1367     - * Tasks charged proportionately less run_time at high sleep_avg to
1368     - * delay them losing their interactive status
1369     - */
1370     - run_time /= (CURRENT_BONUS(prev) ? : 1);
1371    
1372     spin_lock_irq(&rq->lock);
1373     + prev->runtime = ns_diff(now, prev->timestamp);
1374     + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY;
1375     + prev->ns_debit += debit;
1376    
1377     if (unlikely(prev->flags & PF_DEAD))
1378     prev->state = EXIT_DEAD;
1379     @@ -3054,8 +2857,10 @@ need_resched_nonpreemptible:
1380     unlikely(signal_pending(prev))))
1381     prev->state = TASK_RUNNING;
1382     else {
1383     - if (prev->state == TASK_UNINTERRUPTIBLE)
1384     + if (prev->state == TASK_UNINTERRUPTIBLE) {
1385     + prev->flags |= PF_NONSLEEP;
1386     rq->nr_uninterruptible++;
1387     + }
1388     deactivate_task(prev, rq);
1389     }
1390     }
1391     @@ -3066,7 +2871,6 @@ go_idle:
1392     idle_balance(cpu, rq);
1393     if (!rq->nr_running) {
1394     next = rq->idle;
1395     - rq->expired_timestamp = 0;
1396     wake_sleeping_dependent(cpu, rq);
1397     /*
1398     * wake_sleeping_dependent() might have released
1399     @@ -3090,45 +2894,15 @@ go_idle:
1400     goto go_idle;
1401     }
1402    
1403     - array = rq->active;
1404     - if (unlikely(!array->nr_active)) {
1405     - /*
1406     - * Switch the active and expired arrays.
1407     - */
1408     - schedstat_inc(rq, sched_switch);
1409     - rq->active = rq->expired;
1410     - rq->expired = array;
1411     - array = rq->active;
1412     - rq->expired_timestamp = 0;
1413     - rq->best_expired_prio = MAX_PRIO;
1414     - }
1415     -
1416     - idx = sched_find_first_bit(array->bitmap);
1417     - queue = array->queue + idx;
1418     + idx = sched_find_first_bit(rq->bitmap);
1419     + queue = rq->queue + idx;
1420     next = list_entry(queue->next, task_t, run_list);
1421    
1422     - if (!rt_task(next) && next->activated > 0) {
1423     - unsigned long long delta = now - next->timestamp;
1424     - if (unlikely((long long)(now - next->timestamp) < 0))
1425     - delta = 0;
1426     -
1427     - if (next->activated == 1)
1428     - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1429     -
1430     - array = next->array;
1431     - new_prio = recalc_task_prio(next, next->timestamp + delta);
1432     -
1433     - if (unlikely(next->prio != new_prio)) {
1434     - dequeue_task(next, array);
1435     - next->prio = new_prio;
1436     - enqueue_task(next, array);
1437     - } else
1438     - requeue_task(next, array);
1439     - }
1440     - next->activated = 0;
1441     switch_tasks:
1442     if (next == rq->idle)
1443     schedstat_inc(rq, sched_goidle);
1444     + prev->timestamp = now;
1445     +
1446     prefetch(next);
1447     prefetch_stack(next);
1448     clear_tsk_need_resched(prev);
1449     @@ -3136,13 +2910,10 @@ switch_tasks:
1450    
1451     update_cpu_clock(prev, rq, now);
1452    
1453     - prev->sleep_avg -= run_time;
1454     - if ((long)prev->sleep_avg <= 0)
1455     - prev->sleep_avg = 0;
1456     - prev->timestamp = prev->last_ran = now;
1457     -
1458     sched_info_switch(prev, next);
1459     if (likely(prev != next)) {
1460     + rq->preempted = 0;
1461     + rq->cache_ticks = 0;
1462     next->timestamp = now;
1463     rq->nr_switches++;
1464     rq->curr = next;
1465     @@ -3572,9 +3343,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
1466     void set_user_nice(task_t *p, long nice)
1467     {
1468     unsigned long flags;
1469     - prio_array_t *array;
1470     runqueue_t *rq;
1471     - int old_prio, new_prio, delta;
1472     + int queued, old_prio, new_prio, delta;
1473    
1474     if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1475     return;
1476     @@ -3593,9 +3363,8 @@ void set_user_nice(task_t *p, long nice)
1477     p->static_prio = NICE_TO_PRIO(nice);
1478     goto out_unlock;
1479     }
1480     - array = p->array;
1481     - if (array) {
1482     - dequeue_task(p, array);
1483     + if ((queued = task_queued(p))) {
1484     + dequeue_task(p, rq);
1485     dec_raw_weighted_load(rq, p);
1486     }
1487    
1488     @@ -3605,9 +3374,11 @@ void set_user_nice(task_t *p, long nice)
1489     p->static_prio = NICE_TO_PRIO(nice);
1490     set_load_weight(p);
1491     p->prio += delta;
1492     + if (p->bonus > bonus(p))
1493     + p->bonus= bonus(p);
1494    
1495     - if (array) {
1496     - enqueue_task(p, array);
1497     + if (queued) {
1498     + enqueue_task(p, rq);
1499     inc_raw_weighted_load(rq, p);
1500     /*
1501     * If the task increased its priority or is running and
1502     @@ -3731,19 +3502,13 @@ static inline task_t *find_process_by_pi
1503     /* Actually do priority change: must hold rq lock. */
1504     static void __setscheduler(struct task_struct *p, int policy, int prio)
1505     {
1506     - BUG_ON(p->array);
1507     + BUG_ON(task_queued(p));
1508     p->policy = policy;
1509     p->rt_priority = prio;
1510     if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
1511     p->prio = MAX_RT_PRIO-1 - p->rt_priority;
1512     - } else {
1513     + } else
1514     p->prio = p->static_prio;
1515     - /*
1516     - * SCHED_BATCH tasks are treated as perpetual CPU hogs:
1517     - */
1518     - if (policy == SCHED_BATCH)
1519     - p->sleep_avg = 0;
1520     - }
1521     set_load_weight(p);
1522     }
1523    
1524     @@ -3758,8 +3523,7 @@ int sched_setscheduler(struct task_struc
1525     struct sched_param *param)
1526     {
1527     int retval;
1528     - int oldprio, oldpolicy = -1;
1529     - prio_array_t *array;
1530     + int queued, oldprio, oldpolicy = -1;
1531     unsigned long flags;
1532     runqueue_t *rq;
1533    
1534     @@ -3821,12 +3585,11 @@ recheck:
1535     task_rq_unlock(rq, &flags);
1536     goto recheck;
1537     }
1538     - array = p->array;
1539     - if (array)
1540     + if ((queued = task_queued(p)))
1541     deactivate_task(p, rq);
1542     oldprio = p->prio;
1543     __setscheduler(p, policy, param->sched_priority);
1544     - if (array) {
1545     + if (queued) {
1546     __activate_task(p, rq);
1547     /*
1548     * Reschedule if we are currently running on this runqueue and
1549     @@ -3836,8 +3599,8 @@ recheck:
1550     if (task_running(rq, p)) {
1551     if (p->prio > oldprio)
1552     resched_task(rq->curr);
1553     - } else if (TASK_PREEMPTS_CURR(p, rq))
1554     - resched_task(rq->curr);
1555     + } else
1556     + preempt(p, rq);
1557     }
1558     task_rq_unlock(rq, &flags);
1559     return 0;
1560     @@ -4094,43 +3857,27 @@ asmlinkage long sys_sched_getaffinity(pi
1561    
1562     /**
1563     * sys_sched_yield - yield the current processor to other threads.
1564     - *
1565     - * this function yields the current CPU by moving the calling thread
1566     - * to the expired array. If there are no other threads running on this
1567     - * CPU then this function will return.
1568     + * This function yields the current CPU by dropping the priority of current
1569     + * to the lowest priority.
1570     */
1571     asmlinkage long sys_sched_yield(void)
1572     {
1573     + int newprio;
1574     runqueue_t *rq = this_rq_lock();
1575     - prio_array_t *array = current->array;
1576     - prio_array_t *target = rq->expired;
1577    
1578     + newprio = current->prio;
1579     schedstat_inc(rq, yld_cnt);
1580     - /*
1581     - * We implement yielding by moving the task into the expired
1582     - * queue.
1583     - *
1584     - * (special rule: RT tasks will just roundrobin in the active
1585     - * array.)
1586     - */
1587     - if (rt_task(current))
1588     - target = rq->active;
1589     -
1590     - if (array->nr_active == 1) {
1591     - schedstat_inc(rq, yld_act_empty);
1592     - if (!rq->expired->nr_active)
1593     - schedstat_inc(rq, yld_both_empty);
1594     - } else if (!rq->expired->nr_active)
1595     - schedstat_inc(rq, yld_exp_empty);
1596     -
1597     - if (array != target) {
1598     - dequeue_task(current, array);
1599     - enqueue_task(current, target);
1600     + current->slice = slice(current);
1601     + current->time_slice = rr_interval(current);
1602     + if (likely(!rt_task(current)))
1603     + newprio = MAX_PRIO - 1;
1604     +
1605     + if (newprio != current->prio) {
1606     + dequeue_task(current, rq);
1607     + current->prio = newprio;
1608     + enqueue_task(current, rq);
1609     } else
1610     - /*
1611     - * requeue_task is cheaper so perform that if possible.
1612     - */
1613     - requeue_task(current, array);
1614     + requeue_task(current, rq);
1615    
1616     /*
1617     * Since we are going to call schedule() anyway, there's
1618     @@ -4339,7 +4086,7 @@ long sys_sched_rr_get_interval(pid_t pid
1619     goto out_unlock;
1620    
1621     jiffies_to_timespec(p->policy & SCHED_FIFO ?
1622     - 0 : task_timeslice(p), &t);
1623     + 0 : slice(p), &t);
1624     read_unlock(&tasklist_lock);
1625     retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1626     out_nounlock:
1627     @@ -4462,8 +4209,6 @@ void __devinit init_idle(task_t *idle, i
1628     unsigned long flags;
1629    
1630     idle->timestamp = sched_clock();
1631     - idle->sleep_avg = 0;
1632     - idle->array = NULL;
1633     idle->prio = MAX_PRIO;
1634     idle->state = TASK_RUNNING;
1635     idle->cpus_allowed = cpumask_of_cpu(cpu);
1636     @@ -4580,7 +4325,7 @@ static void __migrate_task(struct task_s
1637     goto out;
1638    
1639     set_task_cpu(p, dest_cpu);
1640     - if (p->array) {
1641     + if (task_queued(p)) {
1642     /*
1643     * Sync timestamp with rq_dest's before activating.
1644     * The same thing could be achieved by doing this step
1645     @@ -4591,8 +4336,7 @@ static void __migrate_task(struct task_s
1646     + rq_dest->timestamp_last_tick;
1647     deactivate_task(p, rq_src);
1648     activate_task(p, rq_dest, 0);
1649     - if (TASK_PREEMPTS_CURR(p, rq_dest))
1650     - resched_task(rq_dest->curr);
1651     + preempt(p, rq_dest);
1652     }
1653    
1654     out:
1655     @@ -4806,7 +4550,7 @@ static void migrate_dead_tasks(unsigned
1656    
1657     for (arr = 0; arr < 2; arr++) {
1658     for (i = 0; i < MAX_PRIO; i++) {
1659     - struct list_head *list = &rq->arrays[arr].queue[i];
1660     + struct list_head *list = &rq->queue[i];
1661     while (!list_empty(list))
1662     migrate_dead(dead_cpu,
1663     list_entry(list->next, task_t,
1664     @@ -6148,17 +5892,15 @@ int in_sched_functions(unsigned long add
1665     void __init sched_init(void)
1666     {
1667     runqueue_t *rq;
1668     - int i, j, k;
1669     + int i, j;
1670    
1671     for_each_cpu(i) {
1672     - prio_array_t *array;
1673    
1674     rq = cpu_rq(i);
1675     spin_lock_init(&rq->lock);
1676     rq->nr_running = 0;
1677     - rq->active = rq->arrays;
1678     - rq->expired = rq->arrays + 1;
1679     - rq->best_expired_prio = MAX_PRIO;
1680     + rq->cache_ticks = 0;
1681     + rq->preempted = 0;
1682    
1683     #ifdef CONFIG_SMP
1684     rq->sd = NULL;
1685     @@ -6170,16 +5912,13 @@ void __init sched_init(void)
1686     INIT_LIST_HEAD(&rq->migration_queue);
1687     #endif
1688     atomic_set(&rq->nr_iowait, 0);
1689     -
1690     - for (j = 0; j < 2; j++) {
1691     - array = rq->arrays + j;
1692     - for (k = 0; k < MAX_PRIO; k++) {
1693     - INIT_LIST_HEAD(array->queue + k);
1694     - __clear_bit(k, array->bitmap);
1695     - }
1696     - // delimiter for bitsearch
1697     - __set_bit(MAX_PRIO, array->bitmap);
1698     - }
1699     + for (j = 0; j < MAX_PRIO; j++)
1700     + INIT_LIST_HEAD(&rq->queue[j]);
1701     + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
1702     + /*
1703     + * delimiter for bitsearch
1704     + */
1705     + __set_bit(MAX_PRIO, rq->bitmap);
1706     }
1707    
1708     set_load_weight(&init_task);
1709     @@ -6224,9 +5963,9 @@ EXPORT_SYMBOL(__might_sleep);
1710     void normalize_rt_tasks(void)
1711     {
1712     struct task_struct *p;
1713     - prio_array_t *array;
1714     unsigned long flags;
1715     runqueue_t *rq;
1716     + int queued;
1717    
1718     read_lock_irq(&tasklist_lock);
1719     for_each_process (p) {
1720     @@ -6235,11 +5974,10 @@ void normalize_rt_tasks(void)
1721    
1722     rq = task_rq_lock(p, &flags);
1723    
1724     - array = p->array;
1725     - if (array)
1726     + if ((queued = task_queued(p)))
1727     deactivate_task(p, task_rq(p));
1728     __setscheduler(p, SCHED_NORMAL, 0);
1729     - if (array) {
1730     + if (queued) {
1731     __activate_task(p, task_rq(p));
1732     resched_task(rq->curr);
1733     }
1734     Index: linux-2.6.16-ck1/kernel/sysctl.c
1735     ===================================================================
1736     --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:26.000000000 +1100
1737     +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:46:48.000000000 +1100
1738     @@ -623,6 +623,22 @@ static ctl_table kern_table[] = {
1739     .mode = 0444,
1740     .proc_handler = &proc_dointvec,
1741     },
1742     + {
1743     + .ctl_name = KERN_INTERACTIVE,
1744     + .procname = "interactive",
1745     + .data = &sched_interactive,
1746     + .maxlen = sizeof (int),
1747     + .mode = 0644,
1748     + .proc_handler = &proc_dointvec,
1749     + },
1750     + {
1751     + .ctl_name = KERN_COMPUTE,
1752     + .procname = "compute",
1753     + .data = &sched_compute,
1754     + .maxlen = sizeof (int),
1755     + .mode = 0644,
1756     + .proc_handler = &proc_dointvec,
1757     + },
1758     #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
1759     {
1760     .ctl_name = KERN_UNKNOWN_NMI_PANIC,