Magellan Linux

Contents of /trunk/kernel26-magellan/patches-2.6.16-r10/0007-2.6.16-sched-staircase14.2.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 70 - (show annotations) (download)
Thu May 11 19:09:22 2006 UTC (18 years ago) by niro
File size: 53449 byte(s)
import

1 fs/proc/array.c | 4
2 include/linux/sched.h | 13
3 include/linux/sysctl.h | 2
4 kernel/exit.c | 1
5 kernel/sched.c | 1022 ++++++++++++++++++-------------------------------
6 kernel/sysctl.c | 16
7 6 files changed, 406 insertions(+), 652 deletions(-)
8
9 Index: linux-2.6.16-ck1/fs/proc/array.c
10 ===================================================================
11 --- linux-2.6.16-ck1.orig/fs/proc/array.c 2006-03-20 20:46:26.000000000 +1100
12 +++ linux-2.6.16-ck1/fs/proc/array.c 2006-03-20 20:46:48.000000000 +1100
13 @@ -165,7 +165,7 @@ static inline char * task_state(struct t
14 read_lock(&tasklist_lock);
15 buffer += sprintf(buffer,
16 "State:\t%s\n"
17 - "SleepAVG:\t%lu%%\n"
18 + "Bonus:\t%d\n"
19 "Tgid:\t%d\n"
20 "Pid:\t%d\n"
21 "PPid:\t%d\n"
22 @@ -173,7 +173,7 @@ static inline char * task_state(struct t
23 "Uid:\t%d\t%d\t%d\t%d\n"
24 "Gid:\t%d\t%d\t%d\t%d\n",
25 get_task_state(p),
26 - (p->sleep_avg/1024)*100/(1020000000/1024),
27 + p->bonus,
28 p->tgid,
29 p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0,
30 pid_alive(p) && p->ptrace ? p->parent->pid : 0,
31 Index: linux-2.6.16-ck1/include/linux/sched.h
32 ===================================================================
33 --- linux-2.6.16-ck1.orig/include/linux/sched.h 2006-03-20 20:46:47.000000000 +1100
34 +++ linux-2.6.16-ck1/include/linux/sched.h 2006-03-20 20:46:48.000000000 +1100
35 @@ -200,6 +200,7 @@ extern void show_stack(struct task_struc
36
37 void io_schedule(void);
38 long io_schedule_timeout(long timeout);
39 +extern int sched_interactive, sched_compute;
40
41 extern void cpu_init (void);
42 extern void trap_init(void);
43 @@ -522,7 +523,6 @@ extern struct user_struct *find_user(uid
44 extern struct user_struct root_user;
45 #define INIT_USER (&root_user)
46
47 -typedef struct prio_array prio_array_t;
48 struct backing_dev_info;
49 struct reclaim_state;
50
51 @@ -723,18 +723,17 @@ struct task_struct {
52 int load_weight; /* for niceness load balancing purposes */
53 int prio, static_prio;
54 struct list_head run_list;
55 - prio_array_t *array;
56
57 unsigned short ioprio;
58
59 - unsigned long sleep_avg;
60 - unsigned long long timestamp, last_ran;
61 + unsigned long long timestamp;
62 + unsigned long runtime, totalrun, ns_debit;
63 + unsigned int bonus;
64 + unsigned int slice, time_slice;
65 unsigned long long sched_time; /* sched_clock time spent running */
66 - int activated;
67
68 unsigned long policy;
69 cpumask_t cpus_allowed;
70 - unsigned int time_slice, first_time_slice;
71
72 #ifdef CONFIG_SCHEDSTATS
73 struct sched_info sched_info;
74 @@ -948,6 +947,7 @@ static inline void put_task_struct(struc
75 #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
76 #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */
77 #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */
78 +#define PF_NONSLEEP 0x02000000 /* Waiting on in kernel activity */
79
80 /*
81 * Only the _current_ task can read/write to tsk->flags, but other
82 @@ -1069,7 +1069,6 @@ extern void FASTCALL(wake_up_new_task(st
83 static inline void kick_process(struct task_struct *tsk) { }
84 #endif
85 extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
86 -extern void FASTCALL(sched_exit(task_t * p));
87
88 extern int in_group_p(gid_t);
89 extern int in_egroup_p(gid_t);
90 Index: linux-2.6.16-ck1/include/linux/sysctl.h
91 ===================================================================
92 --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:26.000000000 +1100
93 +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:46:48.000000000 +1100
94 @@ -148,6 +148,8 @@ enum
95 KERN_SPIN_RETRY=70, /* int: number of spinlock retries */
96 KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
97 KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
98 + KERN_INTERACTIVE=73, /* interactive tasks can have cpu bursts */
99 + KERN_COMPUTE=74, /* adjust timeslices for a compute server */
100 };
101
102
103 Index: linux-2.6.16-ck1/kernel/exit.c
104 ===================================================================
105 --- linux-2.6.16-ck1.orig/kernel/exit.c 2006-03-20 20:46:26.000000000 +1100
106 +++ linux-2.6.16-ck1/kernel/exit.c 2006-03-20 20:46:48.000000000 +1100
107 @@ -102,7 +102,6 @@ repeat:
108 zap_leader = (leader->exit_signal == -1);
109 }
110
111 - sched_exit(p);
112 write_unlock_irq(&tasklist_lock);
113 spin_unlock(&p->proc_lock);
114 proc_pid_flush(proc_dentry);
115 Index: linux-2.6.16-ck1/kernel/sched.c
116 ===================================================================
117 --- linux-2.6.16-ck1.orig/kernel/sched.c 2006-03-20 20:46:46.000000000 +1100
118 +++ linux-2.6.16-ck1/kernel/sched.c 2006-03-20 20:46:48.000000000 +1100
119 @@ -16,6 +16,9 @@
120 * by Davide Libenzi, preemptible kernel bits by Robert Love.
121 * 2003-09-03 Interactivity tuning by Con Kolivas.
122 * 2004-04-02 Scheduler domains code by Nick Piggin
123 + * 2006-03-16 New staircase scheduling policy by Con Kolivas with help
124 + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
125 + * Staircase v14.2
126 */
127
128 #include <linux/mm.h>
129 @@ -76,128 +79,27 @@
130 */
131 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
132 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
133 +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
134 +#define TASK_PREEMPTS_CURR(p, rq) ((p)->prio < (rq)->curr->prio)
135
136 +int sched_compute __read_mostly = 0;
137 /*
138 - * These are the 'tuning knobs' of the scheduler:
139 - *
140 - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
141 - * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
142 - * Timeslices get refilled after they expire.
143 - */
144 -#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
145 -#define DEF_TIMESLICE (100 * HZ / 1000)
146 -#define ON_RUNQUEUE_WEIGHT 30
147 -#define CHILD_PENALTY 95
148 -#define PARENT_PENALTY 100
149 -#define EXIT_WEIGHT 3
150 -#define PRIO_BONUS_RATIO 25
151 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
152 -#define INTERACTIVE_DELTA 2
153 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
154 -#define STARVATION_LIMIT (MAX_SLEEP_AVG)
155 -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
156 -
157 -/*
158 - * If a task is 'interactive' then we reinsert it in the active
159 - * array after it has expired its current timeslice. (it will not
160 - * continue to run immediately, it will still roundrobin with
161 - * other interactive tasks.)
162 - *
163 - * This part scales the interactivity limit depending on niceness.
164 - *
165 - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
166 - * Here are a few examples of different nice levels:
167 - *
168 - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
169 - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
170 - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
171 - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
172 - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
173 - *
174 - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
175 - * priority range a task can explore, a value of '1' means the
176 - * task is rated interactive.)
177 - *
178 - * Ie. nice +19 tasks can never get 'interactive' enough to be
179 - * reinserted into the active array. And only heavily CPU-hog nice -20
180 - * tasks will be expired. Default nice 0 tasks are somewhere between,
181 - * it takes some effort for them to get interactive, but it's not
182 - * too hard.
183 - */
184 -
185 -#define CURRENT_BONUS(p) \
186 - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
187 - MAX_SLEEP_AVG)
188 -
189 -#define GRANULARITY (10 * HZ / 1000 ? : 1)
190 -
191 -#ifdef CONFIG_SMP
192 -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
193 - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
194 - num_online_cpus())
195 -#else
196 -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
197 - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
198 -#endif
199 -
200 -#define SCALE(v1,v1_max,v2_max) \
201 - (v1) * (v2_max) / (v1_max)
202 -
203 -#define DELTA(p) \
204 - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
205 -
206 -#define TASK_INTERACTIVE(p) \
207 - ((p)->prio <= (p)->static_prio - DELTA(p))
208 -
209 -#define INTERACTIVE_SLEEP(p) \
210 - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
211 - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
212 -
213 -#define TASK_PREEMPTS_CURR(p, rq) \
214 - ((p)->prio < (rq)->curr->prio)
215 -
216 -/*
217 - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
218 - * to time slice values: [800ms ... 100ms ... 5ms]
219 - *
220 - * The higher a thread's priority, the bigger timeslices
221 - * it gets during one round of execution. But even the lowest
222 - * priority thread gets MIN_TIMESLICE worth of execution time.
223 + *This is the time all tasks within the same priority round robin.
224 + *compute setting is reserved for dedicated computational scheduling
225 + *and has twenty times larger intervals. Set to a minimum of 6ms.
226 */
227 +#define _RR_INTERVAL ((6 * HZ / 1001) + 1)
228 +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 16 * sched_compute))
229 +#define DEF_TIMESLICE (RR_INTERVAL() * 19)
230
231 -#define SCALE_PRIO(x, prio) \
232 - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
233 -
234 -static unsigned int static_prio_timeslice(int static_prio)
235 -{
236 - if (static_prio < NICE_TO_PRIO(0))
237 - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
238 - else
239 - return SCALE_PRIO(DEF_TIMESLICE, static_prio);
240 -}
241 -
242 -static inline unsigned int task_timeslice(task_t *p)
243 -{
244 - return static_prio_timeslice(p->static_prio);
245 -}
246 -
247 -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
248 +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \
249 < (long long) (sd)->cache_hot_time)
250
251 /*
252 * These are the runqueue data structures:
253 */
254 -
255 -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
256 -
257 typedef struct runqueue runqueue_t;
258
259 -struct prio_array {
260 - unsigned int nr_active;
261 - unsigned long bitmap[BITMAP_SIZE];
262 - struct list_head queue[MAX_PRIO];
263 -};
264 -
265 /*
266 * This is the main, per-CPU runqueue data structure.
267 *
268 @@ -227,12 +129,12 @@ struct runqueue {
269 */
270 unsigned long nr_uninterruptible;
271
272 - unsigned long expired_timestamp;
273 unsigned long long timestamp_last_tick;
274 + unsigned int cache_ticks, preempted;
275 task_t *curr, *idle;
276 struct mm_struct *prev_mm;
277 - prio_array_t *active, *expired, arrays[2];
278 - int best_expired_prio;
279 + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
280 + struct list_head queue[MAX_PRIO];
281 atomic_t nr_iowait;
282
283 #ifdef CONFIG_SMP
284 @@ -496,13 +398,7 @@ static inline runqueue_t *this_rq_lock(v
285
286 #ifdef CONFIG_SCHEDSTATS
287 /*
288 - * Called when a process is dequeued from the active array and given
289 - * the cpu. We should note that with the exception of interactive
290 - * tasks, the expired queue will become the active queue after the active
291 - * queue is empty, without explicitly dequeuing and requeuing tasks in the
292 - * expired queue. (Interactive tasks may be requeued directly to the
293 - * active queue, thus delaying tasks in the expired queue from running;
294 - * see scheduler_tick()).
295 + * Called when a process is dequeued and given the cpu.
296 *
297 * This function is only called from sched_info_arrive(), rather than
298 * dequeue_task(). Even though a task may be queued and dequeued multiple
299 @@ -540,13 +436,11 @@ static void sched_info_arrive(task_t *t)
300 }
301
302 /*
303 - * Called when a process is queued into either the active or expired
304 - * array. The time is noted and later used to determine how long we
305 - * had to wait for us to reach the cpu. Since the expired queue will
306 - * become the active queue after active queue is empty, without dequeuing
307 - * and requeuing any tasks, we are interested in queuing to either. It
308 - * is unusual but not impossible for tasks to be dequeued and immediately
309 - * requeued in the same or another array: this can happen in sched_yield(),
310 + * Called when a process is queued
311 + * The time is noted and later used to determine how long we had to wait for
312 + * us to reach the cpu.
313 + * It is unusual but not impossible for tasks to be dequeued and immediately
314 + * requeued: this can happen in sched_yield(),
315 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
316 * to runqueue.
317 *
318 @@ -601,73 +495,67 @@ static inline void sched_info_switch(tas
319 #endif /* CONFIG_SCHEDSTATS */
320
321 /*
322 - * Adding/removing a task to/from a priority array:
323 + * Get nanosecond clock difference without overflowing unsigned long.
324 */
325 -static void dequeue_task(struct task_struct *p, prio_array_t *array)
326 +static unsigned long ns_diff(const unsigned long long v1,
327 + const unsigned long long v2)
328 {
329 - array->nr_active--;
330 - list_del(&p->run_list);
331 - if (list_empty(array->queue + p->prio))
332 - __clear_bit(p->prio, array->bitmap);
333 + unsigned long long vdiff;
334 + if (likely(v1 > v2)) {
335 + vdiff = v1 - v2;
336 +#if BITS_PER_LONG < 64
337 + if (vdiff > (1 << 31))
338 + vdiff = 1 << 31;
339 +#endif
340 + } else {
341 + /*
342 + * Rarely the clock appears to go backwards. There should
343 + * always be a positive difference so return 1.
344 + */
345 + vdiff = 1;
346 + }
347 + return (unsigned long)vdiff;
348 }
349
350 -static void enqueue_task(struct task_struct *p, prio_array_t *array)
351 +static inline int task_queued(const task_t *task)
352 {
353 - sched_info_queued(p);
354 - list_add_tail(&p->run_list, array->queue + p->prio);
355 - __set_bit(p->prio, array->bitmap);
356 - array->nr_active++;
357 - p->array = array;
358 + return !list_empty(&task->run_list);
359 }
360
361 /*
362 - * Put task to the end of the run list without the overhead of dequeue
363 - * followed by enqueue.
364 + * Adding/removing a task to/from a runqueue:
365 */
366 -static void requeue_task(struct task_struct *p, prio_array_t *array)
367 +static void fastcall dequeue_task(task_t *p, runqueue_t *rq)
368 {
369 - list_move_tail(&p->run_list, array->queue + p->prio);
370 + list_del_init(&p->run_list);
371 + if (list_empty(rq->queue + p->prio))
372 + __clear_bit(p->prio, rq->bitmap);
373 + p->ns_debit = 0;
374 }
375
376 -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
377 +static void fastcall enqueue_task(task_t *p, runqueue_t *rq)
378 {
379 - list_add(&p->run_list, array->queue + p->prio);
380 - __set_bit(p->prio, array->bitmap);
381 - array->nr_active++;
382 - p->array = array;
383 + list_add_tail(&p->run_list, rq->queue + p->prio);
384 + __set_bit(p->prio, rq->bitmap);
385 }
386
387 /*
388 - * effective_prio - return the priority that is based on the static
389 - * priority but is modified by bonuses/penalties.
390 - *
391 - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
392 - * into the -5 ... 0 ... +5 bonus/penalty range.
393 - *
394 - * We use 25% of the full 0...39 priority range so that:
395 - *
396 - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
397 - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
398 - *
399 - * Both properties are important to certain workloads.
400 + * Put task to the end of the run list without the overhead of dequeue
401 + * followed by enqueue.
402 */
403 -static int effective_prio(task_t *p)
404 +static inline void requeue_task(task_t *p, runqueue_t *rq)
405 {
406 - int bonus, prio;
407 -
408 - if (rt_task(p))
409 - return p->prio;
410 -
411 - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
412 + list_move_tail(&p->run_list, rq->queue + p->prio);
413 +}
414
415 - prio = p->static_prio - bonus;
416 - if (prio < MAX_RT_PRIO)
417 - prio = MAX_RT_PRIO;
418 - if (prio > MAX_PRIO-1)
419 - prio = MAX_PRIO-1;
420 - return prio;
421 +static inline void enqueue_task_head(task_t *p, runqueue_t *rq)
422 +{
423 + list_add(&p->run_list, rq->queue + p->prio);
424 + __set_bit(p->prio, rq->bitmap);
425 }
426
427 +static unsigned int fastcall slice(const task_t *p);
428 +
429 /*
430 * To aid in avoiding the subversion of "niceness" due to uneven distribution
431 * of tasks with abnormal "nice" values across CPUs the contribution that
432 @@ -685,10 +573,9 @@ static int effective_prio(task_t *p)
433 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
434 #define LOAD_WEIGHT(lp) \
435 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
436 -#define PRIO_TO_LOAD_WEIGHT(prio) \
437 - LOAD_WEIGHT(static_prio_timeslice(prio))
438 -#define RTPRIO_TO_LOAD_WEIGHT(rp) \
439 - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
440 +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p))
441 +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
442 + (LOAD_WEIGHT((RR_INTERVAL() + 20 + (rp))))
443
444 static void set_load_weight(task_t *p)
445 {
446 @@ -705,7 +592,7 @@ static void set_load_weight(task_t *p)
447 #endif
448 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
449 } else
450 - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
451 + p->load_weight = TASK_LOAD_WEIGHT(p);
452 }
453
454 static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
455 @@ -733,9 +620,9 @@ static inline void dec_nr_running(task_t
456 /*
457 * __activate_task - move a task to the runqueue.
458 */
459 -static inline void __activate_task(task_t *p, runqueue_t *rq)
460 +static void fastcall __activate_task(task_t *p, runqueue_t *rq)
461 {
462 - enqueue_task(p, rq->active);
463 + enqueue_task(p, rq);
464 inc_nr_running(p, rq);
465 }
466
467 @@ -744,74 +631,157 @@ static inline void __activate_task(task_
468 */
469 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
470 {
471 - enqueue_task_head(p, rq->active);
472 + enqueue_task_head(p, rq);
473 inc_nr_running(p, rq);
474 }
475
476 -static int recalc_task_prio(task_t *p, unsigned long long now)
477 +/*
478 + * Bonus - How much higher than its base priority an interactive task can run.
479 + */
480 +static inline unsigned int bonus(const task_t *p)
481 {
482 - /* Caller must always ensure 'now >= p->timestamp' */
483 - unsigned long long __sleep_time = now - p->timestamp;
484 - unsigned long sleep_time;
485 -
486 - if (unlikely(p->policy == SCHED_BATCH))
487 - sleep_time = 0;
488 - else {
489 - if (__sleep_time > NS_MAX_SLEEP_AVG)
490 - sleep_time = NS_MAX_SLEEP_AVG;
491 - else
492 - sleep_time = (unsigned long)__sleep_time;
493 - }
494 + return TASK_USER_PRIO(p);
495 +}
496
497 - if (likely(sleep_time > 0)) {
498 - /*
499 - * User tasks that sleep a long time are categorised as
500 - * idle and will get just interactive status to stay active &
501 - * prevent them suddenly becoming cpu hogs and starving
502 - * other processes.
503 - */
504 - if (p->mm && p->activated != -1 &&
505 - sleep_time > INTERACTIVE_SLEEP(p)) {
506 - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
507 - DEF_TIMESLICE);
508 - } else {
509 - /*
510 - * The lower the sleep avg a task has the more
511 - * rapidly it will rise with sleep time.
512 - */
513 - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
514 +static unsigned int fastcall rr_interval(const task_t *p)
515 +{
516 + int nice = TASK_NICE(p);
517
518 - /*
519 - * Tasks waking from uninterruptible sleep are
520 - * limited in their sleep_avg rise as they
521 - * are likely to be waiting on I/O
522 - */
523 - if (p->activated == -1 && p->mm) {
524 - if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
525 - sleep_time = 0;
526 - else if (p->sleep_avg + sleep_time >=
527 - INTERACTIVE_SLEEP(p)) {
528 - p->sleep_avg = INTERACTIVE_SLEEP(p);
529 - sleep_time = 0;
530 - }
531 - }
532 + if (nice < 0 && !rt_task(p))
533 + return RR_INTERVAL() * (20 - nice) / 20;
534 + return RR_INTERVAL();
535 +}
536
537 - /*
538 - * This code gives a bonus to interactive tasks.
539 - *
540 - * The boost works by updating the 'average sleep time'
541 - * value here, based on ->timestamp. The more time a
542 - * task spends sleeping, the higher the average gets -
543 - * and the higher the priority boost gets as well.
544 - */
545 - p->sleep_avg += sleep_time;
546 +/*
547 + * slice - the duration a task runs before getting requeued at its best
548 + * priority and has its bonus decremented.
549 + */
550 +static unsigned int fastcall slice(const task_t *p)
551 +{
552 + unsigned int slice, rr;
553
554 - if (p->sleep_avg > NS_MAX_SLEEP_AVG)
555 - p->sleep_avg = NS_MAX_SLEEP_AVG;
556 - }
557 + slice = rr = rr_interval(p);
558 + if (likely(!rt_task(p)))
559 + slice += (39 - TASK_USER_PRIO(p)) * rr;
560 + return slice;
561 +}
562 +
563 +/*
564 + * We increase our bonus by sleeping more than the time we ran.
565 + * The ratio of sleep to run gives us the cpu% that we last ran and determines
566 + * the maximum bonus we can acquire.
567 + */
568 +static void fastcall inc_bonus(task_t *p, const unsigned long totalrun,
569 + const unsigned long sleep)
570 +{
571 + unsigned int best_bonus;
572 +
573 + best_bonus = sleep / (totalrun + 1);
574 + if (p->bonus >= best_bonus)
575 + return;
576 +
577 + p->bonus++;
578 + best_bonus = bonus(p);
579 + if (p->bonus > best_bonus)
580 + p->bonus = best_bonus;
581 +}
582 +
583 +static void dec_bonus(task_t *p)
584 +{
585 + if (p->bonus)
586 + p->bonus--;
587 +}
588 +
589 +/*
590 + * sched_interactive - sysctl which allows interactive tasks to have bonus
591 + * raise its priority.
592 + */
593 +int sched_interactive __read_mostly = 1;
594 +
595 +/*
596 + * effective_prio - dynamic priority dependent on bonus.
597 + * The priority normally decreases by one each RR_INTERVAL.
598 + * As the bonus increases the initial priority starts at a higher "stair" or
599 + * priority for longer.
600 + */
601 +static int effective_prio(const task_t *p)
602 +{
603 + int prio;
604 + unsigned int full_slice, used_slice = 0;
605 + unsigned int best_bonus, rr;
606 +
607 + if (rt_task(p))
608 + return p->prio;
609 +
610 + full_slice = slice(p);
611 + if (full_slice > p->slice)
612 + used_slice = full_slice - p->slice;
613 +
614 + best_bonus = bonus(p);
615 + prio = MAX_RT_PRIO + best_bonus;
616 + if (sched_interactive && !sched_compute && p->policy != SCHED_BATCH)
617 + prio -= p->bonus;
618 +
619 + rr = rr_interval(p);
620 + prio += used_slice / rr;
621 + if (prio > MAX_PRIO - 1)
622 + prio = MAX_PRIO - 1;
623 + return prio;
624 +}
625 +
626 +static inline void continue_slice(task_t *p)
627 +{
628 + unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
629 +
630 + if (total_run >= p->slice) {
631 + p->totalrun -= JIFFIES_TO_NS(p->slice);
632 + dec_bonus(p);
633 + } else {
634 + unsigned int remainder;
635 +
636 + p->slice -= total_run;
637 + remainder = p->slice % rr_interval(p);
638 + if (remainder)
639 + p->time_slice = remainder;
640 }
641 +}
642
643 - return effective_prio(p);
644 +/*
645 + * recalc_task_prio - this checks for tasks that run ultra short timeslices
646 + * or have just forked a thread/process and make them continue their old
647 + * slice instead of starting a new one at high priority.
648 + */
649 +static inline void recalc_task_prio(task_t *p, const unsigned long long now)
650 +{
651 + unsigned long sleep_time = ns_diff(now, p->timestamp);
652 +
653 + /*
654 + * Add the total for this last scheduled run (p->runtime) to the
655 + * running total so far used (p->totalrun).
656 + */
657 + p->totalrun += p->runtime;
658 +
659 + /*
660 + * If we sleep longer than our running total and have not set the
661 + * PF_NONSLEEP flag we gain a bonus.
662 + */
663 + if (sleep_time >= p->totalrun && !(p->flags & PF_NONSLEEP) &&
664 + !sched_compute) {
665 + inc_bonus(p, p->totalrun, sleep_time);
666 + p->totalrun = 0;
667 + return;
668 + }
669 +
670 + /*
671 + * If we have not set the PF_NONSLEEP flag we elevate priority by the
672 + * amount of time we slept.
673 + */
674 + if (p->flags & PF_NONSLEEP)
675 + p->flags &= ~PF_NONSLEEP;
676 + else
677 + p->totalrun -= sleep_time;
678 +
679 + continue_slice(p);
680 }
681
682 /*
683 @@ -820,11 +790,11 @@ static int recalc_task_prio(task_t *p, u
684 * Update all the scheduling statistics stuff. (sleep average
685 * calculation, priority modifiers, etc.)
686 */
687 -static void activate_task(task_t *p, runqueue_t *rq, int local)
688 +static void activate_task(task_t *p, runqueue_t *rq, const int local)
689 {
690 - unsigned long long now;
691 + unsigned long long now = sched_clock();
692 + unsigned long rr = rr_interval(p);
693
694 - now = sched_clock();
695 #ifdef CONFIG_SMP
696 if (!local) {
697 /* Compensate for drifting sched_clock */
698 @@ -833,45 +803,24 @@ static void activate_task(task_t *p, run
699 + rq->timestamp_last_tick;
700 }
701 #endif
702 -
703 - if (!rt_task(p))
704 - p->prio = recalc_task_prio(p, now);
705 -
706 - /*
707 - * This checks to make sure it's not an uninterruptible task
708 - * that is now waking up.
709 - */
710 - if (!p->activated) {
711 - /*
712 - * Tasks which were woken up by interrupts (ie. hw events)
713 - * are most likely of interactive nature. So we give them
714 - * the credit of extending their sleep time to the period
715 - * of time they spend on the runqueue, waiting for execution
716 - * on a CPU, first time around:
717 - */
718 - if (in_interrupt())
719 - p->activated = 2;
720 - else {
721 - /*
722 - * Normal first-time wakeups get a credit too for
723 - * on-runqueue time, but it will be weighted down:
724 - */
725 - p->activated = 1;
726 - }
727 + p->slice = slice(p);
728 + p->time_slice = p->slice % rr ? : rr;
729 + if (!rt_task(p)) {
730 + recalc_task_prio(p, now);
731 + p->flags &= ~PF_NONSLEEP;
732 + p->prio = effective_prio(p);
733 }
734 p->timestamp = now;
735 -
736 __activate_task(p, rq);
737 }
738
739 /*
740 * deactivate_task - remove a task from the runqueue.
741 */
742 -static void deactivate_task(struct task_struct *p, runqueue_t *rq)
743 +static void fastcall deactivate_task(task_t *p, runqueue_t *rq)
744 {
745 dec_nr_running(p, rq);
746 - dequeue_task(p, p->array);
747 - p->array = NULL;
748 + dequeue_task(p, rq);
749 }
750
751 /*
752 @@ -947,7 +896,7 @@ static int migrate_task(task_t *p, int d
753 * If the task is not on a runqueue (and not running), then
754 * it is sufficient to simply update the task's cpu field.
755 */
756 - if (!p->array && !task_running(rq, p)) {
757 + if (!task_queued(p) && !task_running(rq, p)) {
758 set_task_cpu(p, dest_cpu);
759 return 0;
760 }
761 @@ -977,7 +926,7 @@ void wait_task_inactive(task_t *p)
762 repeat:
763 rq = task_rq_lock(p, &flags);
764 /* Must be off runqueue entirely, not preempted. */
765 - if (unlikely(p->array || task_running(rq, p))) {
766 + if (unlikely(task_queued(p) || task_running(rq, p))) {
767 /* If it's preempted, we yield. It could be a while. */
768 preempted = !task_running(rq, p);
769 task_rq_unlock(rq, &flags);
770 @@ -1228,6 +1177,26 @@ static inline int wake_idle(int cpu, tas
771 }
772 #endif
773
774 +/*
775 + * CACHE_DELAY is the time preemption is delayed in sched_compute mode
776 + * and is set to a nominal 10ms.
777 + */
778 +#define CACHE_DELAY (10 * (HZ) / 1001 + 1)
779 +
780 +/*
781 + * Check to see if p preempts rq->curr and resched if it does. In compute
782 + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted.
783 + */
784 +static void fastcall preempt(const task_t *p, runqueue_t *rq)
785 +{
786 + if (p->prio >= rq->curr->prio)
787 + return;
788 + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY ||
789 + !p->mm || rt_task(p))
790 + resched_task(rq->curr);
791 + rq->preempted = 1;
792 +}
793 +
794 /***
795 * try_to_wake_up - wake up a thread
796 * @p: the to-be-woken-up thread
797 @@ -1259,7 +1228,7 @@ static int try_to_wake_up(task_t *p, uns
798 if (!(old_state & state))
799 goto out;
800
801 - if (p->array)
802 + if (task_queued(p))
803 goto out_running;
804
805 cpu = task_cpu(p);
806 @@ -1350,7 +1319,7 @@ out_set_cpu:
807 old_state = p->state;
808 if (!(old_state & state))
809 goto out;
810 - if (p->array)
811 + if (task_queued(p))
812 goto out_running;
813
814 this_cpu = smp_processor_id();
815 @@ -1359,26 +1328,10 @@ out_set_cpu:
816
817 out_activate:
818 #endif /* CONFIG_SMP */
819 - if (old_state == TASK_UNINTERRUPTIBLE) {
820 + if (old_state == TASK_UNINTERRUPTIBLE)
821 rq->nr_uninterruptible--;
822 - /*
823 - * Tasks on involuntary sleep don't earn
824 - * sleep_avg beyond just interactive state.
825 - */
826 - p->activated = -1;
827 - }
828
829 /*
830 - * Tasks that have marked their sleep as noninteractive get
831 - * woken up without updating their sleep average. (i.e. their
832 - * sleep is handled in a priority-neutral manner, no priority
833 - * boost and no penalty.)
834 - */
835 - if (old_state & TASK_NONINTERACTIVE)
836 - __activate_task(p, rq);
837 - else
838 - activate_task(p, rq, cpu == this_cpu);
839 - /*
840 * Sync wakeups (i.e. those types of wakeups where the waker
841 * has indicated that it will leave the CPU in short order)
842 * don't trigger a preemption, if the woken up task will run on
843 @@ -1386,10 +1339,9 @@ out_activate:
844 * the waker guarantees that the freshly woken up task is going
845 * to be considered on this CPU.)
846 */
847 - if (!sync || cpu != this_cpu) {
848 - if (TASK_PREEMPTS_CURR(p, rq))
849 - resched_task(rq->curr);
850 - }
851 + activate_task(p, rq, cpu == this_cpu);
852 + if (!sync || cpu != this_cpu)
853 + preempt(p, rq);
854 success = 1;
855
856 out_running:
857 @@ -1434,7 +1386,6 @@ void fastcall sched_fork(task_t *p, int
858 */
859 p->state = TASK_RUNNING;
860 INIT_LIST_HEAD(&p->run_list);
861 - p->array = NULL;
862 #ifdef CONFIG_SCHEDSTATS
863 memset(&p->sched_info, 0, sizeof(p->sched_info));
864 #endif
865 @@ -1445,30 +1396,6 @@ void fastcall sched_fork(task_t *p, int
866 /* Want to start with kernel preemption disabled. */
867 task_thread_info(p)->preempt_count = 1;
868 #endif
869 - /*
870 - * Share the timeslice between parent and child, thus the
871 - * total amount of pending timeslices in the system doesn't change,
872 - * resulting in more scheduling fairness.
873 - */
874 - local_irq_disable();
875 - p->time_slice = (current->time_slice + 1) >> 1;
876 - /*
877 - * The remainder of the first timeslice might be recovered by
878 - * the parent if the child exits early enough.
879 - */
880 - p->first_time_slice = 1;
881 - current->time_slice >>= 1;
882 - p->timestamp = sched_clock();
883 - if (unlikely(!current->time_slice)) {
884 - /*
885 - * This case is rare, it happens when the parent has only
886 - * a single jiffy left from its timeslice. Taking the
887 - * runqueue lock is not a problem.
888 - */
889 - current->time_slice = 1;
890 - scheduler_tick();
891 - }
892 - local_irq_enable();
893 put_cpu();
894 }
895
896 @@ -1491,36 +1418,20 @@ void fastcall wake_up_new_task(task_t *p
897 cpu = task_cpu(p);
898
899 /*
900 - * We decrease the sleep average of forking parents
901 - * and children as well, to keep max-interactive tasks
902 - * from forking tasks that are max-interactive. The parent
903 - * (current) is done further down, under its lock.
904 + * Forked process gets no bonus to prevent fork bombs.
905 */
906 - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
907 - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
908 -
909 - p->prio = effective_prio(p);
910 + p->bonus = 0;
911
912 if (likely(cpu == this_cpu)) {
913 - if (!(clone_flags & CLONE_VM)) {
914 + current->flags |= PF_NONSLEEP;
915 + activate_task(p, rq, 1);
916 + if (!(clone_flags & CLONE_VM))
917 /*
918 * The VM isn't cloned, so we're in a good position to
919 * do child-runs-first in anticipation of an exec. This
920 * usually avoids a lot of COW overhead.
921 */
922 - if (unlikely(!current->array))
923 - __activate_task(p, rq);
924 - else {
925 - p->prio = current->prio;
926 - list_add_tail(&p->run_list, &current->run_list);
927 - p->array = current->array;
928 - p->array->nr_active++;
929 - inc_nr_running(p, rq);
930 - }
931 set_need_resched();
932 - } else
933 - /* Run child last */
934 - __activate_task(p, rq);
935 /*
936 * We skip the following code due to cpu == this_cpu
937 *
938 @@ -1537,53 +1448,20 @@ void fastcall wake_up_new_task(task_t *p
939 */
940 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
941 + rq->timestamp_last_tick;
942 - __activate_task(p, rq);
943 - if (TASK_PREEMPTS_CURR(p, rq))
944 - resched_task(rq->curr);
945 + activate_task(p, rq, 0);
946 + preempt(p, rq);
947
948 /*
949 * Parent and child are on different CPUs, now get the
950 - * parent runqueue to update the parent's ->sleep_avg:
951 + * parent runqueue to update the parent's ->flags:
952 */
953 task_rq_unlock(rq, &flags);
954 this_rq = task_rq_lock(current, &flags);
955 + current->flags |= PF_NONSLEEP;
956 }
957 - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
958 - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
959 task_rq_unlock(this_rq, &flags);
960 }
961
962 -/*
963 - * Potentially available exiting-child timeslices are
964 - * retrieved here - this way the parent does not get
965 - * penalized for creating too many threads.
966 - *
967 - * (this cannot be used to 'generate' timeslices
968 - * artificially, because any timeslice recovered here
969 - * was given away by the parent in the first place.)
970 - */
971 -void fastcall sched_exit(task_t *p)
972 -{
973 - unsigned long flags;
974 - runqueue_t *rq;
975 -
976 - /*
977 - * If the child was a (relative-) CPU hog then decrease
978 - * the sleep_avg of the parent as well.
979 - */
980 - rq = task_rq_lock(p->parent, &flags);
981 - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
982 - p->parent->time_slice += p->time_slice;
983 - if (unlikely(p->parent->time_slice > task_timeslice(p)))
984 - p->parent->time_slice = task_timeslice(p);
985 - }
986 - if (p->sleep_avg < p->parent->sleep_avg)
987 - p->parent->sleep_avg = p->parent->sleep_avg /
988 - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
989 - (EXIT_WEIGHT + 1);
990 - task_rq_unlock(rq, &flags);
991 -}
992 -
993 /**
994 * prepare_task_switch - prepare to switch tasks
995 * @rq: the runqueue preparing to switch
996 @@ -1855,32 +1733,28 @@ void sched_exec(void)
997 * pull_task - move a task from a remote runqueue to the local runqueue.
998 * Both runqueues must be locked.
999 */
1000 -static
1001 -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1002 - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1003 +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq,
1004 + const int this_cpu)
1005 {
1006 - dequeue_task(p, src_array);
1007 + dequeue_task(p, src_rq);
1008 dec_nr_running(p, src_rq);
1009 set_task_cpu(p, this_cpu);
1010 inc_nr_running(p, this_rq);
1011 - enqueue_task(p, this_array);
1012 + enqueue_task(p, this_rq);
1013 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1014 + this_rq->timestamp_last_tick;
1015 /*
1016 * Note that idle threads have a prio of MAX_PRIO, for this test
1017 * to be always true for them.
1018 */
1019 - if (TASK_PREEMPTS_CURR(p, this_rq))
1020 - resched_task(this_rq->curr);
1021 + preempt(p, this_rq);
1022 }
1023
1024 /*
1025 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1026 */
1027 -static
1028 -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1029 - struct sched_domain *sd, enum idle_type idle,
1030 - int *all_pinned)
1031 +static int can_migrate_task(task_t *p, runqueue_t *rq, const int this_cpu,
1032 + struct sched_domain *sd, const enum idle_type idle, int *all_pinned)
1033 {
1034 /*
1035 * We do not migrate tasks that are:
1036 @@ -1921,7 +1795,6 @@ static int move_tasks(runqueue_t *this_r
1037 struct sched_domain *sd, enum idle_type idle,
1038 int *all_pinned)
1039 {
1040 - prio_array_t *array, *dst_array;
1041 struct list_head *head, *curr;
1042 int idx, pulled = 0, pinned = 0;
1043 long rem_load_move;
1044 @@ -1933,38 +1806,17 @@ static int move_tasks(runqueue_t *this_r
1045 rem_load_move = max_load_move;
1046 pinned = 1;
1047
1048 - /*
1049 - * We first consider expired tasks. Those will likely not be
1050 - * executed in the near future, and they are most likely to
1051 - * be cache-cold, thus switching CPUs has the least effect
1052 - * on them.
1053 - */
1054 - if (busiest->expired->nr_active) {
1055 - array = busiest->expired;
1056 - dst_array = this_rq->expired;
1057 - } else {
1058 - array = busiest->active;
1059 - dst_array = this_rq->active;
1060 - }
1061 -
1062 -new_array:
1063 /* Start searching at priority 0: */
1064 idx = 0;
1065 skip_bitmap:
1066 if (!idx)
1067 - idx = sched_find_first_bit(array->bitmap);
1068 + idx = sched_find_first_bit(busiest->bitmap);
1069 else
1070 - idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1071 - if (idx >= MAX_PRIO) {
1072 - if (array == busiest->expired && busiest->active->nr_active) {
1073 - array = busiest->active;
1074 - dst_array = this_rq->active;
1075 - goto new_array;
1076 - }
1077 + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1078 + if (idx >= MAX_PRIO)
1079 goto out;
1080 - }
1081
1082 - head = array->queue + idx;
1083 + head = busiest->queue + idx;
1084 curr = head->prev;
1085 skip_queue:
1086 tmp = list_entry(curr, task_t, run_list);
1087 @@ -1984,7 +1836,7 @@ skip_queue:
1088 schedstat_inc(sd, lb_hot_gained[idle]);
1089 #endif
1090
1091 - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1092 + pull_task(busiest, tmp, this_rq, this_cpu);
1093 pulled++;
1094 rem_load_move -= tmp->load_weight;
1095
1096 @@ -2507,15 +2359,13 @@ static void rebalance_tick(int this_cpu,
1097 continue;
1098
1099 interval = sd->balance_interval;
1100 - if (idle != SCHED_IDLE)
1101 - interval *= sd->busy_factor;
1102
1103 /* scale ms to jiffies */
1104 interval = msecs_to_jiffies(interval);
1105 if (unlikely(!interval))
1106 interval = 1;
1107
1108 - if (j - sd->last_balance >= interval) {
1109 + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) {
1110 if (load_balance(this_cpu, this_rq, sd, idle)) {
1111 /*
1112 * We've pulled tasks over so either we're no
1113 @@ -2589,22 +2439,6 @@ unsigned long long current_sched_time(co
1114 }
1115
1116 /*
1117 - * We place interactive tasks back into the active array, if possible.
1118 - *
1119 - * To guarantee that this does not starve expired tasks we ignore the
1120 - * interactivity of a task if the first expired task had to wait more
1121 - * than a 'reasonable' amount of time. This deadline timeout is
1122 - * load-dependent, as the frequency of array switched decreases with
1123 - * increasing number of running tasks. We also ignore the interactivity
1124 - * if a better static_prio task has expired:
1125 - */
1126 -#define EXPIRED_STARVING(rq) \
1127 - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1128 - (jiffies - (rq)->expired_timestamp >= \
1129 - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1130 - ((rq)->curr->static_prio > (rq)->best_expired_prio))
1131 -
1132 -/*
1133 * Account user cpu time to a process.
1134 * @p: the process that the cpu time gets accounted to
1135 * @hardirq_offset: the offset to subtract from hardirq_count()
1136 @@ -2652,6 +2486,7 @@ void account_system_time(struct task_str
1137 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
1138 else
1139 cpustat->idle = cputime64_add(cpustat->idle, tmp);
1140 +
1141 /* Account for system time used */
1142 acct_update_integrals(p);
1143 }
1144 @@ -2677,18 +2512,25 @@ void account_steal_time(struct task_stru
1145 cpustat->steal = cputime64_add(cpustat->steal, tmp);
1146 }
1147
1148 +static void time_slice_expired(task_t *p, runqueue_t *rq)
1149 +{
1150 + set_tsk_need_resched(p);
1151 + dequeue_task(p, rq);
1152 + p->prio = effective_prio(p);
1153 + p->time_slice = rr_interval(p);
1154 + enqueue_task(p, rq);
1155 +}
1156 +
1157 /*
1158 * This function gets called by the timer code, with HZ frequency.
1159 * We call it with interrupts disabled.
1160 - *
1161 - * It also gets called by the fork code, when changing the parent's
1162 - * timeslices.
1163 */
1164 void scheduler_tick(void)
1165 {
1166 int cpu = smp_processor_id();
1167 runqueue_t *rq = this_rq();
1168 task_t *p = current;
1169 + unsigned long debit, expired_balance = rq->nr_running;
1170 unsigned long long now = sched_clock();
1171
1172 update_cpu_clock(p, rq, now);
1173 @@ -2703,78 +2545,53 @@ void scheduler_tick(void)
1174 }
1175
1176 /* Task might have expired already, but not scheduled off yet */
1177 - if (p->array != rq->active) {
1178 + if (unlikely(!task_queued(p))) {
1179 set_tsk_need_resched(p);
1180 goto out;
1181 }
1182 - spin_lock(&rq->lock);
1183 /*
1184 - * The task was running during this tick - update the
1185 - * time slice counter. Note: we do not update a thread's
1186 - * priority until it either goes to sleep or uses up its
1187 - * timeslice. This makes it possible for interactive tasks
1188 - * to use up their timeslices at their highest priority levels.
1189 + * SCHED_FIFO tasks never run out of timeslice.
1190 */
1191 - if (rt_task(p)) {
1192 - /*
1193 - * RR tasks need a special form of timeslice management.
1194 - * FIFO tasks have no timeslices.
1195 - */
1196 - if ((p->policy == SCHED_RR) && !--p->time_slice) {
1197 - p->time_slice = task_timeslice(p);
1198 - p->first_time_slice = 0;
1199 - set_tsk_need_resched(p);
1200 + if (unlikely(p->policy == SCHED_FIFO)) {
1201 + expired_balance = 0;
1202 + goto out;
1203 + }
1204
1205 - /* put it at the end of the queue: */
1206 - requeue_task(p, rq->active);
1207 - }
1208 + spin_lock(&rq->lock);
1209 + debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
1210 + p->ns_debit += debit;
1211 + if (p->ns_debit < NSJIFFY)
1212 + goto out_unlock;
1213 + p->ns_debit %= NSJIFFY;
1214 + /*
1215 + * Tasks lose bonus each time they use up a full slice().
1216 + */
1217 + if (!--p->slice) {
1218 + dec_bonus(p);
1219 + p->slice = slice(p);
1220 + time_slice_expired(p, rq);
1221 + p->totalrun = 0;
1222 goto out_unlock;
1223 }
1224 + /*
1225 + * Tasks that run out of time_slice but still have slice left get
1226 + * requeued with a lower priority && RR_INTERVAL time_slice.
1227 + */
1228 if (!--p->time_slice) {
1229 - dequeue_task(p, rq->active);
1230 + time_slice_expired(p, rq);
1231 + goto out_unlock;
1232 + }
1233 + rq->cache_ticks++;
1234 + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) {
1235 set_tsk_need_resched(p);
1236 - p->prio = effective_prio(p);
1237 - p->time_slice = task_timeslice(p);
1238 - p->first_time_slice = 0;
1239 -
1240 - if (!rq->expired_timestamp)
1241 - rq->expired_timestamp = jiffies;
1242 - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1243 - enqueue_task(p, rq->expired);
1244 - if (p->static_prio < rq->best_expired_prio)
1245 - rq->best_expired_prio = p->static_prio;
1246 - } else
1247 - enqueue_task(p, rq->active);
1248 - } else {
1249 - /*
1250 - * Prevent a too long timeslice allowing a task to monopolize
1251 - * the CPU. We do this by splitting up the timeslice into
1252 - * smaller pieces.
1253 - *
1254 - * Note: this does not mean the task's timeslices expire or
1255 - * get lost in any way, they just might be preempted by
1256 - * another task of equal priority. (one with higher
1257 - * priority would have preempted this task already.) We
1258 - * requeue this task to the end of the list on this priority
1259 - * level, which is in essence a round-robin of tasks with
1260 - * equal priority.
1261 - *
1262 - * This only applies to tasks in the interactive
1263 - * delta range with at least TIMESLICE_GRANULARITY to requeue.
1264 - */
1265 - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1266 - p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1267 - (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1268 - (p->array == rq->active)) {
1269 -
1270 - requeue_task(p, rq->active);
1271 - set_tsk_need_resched(p);
1272 - }
1273 + goto out_unlock;
1274 }
1275 + expired_balance = 0;
1276 out_unlock:
1277 spin_unlock(&rq->lock);
1278 out:
1279 - rebalance_tick(cpu, rq, NOT_IDLE);
1280 + if (expired_balance > 1)
1281 + rebalance_tick(cpu, rq, NOT_IDLE);
1282 }
1283
1284 #ifdef CONFIG_SCHED_SMT
1285 @@ -2831,19 +2648,19 @@ static void wake_sleeping_dependent(int
1286
1287 /*
1288 * number of 'lost' timeslices this task wont be able to fully
1289 - * utilize, if another task runs on a sibling. This models the
1290 + * utilise, if another task runs on a sibling. This models the
1291 * slowdown effect of other tasks running on siblings:
1292 */
1293 -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
1294 +static inline unsigned long smt_slice(const task_t *p,
1295 + const struct sched_domain *sd)
1296 {
1297 - return p->time_slice * (100 - sd->per_cpu_gain) / 100;
1298 + return p->slice * (100 - sd->per_cpu_gain) / 100;
1299 }
1300
1301 static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
1302 {
1303 struct sched_domain *tmp, *sd = NULL;
1304 cpumask_t sibling_map;
1305 - prio_array_t *array;
1306 int ret = 0, i;
1307 task_t *p;
1308
1309 @@ -2870,12 +2687,8 @@ static int dependent_sleeper(int this_cp
1310 */
1311 if (!this_rq->nr_running)
1312 goto out_unlock;
1313 - array = this_rq->active;
1314 - if (!array->nr_active)
1315 - array = this_rq->expired;
1316 - BUG_ON(!array->nr_active);
1317
1318 - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
1319 + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next,
1320 task_t, run_list);
1321
1322 for_each_cpu_mask(i, sibling_map) {
1323 @@ -2905,7 +2718,7 @@ static int dependent_sleeper(int this_cp
1324 } else
1325 if (smt_curr->static_prio < p->static_prio &&
1326 !TASK_PREEMPTS_CURR(p, smt_rq) &&
1327 - smt_slice(smt_curr, sd) > task_timeslice(p))
1328 + smt_slice(smt_curr, sd) > slice(p))
1329 ret = 1;
1330
1331 check_smt_task:
1332 @@ -2928,7 +2741,7 @@ check_smt_task:
1333 resched_task(smt_curr);
1334 } else {
1335 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
1336 - smt_slice(p, sd) > task_timeslice(smt_curr))
1337 + smt_slice(p, sd) > slice(smt_curr))
1338 resched_task(smt_curr);
1339 else
1340 wakeup_busy_runqueue(smt_rq);
1341 @@ -2990,11 +2803,10 @@ asmlinkage void __sched schedule(void)
1342 long *switch_count;
1343 task_t *prev, *next;
1344 runqueue_t *rq;
1345 - prio_array_t *array;
1346 struct list_head *queue;
1347 unsigned long long now;
1348 - unsigned long run_time;
1349 - int cpu, idx, new_prio;
1350 + unsigned long debit;
1351 + int cpu, idx;
1352
1353 /*
1354 * Test if we are atomic. Since do_exit() needs to call into
1355 @@ -3029,20 +2841,11 @@ need_resched_nonpreemptible:
1356
1357 schedstat_inc(rq, sched_cnt);
1358 now = sched_clock();
1359 - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
1360 - run_time = now - prev->timestamp;
1361 - if (unlikely((long long)(now - prev->timestamp) < 0))
1362 - run_time = 0;
1363 - } else
1364 - run_time = NS_MAX_SLEEP_AVG;
1365 -
1366 - /*
1367 - * Tasks charged proportionately less run_time at high sleep_avg to
1368 - * delay them losing their interactive status
1369 - */
1370 - run_time /= (CURRENT_BONUS(prev) ? : 1);
1371
1372 spin_lock_irq(&rq->lock);
1373 + prev->runtime = ns_diff(now, prev->timestamp);
1374 + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY;
1375 + prev->ns_debit += debit;
1376
1377 if (unlikely(prev->flags & PF_DEAD))
1378 prev->state = EXIT_DEAD;
1379 @@ -3054,8 +2857,10 @@ need_resched_nonpreemptible:
1380 unlikely(signal_pending(prev))))
1381 prev->state = TASK_RUNNING;
1382 else {
1383 - if (prev->state == TASK_UNINTERRUPTIBLE)
1384 + if (prev->state == TASK_UNINTERRUPTIBLE) {
1385 + prev->flags |= PF_NONSLEEP;
1386 rq->nr_uninterruptible++;
1387 + }
1388 deactivate_task(prev, rq);
1389 }
1390 }
1391 @@ -3066,7 +2871,6 @@ go_idle:
1392 idle_balance(cpu, rq);
1393 if (!rq->nr_running) {
1394 next = rq->idle;
1395 - rq->expired_timestamp = 0;
1396 wake_sleeping_dependent(cpu, rq);
1397 /*
1398 * wake_sleeping_dependent() might have released
1399 @@ -3090,45 +2894,15 @@ go_idle:
1400 goto go_idle;
1401 }
1402
1403 - array = rq->active;
1404 - if (unlikely(!array->nr_active)) {
1405 - /*
1406 - * Switch the active and expired arrays.
1407 - */
1408 - schedstat_inc(rq, sched_switch);
1409 - rq->active = rq->expired;
1410 - rq->expired = array;
1411 - array = rq->active;
1412 - rq->expired_timestamp = 0;
1413 - rq->best_expired_prio = MAX_PRIO;
1414 - }
1415 -
1416 - idx = sched_find_first_bit(array->bitmap);
1417 - queue = array->queue + idx;
1418 + idx = sched_find_first_bit(rq->bitmap);
1419 + queue = rq->queue + idx;
1420 next = list_entry(queue->next, task_t, run_list);
1421
1422 - if (!rt_task(next) && next->activated > 0) {
1423 - unsigned long long delta = now - next->timestamp;
1424 - if (unlikely((long long)(now - next->timestamp) < 0))
1425 - delta = 0;
1426 -
1427 - if (next->activated == 1)
1428 - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1429 -
1430 - array = next->array;
1431 - new_prio = recalc_task_prio(next, next->timestamp + delta);
1432 -
1433 - if (unlikely(next->prio != new_prio)) {
1434 - dequeue_task(next, array);
1435 - next->prio = new_prio;
1436 - enqueue_task(next, array);
1437 - } else
1438 - requeue_task(next, array);
1439 - }
1440 - next->activated = 0;
1441 switch_tasks:
1442 if (next == rq->idle)
1443 schedstat_inc(rq, sched_goidle);
1444 + prev->timestamp = now;
1445 +
1446 prefetch(next);
1447 prefetch_stack(next);
1448 clear_tsk_need_resched(prev);
1449 @@ -3136,13 +2910,10 @@ switch_tasks:
1450
1451 update_cpu_clock(prev, rq, now);
1452
1453 - prev->sleep_avg -= run_time;
1454 - if ((long)prev->sleep_avg <= 0)
1455 - prev->sleep_avg = 0;
1456 - prev->timestamp = prev->last_ran = now;
1457 -
1458 sched_info_switch(prev, next);
1459 if (likely(prev != next)) {
1460 + rq->preempted = 0;
1461 + rq->cache_ticks = 0;
1462 next->timestamp = now;
1463 rq->nr_switches++;
1464 rq->curr = next;
1465 @@ -3572,9 +3343,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
1466 void set_user_nice(task_t *p, long nice)
1467 {
1468 unsigned long flags;
1469 - prio_array_t *array;
1470 runqueue_t *rq;
1471 - int old_prio, new_prio, delta;
1472 + int queued, old_prio, new_prio, delta;
1473
1474 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1475 return;
1476 @@ -3593,9 +3363,8 @@ void set_user_nice(task_t *p, long nice)
1477 p->static_prio = NICE_TO_PRIO(nice);
1478 goto out_unlock;
1479 }
1480 - array = p->array;
1481 - if (array) {
1482 - dequeue_task(p, array);
1483 + if ((queued = task_queued(p))) {
1484 + dequeue_task(p, rq);
1485 dec_raw_weighted_load(rq, p);
1486 }
1487
1488 @@ -3605,9 +3374,11 @@ void set_user_nice(task_t *p, long nice)
1489 p->static_prio = NICE_TO_PRIO(nice);
1490 set_load_weight(p);
1491 p->prio += delta;
1492 + if (p->bonus > bonus(p))
1493 + p->bonus= bonus(p);
1494
1495 - if (array) {
1496 - enqueue_task(p, array);
1497 + if (queued) {
1498 + enqueue_task(p, rq);
1499 inc_raw_weighted_load(rq, p);
1500 /*
1501 * If the task increased its priority or is running and
1502 @@ -3731,19 +3502,13 @@ static inline task_t *find_process_by_pi
1503 /* Actually do priority change: must hold rq lock. */
1504 static void __setscheduler(struct task_struct *p, int policy, int prio)
1505 {
1506 - BUG_ON(p->array);
1507 + BUG_ON(task_queued(p));
1508 p->policy = policy;
1509 p->rt_priority = prio;
1510 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
1511 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
1512 - } else {
1513 + } else
1514 p->prio = p->static_prio;
1515 - /*
1516 - * SCHED_BATCH tasks are treated as perpetual CPU hogs:
1517 - */
1518 - if (policy == SCHED_BATCH)
1519 - p->sleep_avg = 0;
1520 - }
1521 set_load_weight(p);
1522 }
1523
1524 @@ -3758,8 +3523,7 @@ int sched_setscheduler(struct task_struc
1525 struct sched_param *param)
1526 {
1527 int retval;
1528 - int oldprio, oldpolicy = -1;
1529 - prio_array_t *array;
1530 + int queued, oldprio, oldpolicy = -1;
1531 unsigned long flags;
1532 runqueue_t *rq;
1533
1534 @@ -3821,12 +3585,11 @@ recheck:
1535 task_rq_unlock(rq, &flags);
1536 goto recheck;
1537 }
1538 - array = p->array;
1539 - if (array)
1540 + if ((queued = task_queued(p)))
1541 deactivate_task(p, rq);
1542 oldprio = p->prio;
1543 __setscheduler(p, policy, param->sched_priority);
1544 - if (array) {
1545 + if (queued) {
1546 __activate_task(p, rq);
1547 /*
1548 * Reschedule if we are currently running on this runqueue and
1549 @@ -3836,8 +3599,8 @@ recheck:
1550 if (task_running(rq, p)) {
1551 if (p->prio > oldprio)
1552 resched_task(rq->curr);
1553 - } else if (TASK_PREEMPTS_CURR(p, rq))
1554 - resched_task(rq->curr);
1555 + } else
1556 + preempt(p, rq);
1557 }
1558 task_rq_unlock(rq, &flags);
1559 return 0;
1560 @@ -4094,43 +3857,27 @@ asmlinkage long sys_sched_getaffinity(pi
1561
1562 /**
1563 * sys_sched_yield - yield the current processor to other threads.
1564 - *
1565 - * this function yields the current CPU by moving the calling thread
1566 - * to the expired array. If there are no other threads running on this
1567 - * CPU then this function will return.
1568 + * This function yields the current CPU by dropping the priority of current
1569 + * to the lowest priority.
1570 */
1571 asmlinkage long sys_sched_yield(void)
1572 {
1573 + int newprio;
1574 runqueue_t *rq = this_rq_lock();
1575 - prio_array_t *array = current->array;
1576 - prio_array_t *target = rq->expired;
1577
1578 + newprio = current->prio;
1579 schedstat_inc(rq, yld_cnt);
1580 - /*
1581 - * We implement yielding by moving the task into the expired
1582 - * queue.
1583 - *
1584 - * (special rule: RT tasks will just roundrobin in the active
1585 - * array.)
1586 - */
1587 - if (rt_task(current))
1588 - target = rq->active;
1589 -
1590 - if (array->nr_active == 1) {
1591 - schedstat_inc(rq, yld_act_empty);
1592 - if (!rq->expired->nr_active)
1593 - schedstat_inc(rq, yld_both_empty);
1594 - } else if (!rq->expired->nr_active)
1595 - schedstat_inc(rq, yld_exp_empty);
1596 -
1597 - if (array != target) {
1598 - dequeue_task(current, array);
1599 - enqueue_task(current, target);
1600 + current->slice = slice(current);
1601 + current->time_slice = rr_interval(current);
1602 + if (likely(!rt_task(current)))
1603 + newprio = MAX_PRIO - 1;
1604 +
1605 + if (newprio != current->prio) {
1606 + dequeue_task(current, rq);
1607 + current->prio = newprio;
1608 + enqueue_task(current, rq);
1609 } else
1610 - /*
1611 - * requeue_task is cheaper so perform that if possible.
1612 - */
1613 - requeue_task(current, array);
1614 + requeue_task(current, rq);
1615
1616 /*
1617 * Since we are going to call schedule() anyway, there's
1618 @@ -4339,7 +4086,7 @@ long sys_sched_rr_get_interval(pid_t pid
1619 goto out_unlock;
1620
1621 jiffies_to_timespec(p->policy & SCHED_FIFO ?
1622 - 0 : task_timeslice(p), &t);
1623 + 0 : slice(p), &t);
1624 read_unlock(&tasklist_lock);
1625 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1626 out_nounlock:
1627 @@ -4462,8 +4209,6 @@ void __devinit init_idle(task_t *idle, i
1628 unsigned long flags;
1629
1630 idle->timestamp = sched_clock();
1631 - idle->sleep_avg = 0;
1632 - idle->array = NULL;
1633 idle->prio = MAX_PRIO;
1634 idle->state = TASK_RUNNING;
1635 idle->cpus_allowed = cpumask_of_cpu(cpu);
1636 @@ -4580,7 +4325,7 @@ static void __migrate_task(struct task_s
1637 goto out;
1638
1639 set_task_cpu(p, dest_cpu);
1640 - if (p->array) {
1641 + if (task_queued(p)) {
1642 /*
1643 * Sync timestamp with rq_dest's before activating.
1644 * The same thing could be achieved by doing this step
1645 @@ -4591,8 +4336,7 @@ static void __migrate_task(struct task_s
1646 + rq_dest->timestamp_last_tick;
1647 deactivate_task(p, rq_src);
1648 activate_task(p, rq_dest, 0);
1649 - if (TASK_PREEMPTS_CURR(p, rq_dest))
1650 - resched_task(rq_dest->curr);
1651 + preempt(p, rq_dest);
1652 }
1653
1654 out:
1655 @@ -4806,7 +4550,7 @@ static void migrate_dead_tasks(unsigned
1656
1657 for (arr = 0; arr < 2; arr++) {
1658 for (i = 0; i < MAX_PRIO; i++) {
1659 - struct list_head *list = &rq->arrays[arr].queue[i];
1660 + struct list_head *list = &rq->queue[i];
1661 while (!list_empty(list))
1662 migrate_dead(dead_cpu,
1663 list_entry(list->next, task_t,
1664 @@ -6148,17 +5892,15 @@ int in_sched_functions(unsigned long add
1665 void __init sched_init(void)
1666 {
1667 runqueue_t *rq;
1668 - int i, j, k;
1669 + int i, j;
1670
1671 for_each_cpu(i) {
1672 - prio_array_t *array;
1673
1674 rq = cpu_rq(i);
1675 spin_lock_init(&rq->lock);
1676 rq->nr_running = 0;
1677 - rq->active = rq->arrays;
1678 - rq->expired = rq->arrays + 1;
1679 - rq->best_expired_prio = MAX_PRIO;
1680 + rq->cache_ticks = 0;
1681 + rq->preempted = 0;
1682
1683 #ifdef CONFIG_SMP
1684 rq->sd = NULL;
1685 @@ -6170,16 +5912,13 @@ void __init sched_init(void)
1686 INIT_LIST_HEAD(&rq->migration_queue);
1687 #endif
1688 atomic_set(&rq->nr_iowait, 0);
1689 -
1690 - for (j = 0; j < 2; j++) {
1691 - array = rq->arrays + j;
1692 - for (k = 0; k < MAX_PRIO; k++) {
1693 - INIT_LIST_HEAD(array->queue + k);
1694 - __clear_bit(k, array->bitmap);
1695 - }
1696 - // delimiter for bitsearch
1697 - __set_bit(MAX_PRIO, array->bitmap);
1698 - }
1699 + for (j = 0; j < MAX_PRIO; j++)
1700 + INIT_LIST_HEAD(&rq->queue[j]);
1701 + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
1702 + /*
1703 + * delimiter for bitsearch
1704 + */
1705 + __set_bit(MAX_PRIO, rq->bitmap);
1706 }
1707
1708 set_load_weight(&init_task);
1709 @@ -6224,9 +5963,9 @@ EXPORT_SYMBOL(__might_sleep);
1710 void normalize_rt_tasks(void)
1711 {
1712 struct task_struct *p;
1713 - prio_array_t *array;
1714 unsigned long flags;
1715 runqueue_t *rq;
1716 + int queued;
1717
1718 read_lock_irq(&tasklist_lock);
1719 for_each_process (p) {
1720 @@ -6235,11 +5974,10 @@ void normalize_rt_tasks(void)
1721
1722 rq = task_rq_lock(p, &flags);
1723
1724 - array = p->array;
1725 - if (array)
1726 + if ((queued = task_queued(p)))
1727 deactivate_task(p, task_rq(p));
1728 __setscheduler(p, SCHED_NORMAL, 0);
1729 - if (array) {
1730 + if (queued) {
1731 __activate_task(p, task_rq(p));
1732 resched_task(rq->curr);
1733 }
1734 Index: linux-2.6.16-ck1/kernel/sysctl.c
1735 ===================================================================
1736 --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:26.000000000 +1100
1737 +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:46:48.000000000 +1100
1738 @@ -623,6 +623,22 @@ static ctl_table kern_table[] = {
1739 .mode = 0444,
1740 .proc_handler = &proc_dointvec,
1741 },
1742 + {
1743 + .ctl_name = KERN_INTERACTIVE,
1744 + .procname = "interactive",
1745 + .data = &sched_interactive,
1746 + .maxlen = sizeof (int),
1747 + .mode = 0644,
1748 + .proc_handler = &proc_dointvec,
1749 + },
1750 + {
1751 + .ctl_name = KERN_COMPUTE,
1752 + .procname = "compute",
1753 + .data = &sched_compute,
1754 + .maxlen = sizeof (int),
1755 + .mode = 0644,
1756 + .proc_handler = &proc_dointvec,
1757 + },
1758 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
1759 {
1760 .ctl_name = KERN_UNKNOWN_NMI_PANIC,