Magellan Linux

Contents of /trunk/kernel26-alx/patches-2.6.17-r7/0003-2.6.17-smpnice-staircase-16.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (show annotations) (download)
Fri May 18 11:04:36 2007 UTC (16 years, 11 months ago) by niro
File size: 52028 byte(s)
-import

1 Implement the "staircase" hybrid foreground-background single priority
2 array cpu scheduler policy.
3
4 Signed-off-by: Con Kolivas <kernel@kolivas.org>
5
6 fs/proc/array.c | 4
7 include/linux/sched.h | 21 -
8 kernel/exit.c | 1
9 kernel/sched.c | 1015 ++++++++++++++++++--------------------------------
10 4 files changed, 378 insertions(+), 663 deletions(-)
11
12 Index: linux-ck-dev/fs/proc/array.c
13 ===================================================================
14 --- linux-ck-dev.orig/fs/proc/array.c 2006-06-18 15:20:15.000000000 +1000
15 +++ linux-ck-dev/fs/proc/array.c 2006-06-18 15:21:50.000000000 +1000
16 @@ -165,7 +165,7 @@ static inline char * task_state(struct t
17 read_lock(&tasklist_lock);
18 buffer += sprintf(buffer,
19 "State:\t%s\n"
20 - "SleepAVG:\t%lu%%\n"
21 + "Bonus:\t%d\n"
22 "Tgid:\t%d\n"
23 "Pid:\t%d\n"
24 "PPid:\t%d\n"
25 @@ -173,7 +173,7 @@ static inline char * task_state(struct t
26 "Uid:\t%d\t%d\t%d\t%d\n"
27 "Gid:\t%d\t%d\t%d\t%d\n",
28 get_task_state(p),
29 - (p->sleep_avg/1024)*100/(1020000000/1024),
30 + p->bonus,
31 p->tgid,
32 p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0,
33 pid_alive(p) && p->ptrace ? p->parent->pid : 0,
34 Index: linux-ck-dev/include/linux/sched.h
35 ===================================================================
36 --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:21:31.000000000 +1000
37 +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:21:50.000000000 +1000
38 @@ -483,6 +483,7 @@ struct signal_struct {
39 #define MAX_RT_PRIO MAX_USER_RT_PRIO
40
41 #define MAX_PRIO (MAX_RT_PRIO + 40)
42 +#define MIN_USER_PRIO (MAX_PRIO - 1)
43
44 #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO))
45 #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
46 @@ -518,7 +519,6 @@ extern struct user_struct *find_user(uid
47 extern struct user_struct root_user;
48 #define INIT_USER (&root_user)
49
50 -typedef struct prio_array prio_array_t;
51 struct backing_dev_info;
52 struct reclaim_state;
53
54 @@ -687,13 +687,6 @@ struct audit_context; /* See audit.c */
55 struct mempolicy;
56 struct pipe_inode_info;
57
58 -enum sleep_type {
59 - SLEEP_NORMAL,
60 - SLEEP_NONINTERACTIVE,
61 - SLEEP_INTERACTIVE,
62 - SLEEP_INTERRUPTED,
63 -};
64 -
65 struct task_struct {
66 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
67 struct thread_info *thread_info;
68 @@ -711,19 +704,18 @@ struct task_struct {
69 int load_weight; /* for niceness load balancing purposes */
70 int prio, static_prio;
71 struct list_head run_list;
72 - prio_array_t *array;
73
74 unsigned short ioprio;
75 unsigned int btrace_seq;
76
77 - unsigned long sleep_avg;
78 - unsigned long long timestamp, last_ran;
79 + unsigned long long timestamp;
80 + unsigned long runtime, totalrun, ns_debit, systime;
81 + unsigned int bonus;
82 + unsigned int slice, time_slice;
83 unsigned long long sched_time; /* sched_clock time spent running */
84 - enum sleep_type sleep_type;
85
86 unsigned long policy;
87 cpumask_t cpus_allowed;
88 - unsigned int time_slice, first_time_slice;
89
90 #ifdef CONFIG_SCHEDSTATS
91 struct sched_info sched_info;
92 @@ -952,6 +944,8 @@ static inline void put_task_struct(struc
93 #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */
94 #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */
95 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
96 +#define PF_NONSLEEP 0x20000000 /* Waiting on in kernel activity */
97 +#define PF_FORKED 0x40000000 /* Task just forked another process */
98
99 /*
100 * Only the _current_ task can read/write to tsk->flags, but other
101 @@ -1073,7 +1067,6 @@ extern void FASTCALL(wake_up_new_task(st
102 static inline void kick_process(struct task_struct *tsk) { }
103 #endif
104 extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
105 -extern void FASTCALL(sched_exit(task_t * p));
106
107 extern int in_group_p(gid_t);
108 extern int in_egroup_p(gid_t);
109 Index: linux-ck-dev/kernel/exit.c
110 ===================================================================
111 --- linux-ck-dev.orig/kernel/exit.c 2006-06-18 15:21:00.000000000 +1000
112 +++ linux-ck-dev/kernel/exit.c 2006-06-18 15:21:50.000000000 +1000
113 @@ -170,7 +170,6 @@ repeat:
114 zap_leader = (leader->exit_signal == -1);
115 }
116
117 - sched_exit(p);
118 write_unlock_irq(&tasklist_lock);
119 spin_unlock(&p->proc_lock);
120 proc_pid_flush(proc_dentry);
121 Index: linux-ck-dev/kernel/sched.c
122 ===================================================================
123 --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000
124 +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:22:27.000000000 +1000
125 @@ -16,6 +16,9 @@
126 * by Davide Libenzi, preemptible kernel bits by Robert Love.
127 * 2003-09-03 Interactivity tuning by Con Kolivas.
128 * 2004-04-02 Scheduler domains code by Nick Piggin
129 + * 2006-06-18 Staircase scheduling policy by Con Kolivas with help
130 + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
131 + * Staircase v16
132 */
133
134 #include <linux/mm.h>
135 @@ -75,131 +78,27 @@
136 /*
137 * Some helpers for converting nanosecond timing to jiffy resolution
138 */
139 -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
140 -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
141 -
142 -/*
143 - * These are the 'tuning knobs' of the scheduler:
144 - *
145 - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
146 - * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
147 - * Timeslices get refilled after they expire.
148 - */
149 -#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
150 -#define DEF_TIMESLICE (100 * HZ / 1000)
151 -#define ON_RUNQUEUE_WEIGHT 30
152 -#define CHILD_PENALTY 95
153 -#define PARENT_PENALTY 100
154 -#define EXIT_WEIGHT 3
155 -#define PRIO_BONUS_RATIO 25
156 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
157 -#define INTERACTIVE_DELTA 2
158 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
159 -#define STARVATION_LIMIT (MAX_SLEEP_AVG)
160 -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
161 -
162 -/*
163 - * If a task is 'interactive' then we reinsert it in the active
164 - * array after it has expired its current timeslice. (it will not
165 - * continue to run immediately, it will still roundrobin with
166 - * other interactive tasks.)
167 - *
168 - * This part scales the interactivity limit depending on niceness.
169 - *
170 - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
171 - * Here are a few examples of different nice levels:
172 - *
173 - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
174 - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
175 - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
176 - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
177 - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
178 - *
179 - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
180 - * priority range a task can explore, a value of '1' means the
181 - * task is rated interactive.)
182 - *
183 - * Ie. nice +19 tasks can never get 'interactive' enough to be
184 - * reinserted into the active array. And only heavily CPU-hog nice -20
185 - * tasks will be expired. Default nice 0 tasks are somewhere between,
186 - * it takes some effort for them to get interactive, but it's not
187 - * too hard.
188 - */
189 -
190 -#define CURRENT_BONUS(p) \
191 - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
192 - MAX_SLEEP_AVG)
193 -
194 -#define GRANULARITY (10 * HZ / 1000 ? : 1)
195 -
196 -#ifdef CONFIG_SMP
197 -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
198 - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
199 - num_online_cpus())
200 -#else
201 -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
202 - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
203 -#endif
204 -
205 -#define SCALE(v1,v1_max,v2_max) \
206 - (v1) * (v2_max) / (v1_max)
207 -
208 -#define DELTA(p) \
209 - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
210 - INTERACTIVE_DELTA)
211 -
212 -#define TASK_INTERACTIVE(p) \
213 - ((p)->prio <= (p)->static_prio - DELTA(p))
214 -
215 -#define INTERACTIVE_SLEEP(p) \
216 - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
217 - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
218 -
219 +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
220 +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY)
221 +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY)
222 #define TASK_PREEMPTS_CURR(p, rq) \
223 ((p)->prio < (rq)->curr->prio)
224
225 /*
226 - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
227 - * to time slice values: [800ms ... 100ms ... 5ms]
228 - *
229 - * The higher a thread's priority, the bigger timeslices
230 - * it gets during one round of execution. But even the lowest
231 - * priority thread gets MIN_TIMESLICE worth of execution time.
232 + * This is the time all tasks within the same priority round robin.
233 + * Set to a minimum of 6ms.
234 */
235 +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
236 +#define DEF_TIMESLICE (RR_INTERVAL * 19)
237
238 -#define SCALE_PRIO(x, prio) \
239 - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
240 -
241 -static unsigned int static_prio_timeslice(int static_prio)
242 -{
243 - if (static_prio < NICE_TO_PRIO(0))
244 - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
245 - else
246 - return SCALE_PRIO(DEF_TIMESLICE, static_prio);
247 -}
248 -
249 -static inline unsigned int task_timeslice(task_t *p)
250 -{
251 - return static_prio_timeslice(p->static_prio);
252 -}
253 -
254 -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
255 +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \
256 < (long long) (sd)->cache_hot_time)
257
258 /*
259 * These are the runqueue data structures:
260 */
261 -
262 -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
263 -
264 typedef struct runqueue runqueue_t;
265
266 -struct prio_array {
267 - unsigned int nr_active;
268 - unsigned long bitmap[BITMAP_SIZE];
269 - struct list_head queue[MAX_PRIO];
270 -};
271 -
272 /*
273 * This is the main, per-CPU runqueue data structure.
274 *
275 @@ -229,12 +128,11 @@ struct runqueue {
276 */
277 unsigned long nr_uninterruptible;
278
279 - unsigned long expired_timestamp;
280 unsigned long long timestamp_last_tick;
281 task_t *curr, *idle;
282 struct mm_struct *prev_mm;
283 - prio_array_t *active, *expired, arrays[2];
284 - int best_expired_prio;
285 + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
286 + struct list_head queue[MAX_PRIO];
287 atomic_t nr_iowait;
288
289 #ifdef CONFIG_SMP
290 @@ -499,13 +397,7 @@ static inline runqueue_t *this_rq_lock(v
291
292 #ifdef CONFIG_SCHEDSTATS
293 /*
294 - * Called when a process is dequeued from the active array and given
295 - * the cpu. We should note that with the exception of interactive
296 - * tasks, the expired queue will become the active queue after the active
297 - * queue is empty, without explicitly dequeuing and requeuing tasks in the
298 - * expired queue. (Interactive tasks may be requeued directly to the
299 - * active queue, thus delaying tasks in the expired queue from running;
300 - * see scheduler_tick()).
301 + * Called when a process is dequeued and given the cpu.
302 *
303 * This function is only called from sched_info_arrive(), rather than
304 * dequeue_task(). Even though a task may be queued and dequeued multiple
305 @@ -543,13 +435,11 @@ static void sched_info_arrive(task_t *t)
306 }
307
308 /*
309 - * Called when a process is queued into either the active or expired
310 - * array. The time is noted and later used to determine how long we
311 - * had to wait for us to reach the cpu. Since the expired queue will
312 - * become the active queue after active queue is empty, without dequeuing
313 - * and requeuing any tasks, we are interested in queuing to either. It
314 - * is unusual but not impossible for tasks to be dequeued and immediately
315 - * requeued in the same or another array: this can happen in sched_yield(),
316 + * Called when a process is queued
317 + * The time is noted and later used to determine how long we had to wait for
318 + * us to reach the cpu.
319 + * It is unusual but not impossible for tasks to be dequeued and immediately
320 + * requeued: this can happen in sched_yield(),
321 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
322 * to runqueue.
323 *
324 @@ -603,74 +493,81 @@ static inline void sched_info_switch(tas
325 #define sched_info_switch(t, next) do { } while (0)
326 #endif /* CONFIG_SCHEDSTATS */
327
328 -/*
329 - * Adding/removing a task to/from a priority array:
330 - */
331 -static void dequeue_task(struct task_struct *p, prio_array_t *array)
332 +#if BITS_PER_LONG < 64
333 +static inline void longlimit(unsigned long long *longlong)
334 +{
335 + if (*longlong > (1 << 31))
336 + *longlong = 1 << 31;
337 +}
338 +#else
339 +static inline void longlimit(unsigned long long *__unused)
340 {
341 - array->nr_active--;
342 - list_del(&p->run_list);
343 - if (list_empty(array->queue + p->prio))
344 - __clear_bit(p->prio, array->bitmap);
345 +}
346 +#endif
347 +
348 +/* Get nanosecond clock difference without overflowing unsigned long. */
349 +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
350 +{
351 + unsigned long long vdiff;
352 + if (likely(v1 >= v2)) {
353 + vdiff = v1 - v2;
354 + longlimit(&vdiff);
355 + } else {
356 + /*
357 + * Rarely the clock appears to go backwards. There should
358 + * always be a positive difference so return 1.
359 + */
360 + vdiff = 1;
361 + }
362 + return (unsigned long)vdiff;
363 }
364
365 -static void enqueue_task(struct task_struct *p, prio_array_t *array)
366 +static inline int task_queued(const task_t *task)
367 {
368 - sched_info_queued(p);
369 - list_add_tail(&p->run_list, array->queue + p->prio);
370 - __set_bit(p->prio, array->bitmap);
371 - array->nr_active++;
372 - p->array = array;
373 + return !list_empty(&task->run_list);
374 }
375
376 /*
377 - * Put task to the end of the run list without the overhead of dequeue
378 - * followed by enqueue.
379 + * Adding/removing a task to/from a runqueue:
380 */
381 -static void requeue_task(struct task_struct *p, prio_array_t *array)
382 +static void dequeue_task(task_t *p, runqueue_t *rq)
383 {
384 - list_move_tail(&p->run_list, array->queue + p->prio);
385 + list_del_init(&p->run_list);
386 + if (list_empty(rq->queue + p->prio))
387 + __clear_bit(p->prio, rq->bitmap);
388 + p->ns_debit = 0;
389 }
390
391 -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
392 +static void enqueue_task(task_t *p, runqueue_t *rq)
393 {
394 - list_add(&p->run_list, array->queue + p->prio);
395 - __set_bit(p->prio, array->bitmap);
396 - array->nr_active++;
397 - p->array = array;
398 + list_add_tail(&p->run_list, rq->queue + p->prio);
399 + __set_bit(p->prio, rq->bitmap);
400 }
401
402 /*
403 - * effective_prio - return the priority that is based on the static
404 - * priority but is modified by bonuses/penalties.
405 - *
406 - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
407 - * into the -5 ... 0 ... +5 bonus/penalty range.
408 - *
409 - * We use 25% of the full 0...39 priority range so that:
410 - *
411 - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
412 - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
413 - *
414 - * Both properties are important to certain workloads.
415 + * Put task to the end of the run list without the overhead of dequeue
416 + * followed by enqueue.
417 */
418 -static int effective_prio(task_t *p)
419 +static void requeue_task(task_t *p, runqueue_t *rq, const int prio)
420 {
421 - int bonus, prio;
422 -
423 - if (rt_task(p))
424 - return p->prio;
425 -
426 - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
427 + list_move_tail(&p->run_list, rq->queue + prio);
428 + if (p->prio != prio) {
429 + if (list_empty(rq->queue + p->prio))
430 + __clear_bit(p->prio, rq->bitmap);
431 + p->prio = prio;
432 + __set_bit(prio, rq->bitmap);
433 + }
434 + p->ns_debit = 0;
435 +}
436
437 - prio = p->static_prio - bonus;
438 - if (prio < MAX_RT_PRIO)
439 - prio = MAX_RT_PRIO;
440 - if (prio > MAX_PRIO-1)
441 - prio = MAX_PRIO-1;
442 - return prio;
443 +static inline void enqueue_task_head(task_t *p, runqueue_t *rq)
444 +{
445 + list_add(&p->run_list, rq->queue + p->prio);
446 + __set_bit(p->prio, rq->bitmap);
447 }
448
449 +static unsigned int slice(const task_t *p);
450 +
451 /*
452 * To aid in avoiding the subversion of "niceness" due to uneven distribution
453 * of tasks with abnormal "nice" values across CPUs the contribution that
454 @@ -688,10 +585,9 @@ static int effective_prio(task_t *p)
455 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
456 #define LOAD_WEIGHT(lp) \
457 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
458 -#define PRIO_TO_LOAD_WEIGHT(prio) \
459 - LOAD_WEIGHT(static_prio_timeslice(prio))
460 -#define RTPRIO_TO_LOAD_WEIGHT(rp) \
461 - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
462 +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p))
463 +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
464 + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
465
466 static void set_load_weight(task_t *p)
467 {
468 @@ -708,7 +604,7 @@ static void set_load_weight(task_t *p)
469 #endif
470 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
471 } else
472 - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
473 + p->load_weight = TASK_LOAD_WEIGHT(p);
474 }
475
476 static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
477 @@ -736,13 +632,9 @@ static inline void dec_nr_running(task_t
478 /*
479 * __activate_task - move a task to the runqueue.
480 */
481 -static void __activate_task(task_t *p, runqueue_t *rq)
482 +static inline void __activate_task(task_t *p, runqueue_t *rq)
483 {
484 - prio_array_t *target = rq->active;
485 -
486 - if (batch_task(p))
487 - target = rq->expired;
488 - enqueue_task(p, target);
489 + enqueue_task(p, rq);
490 inc_nr_running(p, rq);
491 }
492
493 @@ -751,85 +643,181 @@ static void __activate_task(task_t *p, r
494 */
495 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
496 {
497 - enqueue_task_head(p, rq->active);
498 + enqueue_task_head(p, rq);
499 inc_nr_running(p, rq);
500 }
501
502 -static int recalc_task_prio(task_t *p, unsigned long long now)
503 +/*
504 + * Bonus - How much higher than its base priority an interactive task can run.
505 + */
506 +static inline unsigned int bonus(const task_t *p)
507 {
508 - /* Caller must always ensure 'now >= p->timestamp' */
509 - unsigned long long __sleep_time = now - p->timestamp;
510 - unsigned long sleep_time;
511 + return TASK_USER_PRIO(p);
512 +}
513
514 - if (batch_task(p))
515 - sleep_time = 0;
516 +static unsigned int rr_interval(const task_t *p)
517 +{
518 + int nice = TASK_NICE(p);
519 +
520 + if (nice < 0 && !rt_task(p))
521 + return RR_INTERVAL * (20 - nice) / 20;
522 + return RR_INTERVAL;
523 +}
524 +
525 +/*
526 + * slice - the duration a task runs before getting requeued at its best
527 + * priority and has its bonus decremented.
528 + */
529 +static unsigned int slice(const task_t *p)
530 +{
531 + unsigned int slice, rr;
532 +
533 + slice = rr = rr_interval(p);
534 + if (likely(!rt_task(p)))
535 + slice += (39 - TASK_USER_PRIO(p)) * rr;
536 + return slice;
537 +}
538 +
539 +/*
540 + * We increase our bonus by sleeping more than the time we ran.
541 + * The ratio of sleep to run gives us the cpu% that we last ran and determines
542 + * the maximum bonus we can acquire.
543 + */
544 +static void inc_bonus(task_t *p, unsigned long totalrun, unsigned long sleep)
545 +{
546 + unsigned int best_bonus = sleep / (totalrun + 1);
547 +
548 + if (p->bonus >= best_bonus)
549 + return;
550 + best_bonus = bonus(p);
551 + if (p->bonus < best_bonus)
552 + p->bonus++;
553 +}
554 +
555 +static inline void dec_bonus(task_t *p)
556 +{
557 + if (p->bonus)
558 + p->bonus--;
559 +}
560 +
561 +static inline void slice_overrun(struct task_struct *p)
562 +{
563 + unsigned long ns_slice = JIFFIES_TO_NS(p->slice);
564 +
565 + do {
566 + p->totalrun -= ns_slice;
567 + dec_bonus(p);
568 + } while (unlikely(p->totalrun > ns_slice));
569 +}
570 +
571 +/*
572 + * effective_prio - dynamic priority dependent on bonus.
573 + * The priority normally decreases by one each RR_INTERVAL.
574 + * As the bonus increases the initial priority starts at a higher "stair" or
575 + * priority for longer.
576 + */
577 +static int effective_prio(const task_t *p)
578 +{
579 + int prio;
580 + unsigned int full_slice, used_slice = 0;
581 + unsigned int best_bonus, rr;
582 +
583 + if (rt_task(p))
584 + return p->prio;
585 +
586 + full_slice = slice(p);
587 + if (full_slice > p->slice)
588 + used_slice = full_slice - p->slice;
589 +
590 + best_bonus = bonus(p);
591 + prio = MAX_RT_PRIO + best_bonus;
592 + if (!batch_task(p))
593 + prio -= p->bonus;
594 +
595 + rr = rr_interval(p);
596 + prio += used_slice / rr;
597 + if (prio > MIN_USER_PRIO)
598 + prio = MIN_USER_PRIO;
599 + return prio;
600 +}
601 +
602 +static inline void continue_slice(task_t *p)
603 +{
604 + unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
605 +
606 + if (unlikely(total_run >= p->slice))
607 + slice_overrun(p);
608 else {
609 - if (__sleep_time > NS_MAX_SLEEP_AVG)
610 - sleep_time = NS_MAX_SLEEP_AVG;
611 - else
612 - sleep_time = (unsigned long)__sleep_time;
613 + unsigned long remainder;
614 +
615 + p->slice -= total_run;
616 + remainder = p->slice % rr_interval(p);
617 + if (remainder)
618 + p->time_slice = remainder;
619 }
620 +}
621
622 - if (likely(sleep_time > 0)) {
623 - /*
624 - * User tasks that sleep a long time are categorised as
625 - * idle. They will only have their sleep_avg increased to a
626 - * level that makes them just interactive priority to stay
627 - * active yet prevent them suddenly becoming cpu hogs and
628 - * starving other processes.
629 - */
630 - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
631 - unsigned long ceiling;
632 +/*
633 + * recalc_task_prio - this checks for tasks that have run less than a full
634 + * slice and have woken up again soon after, or have just forked a
635 + * thread/process and make them continue their old slice instead of starting
636 + * a new one at high priority.
637 + */
638 +static inline void recalc_task_prio(task_t *p, const unsigned long long now)
639 +{
640 + unsigned long sleep_time;
641
642 - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
643 - DEF_TIMESLICE);
644 - if (p->sleep_avg < ceiling)
645 - p->sleep_avg = ceiling;
646 - } else {
647 - /*
648 - * Tasks waking from uninterruptible sleep are
649 - * limited in their sleep_avg rise as they
650 - * are likely to be waiting on I/O
651 - */
652 - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
653 - if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
654 - sleep_time = 0;
655 - else if (p->sleep_avg + sleep_time >=
656 - INTERACTIVE_SLEEP(p)) {
657 - p->sleep_avg = INTERACTIVE_SLEEP(p);
658 - sleep_time = 0;
659 - }
660 - }
661 + /*
662 + * If this task has managed to run to its lowest priority then
663 + * decrease its bonus and requeue it now at best priority instead
664 + * of possibly flagging around lowest priority. Save up any systime
665 + * that may affect priority on the next reschedule.
666 + */
667 + if (p->slice > p->time_slice &&
668 + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) {
669 + dec_bonus(p);
670 + p->totalrun = 0;
671 + return;
672 + }
673
674 - /*
675 - * This code gives a bonus to interactive tasks.
676 - *
677 - * The boost works by updating the 'average sleep time'
678 - * value here, based on ->timestamp. The more time a
679 - * task spends sleeping, the higher the average gets -
680 - * and the higher the priority boost gets as well.
681 - */
682 - p->sleep_avg += sleep_time;
683 + /*
684 + * Add the total for this last scheduled run (p->runtime) and system
685 + * time (p->systime) done on behalf of p to the running total so far
686 + * used (p->totalrun).
687 + */
688 + p->totalrun += p->runtime + p->systime;
689 + sleep_time = ns_diff(now, p->timestamp);
690
691 - if (p->sleep_avg > NS_MAX_SLEEP_AVG)
692 - p->sleep_avg = NS_MAX_SLEEP_AVG;
693 + if (p->systime > sleep_time || p->flags & PF_FORKED)
694 + sleep_time = 0;
695 + else {
696 + sleep_time -= p->systime;
697 + /*
698 + * We elevate priority by the amount of time we slept. If we
699 + * sleep longer than our running total and have not set the
700 + * PF_NONSLEEP flag we gain a bonus.
701 + */
702 + if (sleep_time >= p->totalrun) {
703 + if (!(p->flags & PF_NONSLEEP))
704 + inc_bonus(p, p->totalrun, sleep_time);
705 + p->totalrun = 0;
706 + return;
707 }
708 + p->totalrun -= sleep_time;
709 }
710 -
711 - return effective_prio(p);
712 + continue_slice(p);
713 }
714
715 /*
716 * activate_task - move a task to the runqueue and do priority recalculation
717 *
718 - * Update all the scheduling statistics stuff. (sleep average
719 - * calculation, priority modifiers, etc.)
720 + * Update all the scheduling statistics stuff. (priority modifiers, etc.)
721 */
722 -static void activate_task(task_t *p, runqueue_t *rq, int local)
723 +static void activate_task(task_t *p, runqueue_t *rq, const int local)
724 {
725 - unsigned long long now;
726 + unsigned long long now = sched_clock();
727 + unsigned long rr = rr_interval(p);
728
729 - now = sched_clock();
730 #ifdef CONFIG_SMP
731 if (!local) {
732 /* Compensate for drifting sched_clock */
733 @@ -838,45 +826,25 @@ static void activate_task(task_t *p, run
734 + rq->timestamp_last_tick;
735 }
736 #endif
737 -
738 - if (!rt_task(p))
739 - p->prio = recalc_task_prio(p, now);
740 -
741 - /*
742 - * This checks to make sure it's not an uninterruptible task
743 - * that is now waking up.
744 - */
745 - if (p->sleep_type == SLEEP_NORMAL) {
746 - /*
747 - * Tasks which were woken up by interrupts (ie. hw events)
748 - * are most likely of interactive nature. So we give them
749 - * the credit of extending their sleep time to the period
750 - * of time they spend on the runqueue, waiting for execution
751 - * on a CPU, first time around:
752 - */
753 - if (in_interrupt())
754 - p->sleep_type = SLEEP_INTERRUPTED;
755 - else {
756 - /*
757 - * Normal first-time wakeups get a credit too for
758 - * on-runqueue time, but it will be weighted down:
759 - */
760 - p->sleep_type = SLEEP_INTERACTIVE;
761 - }
762 + p->slice = slice(p);
763 + p->time_slice = p->slice % rr ? : rr;
764 + if (!rt_task(p)) {
765 + recalc_task_prio(p, now);
766 + p->prio = effective_prio(p);
767 + p->systime = 0;
768 + p->flags &= ~(PF_FORKED | PF_NONSLEEP);
769 }
770 p->timestamp = now;
771 -
772 __activate_task(p, rq);
773 }
774
775 /*
776 * deactivate_task - remove a task from the runqueue.
777 */
778 -static void deactivate_task(struct task_struct *p, runqueue_t *rq)
779 +static void deactivate_task(task_t *p, runqueue_t *rq)
780 {
781 dec_nr_running(p, rq);
782 - dequeue_task(p, p->array);
783 - p->array = NULL;
784 + dequeue_task(p, rq);
785 }
786
787 /*
788 @@ -952,7 +920,7 @@ static int migrate_task(task_t *p, int d
789 * If the task is not on a runqueue (and not running), then
790 * it is sufficient to simply update the task's cpu field.
791 */
792 - if (!p->array && !task_running(rq, p)) {
793 + if (!task_queued(p) && !task_running(rq, p)) {
794 set_task_cpu(p, dest_cpu);
795 return 0;
796 }
797 @@ -982,7 +950,7 @@ void wait_task_inactive(task_t *p)
798 repeat:
799 rq = task_rq_lock(p, &flags);
800 /* Must be off runqueue entirely, not preempted. */
801 - if (unlikely(p->array || task_running(rq, p))) {
802 + if (unlikely(task_queued(p) || task_running(rq, p))) {
803 /* If it's preempted, we yield. It could be a while. */
804 preempted = !task_running(rq, p);
805 task_rq_unlock(rq, &flags);
806 @@ -1234,6 +1202,15 @@ static inline int wake_idle(int cpu, tas
807 }
808 #endif
809
810 +/*
811 + * Check to see if p preempts rq->curr and resched if it does.
812 + */
813 +static inline void preempt(const task_t *p, runqueue_t *rq)
814 +{
815 + if (TASK_PREEMPTS_CURR(p, rq))
816 + resched_task(rq->curr);
817 +}
818 +
819 /***
820 * try_to_wake_up - wake up a thread
821 * @p: the to-be-woken-up thread
822 @@ -1265,7 +1242,7 @@ static int try_to_wake_up(task_t *p, uns
823 if (!(old_state & state))
824 goto out;
825
826 - if (p->array)
827 + if (task_queued(p))
828 goto out_running;
829
830 cpu = task_cpu(p);
831 @@ -1356,7 +1333,7 @@ out_set_cpu:
832 old_state = p->state;
833 if (!(old_state & state))
834 goto out;
835 - if (p->array)
836 + if (task_queued(p))
837 goto out_running;
838
839 this_cpu = smp_processor_id();
840 @@ -1365,25 +1342,9 @@ out_set_cpu:
841
842 out_activate:
843 #endif /* CONFIG_SMP */
844 - if (old_state == TASK_UNINTERRUPTIBLE) {
845 + if (old_state == TASK_UNINTERRUPTIBLE)
846 rq->nr_uninterruptible--;
847 - /*
848 - * Tasks on involuntary sleep don't earn
849 - * sleep_avg beyond just interactive state.
850 - */
851 - p->sleep_type = SLEEP_NONINTERACTIVE;
852 - } else
853 -
854 - /*
855 - * Tasks that have marked their sleep as noninteractive get
856 - * woken up with their sleep average not weighted in an
857 - * interactive way.
858 - */
859 - if (old_state & TASK_NONINTERACTIVE)
860 - p->sleep_type = SLEEP_NONINTERACTIVE;
861 -
862
863 - activate_task(p, rq, cpu == this_cpu);
864 /*
865 * Sync wakeups (i.e. those types of wakeups where the waker
866 * has indicated that it will leave the CPU in short order)
867 @@ -1392,10 +1353,9 @@ out_activate:
868 * the waker guarantees that the freshly woken up task is going
869 * to be considered on this CPU.)
870 */
871 - if (!sync || cpu != this_cpu) {
872 - if (TASK_PREEMPTS_CURR(p, rq))
873 - resched_task(rq->curr);
874 - }
875 + activate_task(p, rq, cpu == this_cpu);
876 + if (!sync || cpu != this_cpu)
877 + preempt(p, rq);
878 success = 1;
879
880 out_running:
881 @@ -1440,7 +1400,6 @@ void fastcall sched_fork(task_t *p, int
882 */
883 p->state = TASK_RUNNING;
884 INIT_LIST_HEAD(&p->run_list);
885 - p->array = NULL;
886 #ifdef CONFIG_SCHEDSTATS
887 memset(&p->sched_info, 0, sizeof(p->sched_info));
888 #endif
889 @@ -1451,30 +1410,6 @@ void fastcall sched_fork(task_t *p, int
890 /* Want to start with kernel preemption disabled. */
891 task_thread_info(p)->preempt_count = 1;
892 #endif
893 - /*
894 - * Share the timeslice between parent and child, thus the
895 - * total amount of pending timeslices in the system doesn't change,
896 - * resulting in more scheduling fairness.
897 - */
898 - local_irq_disable();
899 - p->time_slice = (current->time_slice + 1) >> 1;
900 - /*
901 - * The remainder of the first timeslice might be recovered by
902 - * the parent if the child exits early enough.
903 - */
904 - p->first_time_slice = 1;
905 - current->time_slice >>= 1;
906 - p->timestamp = sched_clock();
907 - if (unlikely(!current->time_slice)) {
908 - /*
909 - * This case is rare, it happens when the parent has only
910 - * a single jiffy left from its timeslice. Taking the
911 - * runqueue lock is not a problem.
912 - */
913 - current->time_slice = 1;
914 - scheduler_tick();
915 - }
916 - local_irq_enable();
917 put_cpu();
918 }
919
920 @@ -1496,37 +1431,20 @@ void fastcall wake_up_new_task(task_t *p
921 this_cpu = smp_processor_id();
922 cpu = task_cpu(p);
923
924 - /*
925 - * We decrease the sleep average of forking parents
926 - * and children as well, to keep max-interactive tasks
927 - * from forking tasks that are max-interactive. The parent
928 - * (current) is done further down, under its lock.
929 - */
930 - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
931 - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
932 -
933 - p->prio = effective_prio(p);
934 + /* Forked process gets no bonus to prevent fork bombs. */
935 + p->bonus = 0;
936 + current->flags |= PF_FORKED;
937
938 if (likely(cpu == this_cpu)) {
939 + activate_task(p, rq, 1);
940 if (!(clone_flags & CLONE_VM)) {
941 /*
942 * The VM isn't cloned, so we're in a good position to
943 * do child-runs-first in anticipation of an exec. This
944 * usually avoids a lot of COW overhead.
945 */
946 - if (unlikely(!current->array))
947 - __activate_task(p, rq);
948 - else {
949 - p->prio = current->prio;
950 - list_add_tail(&p->run_list, &current->run_list);
951 - p->array = current->array;
952 - p->array->nr_active++;
953 - inc_nr_running(p, rq);
954 - }
955 set_need_resched();
956 - } else
957 - /* Run child last */
958 - __activate_task(p, rq);
959 + }
960 /*
961 * We skip the following code due to cpu == this_cpu
962 *
963 @@ -1543,53 +1461,19 @@ void fastcall wake_up_new_task(task_t *p
964 */
965 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
966 + rq->timestamp_last_tick;
967 - __activate_task(p, rq);
968 - if (TASK_PREEMPTS_CURR(p, rq))
969 - resched_task(rq->curr);
970 + activate_task(p, rq, 0);
971 + preempt(p, rq);
972
973 /*
974 * Parent and child are on different CPUs, now get the
975 - * parent runqueue to update the parent's ->sleep_avg:
976 + * parent runqueue to update the parent's ->flags:
977 */
978 task_rq_unlock(rq, &flags);
979 this_rq = task_rq_lock(current, &flags);
980 }
981 - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
982 - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
983 task_rq_unlock(this_rq, &flags);
984 }
985
986 -/*
987 - * Potentially available exiting-child timeslices are
988 - * retrieved here - this way the parent does not get
989 - * penalized for creating too many threads.
990 - *
991 - * (this cannot be used to 'generate' timeslices
992 - * artificially, because any timeslice recovered here
993 - * was given away by the parent in the first place.)
994 - */
995 -void fastcall sched_exit(task_t *p)
996 -{
997 - unsigned long flags;
998 - runqueue_t *rq;
999 -
1000 - /*
1001 - * If the child was a (relative-) CPU hog then decrease
1002 - * the sleep_avg of the parent as well.
1003 - */
1004 - rq = task_rq_lock(p->parent, &flags);
1005 - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1006 - p->parent->time_slice += p->time_slice;
1007 - if (unlikely(p->parent->time_slice > task_timeslice(p)))
1008 - p->parent->time_slice = task_timeslice(p);
1009 - }
1010 - if (p->sleep_avg < p->parent->sleep_avg)
1011 - p->parent->sleep_avg = p->parent->sleep_avg /
1012 - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1013 - (EXIT_WEIGHT + 1);
1014 - task_rq_unlock(rq, &flags);
1015 -}
1016 -
1017 /**
1018 * prepare_task_switch - prepare to switch tasks
1019 * @rq: the runqueue preparing to switch
1020 @@ -1885,23 +1769,21 @@ void sched_exec(void)
1021 * pull_task - move a task from a remote runqueue to the local runqueue.
1022 * Both runqueues must be locked.
1023 */
1024 -static
1025 -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1026 - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1027 +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq,
1028 + const int this_cpu)
1029 {
1030 - dequeue_task(p, src_array);
1031 + dequeue_task(p, src_rq);
1032 dec_nr_running(p, src_rq);
1033 set_task_cpu(p, this_cpu);
1034 inc_nr_running(p, this_rq);
1035 - enqueue_task(p, this_array);
1036 + enqueue_task(p, this_rq);
1037 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1038 + this_rq->timestamp_last_tick;
1039 /*
1040 * Note that idle threads have a prio of MAX_PRIO, for this test
1041 * to be always true for them.
1042 */
1043 - if (TASK_PREEMPTS_CURR(p, this_rq))
1044 - resched_task(this_rq->curr);
1045 + preempt(p, this_rq);
1046 }
1047
1048 /*
1049 @@ -1939,7 +1821,6 @@ int can_migrate_task(task_t *p, runqueue
1050 return 1;
1051 }
1052
1053 -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1054 /*
1055 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1056 * load from busiest to this_rq, as part of a balancing operation within
1057 @@ -1952,7 +1833,6 @@ static int move_tasks(runqueue_t *this_r
1058 struct sched_domain *sd, enum idle_type idle,
1059 int *all_pinned)
1060 {
1061 - prio_array_t *array, *dst_array;
1062 struct list_head *head, *curr;
1063 int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
1064 int busiest_best_prio_seen;
1065 @@ -1965,8 +1845,8 @@ static int move_tasks(runqueue_t *this_r
1066
1067 rem_load_move = max_load_move;
1068 pinned = 1;
1069 - this_best_prio = rq_best_prio(this_rq);
1070 - busiest_best_prio = rq_best_prio(busiest);
1071 + this_best_prio = this_rq->curr->prio;
1072 + busiest_best_prio = busiest->curr->prio;
1073 /*
1074 * Enable handling of the case where there is more than one task
1075 * with the best priority. If the current running task is one
1076 @@ -1976,38 +1856,17 @@ static int move_tasks(runqueue_t *this_r
1077 */
1078 busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
1079
1080 - /*
1081 - * We first consider expired tasks. Those will likely not be
1082 - * executed in the near future, and they are most likely to
1083 - * be cache-cold, thus switching CPUs has the least effect
1084 - * on them.
1085 - */
1086 - if (busiest->expired->nr_active) {
1087 - array = busiest->expired;
1088 - dst_array = this_rq->expired;
1089 - } else {
1090 - array = busiest->active;
1091 - dst_array = this_rq->active;
1092 - }
1093 -
1094 -new_array:
1095 /* Start searching at priority 0: */
1096 idx = 0;
1097 skip_bitmap:
1098 if (!idx)
1099 - idx = sched_find_first_bit(array->bitmap);
1100 + idx = sched_find_first_bit(busiest->bitmap);
1101 else
1102 - idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1103 - if (idx >= MAX_PRIO) {
1104 - if (array == busiest->expired && busiest->active->nr_active) {
1105 - array = busiest->active;
1106 - dst_array = this_rq->active;
1107 - goto new_array;
1108 - }
1109 + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1110 + if (idx >= MAX_PRIO)
1111 goto out;
1112 - }
1113
1114 - head = array->queue + idx;
1115 + head = busiest->queue + idx;
1116 curr = head->prev;
1117 skip_queue:
1118 tmp = list_entry(curr, task_t, run_list);
1119 @@ -2036,7 +1895,7 @@ skip_queue:
1120 schedstat_inc(sd, lb_hot_gained[idle]);
1121 #endif
1122
1123 - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1124 + pull_task(busiest, tmp, this_rq, this_cpu);
1125 pulled++;
1126 rem_load_move -= tmp->load_weight;
1127
1128 @@ -2585,15 +2444,13 @@ static void rebalance_tick(int this_cpu,
1129 continue;
1130
1131 interval = sd->balance_interval;
1132 - if (idle != SCHED_IDLE)
1133 - interval *= sd->busy_factor;
1134
1135 /* scale ms to jiffies */
1136 interval = msecs_to_jiffies(interval);
1137 if (unlikely(!interval))
1138 interval = 1;
1139
1140 - if (j - sd->last_balance >= interval) {
1141 + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) {
1142 if (load_balance(this_cpu, this_rq, sd, idle)) {
1143 /*
1144 * We've pulled tasks over so either we're no
1145 @@ -2667,22 +2524,6 @@ unsigned long long current_sched_time(co
1146 }
1147
1148 /*
1149 - * We place interactive tasks back into the active array, if possible.
1150 - *
1151 - * To guarantee that this does not starve expired tasks we ignore the
1152 - * interactivity of a task if the first expired task had to wait more
1153 - * than a 'reasonable' amount of time. This deadline timeout is
1154 - * load-dependent, as the frequency of array switched decreases with
1155 - * increasing number of running tasks. We also ignore the interactivity
1156 - * if a better static_prio task has expired:
1157 - */
1158 -#define EXPIRED_STARVING(rq) \
1159 - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1160 - (jiffies - (rq)->expired_timestamp >= \
1161 - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1162 - ((rq)->curr->static_prio > (rq)->best_expired_prio))
1163 -
1164 -/*
1165 * Account user cpu time to a process.
1166 * @p: the process that the cpu time gets accounted to
1167 * @hardirq_offset: the offset to subtract from hardirq_count()
1168 @@ -2730,6 +2571,8 @@ void account_system_time(struct task_str
1169 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
1170 else
1171 cpustat->idle = cputime64_add(cpustat->idle, tmp);
1172 +
1173 + p->systime += NSJIFFY;
1174 /* Account for system time used */
1175 acct_update_integrals(p);
1176 }
1177 @@ -2755,18 +2598,23 @@ void account_steal_time(struct task_stru
1178 cpustat->steal = cputime64_add(cpustat->steal, tmp);
1179 }
1180
1181 +static void time_slice_expired(task_t *p, runqueue_t *rq)
1182 +{
1183 + set_tsk_need_resched(p);
1184 + p->time_slice = rr_interval(p);
1185 + requeue_task(p, rq, effective_prio(p));
1186 +}
1187 +
1188 /*
1189 * This function gets called by the timer code, with HZ frequency.
1190 * We call it with interrupts disabled.
1191 - *
1192 - * It also gets called by the fork code, when changing the parent's
1193 - * timeslices.
1194 */
1195 void scheduler_tick(void)
1196 {
1197 int cpu = smp_processor_id();
1198 runqueue_t *rq = this_rq();
1199 task_t *p = current;
1200 + unsigned long debit;
1201 unsigned long long now = sched_clock();
1202
1203 update_cpu_clock(p, rq, now);
1204 @@ -2781,73 +2629,37 @@ void scheduler_tick(void)
1205 }
1206
1207 /* Task might have expired already, but not scheduled off yet */
1208 - if (p->array != rq->active) {
1209 + if (unlikely(!task_queued(p))) {
1210 set_tsk_need_resched(p);
1211 goto out;
1212 }
1213 + /* SCHED_FIFO tasks never run out of timeslice. */
1214 + if (unlikely(p->policy == SCHED_FIFO))
1215 + goto out;
1216 +
1217 spin_lock(&rq->lock);
1218 + debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
1219 + p->ns_debit += debit;
1220 + if (p->ns_debit < NSJIFFY)
1221 + goto out_unlock;
1222 + p->ns_debit %= NSJIFFY;
1223 /*
1224 - * The task was running during this tick - update the
1225 - * time slice counter. Note: we do not update a thread's
1226 - * priority until it either goes to sleep or uses up its
1227 - * timeslice. This makes it possible for interactive tasks
1228 - * to use up their timeslices at their highest priority levels.
1229 + * Tasks lose bonus each time they use up a full slice().
1230 */
1231 - if (rt_task(p)) {
1232 - /*
1233 - * RR tasks need a special form of timeslice management.
1234 - * FIFO tasks have no timeslices.
1235 - */
1236 - if ((p->policy == SCHED_RR) && !--p->time_slice) {
1237 - p->time_slice = task_timeslice(p);
1238 - p->first_time_slice = 0;
1239 - set_tsk_need_resched(p);
1240 -
1241 - /* put it at the end of the queue: */
1242 - requeue_task(p, rq->active);
1243 - }
1244 + if (!--p->slice) {
1245 + dec_bonus(p);
1246 + p->totalrun = 0;
1247 + p->slice = slice(p);
1248 + time_slice_expired(p, rq);
1249 goto out_unlock;
1250 }
1251 + /*
1252 + * Tasks that run out of time_slice but still have slice left get
1253 + * requeued with a lower priority && RR_INTERVAL time_slice.
1254 + */
1255 if (!--p->time_slice) {
1256 - dequeue_task(p, rq->active);
1257 - set_tsk_need_resched(p);
1258 - p->prio = effective_prio(p);
1259 - p->time_slice = task_timeslice(p);
1260 - p->first_time_slice = 0;
1261 -
1262 - if (!rq->expired_timestamp)
1263 - rq->expired_timestamp = jiffies;
1264 - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1265 - enqueue_task(p, rq->expired);
1266 - if (p->static_prio < rq->best_expired_prio)
1267 - rq->best_expired_prio = p->static_prio;
1268 - } else
1269 - enqueue_task(p, rq->active);
1270 - } else {
1271 - /*
1272 - * Prevent a too long timeslice allowing a task to monopolize
1273 - * the CPU. We do this by splitting up the timeslice into
1274 - * smaller pieces.
1275 - *
1276 - * Note: this does not mean the task's timeslices expire or
1277 - * get lost in any way, they just might be preempted by
1278 - * another task of equal priority. (one with higher
1279 - * priority would have preempted this task already.) We
1280 - * requeue this task to the end of the list on this priority
1281 - * level, which is in essence a round-robin of tasks with
1282 - * equal priority.
1283 - *
1284 - * This only applies to tasks in the interactive
1285 - * delta range with at least TIMESLICE_GRANULARITY to requeue.
1286 - */
1287 - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1288 - p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1289 - (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1290 - (p->array == rq->active)) {
1291 -
1292 - requeue_task(p, rq->active);
1293 - set_tsk_need_resched(p);
1294 - }
1295 + time_slice_expired(p, rq);
1296 + goto out_unlock;
1297 }
1298 out_unlock:
1299 spin_unlock(&rq->lock);
1300 @@ -2896,12 +2708,13 @@ static void wake_sleeping_dependent(int
1301
1302 /*
1303 * number of 'lost' timeslices this task wont be able to fully
1304 - * utilize, if another task runs on a sibling. This models the
1305 + * utilise, if another task runs on a sibling. This models the
1306 * slowdown effect of other tasks running on siblings:
1307 */
1308 -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
1309 +static inline unsigned long
1310 +smt_slice(const task_t *p, const struct sched_domain *sd)
1311 {
1312 - return p->time_slice * (100 - sd->per_cpu_gain) / 100;
1313 + return p->slice * (100 - sd->per_cpu_gain) / 100;
1314 }
1315
1316 /*
1317 @@ -2964,7 +2777,7 @@ static int dependent_sleeper(int this_cp
1318 } else
1319 if (smt_curr->static_prio < p->static_prio &&
1320 !TASK_PREEMPTS_CURR(p, smt_rq) &&
1321 - smt_slice(smt_curr, sd) > task_timeslice(p))
1322 + smt_slice(smt_curr, sd) > slice(p))
1323 ret = 1;
1324
1325 unlock:
1326 @@ -3015,12 +2828,6 @@ EXPORT_SYMBOL(sub_preempt_count);
1327
1328 #endif
1329
1330 -static inline int interactive_sleep(enum sleep_type sleep_type)
1331 -{
1332 - return (sleep_type == SLEEP_INTERACTIVE ||
1333 - sleep_type == SLEEP_INTERRUPTED);
1334 -}
1335 -
1336 /*
1337 * schedule() is the main scheduler function.
1338 */
1339 @@ -3029,11 +2836,10 @@ asmlinkage void __sched schedule(void)
1340 long *switch_count;
1341 task_t *prev, *next;
1342 runqueue_t *rq;
1343 - prio_array_t *array;
1344 struct list_head *queue;
1345 unsigned long long now;
1346 - unsigned long run_time;
1347 - int cpu, idx, new_prio;
1348 + unsigned long debit;
1349 + int cpu, idx;
1350
1351 /*
1352 * Test if we are atomic. Since do_exit() needs to call into
1353 @@ -3066,20 +2872,11 @@ need_resched_nonpreemptible:
1354
1355 schedstat_inc(rq, sched_cnt);
1356 now = sched_clock();
1357 - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
1358 - run_time = now - prev->timestamp;
1359 - if (unlikely((long long)(now - prev->timestamp) < 0))
1360 - run_time = 0;
1361 - } else
1362 - run_time = NS_MAX_SLEEP_AVG;
1363 -
1364 - /*
1365 - * Tasks charged proportionately less run_time at high sleep_avg to
1366 - * delay them losing their interactive status
1367 - */
1368 - run_time /= (CURRENT_BONUS(prev) ? : 1);
1369
1370 spin_lock_irq(&rq->lock);
1371 + prev->runtime = ns_diff(now, prev->timestamp);
1372 + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY;
1373 + prev->ns_debit += debit;
1374
1375 if (unlikely(prev->flags & PF_DEAD))
1376 prev->state = EXIT_DEAD;
1377 @@ -3091,8 +2888,10 @@ need_resched_nonpreemptible:
1378 unlikely(signal_pending(prev))))
1379 prev->state = TASK_RUNNING;
1380 else {
1381 - if (prev->state == TASK_UNINTERRUPTIBLE)
1382 + if (prev->state == TASK_UNINTERRUPTIBLE) {
1383 + prev->flags |= PF_NONSLEEP;
1384 rq->nr_uninterruptible++;
1385 + }
1386 deactivate_task(prev, rq);
1387 }
1388 }
1389 @@ -3102,64 +2901,30 @@ need_resched_nonpreemptible:
1390 idle_balance(cpu, rq);
1391 if (!rq->nr_running) {
1392 next = rq->idle;
1393 - rq->expired_timestamp = 0;
1394 wake_sleeping_dependent(cpu);
1395 goto switch_tasks;
1396 }
1397 }
1398
1399 - array = rq->active;
1400 - if (unlikely(!array->nr_active)) {
1401 - /*
1402 - * Switch the active and expired arrays.
1403 - */
1404 - schedstat_inc(rq, sched_switch);
1405 - rq->active = rq->expired;
1406 - rq->expired = array;
1407 - array = rq->active;
1408 - rq->expired_timestamp = 0;
1409 - rq->best_expired_prio = MAX_PRIO;
1410 - }
1411 -
1412 - idx = sched_find_first_bit(array->bitmap);
1413 - queue = array->queue + idx;
1414 + idx = sched_find_first_bit(rq->bitmap);
1415 + queue = rq->queue + idx;
1416 next = list_entry(queue->next, task_t, run_list);
1417
1418 - if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
1419 - unsigned long long delta = now - next->timestamp;
1420 - if (unlikely((long long)(now - next->timestamp) < 0))
1421 - delta = 0;
1422 -
1423 - if (next->sleep_type == SLEEP_INTERACTIVE)
1424 - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1425 -
1426 - array = next->array;
1427 - new_prio = recalc_task_prio(next, next->timestamp + delta);
1428 -
1429 - if (unlikely(next->prio != new_prio)) {
1430 - dequeue_task(next, array);
1431 - next->prio = new_prio;
1432 - enqueue_task(next, array);
1433 - }
1434 - }
1435 - next->sleep_type = SLEEP_NORMAL;
1436 if (dependent_sleeper(cpu, rq, next))
1437 next = rq->idle;
1438 + else {
1439 + prefetch(next);
1440 + prefetch_stack(next);
1441 + }
1442 switch_tasks:
1443 if (next == rq->idle)
1444 schedstat_inc(rq, sched_goidle);
1445 - prefetch(next);
1446 - prefetch_stack(next);
1447 + prev->timestamp = now;
1448 clear_tsk_need_resched(prev);
1449 rcu_qsctr_inc(task_cpu(prev));
1450
1451 update_cpu_clock(prev, rq, now);
1452
1453 - prev->sleep_avg -= run_time;
1454 - if ((long)prev->sleep_avg <= 0)
1455 - prev->sleep_avg = 0;
1456 - prev->timestamp = prev->last_ran = now;
1457 -
1458 sched_info_switch(prev, next);
1459 if (likely(prev != next)) {
1460 next->timestamp = now;
1461 @@ -3591,9 +3356,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
1462 void set_user_nice(task_t *p, long nice)
1463 {
1464 unsigned long flags;
1465 - prio_array_t *array;
1466 runqueue_t *rq;
1467 - int old_prio, new_prio, delta;
1468 + int queued, old_prio, new_prio, delta;
1469
1470 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1471 return;
1472 @@ -3612,9 +3376,8 @@ void set_user_nice(task_t *p, long nice)
1473 p->static_prio = NICE_TO_PRIO(nice);
1474 goto out_unlock;
1475 }
1476 - array = p->array;
1477 - if (array) {
1478 - dequeue_task(p, array);
1479 + if ((queued = task_queued(p))) {
1480 + dequeue_task(p, rq);
1481 dec_raw_weighted_load(rq, p);
1482 }
1483
1484 @@ -3624,9 +3387,11 @@ void set_user_nice(task_t *p, long nice)
1485 p->static_prio = NICE_TO_PRIO(nice);
1486 set_load_weight(p);
1487 p->prio += delta;
1488 + if (p->bonus > bonus(p))
1489 + p->bonus= bonus(p);
1490
1491 - if (array) {
1492 - enqueue_task(p, array);
1493 + if (queued) {
1494 + enqueue_task(p, rq);
1495 inc_raw_weighted_load(rq, p);
1496 /*
1497 * If the task increased its priority or is running and
1498 @@ -3750,19 +3515,13 @@ static inline task_t *find_process_by_pi
1499 /* Actually do priority change: must hold rq lock. */
1500 static void __setscheduler(struct task_struct *p, int policy, int prio)
1501 {
1502 - BUG_ON(p->array);
1503 + BUG_ON(task_queued(p));
1504 p->policy = policy;
1505 p->rt_priority = prio;
1506 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
1507 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
1508 - } else {
1509 + } else
1510 p->prio = p->static_prio;
1511 - /*
1512 - * SCHED_BATCH tasks are treated as perpetual CPU hogs:
1513 - */
1514 - if (policy == SCHED_BATCH)
1515 - p->sleep_avg = 0;
1516 - }
1517 set_load_weight(p);
1518 }
1519
1520 @@ -3777,8 +3536,7 @@ int sched_setscheduler(struct task_struc
1521 struct sched_param *param)
1522 {
1523 int retval;
1524 - int oldprio, oldpolicy = -1;
1525 - prio_array_t *array;
1526 + int queued, oldprio, oldpolicy = -1;
1527 unsigned long flags;
1528 runqueue_t *rq;
1529
1530 @@ -3840,12 +3598,11 @@ recheck:
1531 task_rq_unlock(rq, &flags);
1532 goto recheck;
1533 }
1534 - array = p->array;
1535 - if (array)
1536 + if ((queued = task_queued(p)))
1537 deactivate_task(p, rq);
1538 oldprio = p->prio;
1539 __setscheduler(p, policy, param->sched_priority);
1540 - if (array) {
1541 + if (queued) {
1542 __activate_task(p, rq);
1543 /*
1544 * Reschedule if we are currently running on this runqueue and
1545 @@ -3855,8 +3612,8 @@ recheck:
1546 if (task_running(rq, p)) {
1547 if (p->prio > oldprio)
1548 resched_task(rq->curr);
1549 - } else if (TASK_PREEMPTS_CURR(p, rq))
1550 - resched_task(rq->curr);
1551 + } else
1552 + preempt(p, rq);
1553 }
1554 task_rq_unlock(rq, &flags);
1555 return 0;
1556 @@ -4113,43 +3870,22 @@ asmlinkage long sys_sched_getaffinity(pi
1557
1558 /**
1559 * sys_sched_yield - yield the current processor to other threads.
1560 - *
1561 - * this function yields the current CPU by moving the calling thread
1562 - * to the expired array. If there are no other threads running on this
1563 - * CPU then this function will return.
1564 + * This function yields the current CPU by dropping the priority of current
1565 + * to the lowest priority.
1566 */
1567 asmlinkage long sys_sched_yield(void)
1568 {
1569 + int newprio;
1570 runqueue_t *rq = this_rq_lock();
1571 - prio_array_t *array = current->array;
1572 - prio_array_t *target = rq->expired;
1573
1574 + newprio = current->prio;
1575 schedstat_inc(rq, yld_cnt);
1576 - /*
1577 - * We implement yielding by moving the task into the expired
1578 - * queue.
1579 - *
1580 - * (special rule: RT tasks will just roundrobin in the active
1581 - * array.)
1582 - */
1583 - if (rt_task(current))
1584 - target = rq->active;
1585 + current->slice = slice(current);
1586 + current->time_slice = rr_interval(current);
1587 + if (likely(!rt_task(current)))
1588 + newprio = MIN_USER_PRIO;
1589
1590 - if (array->nr_active == 1) {
1591 - schedstat_inc(rq, yld_act_empty);
1592 - if (!rq->expired->nr_active)
1593 - schedstat_inc(rq, yld_both_empty);
1594 - } else if (!rq->expired->nr_active)
1595 - schedstat_inc(rq, yld_exp_empty);
1596 -
1597 - if (array != target) {
1598 - dequeue_task(current, array);
1599 - enqueue_task(current, target);
1600 - } else
1601 - /*
1602 - * requeue_task is cheaper so perform that if possible.
1603 - */
1604 - requeue_task(current, array);
1605 + requeue_task(current, rq, newprio);
1606
1607 /*
1608 * Since we are going to call schedule() anyway, there's
1609 @@ -4358,7 +4094,7 @@ long sys_sched_rr_get_interval(pid_t pid
1610 goto out_unlock;
1611
1612 jiffies_to_timespec(p->policy & SCHED_FIFO ?
1613 - 0 : task_timeslice(p), &t);
1614 + 0 : slice(p), &t);
1615 read_unlock(&tasklist_lock);
1616 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1617 out_nounlock:
1618 @@ -4481,8 +4217,6 @@ void __devinit init_idle(task_t *idle, i
1619 unsigned long flags;
1620
1621 idle->timestamp = sched_clock();
1622 - idle->sleep_avg = 0;
1623 - idle->array = NULL;
1624 idle->prio = MAX_PRIO;
1625 idle->state = TASK_RUNNING;
1626 idle->cpus_allowed = cpumask_of_cpu(cpu);
1627 @@ -4599,7 +4333,7 @@ static void __migrate_task(struct task_s
1628 goto out;
1629
1630 set_task_cpu(p, dest_cpu);
1631 - if (p->array) {
1632 + if (task_queued(p)) {
1633 /*
1634 * Sync timestamp with rq_dest's before activating.
1635 * The same thing could be achieved by doing this step
1636 @@ -4610,8 +4344,7 @@ static void __migrate_task(struct task_s
1637 + rq_dest->timestamp_last_tick;
1638 deactivate_task(p, rq_src);
1639 activate_task(p, rq_dest, 0);
1640 - if (TASK_PREEMPTS_CURR(p, rq_dest))
1641 - resched_task(rq_dest->curr);
1642 + preempt(p, rq_dest);
1643 }
1644
1645 out:
1646 @@ -4825,7 +4558,7 @@ static void migrate_dead_tasks(unsigned
1647
1648 for (arr = 0; arr < 2; arr++) {
1649 for (i = 0; i < MAX_PRIO; i++) {
1650 - struct list_head *list = &rq->arrays[arr].queue[i];
1651 + struct list_head *list = &rq->queue[i];
1652 while (!list_empty(list))
1653 migrate_dead(dead_cpu,
1654 list_entry(list->next, task_t,
1655 @@ -6226,17 +5959,13 @@ int in_sched_functions(unsigned long add
1656 void __init sched_init(void)
1657 {
1658 runqueue_t *rq;
1659 - int i, j, k;
1660 + int i, j;
1661
1662 for_each_possible_cpu(i) {
1663 - prio_array_t *array;
1664
1665 rq = cpu_rq(i);
1666 spin_lock_init(&rq->lock);
1667 rq->nr_running = 0;
1668 - rq->active = rq->arrays;
1669 - rq->expired = rq->arrays + 1;
1670 - rq->best_expired_prio = MAX_PRIO;
1671
1672 #ifdef CONFIG_SMP
1673 rq->sd = NULL;
1674 @@ -6248,16 +5977,11 @@ void __init sched_init(void)
1675 INIT_LIST_HEAD(&rq->migration_queue);
1676 #endif
1677 atomic_set(&rq->nr_iowait, 0);
1678 -
1679 - for (j = 0; j < 2; j++) {
1680 - array = rq->arrays + j;
1681 - for (k = 0; k < MAX_PRIO; k++) {
1682 - INIT_LIST_HEAD(array->queue + k);
1683 - __clear_bit(k, array->bitmap);
1684 - }
1685 - // delimiter for bitsearch
1686 - __set_bit(MAX_PRIO, array->bitmap);
1687 - }
1688 + for (j = 0; j < MAX_PRIO; j++)
1689 + INIT_LIST_HEAD(&rq->queue[j]);
1690 + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
1691 + /* delimiter for bitsearch */
1692 + __set_bit(MAX_PRIO, rq->bitmap);
1693 }
1694
1695 set_load_weight(&init_task);
1696 @@ -6302,9 +6026,9 @@ EXPORT_SYMBOL(__might_sleep);
1697 void normalize_rt_tasks(void)
1698 {
1699 struct task_struct *p;
1700 - prio_array_t *array;
1701 unsigned long flags;
1702 runqueue_t *rq;
1703 + int queued;
1704
1705 read_lock_irq(&tasklist_lock);
1706 for_each_process(p) {
1707 @@ -6313,11 +6037,10 @@ void normalize_rt_tasks(void)
1708
1709 rq = task_rq_lock(p, &flags);
1710
1711 - array = p->array;
1712 - if (array)
1713 + if ((queued = task_queued(p)))
1714 deactivate_task(p, task_rq(p));
1715 __setscheduler(p, SCHED_NORMAL, 0);
1716 - if (array) {
1717 + if (queued) {
1718 __activate_task(p, task_rq(p));
1719 resched_task(rq->curr);
1720 }