Magellan Linux

Contents of /trunk/kernel26-alx/patches-2.6.20-r6/0001-2.6.20-sched-staircase-17.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1175 - (show annotations) (download)
Thu Oct 14 12:15:46 2010 UTC (13 years, 6 months ago) by niro
File size: 53298 byte(s)
-2.6.20-alx-r6 new magellan 0.5.2 kernel
1 Implement the "staircase" hybrid foreground-background single priority
2 array cpu scheduler policy.
3
4 Signed-off-by: Con Kolivas <kernel@kolivas.org>
5 ---
6 fs/proc/array.c | 4
7 include/linux/sched.h | 20
8 kernel/exit.c | 1
9 kernel/sched.c | 1084 ++++++++++++++++++--------------------------------
10 4 files changed, 404 insertions(+), 705 deletions(-)
11
12 Index: linux-2.6.20-ck1/fs/proc/array.c
13 ===================================================================
14 --- linux-2.6.20-ck1.orig/fs/proc/array.c 2007-02-05 22:52:03.000000000 +1100
15 +++ linux-2.6.20-ck1/fs/proc/array.c 2007-02-16 19:01:30.000000000 +1100
16 @@ -165,7 +165,7 @@ static inline char * task_state(struct t
17 rcu_read_lock();
18 buffer += sprintf(buffer,
19 "State:\t%s\n"
20 - "SleepAVG:\t%lu%%\n"
21 + "Bonus:\t%d\n"
22 "Tgid:\t%d\n"
23 "Pid:\t%d\n"
24 "PPid:\t%d\n"
25 @@ -173,7 +173,7 @@ static inline char * task_state(struct t
26 "Uid:\t%d\t%d\t%d\t%d\n"
27 "Gid:\t%d\t%d\t%d\t%d\n",
28 get_task_state(p),
29 - (p->sleep_avg/1024)*100/(1020000000/1024),
30 + p->bonus,
31 p->tgid, p->pid,
32 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
33 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
34 Index: linux-2.6.20-ck1/kernel/exit.c
35 ===================================================================
36 --- linux-2.6.20-ck1.orig/kernel/exit.c 2007-02-05 22:52:04.000000000 +1100
37 +++ linux-2.6.20-ck1/kernel/exit.c 2007-02-16 19:01:30.000000000 +1100
38 @@ -170,7 +170,6 @@ repeat:
39 zap_leader = (leader->exit_signal == -1);
40 }
41
42 - sched_exit(p);
43 write_unlock_irq(&tasklist_lock);
44 proc_flush_task(p);
45 release_thread(p);
46 Index: linux-2.6.20-ck1/include/linux/sched.h
47 ===================================================================
48 --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-05 22:52:04.000000000 +1100
49 +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100
50 @@ -524,6 +524,7 @@ struct signal_struct {
51 #define MAX_RT_PRIO MAX_USER_RT_PRIO
52
53 #define MAX_PRIO (MAX_RT_PRIO + 40)
54 +#define MIN_USER_PRIO (MAX_PRIO - 1)
55
56 #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
57 #define rt_task(p) rt_prio((p)->prio)
58 @@ -789,15 +790,6 @@ struct mempolicy;
59 struct pipe_inode_info;
60 struct uts_namespace;
61
62 -enum sleep_type {
63 - SLEEP_NORMAL,
64 - SLEEP_NONINTERACTIVE,
65 - SLEEP_INTERACTIVE,
66 - SLEEP_INTERRUPTED,
67 -};
68 -
69 -struct prio_array;
70 -
71 struct task_struct {
72 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
73 struct thread_info *thread_info;
74 @@ -815,20 +807,19 @@ struct task_struct {
75 int load_weight; /* for niceness load balancing purposes */
76 int prio, static_prio, normal_prio;
77 struct list_head run_list;
78 - struct prio_array *array;
79
80 unsigned short ioprio;
81 #ifdef CONFIG_BLK_DEV_IO_TRACE
82 unsigned int btrace_seq;
83 #endif
84 - unsigned long sleep_avg;
85 unsigned long long timestamp, last_ran;
86 + unsigned long runtime, totalrun, ns_debit, systime;
87 + unsigned int bonus;
88 + unsigned int slice, time_slice;
89 unsigned long long sched_time; /* sched_clock time spent running */
90 - enum sleep_type sleep_type;
91
92 unsigned long policy;
93 cpumask_t cpus_allowed;
94 - unsigned int time_slice, first_time_slice;
95
96 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
97 struct sched_info sched_info;
98 @@ -1157,6 +1148,8 @@ static inline void put_task_struct(struc
99 #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
100 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
101 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
102 +#define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */
103 +#define PF_FORKED 0x80000000 /* Task just forked another process */
104
105 /*
106 * Only the _current_ task can read/write to tsk->flags, but other
107 @@ -1291,7 +1284,6 @@ extern void FASTCALL(wake_up_new_task(st
108 static inline void kick_process(struct task_struct *tsk) { }
109 #endif
110 extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
111 -extern void FASTCALL(sched_exit(struct task_struct * p));
112
113 extern int in_group_p(gid_t);
114 extern int in_egroup_p(gid_t);
115 Index: linux-2.6.20-ck1/kernel/sched.c
116 ===================================================================
117 --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-05 22:52:04.000000000 +1100
118 +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100
119 @@ -16,6 +16,10 @@
120 * by Davide Libenzi, preemptible kernel bits by Robert Love.
121 * 2003-09-03 Interactivity tuning by Con Kolivas.
122 * 2004-04-02 Scheduler domains code by Nick Piggin
123 + * 2007-02-14 Staircase scheduling policy by Con Kolivas with help
124 + * from William Lee Irwin III, Zwane Mwaikambo, Peter Williams
125 + * and Andreas Mohr.
126 + * Staircase v17
127 */
128
129 #include <linux/mm.h>
130 @@ -77,123 +81,19 @@
131 /*
132 * Some helpers for converting nanosecond timing to jiffy resolution
133 */
134 -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
135 -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
136 -
137 -/*
138 - * These are the 'tuning knobs' of the scheduler:
139 - *
140 - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
141 - * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
142 - * Timeslices get refilled after they expire.
143 - */
144 -#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
145 -#define DEF_TIMESLICE (100 * HZ / 1000)
146 -#define ON_RUNQUEUE_WEIGHT 30
147 -#define CHILD_PENALTY 95
148 -#define PARENT_PENALTY 100
149 -#define EXIT_WEIGHT 3
150 -#define PRIO_BONUS_RATIO 25
151 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
152 -#define INTERACTIVE_DELTA 2
153 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
154 -#define STARVATION_LIMIT (MAX_SLEEP_AVG)
155 -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
156 -
157 -/*
158 - * If a task is 'interactive' then we reinsert it in the active
159 - * array after it has expired its current timeslice. (it will not
160 - * continue to run immediately, it will still roundrobin with
161 - * other interactive tasks.)
162 - *
163 - * This part scales the interactivity limit depending on niceness.
164 - *
165 - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
166 - * Here are a few examples of different nice levels:
167 - *
168 - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
169 - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
170 - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
171 - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
172 - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
173 - *
174 - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
175 - * priority range a task can explore, a value of '1' means the
176 - * task is rated interactive.)
177 - *
178 - * Ie. nice +19 tasks can never get 'interactive' enough to be
179 - * reinserted into the active array. And only heavily CPU-hog nice -20
180 - * tasks will be expired. Default nice 0 tasks are somewhere between,
181 - * it takes some effort for them to get interactive, but it's not
182 - * too hard.
183 - */
184 -
185 -#define CURRENT_BONUS(p) \
186 - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
187 - MAX_SLEEP_AVG)
188 -
189 -#define GRANULARITY (10 * HZ / 1000 ? : 1)
190 -
191 -#ifdef CONFIG_SMP
192 -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
193 - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
194 - num_online_cpus())
195 -#else
196 -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
197 - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
198 -#endif
199 -
200 -#define SCALE(v1,v1_max,v2_max) \
201 - (v1) * (v2_max) / (v1_max)
202 -
203 -#define DELTA(p) \
204 - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
205 - INTERACTIVE_DELTA)
206 -
207 -#define TASK_INTERACTIVE(p) \
208 - ((p)->prio <= (p)->static_prio - DELTA(p))
209 -
210 -#define INTERACTIVE_SLEEP(p) \
211 - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
212 - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
213 +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
214 +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY)
215 +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY)
216
217 #define TASK_PREEMPTS_CURR(p, rq) \
218 ((p)->prio < (rq)->curr->prio)
219
220 -#define SCALE_PRIO(x, prio) \
221 - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
222 -
223 -static unsigned int static_prio_timeslice(int static_prio)
224 -{
225 - if (static_prio < NICE_TO_PRIO(0))
226 - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
227 - else
228 - return SCALE_PRIO(DEF_TIMESLICE, static_prio);
229 -}
230 -
231 /*
232 - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
233 - * to time slice values: [800ms ... 100ms ... 5ms]
234 - *
235 - * The higher a thread's priority, the bigger timeslices
236 - * it gets during one round of execution. But even the lowest
237 - * priority thread gets MIN_TIMESLICE worth of execution time.
238 + * This is the time all tasks within the same priority round robin.
239 + * Set to a minimum of 6ms.
240 */
241 -
242 -static inline unsigned int task_timeslice(struct task_struct *p)
243 -{
244 - return static_prio_timeslice(p->static_prio);
245 -}
246 -
247 -/*
248 - * These are the runqueue data structures:
249 - */
250 -
251 -struct prio_array {
252 - unsigned int nr_active;
253 - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
254 - struct list_head queue[MAX_PRIO];
255 -};
256 +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
257 +#define DEF_TIMESLICE (RR_INTERVAL * 19)
258
259 /*
260 * This is the main, per-CPU runqueue data structure.
261 @@ -224,14 +124,13 @@ struct rq {
262 */
263 unsigned long nr_uninterruptible;
264
265 - unsigned long expired_timestamp;
266 /* Cached timestamp set by update_cpu_clock() */
267 unsigned long long most_recent_timestamp;
268 struct task_struct *curr, *idle;
269 unsigned long next_balance;
270 struct mm_struct *prev_mm;
271 - struct prio_array *active, *expired, arrays[2];
272 - int best_expired_prio;
273 + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
274 + struct list_head queue[MAX_PRIO];
275 atomic_t nr_iowait;
276
277 #ifdef CONFIG_SMP
278 @@ -568,13 +467,7 @@ static inline struct rq *this_rq_lock(vo
279
280 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
281 /*
282 - * Called when a process is dequeued from the active array and given
283 - * the cpu. We should note that with the exception of interactive
284 - * tasks, the expired queue will become the active queue after the active
285 - * queue is empty, without explicitly dequeuing and requeuing tasks in the
286 - * expired queue. (Interactive tasks may be requeued directly to the
287 - * active queue, thus delaying tasks in the expired queue from running;
288 - * see scheduler_tick()).
289 + * Called when a process is dequeued and given the cpu.
290 *
291 * This function is only called from sched_info_arrive(), rather than
292 * dequeue_task(). Even though a task may be queued and dequeued multiple
293 @@ -607,13 +500,11 @@ static void sched_info_arrive(struct tas
294 }
295
296 /*
297 - * Called when a process is queued into either the active or expired
298 - * array. The time is noted and later used to determine how long we
299 - * had to wait for us to reach the cpu. Since the expired queue will
300 - * become the active queue after active queue is empty, without dequeuing
301 - * and requeuing any tasks, we are interested in queuing to either. It
302 - * is unusual but not impossible for tasks to be dequeued and immediately
303 - * requeued in the same or another array: this can happen in sched_yield(),
304 + * Called when a process is queued.
305 + * The time is noted and later used to determine how long we had to wait for
306 + * us to reach the cpu.
307 + * It is unusual but not impossible for tasks to be dequeued and immediately
308 + * requeued: this can happen in sched_yield(),
309 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
310 * to runqueue.
311 *
312 @@ -672,73 +563,81 @@ sched_info_switch(struct task_struct *pr
313 #define sched_info_switch(t, next) do { } while (0)
314 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
315
316 -/*
317 - * Adding/removing a task to/from a priority array:
318 - */
319 -static void dequeue_task(struct task_struct *p, struct prio_array *array)
320 +#if BITS_PER_LONG < 64
321 +static inline void longlimit(unsigned long long *longlong)
322 +{
323 + if (*longlong > (1 << 31))
324 + *longlong = 1 << 31;
325 +}
326 +#else
327 +static inline void longlimit(unsigned long long *__unused)
328 +{
329 +}
330 +#endif
331 +
332 +/* Get nanosecond clock difference without overflowing unsigned long. */
333 +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
334 {
335 - array->nr_active--;
336 - list_del(&p->run_list);
337 - if (list_empty(array->queue + p->prio))
338 - __clear_bit(p->prio, array->bitmap);
339 + unsigned long long vdiff;
340 + if (likely(v1 >= v2)) {
341 + vdiff = v1 - v2;
342 + longlimit(&vdiff);
343 + } else {
344 + /*
345 + * Rarely the clock appears to go backwards. There should
346 + * always be a positive difference so return 1.
347 + */
348 + vdiff = 1;
349 + }
350 + return (unsigned long)vdiff;
351 }
352
353 -static void enqueue_task(struct task_struct *p, struct prio_array *array)
354 +static inline int task_queued(struct task_struct *task)
355 {
356 - sched_info_queued(p);
357 - list_add_tail(&p->run_list, array->queue + p->prio);
358 - __set_bit(p->prio, array->bitmap);
359 - array->nr_active++;
360 - p->array = array;
361 + return !list_empty(&task->run_list);
362 }
363
364 /*
365 - * Put task to the end of the run list without the overhead of dequeue
366 - * followed by enqueue.
367 + * Adding/removing a task to/from a runqueue:
368 */
369 -static void requeue_task(struct task_struct *p, struct prio_array *array)
370 +static void dequeue_task(struct task_struct *p, struct rq *rq)
371 {
372 - list_move_tail(&p->run_list, array->queue + p->prio);
373 + list_del_init(&p->run_list);
374 + if (list_empty(rq->queue + p->prio))
375 + __clear_bit(p->prio, rq->bitmap);
376 + p->ns_debit = 0;
377 }
378
379 -static inline void
380 -enqueue_task_head(struct task_struct *p, struct prio_array *array)
381 +static void enqueue_task(struct task_struct *p, struct rq *rq)
382 {
383 - list_add(&p->run_list, array->queue + p->prio);
384 - __set_bit(p->prio, array->bitmap);
385 - array->nr_active++;
386 - p->array = array;
387 + list_add_tail(&p->run_list, rq->queue + p->prio);
388 + __set_bit(p->prio, rq->bitmap);
389 }
390
391 /*
392 - * __normal_prio - return the priority that is based on the static
393 - * priority but is modified by bonuses/penalties.
394 - *
395 - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
396 - * into the -5 ... 0 ... +5 bonus/penalty range.
397 - *
398 - * We use 25% of the full 0...39 priority range so that:
399 - *
400 - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
401 - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
402 - *
403 - * Both properties are important to certain workloads.
404 + * Put task to the end of the run list without the overhead of dequeue
405 + * followed by enqueue.
406 */
407 -
408 -static inline int __normal_prio(struct task_struct *p)
409 +static void requeue_task(struct task_struct *p, struct rq *rq, const int prio)
410 {
411 - int bonus, prio;
412 -
413 - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
414 + list_move_tail(&p->run_list, rq->queue + prio);
415 + if (p->prio != prio) {
416 + if (list_empty(rq->queue + p->prio))
417 + __clear_bit(p->prio, rq->bitmap);
418 + p->prio = prio;
419 + __set_bit(prio, rq->bitmap);
420 + }
421 + p->ns_debit = 0;
422 +}
423
424 - prio = p->static_prio - bonus;
425 - if (prio < MAX_RT_PRIO)
426 - prio = MAX_RT_PRIO;
427 - if (prio > MAX_PRIO-1)
428 - prio = MAX_PRIO-1;
429 - return prio;
430 +static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
431 +{
432 + list_add(&p->run_list, rq->queue + p->prio);
433 + __set_bit(p->prio, rq->bitmap);
434 }
435
436 +static unsigned int slice(const struct task_struct *p);
437 +
438 /*
439 * To aid in avoiding the subversion of "niceness" due to uneven distribution
440 * of tasks with abnormal "nice" values across CPUs the contribution that
441 @@ -756,10 +655,9 @@ static inline int __normal_prio(struct t
442 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
443 #define LOAD_WEIGHT(lp) \
444 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
445 -#define PRIO_TO_LOAD_WEIGHT(prio) \
446 - LOAD_WEIGHT(static_prio_timeslice(prio))
447 -#define RTPRIO_TO_LOAD_WEIGHT(rp) \
448 - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
449 +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p))
450 +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
451 + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
452
453 static void set_load_weight(struct task_struct *p)
454 {
455 @@ -776,7 +674,7 @@ static void set_load_weight(struct task_
456 #endif
457 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
458 } else
459 - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
460 + p->load_weight = TASK_LOAD_WEIGHT(p);
461 }
462
463 static inline void
464 @@ -804,6 +702,182 @@ static inline void dec_nr_running(struct
465 }
466
467 /*
468 + * __activate_task - move a task to the runqueue.
469 + */
470 +static inline void __activate_task(struct task_struct *p, struct rq *rq)
471 +{
472 + enqueue_task(p, rq);
473 + inc_nr_running(p, rq);
474 +}
475 +
476 +/*
477 + * __activate_idle_task - move idle task to the _front_ of runqueue.
478 + */
479 +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
480 +{
481 + enqueue_task_head(p, rq);
482 + inc_nr_running(p, rq);
483 +}
484 +
485 +/*
486 + * Bonus - How much higher than its base priority an interactive task can run.
487 + */
488 +static inline unsigned int bonus(const struct task_struct *p)
489 +{
490 + return TASK_USER_PRIO(p);
491 +}
492 +
493 +static unsigned int rr_interval(const struct task_struct *p)
494 +{
495 + int nice = TASK_NICE(p);
496 +
497 + if (nice < 0 && !rt_task(p))
498 + return RR_INTERVAL * (20 - nice) / 20;
499 + return RR_INTERVAL;
500 +}
501 +
502 +/*
503 + * slice - the duration a task runs before getting requeued at its best
504 + * priority and has its bonus decremented.
505 + */
506 +static unsigned int slice(const struct task_struct *p)
507 +{
508 + unsigned int slice, rr;
509 +
510 + slice = rr = rr_interval(p);
511 + if (likely(!rt_task(p)))
512 + slice += (39 - TASK_USER_PRIO(p)) * rr;
513 + return slice;
514 +}
515 +
516 +/*
517 + * We increase our bonus by sleeping more than the time we ran.
518 + * The ratio of sleep to run gives us the cpu% that we last ran and determines
519 + * the maximum bonus we can acquire.
520 + */
521 +static void inc_bonus(struct task_struct *p, unsigned long totalrun, unsigned long sleep)
522 +{
523 + unsigned int best_bonus = sleep / (totalrun + 1);
524 +
525 + if (p->bonus >= best_bonus)
526 + return;
527 + best_bonus = bonus(p);
528 + if (p->bonus < best_bonus)
529 + p->bonus++;
530 +}
531 +
532 +static inline void dec_bonus(struct task_struct *p)
533 +{
534 + if (p->bonus)
535 + p->bonus--;
536 +}
537 +
538 +static inline void slice_overrun(struct task_struct *p)
539 +{
540 + unsigned long ns_slice = JIFFIES_TO_NS(p->slice);
541 +
542 + do {
543 + p->totalrun -= ns_slice;
544 + dec_bonus(p);
545 + } while (unlikely(p->totalrun > ns_slice));
546 +}
547 +
548 +static inline void continue_slice(struct task_struct *p)
549 +{
550 + unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
551 +
552 + if (unlikely(total_run >= p->slice))
553 + slice_overrun(p);
554 + else {
555 + unsigned long remainder;
556 +
557 + p->slice -= total_run;
558 + remainder = p->slice % rr_interval(p);
559 + if (remainder)
560 + p->time_slice = remainder;
561 + }
562 +}
563 +
564 +/*
565 + * recalc_task_prio - this checks for tasks that have run less than a full
566 + * slice and have woken up again soon after, or have just forked a
567 + * thread/process and make them continue their old slice instead of starting
568 + * a new one at high priority.
569 + */
570 +static inline void recalc_task_prio(struct task_struct *p, const unsigned long long now)
571 +{
572 + unsigned long sleep_time;
573 +
574 + /*
575 + * If this task has managed to run to its lowest priority then
576 + * decrease its bonus and requeue it now at best priority instead
577 + * of possibly flagging around lowest priority. Save up any systime
578 + * that may affect priority on the next reschedule.
579 + */
580 + if (p->slice > p->time_slice &&
581 + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) {
582 + dec_bonus(p);
583 + p->totalrun = 0;
584 + return;
585 + }
586 +
587 + /*
588 + * Add the total for this last scheduled run (p->runtime) and system
589 + * time (p->systime) done on behalf of p to the running total so far
590 + * used (p->totalrun).
591 + */
592 + p->totalrun += p->runtime + p->systime;
593 + sleep_time = ns_diff(now, p->timestamp);
594 +
595 + if (p->systime > sleep_time || p->flags & PF_FORKED)
596 + sleep_time = 0;
597 + else {
598 + sleep_time -= p->systime;
599 + /*
600 + * We elevate priority by the amount of time we slept. If we
601 + * sleep longer than our running total and have not set the
602 + * PF_NONSLEEP flag we gain a bonus.
603 + */
604 + if (sleep_time >= p->totalrun) {
605 + if (!(p->flags & PF_NONSLEEP))
606 + inc_bonus(p, p->totalrun, sleep_time);
607 + p->totalrun = 0;
608 + return;
609 + }
610 + p->totalrun -= sleep_time;
611 + }
612 + continue_slice(p);
613 +}
614 +
615 +/*
616 + * __normal_prio - dynamic priority dependent on bonus.
617 + * The priority normally decreases by one each RR_INTERVAL.
618 + * As the bonus increases the initial priority starts at a higher "stair" or
619 + * priority for longer.
620 + */
621 +static inline int __normal_prio(struct task_struct *p)
622 +{
623 + int prio;
624 + unsigned int full_slice, used_slice = 0;
625 + unsigned int best_bonus, rr;
626 +
627 + full_slice = slice(p);
628 + if (full_slice > p->slice)
629 + used_slice = full_slice - p->slice;
630 +
631 + best_bonus = bonus(p);
632 + prio = MAX_RT_PRIO + best_bonus;
633 + if (!batch_task(p))
634 + prio -= p->bonus;
635 +
636 + rr = rr_interval(p);
637 + prio += used_slice / rr;
638 + if (prio > MIN_USER_PRIO)
639 + prio = MIN_USER_PRIO;
640 + return prio;
641 +}
642 +
643 +/*
644 * Calculate the expected normal priority: i.e. priority
645 * without taking RT-inheritance into account. Might be
646 * boosted by interactivity modifiers. Changes upon fork,
647 @@ -842,111 +916,14 @@ static int effective_prio(struct task_st
648 }
649
650 /*
651 - * __activate_task - move a task to the runqueue.
652 - */
653 -static void __activate_task(struct task_struct *p, struct rq *rq)
654 -{
655 - struct prio_array *target = rq->active;
656 -
657 - if (batch_task(p))
658 - target = rq->expired;
659 - enqueue_task(p, target);
660 - inc_nr_running(p, rq);
661 -}
662 -
663 -/*
664 - * __activate_idle_task - move idle task to the _front_ of runqueue.
665 - */
666 -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
667 -{
668 - enqueue_task_head(p, rq->active);
669 - inc_nr_running(p, rq);
670 -}
671 -
672 -/*
673 - * Recalculate p->normal_prio and p->prio after having slept,
674 - * updating the sleep-average too:
675 - */
676 -static int recalc_task_prio(struct task_struct *p, unsigned long long now)
677 -{
678 - /* Caller must always ensure 'now >= p->timestamp' */
679 - unsigned long sleep_time = now - p->timestamp;
680 -
681 - if (batch_task(p))
682 - sleep_time = 0;
683 -
684 - if (likely(sleep_time > 0)) {
685 - /*
686 - * This ceiling is set to the lowest priority that would allow
687 - * a task to be reinserted into the active array on timeslice
688 - * completion.
689 - */
690 - unsigned long ceiling = INTERACTIVE_SLEEP(p);
691 -
692 - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
693 - /*
694 - * Prevents user tasks from achieving best priority
695 - * with one single large enough sleep.
696 - */
697 - p->sleep_avg = ceiling;
698 - /*
699 - * Using INTERACTIVE_SLEEP() as a ceiling places a
700 - * nice(0) task 1ms sleep away from promotion, and
701 - * gives it 700ms to round-robin with no chance of
702 - * being demoted. This is more than generous, so
703 - * mark this sleep as non-interactive to prevent the
704 - * on-runqueue bonus logic from intervening should
705 - * this task not receive cpu immediately.
706 - */
707 - p->sleep_type = SLEEP_NONINTERACTIVE;
708 - } else {
709 - /*
710 - * Tasks waking from uninterruptible sleep are
711 - * limited in their sleep_avg rise as they
712 - * are likely to be waiting on I/O
713 - */
714 - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
715 - if (p->sleep_avg >= ceiling)
716 - sleep_time = 0;
717 - else if (p->sleep_avg + sleep_time >=
718 - ceiling) {
719 - p->sleep_avg = ceiling;
720 - sleep_time = 0;
721 - }
722 - }
723 -
724 - /*
725 - * This code gives a bonus to interactive tasks.
726 - *
727 - * The boost works by updating the 'average sleep time'
728 - * value here, based on ->timestamp. The more time a
729 - * task spends sleeping, the higher the average gets -
730 - * and the higher the priority boost gets as well.
731 - */
732 - p->sleep_avg += sleep_time;
733 -
734 - }
735 - if (p->sleep_avg > NS_MAX_SLEEP_AVG)
736 - p->sleep_avg = NS_MAX_SLEEP_AVG;
737 - }
738 -
739 - return effective_prio(p);
740 -}
741 -
742 -/*
743 * activate_task - move a task to the runqueue and do priority recalculation
744 *
745 - * Update all the scheduling statistics stuff. (sleep average
746 - * calculation, priority modifiers, etc.)
747 */
748 static void activate_task(struct task_struct *p, struct rq *rq, int local)
749 {
750 - unsigned long long now;
751 -
752 - if (rt_task(p))
753 - goto out;
754 + unsigned long long now = sched_clock();
755 + unsigned long rr = rr_interval(p);
756
757 - now = sched_clock();
758 #ifdef CONFIG_SMP
759 if (!local) {
760 /* Compensate for drifting sched_clock */
761 @@ -967,32 +944,15 @@ static void activate_task(struct task_st
762 (now - p->timestamp) >> 20);
763 }
764
765 - p->prio = recalc_task_prio(p, now);
766 -
767 - /*
768 - * This checks to make sure it's not an uninterruptible task
769 - * that is now waking up.
770 - */
771 - if (p->sleep_type == SLEEP_NORMAL) {
772 - /*
773 - * Tasks which were woken up by interrupts (ie. hw events)
774 - * are most likely of interactive nature. So we give them
775 - * the credit of extending their sleep time to the period
776 - * of time they spend on the runqueue, waiting for execution
777 - * on a CPU, first time around:
778 - */
779 - if (in_interrupt())
780 - p->sleep_type = SLEEP_INTERRUPTED;
781 - else {
782 - /*
783 - * Normal first-time wakeups get a credit too for
784 - * on-runqueue time, but it will be weighted down:
785 - */
786 - p->sleep_type = SLEEP_INTERACTIVE;
787 - }
788 + p->slice = slice(p);
789 + p->time_slice = p->slice % rr ? : rr;
790 + if (!rt_task(p)) {
791 + recalc_task_prio(p, now);
792 + p->prio = effective_prio(p);
793 + p->systime = 0;
794 + p->flags &= ~(PF_FORKED | PF_NONSLEEP);
795 }
796 p->timestamp = now;
797 -out:
798 __activate_task(p, rq);
799 }
800
801 @@ -1002,8 +962,7 @@ out:
802 static void deactivate_task(struct task_struct *p, struct rq *rq)
803 {
804 dec_nr_running(p, rq);
805 - dequeue_task(p, p->array);
806 - p->array = NULL;
807 + dequeue_task(p, rq);
808 }
809
810 /*
811 @@ -1085,7 +1044,7 @@ migrate_task(struct task_struct *p, int
812 * If the task is not on a runqueue (and not running), then
813 * it is sufficient to simply update the task's cpu field.
814 */
815 - if (!p->array && !task_running(rq, p)) {
816 + if (!task_queued(p) && !task_running(rq, p)) {
817 set_task_cpu(p, dest_cpu);
818 return 0;
819 }
820 @@ -1116,7 +1075,7 @@ void wait_task_inactive(struct task_stru
821 repeat:
822 rq = task_rq_lock(p, &flags);
823 /* Must be off runqueue entirely, not preempted. */
824 - if (unlikely(p->array || task_running(rq, p))) {
825 + if (unlikely(task_queued(p) || task_running(rq, p))) {
826 /* If it's preempted, we yield. It could be a while. */
827 preempted = !task_running(rq, p);
828 task_rq_unlock(rq, &flags);
829 @@ -1381,6 +1340,16 @@ static inline int wake_idle(int cpu, str
830 }
831 #endif
832
833 +/*
834 + * Check to see if p preempts rq->curr and resched if it does.
835 + */
836 +static inline void preempt(const struct task_struct *p, struct rq *rq)
837 +{
838 + if (TASK_PREEMPTS_CURR(p, rq))
839 + resched_task(rq->curr);
840 +}
841 +
842 +
843 /***
844 * try_to_wake_up - wake up a thread
845 * @p: the to-be-woken-up thread
846 @@ -1412,7 +1381,7 @@ static int try_to_wake_up(struct task_st
847 if (!(old_state & state))
848 goto out;
849
850 - if (p->array)
851 + if (task_queued(p))
852 goto out_running;
853
854 cpu = task_cpu(p);
855 @@ -1505,7 +1474,7 @@ out_set_cpu:
856 old_state = p->state;
857 if (!(old_state & state))
858 goto out;
859 - if (p->array)
860 + if (task_queued(p))
861 goto out_running;
862
863 this_cpu = smp_processor_id();
864 @@ -1514,25 +1483,9 @@ out_set_cpu:
865
866 out_activate:
867 #endif /* CONFIG_SMP */
868 - if (old_state == TASK_UNINTERRUPTIBLE) {
869 + if (old_state == TASK_UNINTERRUPTIBLE)
870 rq->nr_uninterruptible--;
871 - /*
872 - * Tasks on involuntary sleep don't earn
873 - * sleep_avg beyond just interactive state.
874 - */
875 - p->sleep_type = SLEEP_NONINTERACTIVE;
876 - } else
877 -
878 - /*
879 - * Tasks that have marked their sleep as noninteractive get
880 - * woken up with their sleep average not weighted in an
881 - * interactive way.
882 - */
883 - if (old_state & TASK_NONINTERACTIVE)
884 - p->sleep_type = SLEEP_NONINTERACTIVE;
885 -
886
887 - activate_task(p, rq, cpu == this_cpu);
888 /*
889 * Sync wakeups (i.e. those types of wakeups where the waker
890 * has indicated that it will leave the CPU in short order)
891 @@ -1541,10 +1494,9 @@ out_activate:
892 * the waker guarantees that the freshly woken up task is going
893 * to be considered on this CPU.)
894 */
895 - if (!sync || cpu != this_cpu) {
896 - if (TASK_PREEMPTS_CURR(p, rq))
897 - resched_task(rq->curr);
898 - }
899 + activate_task(p, rq, cpu == this_cpu);
900 + if (!sync || cpu != this_cpu)
901 + preempt(p, rq);
902 success = 1;
903
904 out_running:
905 @@ -1595,7 +1547,6 @@ void fastcall sched_fork(struct task_str
906 p->prio = current->normal_prio;
907
908 INIT_LIST_HEAD(&p->run_list);
909 - p->array = NULL;
910 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
911 if (unlikely(sched_info_on()))
912 memset(&p->sched_info, 0, sizeof(p->sched_info));
913 @@ -1607,30 +1558,6 @@ void fastcall sched_fork(struct task_str
914 /* Want to start with kernel preemption disabled. */
915 task_thread_info(p)->preempt_count = 1;
916 #endif
917 - /*
918 - * Share the timeslice between parent and child, thus the
919 - * total amount of pending timeslices in the system doesn't change,
920 - * resulting in more scheduling fairness.
921 - */
922 - local_irq_disable();
923 - p->time_slice = (current->time_slice + 1) >> 1;
924 - /*
925 - * The remainder of the first timeslice might be recovered by
926 - * the parent if the child exits early enough.
927 - */
928 - p->first_time_slice = 1;
929 - current->time_slice >>= 1;
930 - p->timestamp = sched_clock();
931 - if (unlikely(!current->time_slice)) {
932 - /*
933 - * This case is rare, it happens when the parent has only
934 - * a single jiffy left from its timeslice. Taking the
935 - * runqueue lock is not a problem.
936 - */
937 - current->time_slice = 1;
938 - task_running_tick(cpu_rq(cpu), current);
939 - }
940 - local_irq_enable();
941 put_cpu();
942 }
943
944 @@ -1652,38 +1579,20 @@ void fastcall wake_up_new_task(struct ta
945 this_cpu = smp_processor_id();
946 cpu = task_cpu(p);
947
948 - /*
949 - * We decrease the sleep average of forking parents
950 - * and children as well, to keep max-interactive tasks
951 - * from forking tasks that are max-interactive. The parent
952 - * (current) is done further down, under its lock.
953 - */
954 - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
955 - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
956 -
957 - p->prio = effective_prio(p);
958 + /* Forked process gets no bonus to prevent fork bombs. */
959 + p->bonus = 0;
960 + current->flags |= PF_FORKED;
961
962 if (likely(cpu == this_cpu)) {
963 + activate_task(p, rq, 1);
964 if (!(clone_flags & CLONE_VM)) {
965 /*
966 * The VM isn't cloned, so we're in a good position to
967 * do child-runs-first in anticipation of an exec. This
968 * usually avoids a lot of COW overhead.
969 */
970 - if (unlikely(!current->array))
971 - __activate_task(p, rq);
972 - else {
973 - p->prio = current->prio;
974 - p->normal_prio = current->normal_prio;
975 - list_add_tail(&p->run_list, &current->run_list);
976 - p->array = current->array;
977 - p->array->nr_active++;
978 - inc_nr_running(p, rq);
979 - }
980 set_need_resched();
981 - } else
982 - /* Run child last */
983 - __activate_task(p, rq);
984 + }
985 /*
986 * We skip the following code due to cpu == this_cpu
987 *
988 @@ -1700,53 +1609,19 @@ void fastcall wake_up_new_task(struct ta
989 */
990 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
991 + rq->most_recent_timestamp;
992 - __activate_task(p, rq);
993 - if (TASK_PREEMPTS_CURR(p, rq))
994 - resched_task(rq->curr);
995 + activate_task(p, rq, 0);
996 + preempt(p, rq);
997
998 /*
999 * Parent and child are on different CPUs, now get the
1000 - * parent runqueue to update the parent's ->sleep_avg:
1001 + * parent runqueue to update the parent's ->flags:
1002 */
1003 task_rq_unlock(rq, &flags);
1004 this_rq = task_rq_lock(current, &flags);
1005 }
1006 - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1007 - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1008 task_rq_unlock(this_rq, &flags);
1009 }
1010
1011 -/*
1012 - * Potentially available exiting-child timeslices are
1013 - * retrieved here - this way the parent does not get
1014 - * penalized for creating too many threads.
1015 - *
1016 - * (this cannot be used to 'generate' timeslices
1017 - * artificially, because any timeslice recovered here
1018 - * was given away by the parent in the first place.)
1019 - */
1020 -void fastcall sched_exit(struct task_struct *p)
1021 -{
1022 - unsigned long flags;
1023 - struct rq *rq;
1024 -
1025 - /*
1026 - * If the child was a (relative-) CPU hog then decrease
1027 - * the sleep_avg of the parent as well.
1028 - */
1029 - rq = task_rq_lock(p->parent, &flags);
1030 - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1031 - p->parent->time_slice += p->time_slice;
1032 - if (unlikely(p->parent->time_slice > task_timeslice(p)))
1033 - p->parent->time_slice = task_timeslice(p);
1034 - }
1035 - if (p->sleep_avg < p->parent->sleep_avg)
1036 - p->parent->sleep_avg = p->parent->sleep_avg /
1037 - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1038 - (EXIT_WEIGHT + 1);
1039 - task_rq_unlock(rq, &flags);
1040 -}
1041 -
1042 /**
1043 * prepare_task_switch - prepare to switch tasks
1044 * @rq: the runqueue preparing to switch
1045 @@ -2068,23 +1943,21 @@ void sched_exec(void)
1046 * pull_task - move a task from a remote runqueue to the local runqueue.
1047 * Both runqueues must be locked.
1048 */
1049 -static void pull_task(struct rq *src_rq, struct prio_array *src_array,
1050 - struct task_struct *p, struct rq *this_rq,
1051 - struct prio_array *this_array, int this_cpu)
1052 +static void pull_task(struct rq *src_rq, struct task_struct *p,
1053 + struct rq *this_rq, int this_cpu)
1054 {
1055 - dequeue_task(p, src_array);
1056 + dequeue_task(p, src_rq);
1057 dec_nr_running(p, src_rq);
1058 set_task_cpu(p, this_cpu);
1059 inc_nr_running(p, this_rq);
1060 - enqueue_task(p, this_array);
1061 + enqueue_task(p, this_rq);
1062 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
1063 + this_rq->most_recent_timestamp;
1064 /*
1065 * Note that idle threads have a prio of MAX_PRIO, for this test
1066 * to be always true for them.
1067 */
1068 - if (TASK_PREEMPTS_CURR(p, this_rq))
1069 - resched_task(this_rq->curr);
1070 + preempt(p, this_rq);
1071 }
1072
1073 /*
1074 @@ -2127,8 +2000,6 @@ int can_migrate_task(struct task_struct
1075 return 1;
1076 }
1077
1078 -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1079 -
1080 /*
1081 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1082 * load from busiest to this_rq, as part of a balancing operation within
1083 @@ -2143,7 +2014,6 @@ static int move_tasks(struct rq *this_rq
1084 {
1085 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
1086 best_prio_seen, skip_for_load;
1087 - struct prio_array *array, *dst_array;
1088 struct list_head *head, *curr;
1089 struct task_struct *tmp;
1090 long rem_load_move;
1091 @@ -2153,8 +2023,8 @@ static int move_tasks(struct rq *this_rq
1092
1093 rem_load_move = max_load_move;
1094 pinned = 1;
1095 - this_best_prio = rq_best_prio(this_rq);
1096 - best_prio = rq_best_prio(busiest);
1097 + this_best_prio = this_rq->curr->prio;
1098 + best_prio = busiest->curr->prio;
1099 /*
1100 * Enable handling of the case where there is more than one task
1101 * with the best priority. If the current running task is one
1102 @@ -2164,38 +2034,17 @@ static int move_tasks(struct rq *this_rq
1103 */
1104 best_prio_seen = best_prio == busiest->curr->prio;
1105
1106 - /*
1107 - * We first consider expired tasks. Those will likely not be
1108 - * executed in the near future, and they are most likely to
1109 - * be cache-cold, thus switching CPUs has the least effect
1110 - * on them.
1111 - */
1112 - if (busiest->expired->nr_active) {
1113 - array = busiest->expired;
1114 - dst_array = this_rq->expired;
1115 - } else {
1116 - array = busiest->active;
1117 - dst_array = this_rq->active;
1118 - }
1119 -
1120 -new_array:
1121 /* Start searching at priority 0: */
1122 idx = 0;
1123 skip_bitmap:
1124 if (!idx)
1125 - idx = sched_find_first_bit(array->bitmap);
1126 + idx = sched_find_first_bit(busiest->bitmap);
1127 else
1128 - idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1129 - if (idx >= MAX_PRIO) {
1130 - if (array == busiest->expired && busiest->active->nr_active) {
1131 - array = busiest->active;
1132 - dst_array = this_rq->active;
1133 - goto new_array;
1134 - }
1135 + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1136 + if (idx >= MAX_PRIO)
1137 goto out;
1138 - }
1139
1140 - head = array->queue + idx;
1141 + head = busiest->queue + idx;
1142 curr = head->prev;
1143 skip_queue:
1144 tmp = list_entry(curr, struct task_struct, run_list);
1145 @@ -2220,7 +2069,7 @@ skip_queue:
1146 goto skip_bitmap;
1147 }
1148
1149 - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1150 + pull_task(busiest, tmp, this_rq, this_cpu);
1151 pulled++;
1152 rem_load_move -= tmp->load_weight;
1153
1154 @@ -3036,27 +2885,6 @@ unsigned long long current_sched_time(co
1155 }
1156
1157 /*
1158 - * We place interactive tasks back into the active array, if possible.
1159 - *
1160 - * To guarantee that this does not starve expired tasks we ignore the
1161 - * interactivity of a task if the first expired task had to wait more
1162 - * than a 'reasonable' amount of time. This deadline timeout is
1163 - * load-dependent, as the frequency of array switched decreases with
1164 - * increasing number of running tasks. We also ignore the interactivity
1165 - * if a better static_prio task has expired:
1166 - */
1167 -static inline int expired_starving(struct rq *rq)
1168 -{
1169 - if (rq->curr->static_prio > rq->best_expired_prio)
1170 - return 1;
1171 - if (!STARVATION_LIMIT || !rq->expired_timestamp)
1172 - return 0;
1173 - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
1174 - return 1;
1175 - return 0;
1176 -}
1177 -
1178 -/*
1179 * Account user cpu time to a process.
1180 * @p: the process that the cpu time gets accounted to
1181 * @hardirq_offset: the offset to subtract from hardirq_count()
1182 @@ -3104,6 +2932,7 @@ void account_system_time(struct task_str
1183 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
1184 else
1185 cpustat->idle = cputime64_add(cpustat->idle, tmp);
1186 + p->systime += NSJIFFY;
1187 /* Account for system time used */
1188 acct_update_integrals(p);
1189 }
1190 @@ -3129,76 +2958,49 @@ void account_steal_time(struct task_stru
1191 cpustat->steal = cputime64_add(cpustat->steal, tmp);
1192 }
1193
1194 +static void time_slice_expired(struct task_struct *p, struct rq *rq)
1195 +{
1196 + set_tsk_need_resched(p);
1197 + p->time_slice = rr_interval(p);
1198 + requeue_task(p, rq, effective_prio(p));
1199 +}
1200 +
1201 static void task_running_tick(struct rq *rq, struct task_struct *p)
1202 {
1203 - if (p->array != rq->active) {
1204 + unsigned long debit;
1205 +
1206 + if (unlikely(!task_queued(p))) {
1207 /* Task has expired but was not scheduled yet */
1208 set_tsk_need_resched(p);
1209 return;
1210 }
1211 + /* SCHED_FIFO tasks never run out of timeslice. */
1212 + if (unlikely(p->policy == SCHED_FIFO))
1213 + return;
1214 +
1215 spin_lock(&rq->lock);
1216 + debit = ns_diff(rq->most_recent_timestamp, p->timestamp);
1217 + p->ns_debit += debit;
1218 + if (p->ns_debit < NSJIFFY)
1219 + goto out_unlock;
1220 + p->ns_debit %= NSJIFFY;
1221 /*
1222 - * The task was running during this tick - update the
1223 - * time slice counter. Note: we do not update a thread's
1224 - * priority until it either goes to sleep or uses up its
1225 - * timeslice. This makes it possible for interactive tasks
1226 - * to use up their timeslices at their highest priority levels.
1227 + * Tasks lose bonus each time they use up a full slice().
1228 */
1229 - if (rt_task(p)) {
1230 - /*
1231 - * RR tasks need a special form of timeslice management.
1232 - * FIFO tasks have no timeslices.
1233 - */
1234 - if ((p->policy == SCHED_RR) && !--p->time_slice) {
1235 - p->time_slice = task_timeslice(p);
1236 - p->first_time_slice = 0;
1237 - set_tsk_need_resched(p);
1238 -
1239 - /* put it at the end of the queue: */
1240 - requeue_task(p, rq->active);
1241 - }
1242 + if (!--p->slice) {
1243 + dec_bonus(p);
1244 + p->totalrun = 0;
1245 + p->slice = slice(p);
1246 + time_slice_expired(p, rq);
1247 goto out_unlock;
1248 }
1249 + /*
1250 + * Tasks that run out of time_slice but still have slice left get
1251 + * requeued with a lower priority && RR_INTERVAL time_slice.
1252 + */
1253 if (!--p->time_slice) {
1254 - dequeue_task(p, rq->active);
1255 - set_tsk_need_resched(p);
1256 - p->prio = effective_prio(p);
1257 - p->time_slice = task_timeslice(p);
1258 - p->first_time_slice = 0;
1259 -
1260 - if (!rq->expired_timestamp)
1261 - rq->expired_timestamp = jiffies;
1262 - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
1263 - enqueue_task(p, rq->expired);
1264 - if (p->static_prio < rq->best_expired_prio)
1265 - rq->best_expired_prio = p->static_prio;
1266 - } else
1267 - enqueue_task(p, rq->active);
1268 - } else {
1269 - /*
1270 - * Prevent a too long timeslice allowing a task to monopolize
1271 - * the CPU. We do this by splitting up the timeslice into
1272 - * smaller pieces.
1273 - *
1274 - * Note: this does not mean the task's timeslices expire or
1275 - * get lost in any way, they just might be preempted by
1276 - * another task of equal priority. (one with higher
1277 - * priority would have preempted this task already.) We
1278 - * requeue this task to the end of the list on this priority
1279 - * level, which is in essence a round-robin of tasks with
1280 - * equal priority.
1281 - *
1282 - * This only applies to tasks in the interactive
1283 - * delta range with at least TIMESLICE_GRANULARITY to requeue.
1284 - */
1285 - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1286 - p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1287 - (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1288 - (p->array == rq->active)) {
1289 -
1290 - requeue_task(p, rq->active);
1291 - set_tsk_need_resched(p);
1292 - }
1293 + time_slice_expired(p, rq);
1294 + goto out_unlock;
1295 }
1296 out_unlock:
1297 spin_unlock(&rq->lock);
1298 @@ -3207,9 +3009,6 @@ out_unlock:
1299 /*
1300 * This function gets called by the timer code, with HZ frequency.
1301 * We call it with interrupts disabled.
1302 - *
1303 - * It also gets called by the fork code, when changing the parent's
1304 - * timeslices.
1305 */
1306 void scheduler_tick(void)
1307 {
1308 @@ -3273,13 +3072,13 @@ static void wake_sleeping_dependent(int
1309
1310 /*
1311 * number of 'lost' timeslices this task wont be able to fully
1312 - * utilize, if another task runs on a sibling. This models the
1313 + * utilise, if another task runs on a sibling. This models the
1314 * slowdown effect of other tasks running on siblings:
1315 */
1316 static inline unsigned long
1317 smt_slice(struct task_struct *p, struct sched_domain *sd)
1318 {
1319 - return p->time_slice * (100 - sd->per_cpu_gain) / 100;
1320 + return p->slice * (100 - sd->per_cpu_gain) / 100;
1321 }
1322
1323 /*
1324 @@ -3343,7 +3142,7 @@ dependent_sleeper(int this_cpu, struct r
1325 } else {
1326 if (smt_curr->static_prio < p->static_prio &&
1327 !TASK_PREEMPTS_CURR(p, smt_rq) &&
1328 - smt_slice(smt_curr, sd) > task_timeslice(p))
1329 + smt_slice(smt_curr, sd) > slice(p))
1330 ret = 1;
1331 }
1332 unlock:
1333 @@ -3400,25 +3199,18 @@ EXPORT_SYMBOL(sub_preempt_count);
1334
1335 #endif
1336
1337 -static inline int interactive_sleep(enum sleep_type sleep_type)
1338 -{
1339 - return (sleep_type == SLEEP_INTERACTIVE ||
1340 - sleep_type == SLEEP_INTERRUPTED);
1341 -}
1342 -
1343 /*
1344 * schedule() is the main scheduler function.
1345 */
1346 asmlinkage void __sched schedule(void)
1347 {
1348 struct task_struct *prev, *next;
1349 - struct prio_array *array;
1350 struct list_head *queue;
1351 unsigned long long now;
1352 - unsigned long run_time;
1353 - int cpu, idx, new_prio;
1354 long *switch_count;
1355 + unsigned long debit;
1356 struct rq *rq;
1357 + int cpu, idx;
1358
1359 /*
1360 * Test if we are atomic. Since do_exit() needs to call into
1361 @@ -3454,20 +3246,11 @@ need_resched_nonpreemptible:
1362
1363 schedstat_inc(rq, sched_cnt);
1364 now = sched_clock();
1365 - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
1366 - run_time = now - prev->timestamp;
1367 - if (unlikely((long long)(now - prev->timestamp) < 0))
1368 - run_time = 0;
1369 - } else
1370 - run_time = NS_MAX_SLEEP_AVG;
1371 -
1372 - /*
1373 - * Tasks charged proportionately less run_time at high sleep_avg to
1374 - * delay them losing their interactive status
1375 - */
1376 - run_time /= (CURRENT_BONUS(prev) ? : 1);
1377
1378 spin_lock_irq(&rq->lock);
1379 + prev->runtime = ns_diff(now, prev->timestamp);
1380 + debit = ns_diff(now, rq->most_recent_timestamp) % NSJIFFY;
1381 + prev->ns_debit += debit;
1382
1383 switch_count = &prev->nivcsw;
1384 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1385 @@ -3476,8 +3259,10 @@ need_resched_nonpreemptible:
1386 unlikely(signal_pending(prev))))
1387 prev->state = TASK_RUNNING;
1388 else {
1389 - if (prev->state == TASK_UNINTERRUPTIBLE)
1390 + if (prev->state == TASK_UNINTERRUPTIBLE) {
1391 + prev->flags |= PF_NONSLEEP;
1392 rq->nr_uninterruptible++;
1393 + }
1394 deactivate_task(prev, rq);
1395 }
1396 }
1397 @@ -3487,62 +3272,28 @@ need_resched_nonpreemptible:
1398 idle_balance(cpu, rq);
1399 if (!rq->nr_running) {
1400 next = rq->idle;
1401 - rq->expired_timestamp = 0;
1402 wake_sleeping_dependent(cpu);
1403 goto switch_tasks;
1404 }
1405 }
1406
1407 - array = rq->active;
1408 - if (unlikely(!array->nr_active)) {
1409 - /*
1410 - * Switch the active and expired arrays.
1411 - */
1412 - schedstat_inc(rq, sched_switch);
1413 - rq->active = rq->expired;
1414 - rq->expired = array;
1415 - array = rq->active;
1416 - rq->expired_timestamp = 0;
1417 - rq->best_expired_prio = MAX_PRIO;
1418 - }
1419 -
1420 - idx = sched_find_first_bit(array->bitmap);
1421 - queue = array->queue + idx;
1422 + idx = sched_find_first_bit(rq->bitmap);
1423 + queue = rq->queue + idx;
1424 next = list_entry(queue->next, struct task_struct, run_list);
1425
1426 - if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
1427 - unsigned long long delta = now - next->timestamp;
1428 - if (unlikely((long long)(now - next->timestamp) < 0))
1429 - delta = 0;
1430 -
1431 - if (next->sleep_type == SLEEP_INTERACTIVE)
1432 - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1433 -
1434 - array = next->array;
1435 - new_prio = recalc_task_prio(next, next->timestamp + delta);
1436 -
1437 - if (unlikely(next->prio != new_prio)) {
1438 - dequeue_task(next, array);
1439 - next->prio = new_prio;
1440 - enqueue_task(next, array);
1441 - }
1442 - }
1443 - next->sleep_type = SLEEP_NORMAL;
1444 if (dependent_sleeper(cpu, rq, next))
1445 next = rq->idle;
1446 + else {
1447 + prefetch(next);
1448 + prefetch_stack(next);
1449 + }
1450 switch_tasks:
1451 if (next == rq->idle)
1452 schedstat_inc(rq, sched_goidle);
1453 - prefetch(next);
1454 - prefetch_stack(next);
1455 clear_tsk_need_resched(prev);
1456 rcu_qsctr_inc(task_cpu(prev));
1457
1458 update_cpu_clock(prev, rq, now);
1459 -
1460 - prev->sleep_avg -= run_time;
1461 - if ((long)prev->sleep_avg <= 0)
1462 - prev->sleep_avg = 0;
1463 prev->timestamp = prev->last_ran = now;
1464
1465 sched_info_switch(prev, next);
1466 @@ -3978,29 +3729,21 @@ EXPORT_SYMBOL(sleep_on_timeout);
1467 */
1468 void rt_mutex_setprio(struct task_struct *p, int prio)
1469 {
1470 - struct prio_array *array;
1471 unsigned long flags;
1472 + int queued, oldprio;
1473 struct rq *rq;
1474 - int oldprio;
1475
1476 BUG_ON(prio < 0 || prio > MAX_PRIO);
1477
1478 rq = task_rq_lock(p, &flags);
1479
1480 oldprio = p->prio;
1481 - array = p->array;
1482 - if (array)
1483 - dequeue_task(p, array);
1484 + if ((queued = task_queued(p)))
1485 + dequeue_task(p, rq);
1486 p->prio = prio;
1487
1488 - if (array) {
1489 - /*
1490 - * If changing to an RT priority then queue it
1491 - * in the active array!
1492 - */
1493 - if (rt_task(p))
1494 - array = rq->active;
1495 - enqueue_task(p, array);
1496 + if (queued) {
1497 + enqueue_task(p, rq);
1498 /*
1499 * Reschedule if we are currently running on this runqueue and
1500 * our priority decreased, or if we are not currently running on
1501 @@ -4009,8 +3752,8 @@ void rt_mutex_setprio(struct task_struct
1502 if (task_running(rq, p)) {
1503 if (p->prio > oldprio)
1504 resched_task(rq->curr);
1505 - } else if (TASK_PREEMPTS_CURR(p, rq))
1506 - resched_task(rq->curr);
1507 + } else
1508 + preempt(p, rq);
1509 }
1510 task_rq_unlock(rq, &flags);
1511 }
1512 @@ -4019,8 +3762,7 @@ void rt_mutex_setprio(struct task_struct
1513
1514 void set_user_nice(struct task_struct *p, long nice)
1515 {
1516 - struct prio_array *array;
1517 - int old_prio, delta;
1518 + int queued, old_prio,delta;
1519 unsigned long flags;
1520 struct rq *rq;
1521
1522 @@ -4041,20 +3783,21 @@ void set_user_nice(struct task_struct *p
1523 p->static_prio = NICE_TO_PRIO(nice);
1524 goto out_unlock;
1525 }
1526 - array = p->array;
1527 - if (array) {
1528 - dequeue_task(p, array);
1529 + if ((queued = task_queued(p))) {
1530 + dequeue_task(p, rq);
1531 dec_raw_weighted_load(rq, p);
1532 }
1533
1534 p->static_prio = NICE_TO_PRIO(nice);
1535 set_load_weight(p);
1536 old_prio = p->prio;
1537 + if (p->bonus > bonus(p))
1538 + p->bonus= bonus(p);
1539 p->prio = effective_prio(p);
1540 delta = p->prio - old_prio;
1541
1542 - if (array) {
1543 - enqueue_task(p, array);
1544 + if (queued) {
1545 + enqueue_task(p, rq);
1546 inc_raw_weighted_load(rq, p);
1547 /*
1548 * If the task increased its priority or is running and
1549 @@ -4177,18 +3920,13 @@ static inline struct task_struct *find_p
1550 /* Actually do priority change: must hold rq lock. */
1551 static void __setscheduler(struct task_struct *p, int policy, int prio)
1552 {
1553 - BUG_ON(p->array);
1554 + BUG_ON(task_queued(p));
1555
1556 p->policy = policy;
1557 p->rt_priority = prio;
1558 p->normal_prio = normal_prio(p);
1559 /* we are holding p->pi_lock already */
1560 p->prio = rt_mutex_getprio(p);
1561 - /*
1562 - * SCHED_BATCH tasks are treated as perpetual CPU hogs:
1563 - */
1564 - if (policy == SCHED_BATCH)
1565 - p->sleep_avg = 0;
1566 set_load_weight(p);
1567 }
1568
1569 @@ -4204,8 +3942,7 @@ static void __setscheduler(struct task_s
1570 int sched_setscheduler(struct task_struct *p, int policy,
1571 struct sched_param *param)
1572 {
1573 - int retval, oldprio, oldpolicy = -1;
1574 - struct prio_array *array;
1575 + int queued, retval, oldprio, oldpolicy = -1;
1576 unsigned long flags;
1577 struct rq *rq;
1578
1579 @@ -4279,12 +4016,11 @@ recheck:
1580 spin_unlock_irqrestore(&p->pi_lock, flags);
1581 goto recheck;
1582 }
1583 - array = p->array;
1584 - if (array)
1585 + if ((queued = task_queued(p)))
1586 deactivate_task(p, rq);
1587 oldprio = p->prio;
1588 __setscheduler(p, policy, param->sched_priority);
1589 - if (array) {
1590 + if (queued) {
1591 __activate_task(p, rq);
1592 /*
1593 * Reschedule if we are currently running on this runqueue and
1594 @@ -4294,8 +4030,8 @@ recheck:
1595 if (task_running(rq, p)) {
1596 if (p->prio > oldprio)
1597 resched_task(rq->curr);
1598 - } else if (TASK_PREEMPTS_CURR(p, rq))
1599 - resched_task(rq->curr);
1600 + } else
1601 + preempt(p, rq);
1602 }
1603 __task_rq_unlock(rq);
1604 spin_unlock_irqrestore(&p->pi_lock, flags);
1605 @@ -4567,41 +4303,24 @@ asmlinkage long sys_sched_getaffinity(pi
1606 /**
1607 * sys_sched_yield - yield the current processor to other threads.
1608 *
1609 - * this function yields the current CPU by moving the calling thread
1610 - * to the expired array. If there are no other threads running on this
1611 - * CPU then this function will return.
1612 + * This function yields the current CPU by dropping the priority of current
1613 + * to the lowest priority.
1614 */
1615 asmlinkage long sys_sched_yield(void)
1616 {
1617 struct rq *rq = this_rq_lock();
1618 - struct prio_array *array = current->array, *target = rq->expired;
1619 + int newprio = current->prio;
1620
1621 schedstat_inc(rq, yld_cnt);
1622 - /*
1623 - * We implement yielding by moving the task into the expired
1624 - * queue.
1625 - *
1626 - * (special rule: RT tasks will just roundrobin in the active
1627 - * array.)
1628 - */
1629 - if (rt_task(current))
1630 - target = rq->active;
1631
1632 - if (array->nr_active == 1) {
1633 - schedstat_inc(rq, yld_act_empty);
1634 - if (!rq->expired->nr_active)
1635 - schedstat_inc(rq, yld_both_empty);
1636 - } else if (!rq->expired->nr_active)
1637 - schedstat_inc(rq, yld_exp_empty);
1638 -
1639 - if (array != target) {
1640 - dequeue_task(current, array);
1641 - enqueue_task(current, target);
1642 - } else
1643 - /*
1644 - * requeue_task is cheaper so perform that if possible.
1645 - */
1646 - requeue_task(current, array);
1647 + newprio = current->prio;
1648 + schedstat_inc(rq, yld_cnt);
1649 + current->slice = slice(current);
1650 + current->time_slice = rr_interval(current);
1651 + if (likely(!rt_task(current)))
1652 + newprio = MIN_USER_PRIO;
1653 +
1654 + requeue_task(current, rq, newprio);
1655
1656 /*
1657 * Since we are going to call schedule() anyway, there's
1658 @@ -4812,7 +4531,7 @@ long sys_sched_rr_get_interval(pid_t pid
1659 goto out_unlock;
1660
1661 jiffies_to_timespec(p->policy == SCHED_FIFO ?
1662 - 0 : task_timeslice(p), &t);
1663 + 0 : slice(p), &t);
1664 read_unlock(&tasklist_lock);
1665 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1666 out_nounlock:
1667 @@ -4941,8 +4660,6 @@ void __cpuinit init_idle(struct task_str
1668 unsigned long flags;
1669
1670 idle->timestamp = sched_clock();
1671 - idle->sleep_avg = 0;
1672 - idle->array = NULL;
1673 idle->prio = idle->normal_prio = MAX_PRIO;
1674 idle->state = TASK_RUNNING;
1675 idle->cpus_allowed = cpumask_of_cpu(cpu);
1676 @@ -5062,7 +4779,7 @@ static int __migrate_task(struct task_st
1677 goto out;
1678
1679 set_task_cpu(p, dest_cpu);
1680 - if (p->array) {
1681 + if (task_queued(p)) {
1682 /*
1683 * Sync timestamp with rq_dest's before activating.
1684 * The same thing could be achieved by doing this step
1685 @@ -5073,8 +4790,7 @@ static int __migrate_task(struct task_st
1686 + rq_dest->most_recent_timestamp;
1687 deactivate_task(p, rq_src);
1688 __activate_task(p, rq_dest);
1689 - if (TASK_PREEMPTS_CURR(p, rq_dest))
1690 - resched_task(rq_dest->curr);
1691 + preempt(p, rq_dest);
1692 }
1693 ret = 1;
1694 out:
1695 @@ -5303,7 +5019,7 @@ static void migrate_dead_tasks(unsigned
1696
1697 for (arr = 0; arr < 2; arr++) {
1698 for (i = 0; i < MAX_PRIO; i++) {
1699 - struct list_head *list = &rq->arrays[arr].queue[i];
1700 + struct list_head *list = &rq->queue[i];
1701
1702 while (!list_empty(list))
1703 migrate_dead(dead_cpu, list_entry(list->next,
1704 @@ -6894,19 +6610,16 @@ int in_sched_functions(unsigned long add
1705
1706 void __init sched_init(void)
1707 {
1708 - int i, j, k;
1709 + int i;
1710
1711 for_each_possible_cpu(i) {
1712 - struct prio_array *array;
1713 struct rq *rq;
1714 + int j;
1715
1716 rq = cpu_rq(i);
1717 spin_lock_init(&rq->lock);
1718 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
1719 rq->nr_running = 0;
1720 - rq->active = rq->arrays;
1721 - rq->expired = rq->arrays + 1;
1722 - rq->best_expired_prio = MAX_PRIO;
1723
1724 #ifdef CONFIG_SMP
1725 rq->sd = NULL;
1726 @@ -6920,15 +6633,11 @@ void __init sched_init(void)
1727 #endif
1728 atomic_set(&rq->nr_iowait, 0);
1729
1730 - for (j = 0; j < 2; j++) {
1731 - array = rq->arrays + j;
1732 - for (k = 0; k < MAX_PRIO; k++) {
1733 - INIT_LIST_HEAD(array->queue + k);
1734 - __clear_bit(k, array->bitmap);
1735 - }
1736 - // delimiter for bitsearch
1737 - __set_bit(MAX_PRIO, array->bitmap);
1738 - }
1739 + for (j = 0; j < MAX_PRIO; j++)
1740 + INIT_LIST_HEAD(&rq->queue[j]);
1741 + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
1742 + /* delimiter for bitsearch */
1743 + __set_bit(MAX_PRIO, rq->bitmap);
1744 }
1745
1746 set_load_weight(&init_task);
1747 @@ -6984,10 +6693,10 @@ EXPORT_SYMBOL(__might_sleep);
1748 #ifdef CONFIG_MAGIC_SYSRQ
1749 void normalize_rt_tasks(void)
1750 {
1751 - struct prio_array *array;
1752 struct task_struct *p;
1753 unsigned long flags;
1754 struct rq *rq;
1755 + int queued;
1756
1757 read_lock_irq(&tasklist_lock);
1758 for_each_process(p) {
1759 @@ -6997,11 +6706,10 @@ void normalize_rt_tasks(void)
1760 spin_lock_irqsave(&p->pi_lock, flags);
1761 rq = __task_rq_lock(p);
1762
1763 - array = p->array;
1764 - if (array)
1765 + if ((queued = task_queued(p)))
1766 deactivate_task(p, task_rq(p));
1767 __setscheduler(p, SCHED_NORMAL, 0);
1768 - if (array) {
1769 + if (queued) {
1770 __activate_task(p, task_rq(p));
1771 resched_task(rq->curr);
1772 }