Annotation of /trunk/kernel26-alx/patches-2.6.17-r6/0003-2.6.17-smpnice-staircase-16.patch
Parent Directory | Revision Log
Revision 199 -
(hide annotations)
(download)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 52028 byte(s)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 52028 byte(s)
-import
1 | niro | 199 | Implement the "staircase" hybrid foreground-background single priority |
2 | array cpu scheduler policy. | ||
3 | |||
4 | Signed-off-by: Con Kolivas <kernel@kolivas.org> | ||
5 | |||
6 | fs/proc/array.c | 4 | ||
7 | include/linux/sched.h | 21 - | ||
8 | kernel/exit.c | 1 | ||
9 | kernel/sched.c | 1015 ++++++++++++++++++-------------------------------- | ||
10 | 4 files changed, 378 insertions(+), 663 deletions(-) | ||
11 | |||
12 | Index: linux-ck-dev/fs/proc/array.c | ||
13 | =================================================================== | ||
14 | --- linux-ck-dev.orig/fs/proc/array.c 2006-06-18 15:20:15.000000000 +1000 | ||
15 | +++ linux-ck-dev/fs/proc/array.c 2006-06-18 15:21:50.000000000 +1000 | ||
16 | @@ -165,7 +165,7 @@ static inline char * task_state(struct t | ||
17 | read_lock(&tasklist_lock); | ||
18 | buffer += sprintf(buffer, | ||
19 | "State:\t%s\n" | ||
20 | - "SleepAVG:\t%lu%%\n" | ||
21 | + "Bonus:\t%d\n" | ||
22 | "Tgid:\t%d\n" | ||
23 | "Pid:\t%d\n" | ||
24 | "PPid:\t%d\n" | ||
25 | @@ -173,7 +173,7 @@ static inline char * task_state(struct t | ||
26 | "Uid:\t%d\t%d\t%d\t%d\n" | ||
27 | "Gid:\t%d\t%d\t%d\t%d\n", | ||
28 | get_task_state(p), | ||
29 | - (p->sleep_avg/1024)*100/(1020000000/1024), | ||
30 | + p->bonus, | ||
31 | p->tgid, | ||
32 | p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, | ||
33 | pid_alive(p) && p->ptrace ? p->parent->pid : 0, | ||
34 | Index: linux-ck-dev/include/linux/sched.h | ||
35 | =================================================================== | ||
36 | --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:21:31.000000000 +1000 | ||
37 | +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:21:50.000000000 +1000 | ||
38 | @@ -483,6 +483,7 @@ struct signal_struct { | ||
39 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | ||
40 | |||
41 | #define MAX_PRIO (MAX_RT_PRIO + 40) | ||
42 | +#define MIN_USER_PRIO (MAX_PRIO - 1) | ||
43 | |||
44 | #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) | ||
45 | #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) | ||
46 | @@ -518,7 +519,6 @@ extern struct user_struct *find_user(uid | ||
47 | extern struct user_struct root_user; | ||
48 | #define INIT_USER (&root_user) | ||
49 | |||
50 | -typedef struct prio_array prio_array_t; | ||
51 | struct backing_dev_info; | ||
52 | struct reclaim_state; | ||
53 | |||
54 | @@ -687,13 +687,6 @@ struct audit_context; /* See audit.c */ | ||
55 | struct mempolicy; | ||
56 | struct pipe_inode_info; | ||
57 | |||
58 | -enum sleep_type { | ||
59 | - SLEEP_NORMAL, | ||
60 | - SLEEP_NONINTERACTIVE, | ||
61 | - SLEEP_INTERACTIVE, | ||
62 | - SLEEP_INTERRUPTED, | ||
63 | -}; | ||
64 | - | ||
65 | struct task_struct { | ||
66 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | ||
67 | struct thread_info *thread_info; | ||
68 | @@ -711,19 +704,18 @@ struct task_struct { | ||
69 | int load_weight; /* for niceness load balancing purposes */ | ||
70 | int prio, static_prio; | ||
71 | struct list_head run_list; | ||
72 | - prio_array_t *array; | ||
73 | |||
74 | unsigned short ioprio; | ||
75 | unsigned int btrace_seq; | ||
76 | |||
77 | - unsigned long sleep_avg; | ||
78 | - unsigned long long timestamp, last_ran; | ||
79 | + unsigned long long timestamp; | ||
80 | + unsigned long runtime, totalrun, ns_debit, systime; | ||
81 | + unsigned int bonus; | ||
82 | + unsigned int slice, time_slice; | ||
83 | unsigned long long sched_time; /* sched_clock time spent running */ | ||
84 | - enum sleep_type sleep_type; | ||
85 | |||
86 | unsigned long policy; | ||
87 | cpumask_t cpus_allowed; | ||
88 | - unsigned int time_slice, first_time_slice; | ||
89 | |||
90 | #ifdef CONFIG_SCHEDSTATS | ||
91 | struct sched_info sched_info; | ||
92 | @@ -952,6 +944,8 @@ static inline void put_task_struct(struc | ||
93 | #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ | ||
94 | #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ | ||
95 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | ||
96 | +#define PF_NONSLEEP 0x20000000 /* Waiting on in kernel activity */ | ||
97 | +#define PF_FORKED 0x40000000 /* Task just forked another process */ | ||
98 | |||
99 | /* | ||
100 | * Only the _current_ task can read/write to tsk->flags, but other | ||
101 | @@ -1073,7 +1067,6 @@ extern void FASTCALL(wake_up_new_task(st | ||
102 | static inline void kick_process(struct task_struct *tsk) { } | ||
103 | #endif | ||
104 | extern void FASTCALL(sched_fork(task_t * p, int clone_flags)); | ||
105 | -extern void FASTCALL(sched_exit(task_t * p)); | ||
106 | |||
107 | extern int in_group_p(gid_t); | ||
108 | extern int in_egroup_p(gid_t); | ||
109 | Index: linux-ck-dev/kernel/exit.c | ||
110 | =================================================================== | ||
111 | --- linux-ck-dev.orig/kernel/exit.c 2006-06-18 15:21:00.000000000 +1000 | ||
112 | +++ linux-ck-dev/kernel/exit.c 2006-06-18 15:21:50.000000000 +1000 | ||
113 | @@ -170,7 +170,6 @@ repeat: | ||
114 | zap_leader = (leader->exit_signal == -1); | ||
115 | } | ||
116 | |||
117 | - sched_exit(p); | ||
118 | write_unlock_irq(&tasklist_lock); | ||
119 | spin_unlock(&p->proc_lock); | ||
120 | proc_pid_flush(proc_dentry); | ||
121 | Index: linux-ck-dev/kernel/sched.c | ||
122 | =================================================================== | ||
123 | --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000 | ||
124 | +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:22:27.000000000 +1000 | ||
125 | @@ -16,6 +16,9 @@ | ||
126 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
127 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
128 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
129 | + * 2006-06-18 Staircase scheduling policy by Con Kolivas with help | ||
130 | + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. | ||
131 | + * Staircase v16 | ||
132 | */ | ||
133 | |||
134 | #include <linux/mm.h> | ||
135 | @@ -75,131 +78,27 @@ | ||
136 | /* | ||
137 | * Some helpers for converting nanosecond timing to jiffy resolution | ||
138 | */ | ||
139 | -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | ||
140 | -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | ||
141 | - | ||
142 | -/* | ||
143 | - * These are the 'tuning knobs' of the scheduler: | ||
144 | - * | ||
145 | - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | ||
146 | - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
147 | - * Timeslices get refilled after they expire. | ||
148 | - */ | ||
149 | -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
150 | -#define DEF_TIMESLICE (100 * HZ / 1000) | ||
151 | -#define ON_RUNQUEUE_WEIGHT 30 | ||
152 | -#define CHILD_PENALTY 95 | ||
153 | -#define PARENT_PENALTY 100 | ||
154 | -#define EXIT_WEIGHT 3 | ||
155 | -#define PRIO_BONUS_RATIO 25 | ||
156 | -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | ||
157 | -#define INTERACTIVE_DELTA 2 | ||
158 | -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | ||
159 | -#define STARVATION_LIMIT (MAX_SLEEP_AVG) | ||
160 | -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | ||
161 | - | ||
162 | -/* | ||
163 | - * If a task is 'interactive' then we reinsert it in the active | ||
164 | - * array after it has expired its current timeslice. (it will not | ||
165 | - * continue to run immediately, it will still roundrobin with | ||
166 | - * other interactive tasks.) | ||
167 | - * | ||
168 | - * This part scales the interactivity limit depending on niceness. | ||
169 | - * | ||
170 | - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | ||
171 | - * Here are a few examples of different nice levels: | ||
172 | - * | ||
173 | - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | ||
174 | - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | ||
175 | - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | ||
176 | - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | ||
177 | - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | ||
178 | - * | ||
179 | - * (the X axis represents the possible -5 ... 0 ... +5 dynamic | ||
180 | - * priority range a task can explore, a value of '1' means the | ||
181 | - * task is rated interactive.) | ||
182 | - * | ||
183 | - * Ie. nice +19 tasks can never get 'interactive' enough to be | ||
184 | - * reinserted into the active array. And only heavily CPU-hog nice -20 | ||
185 | - * tasks will be expired. Default nice 0 tasks are somewhere between, | ||
186 | - * it takes some effort for them to get interactive, but it's not | ||
187 | - * too hard. | ||
188 | - */ | ||
189 | - | ||
190 | -#define CURRENT_BONUS(p) \ | ||
191 | - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | ||
192 | - MAX_SLEEP_AVG) | ||
193 | - | ||
194 | -#define GRANULARITY (10 * HZ / 1000 ? : 1) | ||
195 | - | ||
196 | -#ifdef CONFIG_SMP | ||
197 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
198 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | ||
199 | - num_online_cpus()) | ||
200 | -#else | ||
201 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
202 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | ||
203 | -#endif | ||
204 | - | ||
205 | -#define SCALE(v1,v1_max,v2_max) \ | ||
206 | - (v1) * (v2_max) / (v1_max) | ||
207 | - | ||
208 | -#define DELTA(p) \ | ||
209 | - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ | ||
210 | - INTERACTIVE_DELTA) | ||
211 | - | ||
212 | -#define TASK_INTERACTIVE(p) \ | ||
213 | - ((p)->prio <= (p)->static_prio - DELTA(p)) | ||
214 | - | ||
215 | -#define INTERACTIVE_SLEEP(p) \ | ||
216 | - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | ||
217 | - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | ||
218 | - | ||
219 | +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ | ||
220 | +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY) | ||
221 | +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY) | ||
222 | #define TASK_PREEMPTS_CURR(p, rq) \ | ||
223 | ((p)->prio < (rq)->curr->prio) | ||
224 | |||
225 | /* | ||
226 | - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
227 | - * to time slice values: [800ms ... 100ms ... 5ms] | ||
228 | - * | ||
229 | - * The higher a thread's priority, the bigger timeslices | ||
230 | - * it gets during one round of execution. But even the lowest | ||
231 | - * priority thread gets MIN_TIMESLICE worth of execution time. | ||
232 | + * This is the time all tasks within the same priority round robin. | ||
233 | + * Set to a minimum of 6ms. | ||
234 | */ | ||
235 | +#define RR_INTERVAL ((6 * HZ / 1001) + 1) | ||
236 | +#define DEF_TIMESLICE (RR_INTERVAL * 19) | ||
237 | |||
238 | -#define SCALE_PRIO(x, prio) \ | ||
239 | - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
240 | - | ||
241 | -static unsigned int static_prio_timeslice(int static_prio) | ||
242 | -{ | ||
243 | - if (static_prio < NICE_TO_PRIO(0)) | ||
244 | - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
245 | - else | ||
246 | - return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
247 | -} | ||
248 | - | ||
249 | -static inline unsigned int task_timeslice(task_t *p) | ||
250 | -{ | ||
251 | - return static_prio_timeslice(p->static_prio); | ||
252 | -} | ||
253 | - | ||
254 | -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
255 | +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \ | ||
256 | < (long long) (sd)->cache_hot_time) | ||
257 | |||
258 | /* | ||
259 | * These are the runqueue data structures: | ||
260 | */ | ||
261 | - | ||
262 | -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
263 | - | ||
264 | typedef struct runqueue runqueue_t; | ||
265 | |||
266 | -struct prio_array { | ||
267 | - unsigned int nr_active; | ||
268 | - unsigned long bitmap[BITMAP_SIZE]; | ||
269 | - struct list_head queue[MAX_PRIO]; | ||
270 | -}; | ||
271 | - | ||
272 | /* | ||
273 | * This is the main, per-CPU runqueue data structure. | ||
274 | * | ||
275 | @@ -229,12 +128,11 @@ struct runqueue { | ||
276 | */ | ||
277 | unsigned long nr_uninterruptible; | ||
278 | |||
279 | - unsigned long expired_timestamp; | ||
280 | unsigned long long timestamp_last_tick; | ||
281 | task_t *curr, *idle; | ||
282 | struct mm_struct *prev_mm; | ||
283 | - prio_array_t *active, *expired, arrays[2]; | ||
284 | - int best_expired_prio; | ||
285 | + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; | ||
286 | + struct list_head queue[MAX_PRIO]; | ||
287 | atomic_t nr_iowait; | ||
288 | |||
289 | #ifdef CONFIG_SMP | ||
290 | @@ -499,13 +397,7 @@ static inline runqueue_t *this_rq_lock(v | ||
291 | |||
292 | #ifdef CONFIG_SCHEDSTATS | ||
293 | /* | ||
294 | - * Called when a process is dequeued from the active array and given | ||
295 | - * the cpu. We should note that with the exception of interactive | ||
296 | - * tasks, the expired queue will become the active queue after the active | ||
297 | - * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
298 | - * expired queue. (Interactive tasks may be requeued directly to the | ||
299 | - * active queue, thus delaying tasks in the expired queue from running; | ||
300 | - * see scheduler_tick()). | ||
301 | + * Called when a process is dequeued and given the cpu. | ||
302 | * | ||
303 | * This function is only called from sched_info_arrive(), rather than | ||
304 | * dequeue_task(). Even though a task may be queued and dequeued multiple | ||
305 | @@ -543,13 +435,11 @@ static void sched_info_arrive(task_t *t) | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | - * Called when a process is queued into either the active or expired | ||
310 | - * array. The time is noted and later used to determine how long we | ||
311 | - * had to wait for us to reach the cpu. Since the expired queue will | ||
312 | - * become the active queue after active queue is empty, without dequeuing | ||
313 | - * and requeuing any tasks, we are interested in queuing to either. It | ||
314 | - * is unusual but not impossible for tasks to be dequeued and immediately | ||
315 | - * requeued in the same or another array: this can happen in sched_yield(), | ||
316 | + * Called when a process is queued | ||
317 | + * The time is noted and later used to determine how long we had to wait for | ||
318 | + * us to reach the cpu. | ||
319 | + * It is unusual but not impossible for tasks to be dequeued and immediately | ||
320 | + * requeued: this can happen in sched_yield(), | ||
321 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | ||
322 | * to runqueue. | ||
323 | * | ||
324 | @@ -603,74 +493,81 @@ static inline void sched_info_switch(tas | ||
325 | #define sched_info_switch(t, next) do { } while (0) | ||
326 | #endif /* CONFIG_SCHEDSTATS */ | ||
327 | |||
328 | -/* | ||
329 | - * Adding/removing a task to/from a priority array: | ||
330 | - */ | ||
331 | -static void dequeue_task(struct task_struct *p, prio_array_t *array) | ||
332 | +#if BITS_PER_LONG < 64 | ||
333 | +static inline void longlimit(unsigned long long *longlong) | ||
334 | +{ | ||
335 | + if (*longlong > (1 << 31)) | ||
336 | + *longlong = 1 << 31; | ||
337 | +} | ||
338 | +#else | ||
339 | +static inline void longlimit(unsigned long long *__unused) | ||
340 | { | ||
341 | - array->nr_active--; | ||
342 | - list_del(&p->run_list); | ||
343 | - if (list_empty(array->queue + p->prio)) | ||
344 | - __clear_bit(p->prio, array->bitmap); | ||
345 | +} | ||
346 | +#endif | ||
347 | + | ||
348 | +/* Get nanosecond clock difference without overflowing unsigned long. */ | ||
349 | +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2) | ||
350 | +{ | ||
351 | + unsigned long long vdiff; | ||
352 | + if (likely(v1 >= v2)) { | ||
353 | + vdiff = v1 - v2; | ||
354 | + longlimit(&vdiff); | ||
355 | + } else { | ||
356 | + /* | ||
357 | + * Rarely the clock appears to go backwards. There should | ||
358 | + * always be a positive difference so return 1. | ||
359 | + */ | ||
360 | + vdiff = 1; | ||
361 | + } | ||
362 | + return (unsigned long)vdiff; | ||
363 | } | ||
364 | |||
365 | -static void enqueue_task(struct task_struct *p, prio_array_t *array) | ||
366 | +static inline int task_queued(const task_t *task) | ||
367 | { | ||
368 | - sched_info_queued(p); | ||
369 | - list_add_tail(&p->run_list, array->queue + p->prio); | ||
370 | - __set_bit(p->prio, array->bitmap); | ||
371 | - array->nr_active++; | ||
372 | - p->array = array; | ||
373 | + return !list_empty(&task->run_list); | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | - * Put task to the end of the run list without the overhead of dequeue | ||
378 | - * followed by enqueue. | ||
379 | + * Adding/removing a task to/from a runqueue: | ||
380 | */ | ||
381 | -static void requeue_task(struct task_struct *p, prio_array_t *array) | ||
382 | +static void dequeue_task(task_t *p, runqueue_t *rq) | ||
383 | { | ||
384 | - list_move_tail(&p->run_list, array->queue + p->prio); | ||
385 | + list_del_init(&p->run_list); | ||
386 | + if (list_empty(rq->queue + p->prio)) | ||
387 | + __clear_bit(p->prio, rq->bitmap); | ||
388 | + p->ns_debit = 0; | ||
389 | } | ||
390 | |||
391 | -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | ||
392 | +static void enqueue_task(task_t *p, runqueue_t *rq) | ||
393 | { | ||
394 | - list_add(&p->run_list, array->queue + p->prio); | ||
395 | - __set_bit(p->prio, array->bitmap); | ||
396 | - array->nr_active++; | ||
397 | - p->array = array; | ||
398 | + list_add_tail(&p->run_list, rq->queue + p->prio); | ||
399 | + __set_bit(p->prio, rq->bitmap); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | - * effective_prio - return the priority that is based on the static | ||
404 | - * priority but is modified by bonuses/penalties. | ||
405 | - * | ||
406 | - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | ||
407 | - * into the -5 ... 0 ... +5 bonus/penalty range. | ||
408 | - * | ||
409 | - * We use 25% of the full 0...39 priority range so that: | ||
410 | - * | ||
411 | - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | ||
412 | - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | ||
413 | - * | ||
414 | - * Both properties are important to certain workloads. | ||
415 | + * Put task to the end of the run list without the overhead of dequeue | ||
416 | + * followed by enqueue. | ||
417 | */ | ||
418 | -static int effective_prio(task_t *p) | ||
419 | +static void requeue_task(task_t *p, runqueue_t *rq, const int prio) | ||
420 | { | ||
421 | - int bonus, prio; | ||
422 | - | ||
423 | - if (rt_task(p)) | ||
424 | - return p->prio; | ||
425 | - | ||
426 | - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | ||
427 | + list_move_tail(&p->run_list, rq->queue + prio); | ||
428 | + if (p->prio != prio) { | ||
429 | + if (list_empty(rq->queue + p->prio)) | ||
430 | + __clear_bit(p->prio, rq->bitmap); | ||
431 | + p->prio = prio; | ||
432 | + __set_bit(prio, rq->bitmap); | ||
433 | + } | ||
434 | + p->ns_debit = 0; | ||
435 | +} | ||
436 | |||
437 | - prio = p->static_prio - bonus; | ||
438 | - if (prio < MAX_RT_PRIO) | ||
439 | - prio = MAX_RT_PRIO; | ||
440 | - if (prio > MAX_PRIO-1) | ||
441 | - prio = MAX_PRIO-1; | ||
442 | - return prio; | ||
443 | +static inline void enqueue_task_head(task_t *p, runqueue_t *rq) | ||
444 | +{ | ||
445 | + list_add(&p->run_list, rq->queue + p->prio); | ||
446 | + __set_bit(p->prio, rq->bitmap); | ||
447 | } | ||
448 | |||
449 | +static unsigned int slice(const task_t *p); | ||
450 | + | ||
451 | /* | ||
452 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
453 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
454 | @@ -688,10 +585,9 @@ static int effective_prio(task_t *p) | ||
455 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
456 | #define LOAD_WEIGHT(lp) \ | ||
457 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
458 | -#define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
459 | - LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
460 | -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
461 | - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
462 | +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p)) | ||
463 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
464 | + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp)))) | ||
465 | |||
466 | static void set_load_weight(task_t *p) | ||
467 | { | ||
468 | @@ -708,7 +604,7 @@ static void set_load_weight(task_t *p) | ||
469 | #endif | ||
470 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
471 | } else | ||
472 | - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
473 | + p->load_weight = TASK_LOAD_WEIGHT(p); | ||
474 | } | ||
475 | |||
476 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
477 | @@ -736,13 +632,9 @@ static inline void dec_nr_running(task_t | ||
478 | /* | ||
479 | * __activate_task - move a task to the runqueue. | ||
480 | */ | ||
481 | -static void __activate_task(task_t *p, runqueue_t *rq) | ||
482 | +static inline void __activate_task(task_t *p, runqueue_t *rq) | ||
483 | { | ||
484 | - prio_array_t *target = rq->active; | ||
485 | - | ||
486 | - if (batch_task(p)) | ||
487 | - target = rq->expired; | ||
488 | - enqueue_task(p, target); | ||
489 | + enqueue_task(p, rq); | ||
490 | inc_nr_running(p, rq); | ||
491 | } | ||
492 | |||
493 | @@ -751,85 +643,181 @@ static void __activate_task(task_t *p, r | ||
494 | */ | ||
495 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | ||
496 | { | ||
497 | - enqueue_task_head(p, rq->active); | ||
498 | + enqueue_task_head(p, rq); | ||
499 | inc_nr_running(p, rq); | ||
500 | } | ||
501 | |||
502 | -static int recalc_task_prio(task_t *p, unsigned long long now) | ||
503 | +/* | ||
504 | + * Bonus - How much higher than its base priority an interactive task can run. | ||
505 | + */ | ||
506 | +static inline unsigned int bonus(const task_t *p) | ||
507 | { | ||
508 | - /* Caller must always ensure 'now >= p->timestamp' */ | ||
509 | - unsigned long long __sleep_time = now - p->timestamp; | ||
510 | - unsigned long sleep_time; | ||
511 | + return TASK_USER_PRIO(p); | ||
512 | +} | ||
513 | |||
514 | - if (batch_task(p)) | ||
515 | - sleep_time = 0; | ||
516 | +static unsigned int rr_interval(const task_t *p) | ||
517 | +{ | ||
518 | + int nice = TASK_NICE(p); | ||
519 | + | ||
520 | + if (nice < 0 && !rt_task(p)) | ||
521 | + return RR_INTERVAL * (20 - nice) / 20; | ||
522 | + return RR_INTERVAL; | ||
523 | +} | ||
524 | + | ||
525 | +/* | ||
526 | + * slice - the duration a task runs before getting requeued at its best | ||
527 | + * priority and has its bonus decremented. | ||
528 | + */ | ||
529 | +static unsigned int slice(const task_t *p) | ||
530 | +{ | ||
531 | + unsigned int slice, rr; | ||
532 | + | ||
533 | + slice = rr = rr_interval(p); | ||
534 | + if (likely(!rt_task(p))) | ||
535 | + slice += (39 - TASK_USER_PRIO(p)) * rr; | ||
536 | + return slice; | ||
537 | +} | ||
538 | + | ||
539 | +/* | ||
540 | + * We increase our bonus by sleeping more than the time we ran. | ||
541 | + * The ratio of sleep to run gives us the cpu% that we last ran and determines | ||
542 | + * the maximum bonus we can acquire. | ||
543 | + */ | ||
544 | +static void inc_bonus(task_t *p, unsigned long totalrun, unsigned long sleep) | ||
545 | +{ | ||
546 | + unsigned int best_bonus = sleep / (totalrun + 1); | ||
547 | + | ||
548 | + if (p->bonus >= best_bonus) | ||
549 | + return; | ||
550 | + best_bonus = bonus(p); | ||
551 | + if (p->bonus < best_bonus) | ||
552 | + p->bonus++; | ||
553 | +} | ||
554 | + | ||
555 | +static inline void dec_bonus(task_t *p) | ||
556 | +{ | ||
557 | + if (p->bonus) | ||
558 | + p->bonus--; | ||
559 | +} | ||
560 | + | ||
561 | +static inline void slice_overrun(struct task_struct *p) | ||
562 | +{ | ||
563 | + unsigned long ns_slice = JIFFIES_TO_NS(p->slice); | ||
564 | + | ||
565 | + do { | ||
566 | + p->totalrun -= ns_slice; | ||
567 | + dec_bonus(p); | ||
568 | + } while (unlikely(p->totalrun > ns_slice)); | ||
569 | +} | ||
570 | + | ||
571 | +/* | ||
572 | + * effective_prio - dynamic priority dependent on bonus. | ||
573 | + * The priority normally decreases by one each RR_INTERVAL. | ||
574 | + * As the bonus increases the initial priority starts at a higher "stair" or | ||
575 | + * priority for longer. | ||
576 | + */ | ||
577 | +static int effective_prio(const task_t *p) | ||
578 | +{ | ||
579 | + int prio; | ||
580 | + unsigned int full_slice, used_slice = 0; | ||
581 | + unsigned int best_bonus, rr; | ||
582 | + | ||
583 | + if (rt_task(p)) | ||
584 | + return p->prio; | ||
585 | + | ||
586 | + full_slice = slice(p); | ||
587 | + if (full_slice > p->slice) | ||
588 | + used_slice = full_slice - p->slice; | ||
589 | + | ||
590 | + best_bonus = bonus(p); | ||
591 | + prio = MAX_RT_PRIO + best_bonus; | ||
592 | + if (!batch_task(p)) | ||
593 | + prio -= p->bonus; | ||
594 | + | ||
595 | + rr = rr_interval(p); | ||
596 | + prio += used_slice / rr; | ||
597 | + if (prio > MIN_USER_PRIO) | ||
598 | + prio = MIN_USER_PRIO; | ||
599 | + return prio; | ||
600 | +} | ||
601 | + | ||
602 | +static inline void continue_slice(task_t *p) | ||
603 | +{ | ||
604 | + unsigned long total_run = NS_TO_JIFFIES(p->totalrun); | ||
605 | + | ||
606 | + if (unlikely(total_run >= p->slice)) | ||
607 | + slice_overrun(p); | ||
608 | else { | ||
609 | - if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
610 | - sleep_time = NS_MAX_SLEEP_AVG; | ||
611 | - else | ||
612 | - sleep_time = (unsigned long)__sleep_time; | ||
613 | + unsigned long remainder; | ||
614 | + | ||
615 | + p->slice -= total_run; | ||
616 | + remainder = p->slice % rr_interval(p); | ||
617 | + if (remainder) | ||
618 | + p->time_slice = remainder; | ||
619 | } | ||
620 | +} | ||
621 | |||
622 | - if (likely(sleep_time > 0)) { | ||
623 | - /* | ||
624 | - * User tasks that sleep a long time are categorised as | ||
625 | - * idle. They will only have their sleep_avg increased to a | ||
626 | - * level that makes them just interactive priority to stay | ||
627 | - * active yet prevent them suddenly becoming cpu hogs and | ||
628 | - * starving other processes. | ||
629 | - */ | ||
630 | - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | ||
631 | - unsigned long ceiling; | ||
632 | +/* | ||
633 | + * recalc_task_prio - this checks for tasks that have run less than a full | ||
634 | + * slice and have woken up again soon after, or have just forked a | ||
635 | + * thread/process and make them continue their old slice instead of starting | ||
636 | + * a new one at high priority. | ||
637 | + */ | ||
638 | +static inline void recalc_task_prio(task_t *p, const unsigned long long now) | ||
639 | +{ | ||
640 | + unsigned long sleep_time; | ||
641 | |||
642 | - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | ||
643 | - DEF_TIMESLICE); | ||
644 | - if (p->sleep_avg < ceiling) | ||
645 | - p->sleep_avg = ceiling; | ||
646 | - } else { | ||
647 | - /* | ||
648 | - * Tasks waking from uninterruptible sleep are | ||
649 | - * limited in their sleep_avg rise as they | ||
650 | - * are likely to be waiting on I/O | ||
651 | - */ | ||
652 | - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | ||
653 | - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | ||
654 | - sleep_time = 0; | ||
655 | - else if (p->sleep_avg + sleep_time >= | ||
656 | - INTERACTIVE_SLEEP(p)) { | ||
657 | - p->sleep_avg = INTERACTIVE_SLEEP(p); | ||
658 | - sleep_time = 0; | ||
659 | - } | ||
660 | - } | ||
661 | + /* | ||
662 | + * If this task has managed to run to its lowest priority then | ||
663 | + * decrease its bonus and requeue it now at best priority instead | ||
664 | + * of possibly flagging around lowest priority. Save up any systime | ||
665 | + * that may affect priority on the next reschedule. | ||
666 | + */ | ||
667 | + if (p->slice > p->time_slice && | ||
668 | + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) { | ||
669 | + dec_bonus(p); | ||
670 | + p->totalrun = 0; | ||
671 | + return; | ||
672 | + } | ||
673 | |||
674 | - /* | ||
675 | - * This code gives a bonus to interactive tasks. | ||
676 | - * | ||
677 | - * The boost works by updating the 'average sleep time' | ||
678 | - * value here, based on ->timestamp. The more time a | ||
679 | - * task spends sleeping, the higher the average gets - | ||
680 | - * and the higher the priority boost gets as well. | ||
681 | - */ | ||
682 | - p->sleep_avg += sleep_time; | ||
683 | + /* | ||
684 | + * Add the total for this last scheduled run (p->runtime) and system | ||
685 | + * time (p->systime) done on behalf of p to the running total so far | ||
686 | + * used (p->totalrun). | ||
687 | + */ | ||
688 | + p->totalrun += p->runtime + p->systime; | ||
689 | + sleep_time = ns_diff(now, p->timestamp); | ||
690 | |||
691 | - if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
692 | - p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
693 | + if (p->systime > sleep_time || p->flags & PF_FORKED) | ||
694 | + sleep_time = 0; | ||
695 | + else { | ||
696 | + sleep_time -= p->systime; | ||
697 | + /* | ||
698 | + * We elevate priority by the amount of time we slept. If we | ||
699 | + * sleep longer than our running total and have not set the | ||
700 | + * PF_NONSLEEP flag we gain a bonus. | ||
701 | + */ | ||
702 | + if (sleep_time >= p->totalrun) { | ||
703 | + if (!(p->flags & PF_NONSLEEP)) | ||
704 | + inc_bonus(p, p->totalrun, sleep_time); | ||
705 | + p->totalrun = 0; | ||
706 | + return; | ||
707 | } | ||
708 | + p->totalrun -= sleep_time; | ||
709 | } | ||
710 | - | ||
711 | - return effective_prio(p); | ||
712 | + continue_slice(p); | ||
713 | } | ||
714 | |||
715 | /* | ||
716 | * activate_task - move a task to the runqueue and do priority recalculation | ||
717 | * | ||
718 | - * Update all the scheduling statistics stuff. (sleep average | ||
719 | - * calculation, priority modifiers, etc.) | ||
720 | + * Update all the scheduling statistics stuff. (priority modifiers, etc.) | ||
721 | */ | ||
722 | -static void activate_task(task_t *p, runqueue_t *rq, int local) | ||
723 | +static void activate_task(task_t *p, runqueue_t *rq, const int local) | ||
724 | { | ||
725 | - unsigned long long now; | ||
726 | + unsigned long long now = sched_clock(); | ||
727 | + unsigned long rr = rr_interval(p); | ||
728 | |||
729 | - now = sched_clock(); | ||
730 | #ifdef CONFIG_SMP | ||
731 | if (!local) { | ||
732 | /* Compensate for drifting sched_clock */ | ||
733 | @@ -838,45 +826,25 @@ static void activate_task(task_t *p, run | ||
734 | + rq->timestamp_last_tick; | ||
735 | } | ||
736 | #endif | ||
737 | - | ||
738 | - if (!rt_task(p)) | ||
739 | - p->prio = recalc_task_prio(p, now); | ||
740 | - | ||
741 | - /* | ||
742 | - * This checks to make sure it's not an uninterruptible task | ||
743 | - * that is now waking up. | ||
744 | - */ | ||
745 | - if (p->sleep_type == SLEEP_NORMAL) { | ||
746 | - /* | ||
747 | - * Tasks which were woken up by interrupts (ie. hw events) | ||
748 | - * are most likely of interactive nature. So we give them | ||
749 | - * the credit of extending their sleep time to the period | ||
750 | - * of time they spend on the runqueue, waiting for execution | ||
751 | - * on a CPU, first time around: | ||
752 | - */ | ||
753 | - if (in_interrupt()) | ||
754 | - p->sleep_type = SLEEP_INTERRUPTED; | ||
755 | - else { | ||
756 | - /* | ||
757 | - * Normal first-time wakeups get a credit too for | ||
758 | - * on-runqueue time, but it will be weighted down: | ||
759 | - */ | ||
760 | - p->sleep_type = SLEEP_INTERACTIVE; | ||
761 | - } | ||
762 | + p->slice = slice(p); | ||
763 | + p->time_slice = p->slice % rr ? : rr; | ||
764 | + if (!rt_task(p)) { | ||
765 | + recalc_task_prio(p, now); | ||
766 | + p->prio = effective_prio(p); | ||
767 | + p->systime = 0; | ||
768 | + p->flags &= ~(PF_FORKED | PF_NONSLEEP); | ||
769 | } | ||
770 | p->timestamp = now; | ||
771 | - | ||
772 | __activate_task(p, rq); | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * deactivate_task - remove a task from the runqueue. | ||
777 | */ | ||
778 | -static void deactivate_task(struct task_struct *p, runqueue_t *rq) | ||
779 | +static void deactivate_task(task_t *p, runqueue_t *rq) | ||
780 | { | ||
781 | dec_nr_running(p, rq); | ||
782 | - dequeue_task(p, p->array); | ||
783 | - p->array = NULL; | ||
784 | + dequeue_task(p, rq); | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | @@ -952,7 +920,7 @@ static int migrate_task(task_t *p, int d | ||
789 | * If the task is not on a runqueue (and not running), then | ||
790 | * it is sufficient to simply update the task's cpu field. | ||
791 | */ | ||
792 | - if (!p->array && !task_running(rq, p)) { | ||
793 | + if (!task_queued(p) && !task_running(rq, p)) { | ||
794 | set_task_cpu(p, dest_cpu); | ||
795 | return 0; | ||
796 | } | ||
797 | @@ -982,7 +950,7 @@ void wait_task_inactive(task_t *p) | ||
798 | repeat: | ||
799 | rq = task_rq_lock(p, &flags); | ||
800 | /* Must be off runqueue entirely, not preempted. */ | ||
801 | - if (unlikely(p->array || task_running(rq, p))) { | ||
802 | + if (unlikely(task_queued(p) || task_running(rq, p))) { | ||
803 | /* If it's preempted, we yield. It could be a while. */ | ||
804 | preempted = !task_running(rq, p); | ||
805 | task_rq_unlock(rq, &flags); | ||
806 | @@ -1234,6 +1202,15 @@ static inline int wake_idle(int cpu, tas | ||
807 | } | ||
808 | #endif | ||
809 | |||
810 | +/* | ||
811 | + * Check to see if p preempts rq->curr and resched if it does. | ||
812 | + */ | ||
813 | +static inline void preempt(const task_t *p, runqueue_t *rq) | ||
814 | +{ | ||
815 | + if (TASK_PREEMPTS_CURR(p, rq)) | ||
816 | + resched_task(rq->curr); | ||
817 | +} | ||
818 | + | ||
819 | /*** | ||
820 | * try_to_wake_up - wake up a thread | ||
821 | * @p: the to-be-woken-up thread | ||
822 | @@ -1265,7 +1242,7 @@ static int try_to_wake_up(task_t *p, uns | ||
823 | if (!(old_state & state)) | ||
824 | goto out; | ||
825 | |||
826 | - if (p->array) | ||
827 | + if (task_queued(p)) | ||
828 | goto out_running; | ||
829 | |||
830 | cpu = task_cpu(p); | ||
831 | @@ -1356,7 +1333,7 @@ out_set_cpu: | ||
832 | old_state = p->state; | ||
833 | if (!(old_state & state)) | ||
834 | goto out; | ||
835 | - if (p->array) | ||
836 | + if (task_queued(p)) | ||
837 | goto out_running; | ||
838 | |||
839 | this_cpu = smp_processor_id(); | ||
840 | @@ -1365,25 +1342,9 @@ out_set_cpu: | ||
841 | |||
842 | out_activate: | ||
843 | #endif /* CONFIG_SMP */ | ||
844 | - if (old_state == TASK_UNINTERRUPTIBLE) { | ||
845 | + if (old_state == TASK_UNINTERRUPTIBLE) | ||
846 | rq->nr_uninterruptible--; | ||
847 | - /* | ||
848 | - * Tasks on involuntary sleep don't earn | ||
849 | - * sleep_avg beyond just interactive state. | ||
850 | - */ | ||
851 | - p->sleep_type = SLEEP_NONINTERACTIVE; | ||
852 | - } else | ||
853 | - | ||
854 | - /* | ||
855 | - * Tasks that have marked their sleep as noninteractive get | ||
856 | - * woken up with their sleep average not weighted in an | ||
857 | - * interactive way. | ||
858 | - */ | ||
859 | - if (old_state & TASK_NONINTERACTIVE) | ||
860 | - p->sleep_type = SLEEP_NONINTERACTIVE; | ||
861 | - | ||
862 | |||
863 | - activate_task(p, rq, cpu == this_cpu); | ||
864 | /* | ||
865 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
866 | * has indicated that it will leave the CPU in short order) | ||
867 | @@ -1392,10 +1353,9 @@ out_activate: | ||
868 | * the waker guarantees that the freshly woken up task is going | ||
869 | * to be considered on this CPU.) | ||
870 | */ | ||
871 | - if (!sync || cpu != this_cpu) { | ||
872 | - if (TASK_PREEMPTS_CURR(p, rq)) | ||
873 | - resched_task(rq->curr); | ||
874 | - } | ||
875 | + activate_task(p, rq, cpu == this_cpu); | ||
876 | + if (!sync || cpu != this_cpu) | ||
877 | + preempt(p, rq); | ||
878 | success = 1; | ||
879 | |||
880 | out_running: | ||
881 | @@ -1440,7 +1400,6 @@ void fastcall sched_fork(task_t *p, int | ||
882 | */ | ||
883 | p->state = TASK_RUNNING; | ||
884 | INIT_LIST_HEAD(&p->run_list); | ||
885 | - p->array = NULL; | ||
886 | #ifdef CONFIG_SCHEDSTATS | ||
887 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
888 | #endif | ||
889 | @@ -1451,30 +1410,6 @@ void fastcall sched_fork(task_t *p, int | ||
890 | /* Want to start with kernel preemption disabled. */ | ||
891 | task_thread_info(p)->preempt_count = 1; | ||
892 | #endif | ||
893 | - /* | ||
894 | - * Share the timeslice between parent and child, thus the | ||
895 | - * total amount of pending timeslices in the system doesn't change, | ||
896 | - * resulting in more scheduling fairness. | ||
897 | - */ | ||
898 | - local_irq_disable(); | ||
899 | - p->time_slice = (current->time_slice + 1) >> 1; | ||
900 | - /* | ||
901 | - * The remainder of the first timeslice might be recovered by | ||
902 | - * the parent if the child exits early enough. | ||
903 | - */ | ||
904 | - p->first_time_slice = 1; | ||
905 | - current->time_slice >>= 1; | ||
906 | - p->timestamp = sched_clock(); | ||
907 | - if (unlikely(!current->time_slice)) { | ||
908 | - /* | ||
909 | - * This case is rare, it happens when the parent has only | ||
910 | - * a single jiffy left from its timeslice. Taking the | ||
911 | - * runqueue lock is not a problem. | ||
912 | - */ | ||
913 | - current->time_slice = 1; | ||
914 | - scheduler_tick(); | ||
915 | - } | ||
916 | - local_irq_enable(); | ||
917 | put_cpu(); | ||
918 | } | ||
919 | |||
920 | @@ -1496,37 +1431,20 @@ void fastcall wake_up_new_task(task_t *p | ||
921 | this_cpu = smp_processor_id(); | ||
922 | cpu = task_cpu(p); | ||
923 | |||
924 | - /* | ||
925 | - * We decrease the sleep average of forking parents | ||
926 | - * and children as well, to keep max-interactive tasks | ||
927 | - * from forking tasks that are max-interactive. The parent | ||
928 | - * (current) is done further down, under its lock. | ||
929 | - */ | ||
930 | - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
931 | - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
932 | - | ||
933 | - p->prio = effective_prio(p); | ||
934 | + /* Forked process gets no bonus to prevent fork bombs. */ | ||
935 | + p->bonus = 0; | ||
936 | + current->flags |= PF_FORKED; | ||
937 | |||
938 | if (likely(cpu == this_cpu)) { | ||
939 | + activate_task(p, rq, 1); | ||
940 | if (!(clone_flags & CLONE_VM)) { | ||
941 | /* | ||
942 | * The VM isn't cloned, so we're in a good position to | ||
943 | * do child-runs-first in anticipation of an exec. This | ||
944 | * usually avoids a lot of COW overhead. | ||
945 | */ | ||
946 | - if (unlikely(!current->array)) | ||
947 | - __activate_task(p, rq); | ||
948 | - else { | ||
949 | - p->prio = current->prio; | ||
950 | - list_add_tail(&p->run_list, ¤t->run_list); | ||
951 | - p->array = current->array; | ||
952 | - p->array->nr_active++; | ||
953 | - inc_nr_running(p, rq); | ||
954 | - } | ||
955 | set_need_resched(); | ||
956 | - } else | ||
957 | - /* Run child last */ | ||
958 | - __activate_task(p, rq); | ||
959 | + } | ||
960 | /* | ||
961 | * We skip the following code due to cpu == this_cpu | ||
962 | * | ||
963 | @@ -1543,53 +1461,19 @@ void fastcall wake_up_new_task(task_t *p | ||
964 | */ | ||
965 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | ||
966 | + rq->timestamp_last_tick; | ||
967 | - __activate_task(p, rq); | ||
968 | - if (TASK_PREEMPTS_CURR(p, rq)) | ||
969 | - resched_task(rq->curr); | ||
970 | + activate_task(p, rq, 0); | ||
971 | + preempt(p, rq); | ||
972 | |||
973 | /* | ||
974 | * Parent and child are on different CPUs, now get the | ||
975 | - * parent runqueue to update the parent's ->sleep_avg: | ||
976 | + * parent runqueue to update the parent's ->flags: | ||
977 | */ | ||
978 | task_rq_unlock(rq, &flags); | ||
979 | this_rq = task_rq_lock(current, &flags); | ||
980 | } | ||
981 | - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | ||
982 | - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
983 | task_rq_unlock(this_rq, &flags); | ||
984 | } | ||
985 | |||
986 | -/* | ||
987 | - * Potentially available exiting-child timeslices are | ||
988 | - * retrieved here - this way the parent does not get | ||
989 | - * penalized for creating too many threads. | ||
990 | - * | ||
991 | - * (this cannot be used to 'generate' timeslices | ||
992 | - * artificially, because any timeslice recovered here | ||
993 | - * was given away by the parent in the first place.) | ||
994 | - */ | ||
995 | -void fastcall sched_exit(task_t *p) | ||
996 | -{ | ||
997 | - unsigned long flags; | ||
998 | - runqueue_t *rq; | ||
999 | - | ||
1000 | - /* | ||
1001 | - * If the child was a (relative-) CPU hog then decrease | ||
1002 | - * the sleep_avg of the parent as well. | ||
1003 | - */ | ||
1004 | - rq = task_rq_lock(p->parent, &flags); | ||
1005 | - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { | ||
1006 | - p->parent->time_slice += p->time_slice; | ||
1007 | - if (unlikely(p->parent->time_slice > task_timeslice(p))) | ||
1008 | - p->parent->time_slice = task_timeslice(p); | ||
1009 | - } | ||
1010 | - if (p->sleep_avg < p->parent->sleep_avg) | ||
1011 | - p->parent->sleep_avg = p->parent->sleep_avg / | ||
1012 | - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | ||
1013 | - (EXIT_WEIGHT + 1); | ||
1014 | - task_rq_unlock(rq, &flags); | ||
1015 | -} | ||
1016 | - | ||
1017 | /** | ||
1018 | * prepare_task_switch - prepare to switch tasks | ||
1019 | * @rq: the runqueue preparing to switch | ||
1020 | @@ -1885,23 +1769,21 @@ void sched_exec(void) | ||
1021 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
1022 | * Both runqueues must be locked. | ||
1023 | */ | ||
1024 | -static | ||
1025 | -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | ||
1026 | - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | ||
1027 | +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, | ||
1028 | + const int this_cpu) | ||
1029 | { | ||
1030 | - dequeue_task(p, src_array); | ||
1031 | + dequeue_task(p, src_rq); | ||
1032 | dec_nr_running(p, src_rq); | ||
1033 | set_task_cpu(p, this_cpu); | ||
1034 | inc_nr_running(p, this_rq); | ||
1035 | - enqueue_task(p, this_array); | ||
1036 | + enqueue_task(p, this_rq); | ||
1037 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | ||
1038 | + this_rq->timestamp_last_tick; | ||
1039 | /* | ||
1040 | * Note that idle threads have a prio of MAX_PRIO, for this test | ||
1041 | * to be always true for them. | ||
1042 | */ | ||
1043 | - if (TASK_PREEMPTS_CURR(p, this_rq)) | ||
1044 | - resched_task(this_rq->curr); | ||
1045 | + preempt(p, this_rq); | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | @@ -1939,7 +1821,6 @@ int can_migrate_task(task_t *p, runqueue | ||
1050 | return 1; | ||
1051 | } | ||
1052 | |||
1053 | -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
1054 | /* | ||
1055 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
1056 | * load from busiest to this_rq, as part of a balancing operation within | ||
1057 | @@ -1952,7 +1833,6 @@ static int move_tasks(runqueue_t *this_r | ||
1058 | struct sched_domain *sd, enum idle_type idle, | ||
1059 | int *all_pinned) | ||
1060 | { | ||
1061 | - prio_array_t *array, *dst_array; | ||
1062 | struct list_head *head, *curr; | ||
1063 | int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; | ||
1064 | int busiest_best_prio_seen; | ||
1065 | @@ -1965,8 +1845,8 @@ static int move_tasks(runqueue_t *this_r | ||
1066 | |||
1067 | rem_load_move = max_load_move; | ||
1068 | pinned = 1; | ||
1069 | - this_best_prio = rq_best_prio(this_rq); | ||
1070 | - busiest_best_prio = rq_best_prio(busiest); | ||
1071 | + this_best_prio = this_rq->curr->prio; | ||
1072 | + busiest_best_prio = busiest->curr->prio; | ||
1073 | /* | ||
1074 | * Enable handling of the case where there is more than one task | ||
1075 | * with the best priority. If the current running task is one | ||
1076 | @@ -1976,38 +1856,17 @@ static int move_tasks(runqueue_t *this_r | ||
1077 | */ | ||
1078 | busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; | ||
1079 | |||
1080 | - /* | ||
1081 | - * We first consider expired tasks. Those will likely not be | ||
1082 | - * executed in the near future, and they are most likely to | ||
1083 | - * be cache-cold, thus switching CPUs has the least effect | ||
1084 | - * on them. | ||
1085 | - */ | ||
1086 | - if (busiest->expired->nr_active) { | ||
1087 | - array = busiest->expired; | ||
1088 | - dst_array = this_rq->expired; | ||
1089 | - } else { | ||
1090 | - array = busiest->active; | ||
1091 | - dst_array = this_rq->active; | ||
1092 | - } | ||
1093 | - | ||
1094 | -new_array: | ||
1095 | /* Start searching at priority 0: */ | ||
1096 | idx = 0; | ||
1097 | skip_bitmap: | ||
1098 | if (!idx) | ||
1099 | - idx = sched_find_first_bit(array->bitmap); | ||
1100 | + idx = sched_find_first_bit(busiest->bitmap); | ||
1101 | else | ||
1102 | - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
1103 | - if (idx >= MAX_PRIO) { | ||
1104 | - if (array == busiest->expired && busiest->active->nr_active) { | ||
1105 | - array = busiest->active; | ||
1106 | - dst_array = this_rq->active; | ||
1107 | - goto new_array; | ||
1108 | - } | ||
1109 | + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); | ||
1110 | + if (idx >= MAX_PRIO) | ||
1111 | goto out; | ||
1112 | - } | ||
1113 | |||
1114 | - head = array->queue + idx; | ||
1115 | + head = busiest->queue + idx; | ||
1116 | curr = head->prev; | ||
1117 | skip_queue: | ||
1118 | tmp = list_entry(curr, task_t, run_list); | ||
1119 | @@ -2036,7 +1895,7 @@ skip_queue: | ||
1120 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
1121 | #endif | ||
1122 | |||
1123 | - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | ||
1124 | + pull_task(busiest, tmp, this_rq, this_cpu); | ||
1125 | pulled++; | ||
1126 | rem_load_move -= tmp->load_weight; | ||
1127 | |||
1128 | @@ -2585,15 +2444,13 @@ static void rebalance_tick(int this_cpu, | ||
1129 | continue; | ||
1130 | |||
1131 | interval = sd->balance_interval; | ||
1132 | - if (idle != SCHED_IDLE) | ||
1133 | - interval *= sd->busy_factor; | ||
1134 | |||
1135 | /* scale ms to jiffies */ | ||
1136 | interval = msecs_to_jiffies(interval); | ||
1137 | if (unlikely(!interval)) | ||
1138 | interval = 1; | ||
1139 | |||
1140 | - if (j - sd->last_balance >= interval) { | ||
1141 | + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) { | ||
1142 | if (load_balance(this_cpu, this_rq, sd, idle)) { | ||
1143 | /* | ||
1144 | * We've pulled tasks over so either we're no | ||
1145 | @@ -2667,22 +2524,6 @@ unsigned long long current_sched_time(co | ||
1146 | } | ||
1147 | |||
1148 | /* | ||
1149 | - * We place interactive tasks back into the active array, if possible. | ||
1150 | - * | ||
1151 | - * To guarantee that this does not starve expired tasks we ignore the | ||
1152 | - * interactivity of a task if the first expired task had to wait more | ||
1153 | - * than a 'reasonable' amount of time. This deadline timeout is | ||
1154 | - * load-dependent, as the frequency of array switched decreases with | ||
1155 | - * increasing number of running tasks. We also ignore the interactivity | ||
1156 | - * if a better static_prio task has expired: | ||
1157 | - */ | ||
1158 | -#define EXPIRED_STARVING(rq) \ | ||
1159 | - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | ||
1160 | - (jiffies - (rq)->expired_timestamp >= \ | ||
1161 | - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | ||
1162 | - ((rq)->curr->static_prio > (rq)->best_expired_prio)) | ||
1163 | - | ||
1164 | -/* | ||
1165 | * Account user cpu time to a process. | ||
1166 | * @p: the process that the cpu time gets accounted to | ||
1167 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
1168 | @@ -2730,6 +2571,8 @@ void account_system_time(struct task_str | ||
1169 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | ||
1170 | else | ||
1171 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | ||
1172 | + | ||
1173 | + p->systime += NSJIFFY; | ||
1174 | /* Account for system time used */ | ||
1175 | acct_update_integrals(p); | ||
1176 | } | ||
1177 | @@ -2755,18 +2598,23 @@ void account_steal_time(struct task_stru | ||
1178 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | ||
1179 | } | ||
1180 | |||
1181 | +static void time_slice_expired(task_t *p, runqueue_t *rq) | ||
1182 | +{ | ||
1183 | + set_tsk_need_resched(p); | ||
1184 | + p->time_slice = rr_interval(p); | ||
1185 | + requeue_task(p, rq, effective_prio(p)); | ||
1186 | +} | ||
1187 | + | ||
1188 | /* | ||
1189 | * This function gets called by the timer code, with HZ frequency. | ||
1190 | * We call it with interrupts disabled. | ||
1191 | - * | ||
1192 | - * It also gets called by the fork code, when changing the parent's | ||
1193 | - * timeslices. | ||
1194 | */ | ||
1195 | void scheduler_tick(void) | ||
1196 | { | ||
1197 | int cpu = smp_processor_id(); | ||
1198 | runqueue_t *rq = this_rq(); | ||
1199 | task_t *p = current; | ||
1200 | + unsigned long debit; | ||
1201 | unsigned long long now = sched_clock(); | ||
1202 | |||
1203 | update_cpu_clock(p, rq, now); | ||
1204 | @@ -2781,73 +2629,37 @@ void scheduler_tick(void) | ||
1205 | } | ||
1206 | |||
1207 | /* Task might have expired already, but not scheduled off yet */ | ||
1208 | - if (p->array != rq->active) { | ||
1209 | + if (unlikely(!task_queued(p))) { | ||
1210 | set_tsk_need_resched(p); | ||
1211 | goto out; | ||
1212 | } | ||
1213 | + /* SCHED_FIFO tasks never run out of timeslice. */ | ||
1214 | + if (unlikely(p->policy == SCHED_FIFO)) | ||
1215 | + goto out; | ||
1216 | + | ||
1217 | spin_lock(&rq->lock); | ||
1218 | + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); | ||
1219 | + p->ns_debit += debit; | ||
1220 | + if (p->ns_debit < NSJIFFY) | ||
1221 | + goto out_unlock; | ||
1222 | + p->ns_debit %= NSJIFFY; | ||
1223 | /* | ||
1224 | - * The task was running during this tick - update the | ||
1225 | - * time slice counter. Note: we do not update a thread's | ||
1226 | - * priority until it either goes to sleep or uses up its | ||
1227 | - * timeslice. This makes it possible for interactive tasks | ||
1228 | - * to use up their timeslices at their highest priority levels. | ||
1229 | + * Tasks lose bonus each time they use up a full slice(). | ||
1230 | */ | ||
1231 | - if (rt_task(p)) { | ||
1232 | - /* | ||
1233 | - * RR tasks need a special form of timeslice management. | ||
1234 | - * FIFO tasks have no timeslices. | ||
1235 | - */ | ||
1236 | - if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
1237 | - p->time_slice = task_timeslice(p); | ||
1238 | - p->first_time_slice = 0; | ||
1239 | - set_tsk_need_resched(p); | ||
1240 | - | ||
1241 | - /* put it at the end of the queue: */ | ||
1242 | - requeue_task(p, rq->active); | ||
1243 | - } | ||
1244 | + if (!--p->slice) { | ||
1245 | + dec_bonus(p); | ||
1246 | + p->totalrun = 0; | ||
1247 | + p->slice = slice(p); | ||
1248 | + time_slice_expired(p, rq); | ||
1249 | goto out_unlock; | ||
1250 | } | ||
1251 | + /* | ||
1252 | + * Tasks that run out of time_slice but still have slice left get | ||
1253 | + * requeued with a lower priority && RR_INTERVAL time_slice. | ||
1254 | + */ | ||
1255 | if (!--p->time_slice) { | ||
1256 | - dequeue_task(p, rq->active); | ||
1257 | - set_tsk_need_resched(p); | ||
1258 | - p->prio = effective_prio(p); | ||
1259 | - p->time_slice = task_timeslice(p); | ||
1260 | - p->first_time_slice = 0; | ||
1261 | - | ||
1262 | - if (!rq->expired_timestamp) | ||
1263 | - rq->expired_timestamp = jiffies; | ||
1264 | - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | ||
1265 | - enqueue_task(p, rq->expired); | ||
1266 | - if (p->static_prio < rq->best_expired_prio) | ||
1267 | - rq->best_expired_prio = p->static_prio; | ||
1268 | - } else | ||
1269 | - enqueue_task(p, rq->active); | ||
1270 | - } else { | ||
1271 | - /* | ||
1272 | - * Prevent a too long timeslice allowing a task to monopolize | ||
1273 | - * the CPU. We do this by splitting up the timeslice into | ||
1274 | - * smaller pieces. | ||
1275 | - * | ||
1276 | - * Note: this does not mean the task's timeslices expire or | ||
1277 | - * get lost in any way, they just might be preempted by | ||
1278 | - * another task of equal priority. (one with higher | ||
1279 | - * priority would have preempted this task already.) We | ||
1280 | - * requeue this task to the end of the list on this priority | ||
1281 | - * level, which is in essence a round-robin of tasks with | ||
1282 | - * equal priority. | ||
1283 | - * | ||
1284 | - * This only applies to tasks in the interactive | ||
1285 | - * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
1286 | - */ | ||
1287 | - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
1288 | - p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
1289 | - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
1290 | - (p->array == rq->active)) { | ||
1291 | - | ||
1292 | - requeue_task(p, rq->active); | ||
1293 | - set_tsk_need_resched(p); | ||
1294 | - } | ||
1295 | + time_slice_expired(p, rq); | ||
1296 | + goto out_unlock; | ||
1297 | } | ||
1298 | out_unlock: | ||
1299 | spin_unlock(&rq->lock); | ||
1300 | @@ -2896,12 +2708,13 @@ static void wake_sleeping_dependent(int | ||
1301 | |||
1302 | /* | ||
1303 | * number of 'lost' timeslices this task wont be able to fully | ||
1304 | - * utilize, if another task runs on a sibling. This models the | ||
1305 | + * utilise, if another task runs on a sibling. This models the | ||
1306 | * slowdown effect of other tasks running on siblings: | ||
1307 | */ | ||
1308 | -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | ||
1309 | +static inline unsigned long | ||
1310 | +smt_slice(const task_t *p, const struct sched_domain *sd) | ||
1311 | { | ||
1312 | - return p->time_slice * (100 - sd->per_cpu_gain) / 100; | ||
1313 | + return p->slice * (100 - sd->per_cpu_gain) / 100; | ||
1314 | } | ||
1315 | |||
1316 | /* | ||
1317 | @@ -2964,7 +2777,7 @@ static int dependent_sleeper(int this_cp | ||
1318 | } else | ||
1319 | if (smt_curr->static_prio < p->static_prio && | ||
1320 | !TASK_PREEMPTS_CURR(p, smt_rq) && | ||
1321 | - smt_slice(smt_curr, sd) > task_timeslice(p)) | ||
1322 | + smt_slice(smt_curr, sd) > slice(p)) | ||
1323 | ret = 1; | ||
1324 | |||
1325 | unlock: | ||
1326 | @@ -3015,12 +2828,6 @@ EXPORT_SYMBOL(sub_preempt_count); | ||
1327 | |||
1328 | #endif | ||
1329 | |||
1330 | -static inline int interactive_sleep(enum sleep_type sleep_type) | ||
1331 | -{ | ||
1332 | - return (sleep_type == SLEEP_INTERACTIVE || | ||
1333 | - sleep_type == SLEEP_INTERRUPTED); | ||
1334 | -} | ||
1335 | - | ||
1336 | /* | ||
1337 | * schedule() is the main scheduler function. | ||
1338 | */ | ||
1339 | @@ -3029,11 +2836,10 @@ asmlinkage void __sched schedule(void) | ||
1340 | long *switch_count; | ||
1341 | task_t *prev, *next; | ||
1342 | runqueue_t *rq; | ||
1343 | - prio_array_t *array; | ||
1344 | struct list_head *queue; | ||
1345 | unsigned long long now; | ||
1346 | - unsigned long run_time; | ||
1347 | - int cpu, idx, new_prio; | ||
1348 | + unsigned long debit; | ||
1349 | + int cpu, idx; | ||
1350 | |||
1351 | /* | ||
1352 | * Test if we are atomic. Since do_exit() needs to call into | ||
1353 | @@ -3066,20 +2872,11 @@ need_resched_nonpreemptible: | ||
1354 | |||
1355 | schedstat_inc(rq, sched_cnt); | ||
1356 | now = sched_clock(); | ||
1357 | - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | ||
1358 | - run_time = now - prev->timestamp; | ||
1359 | - if (unlikely((long long)(now - prev->timestamp) < 0)) | ||
1360 | - run_time = 0; | ||
1361 | - } else | ||
1362 | - run_time = NS_MAX_SLEEP_AVG; | ||
1363 | - | ||
1364 | - /* | ||
1365 | - * Tasks charged proportionately less run_time at high sleep_avg to | ||
1366 | - * delay them losing their interactive status | ||
1367 | - */ | ||
1368 | - run_time /= (CURRENT_BONUS(prev) ? : 1); | ||
1369 | |||
1370 | spin_lock_irq(&rq->lock); | ||
1371 | + prev->runtime = ns_diff(now, prev->timestamp); | ||
1372 | + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY; | ||
1373 | + prev->ns_debit += debit; | ||
1374 | |||
1375 | if (unlikely(prev->flags & PF_DEAD)) | ||
1376 | prev->state = EXIT_DEAD; | ||
1377 | @@ -3091,8 +2888,10 @@ need_resched_nonpreemptible: | ||
1378 | unlikely(signal_pending(prev)))) | ||
1379 | prev->state = TASK_RUNNING; | ||
1380 | else { | ||
1381 | - if (prev->state == TASK_UNINTERRUPTIBLE) | ||
1382 | + if (prev->state == TASK_UNINTERRUPTIBLE) { | ||
1383 | + prev->flags |= PF_NONSLEEP; | ||
1384 | rq->nr_uninterruptible++; | ||
1385 | + } | ||
1386 | deactivate_task(prev, rq); | ||
1387 | } | ||
1388 | } | ||
1389 | @@ -3102,64 +2901,30 @@ need_resched_nonpreemptible: | ||
1390 | idle_balance(cpu, rq); | ||
1391 | if (!rq->nr_running) { | ||
1392 | next = rq->idle; | ||
1393 | - rq->expired_timestamp = 0; | ||
1394 | wake_sleeping_dependent(cpu); | ||
1395 | goto switch_tasks; | ||
1396 | } | ||
1397 | } | ||
1398 | |||
1399 | - array = rq->active; | ||
1400 | - if (unlikely(!array->nr_active)) { | ||
1401 | - /* | ||
1402 | - * Switch the active and expired arrays. | ||
1403 | - */ | ||
1404 | - schedstat_inc(rq, sched_switch); | ||
1405 | - rq->active = rq->expired; | ||
1406 | - rq->expired = array; | ||
1407 | - array = rq->active; | ||
1408 | - rq->expired_timestamp = 0; | ||
1409 | - rq->best_expired_prio = MAX_PRIO; | ||
1410 | - } | ||
1411 | - | ||
1412 | - idx = sched_find_first_bit(array->bitmap); | ||
1413 | - queue = array->queue + idx; | ||
1414 | + idx = sched_find_first_bit(rq->bitmap); | ||
1415 | + queue = rq->queue + idx; | ||
1416 | next = list_entry(queue->next, task_t, run_list); | ||
1417 | |||
1418 | - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | ||
1419 | - unsigned long long delta = now - next->timestamp; | ||
1420 | - if (unlikely((long long)(now - next->timestamp) < 0)) | ||
1421 | - delta = 0; | ||
1422 | - | ||
1423 | - if (next->sleep_type == SLEEP_INTERACTIVE) | ||
1424 | - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | ||
1425 | - | ||
1426 | - array = next->array; | ||
1427 | - new_prio = recalc_task_prio(next, next->timestamp + delta); | ||
1428 | - | ||
1429 | - if (unlikely(next->prio != new_prio)) { | ||
1430 | - dequeue_task(next, array); | ||
1431 | - next->prio = new_prio; | ||
1432 | - enqueue_task(next, array); | ||
1433 | - } | ||
1434 | - } | ||
1435 | - next->sleep_type = SLEEP_NORMAL; | ||
1436 | if (dependent_sleeper(cpu, rq, next)) | ||
1437 | next = rq->idle; | ||
1438 | + else { | ||
1439 | + prefetch(next); | ||
1440 | + prefetch_stack(next); | ||
1441 | + } | ||
1442 | switch_tasks: | ||
1443 | if (next == rq->idle) | ||
1444 | schedstat_inc(rq, sched_goidle); | ||
1445 | - prefetch(next); | ||
1446 | - prefetch_stack(next); | ||
1447 | + prev->timestamp = now; | ||
1448 | clear_tsk_need_resched(prev); | ||
1449 | rcu_qsctr_inc(task_cpu(prev)); | ||
1450 | |||
1451 | update_cpu_clock(prev, rq, now); | ||
1452 | |||
1453 | - prev->sleep_avg -= run_time; | ||
1454 | - if ((long)prev->sleep_avg <= 0) | ||
1455 | - prev->sleep_avg = 0; | ||
1456 | - prev->timestamp = prev->last_ran = now; | ||
1457 | - | ||
1458 | sched_info_switch(prev, next); | ||
1459 | if (likely(prev != next)) { | ||
1460 | next->timestamp = now; | ||
1461 | @@ -3591,9 +3356,8 @@ EXPORT_SYMBOL(sleep_on_timeout); | ||
1462 | void set_user_nice(task_t *p, long nice) | ||
1463 | { | ||
1464 | unsigned long flags; | ||
1465 | - prio_array_t *array; | ||
1466 | runqueue_t *rq; | ||
1467 | - int old_prio, new_prio, delta; | ||
1468 | + int queued, old_prio, new_prio, delta; | ||
1469 | |||
1470 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | ||
1471 | return; | ||
1472 | @@ -3612,9 +3376,8 @@ void set_user_nice(task_t *p, long nice) | ||
1473 | p->static_prio = NICE_TO_PRIO(nice); | ||
1474 | goto out_unlock; | ||
1475 | } | ||
1476 | - array = p->array; | ||
1477 | - if (array) { | ||
1478 | - dequeue_task(p, array); | ||
1479 | + if ((queued = task_queued(p))) { | ||
1480 | + dequeue_task(p, rq); | ||
1481 | dec_raw_weighted_load(rq, p); | ||
1482 | } | ||
1483 | |||
1484 | @@ -3624,9 +3387,11 @@ void set_user_nice(task_t *p, long nice) | ||
1485 | p->static_prio = NICE_TO_PRIO(nice); | ||
1486 | set_load_weight(p); | ||
1487 | p->prio += delta; | ||
1488 | + if (p->bonus > bonus(p)) | ||
1489 | + p->bonus= bonus(p); | ||
1490 | |||
1491 | - if (array) { | ||
1492 | - enqueue_task(p, array); | ||
1493 | + if (queued) { | ||
1494 | + enqueue_task(p, rq); | ||
1495 | inc_raw_weighted_load(rq, p); | ||
1496 | /* | ||
1497 | * If the task increased its priority or is running and | ||
1498 | @@ -3750,19 +3515,13 @@ static inline task_t *find_process_by_pi | ||
1499 | /* Actually do priority change: must hold rq lock. */ | ||
1500 | static void __setscheduler(struct task_struct *p, int policy, int prio) | ||
1501 | { | ||
1502 | - BUG_ON(p->array); | ||
1503 | + BUG_ON(task_queued(p)); | ||
1504 | p->policy = policy; | ||
1505 | p->rt_priority = prio; | ||
1506 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | ||
1507 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
1508 | - } else { | ||
1509 | + } else | ||
1510 | p->prio = p->static_prio; | ||
1511 | - /* | ||
1512 | - * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
1513 | - */ | ||
1514 | - if (policy == SCHED_BATCH) | ||
1515 | - p->sleep_avg = 0; | ||
1516 | - } | ||
1517 | set_load_weight(p); | ||
1518 | } | ||
1519 | |||
1520 | @@ -3777,8 +3536,7 @@ int sched_setscheduler(struct task_struc | ||
1521 | struct sched_param *param) | ||
1522 | { | ||
1523 | int retval; | ||
1524 | - int oldprio, oldpolicy = -1; | ||
1525 | - prio_array_t *array; | ||
1526 | + int queued, oldprio, oldpolicy = -1; | ||
1527 | unsigned long flags; | ||
1528 | runqueue_t *rq; | ||
1529 | |||
1530 | @@ -3840,12 +3598,11 @@ recheck: | ||
1531 | task_rq_unlock(rq, &flags); | ||
1532 | goto recheck; | ||
1533 | } | ||
1534 | - array = p->array; | ||
1535 | - if (array) | ||
1536 | + if ((queued = task_queued(p))) | ||
1537 | deactivate_task(p, rq); | ||
1538 | oldprio = p->prio; | ||
1539 | __setscheduler(p, policy, param->sched_priority); | ||
1540 | - if (array) { | ||
1541 | + if (queued) { | ||
1542 | __activate_task(p, rq); | ||
1543 | /* | ||
1544 | * Reschedule if we are currently running on this runqueue and | ||
1545 | @@ -3855,8 +3612,8 @@ recheck: | ||
1546 | if (task_running(rq, p)) { | ||
1547 | if (p->prio > oldprio) | ||
1548 | resched_task(rq->curr); | ||
1549 | - } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
1550 | - resched_task(rq->curr); | ||
1551 | + } else | ||
1552 | + preempt(p, rq); | ||
1553 | } | ||
1554 | task_rq_unlock(rq, &flags); | ||
1555 | return 0; | ||
1556 | @@ -4113,43 +3870,22 @@ asmlinkage long sys_sched_getaffinity(pi | ||
1557 | |||
1558 | /** | ||
1559 | * sys_sched_yield - yield the current processor to other threads. | ||
1560 | - * | ||
1561 | - * this function yields the current CPU by moving the calling thread | ||
1562 | - * to the expired array. If there are no other threads running on this | ||
1563 | - * CPU then this function will return. | ||
1564 | + * This function yields the current CPU by dropping the priority of current | ||
1565 | + * to the lowest priority. | ||
1566 | */ | ||
1567 | asmlinkage long sys_sched_yield(void) | ||
1568 | { | ||
1569 | + int newprio; | ||
1570 | runqueue_t *rq = this_rq_lock(); | ||
1571 | - prio_array_t *array = current->array; | ||
1572 | - prio_array_t *target = rq->expired; | ||
1573 | |||
1574 | + newprio = current->prio; | ||
1575 | schedstat_inc(rq, yld_cnt); | ||
1576 | - /* | ||
1577 | - * We implement yielding by moving the task into the expired | ||
1578 | - * queue. | ||
1579 | - * | ||
1580 | - * (special rule: RT tasks will just roundrobin in the active | ||
1581 | - * array.) | ||
1582 | - */ | ||
1583 | - if (rt_task(current)) | ||
1584 | - target = rq->active; | ||
1585 | + current->slice = slice(current); | ||
1586 | + current->time_slice = rr_interval(current); | ||
1587 | + if (likely(!rt_task(current))) | ||
1588 | + newprio = MIN_USER_PRIO; | ||
1589 | |||
1590 | - if (array->nr_active == 1) { | ||
1591 | - schedstat_inc(rq, yld_act_empty); | ||
1592 | - if (!rq->expired->nr_active) | ||
1593 | - schedstat_inc(rq, yld_both_empty); | ||
1594 | - } else if (!rq->expired->nr_active) | ||
1595 | - schedstat_inc(rq, yld_exp_empty); | ||
1596 | - | ||
1597 | - if (array != target) { | ||
1598 | - dequeue_task(current, array); | ||
1599 | - enqueue_task(current, target); | ||
1600 | - } else | ||
1601 | - /* | ||
1602 | - * requeue_task is cheaper so perform that if possible. | ||
1603 | - */ | ||
1604 | - requeue_task(current, array); | ||
1605 | + requeue_task(current, rq, newprio); | ||
1606 | |||
1607 | /* | ||
1608 | * Since we are going to call schedule() anyway, there's | ||
1609 | @@ -4358,7 +4094,7 @@ long sys_sched_rr_get_interval(pid_t pid | ||
1610 | goto out_unlock; | ||
1611 | |||
1612 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | ||
1613 | - 0 : task_timeslice(p), &t); | ||
1614 | + 0 : slice(p), &t); | ||
1615 | read_unlock(&tasklist_lock); | ||
1616 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | ||
1617 | out_nounlock: | ||
1618 | @@ -4481,8 +4217,6 @@ void __devinit init_idle(task_t *idle, i | ||
1619 | unsigned long flags; | ||
1620 | |||
1621 | idle->timestamp = sched_clock(); | ||
1622 | - idle->sleep_avg = 0; | ||
1623 | - idle->array = NULL; | ||
1624 | idle->prio = MAX_PRIO; | ||
1625 | idle->state = TASK_RUNNING; | ||
1626 | idle->cpus_allowed = cpumask_of_cpu(cpu); | ||
1627 | @@ -4599,7 +4333,7 @@ static void __migrate_task(struct task_s | ||
1628 | goto out; | ||
1629 | |||
1630 | set_task_cpu(p, dest_cpu); | ||
1631 | - if (p->array) { | ||
1632 | + if (task_queued(p)) { | ||
1633 | /* | ||
1634 | * Sync timestamp with rq_dest's before activating. | ||
1635 | * The same thing could be achieved by doing this step | ||
1636 | @@ -4610,8 +4344,7 @@ static void __migrate_task(struct task_s | ||
1637 | + rq_dest->timestamp_last_tick; | ||
1638 | deactivate_task(p, rq_src); | ||
1639 | activate_task(p, rq_dest, 0); | ||
1640 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
1641 | - resched_task(rq_dest->curr); | ||
1642 | + preempt(p, rq_dest); | ||
1643 | } | ||
1644 | |||
1645 | out: | ||
1646 | @@ -4825,7 +4558,7 @@ static void migrate_dead_tasks(unsigned | ||
1647 | |||
1648 | for (arr = 0; arr < 2; arr++) { | ||
1649 | for (i = 0; i < MAX_PRIO; i++) { | ||
1650 | - struct list_head *list = &rq->arrays[arr].queue[i]; | ||
1651 | + struct list_head *list = &rq->queue[i]; | ||
1652 | while (!list_empty(list)) | ||
1653 | migrate_dead(dead_cpu, | ||
1654 | list_entry(list->next, task_t, | ||
1655 | @@ -6226,17 +5959,13 @@ int in_sched_functions(unsigned long add | ||
1656 | void __init sched_init(void) | ||
1657 | { | ||
1658 | runqueue_t *rq; | ||
1659 | - int i, j, k; | ||
1660 | + int i, j; | ||
1661 | |||
1662 | for_each_possible_cpu(i) { | ||
1663 | - prio_array_t *array; | ||
1664 | |||
1665 | rq = cpu_rq(i); | ||
1666 | spin_lock_init(&rq->lock); | ||
1667 | rq->nr_running = 0; | ||
1668 | - rq->active = rq->arrays; | ||
1669 | - rq->expired = rq->arrays + 1; | ||
1670 | - rq->best_expired_prio = MAX_PRIO; | ||
1671 | |||
1672 | #ifdef CONFIG_SMP | ||
1673 | rq->sd = NULL; | ||
1674 | @@ -6248,16 +5977,11 @@ void __init sched_init(void) | ||
1675 | INIT_LIST_HEAD(&rq->migration_queue); | ||
1676 | #endif | ||
1677 | atomic_set(&rq->nr_iowait, 0); | ||
1678 | - | ||
1679 | - for (j = 0; j < 2; j++) { | ||
1680 | - array = rq->arrays + j; | ||
1681 | - for (k = 0; k < MAX_PRIO; k++) { | ||
1682 | - INIT_LIST_HEAD(array->queue + k); | ||
1683 | - __clear_bit(k, array->bitmap); | ||
1684 | - } | ||
1685 | - // delimiter for bitsearch | ||
1686 | - __set_bit(MAX_PRIO, array->bitmap); | ||
1687 | - } | ||
1688 | + for (j = 0; j < MAX_PRIO; j++) | ||
1689 | + INIT_LIST_HEAD(&rq->queue[j]); | ||
1690 | + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long)); | ||
1691 | + /* delimiter for bitsearch */ | ||
1692 | + __set_bit(MAX_PRIO, rq->bitmap); | ||
1693 | } | ||
1694 | |||
1695 | set_load_weight(&init_task); | ||
1696 | @@ -6302,9 +6026,9 @@ EXPORT_SYMBOL(__might_sleep); | ||
1697 | void normalize_rt_tasks(void) | ||
1698 | { | ||
1699 | struct task_struct *p; | ||
1700 | - prio_array_t *array; | ||
1701 | unsigned long flags; | ||
1702 | runqueue_t *rq; | ||
1703 | + int queued; | ||
1704 | |||
1705 | read_lock_irq(&tasklist_lock); | ||
1706 | for_each_process(p) { | ||
1707 | @@ -6313,11 +6037,10 @@ void normalize_rt_tasks(void) | ||
1708 | |||
1709 | rq = task_rq_lock(p, &flags); | ||
1710 | |||
1711 | - array = p->array; | ||
1712 | - if (array) | ||
1713 | + if ((queued = task_queued(p))) | ||
1714 | deactivate_task(p, task_rq(p)); | ||
1715 | __setscheduler(p, SCHED_NORMAL, 0); | ||
1716 | - if (array) { | ||
1717 | + if (queued) { | ||
1718 | __activate_task(p, task_rq(p)); | ||
1719 | resched_task(rq->curr); | ||
1720 | } |