Contents of /trunk/kernel26-alx/patches-2.6.17-r5/0003-2.6.17-smpnice-staircase-16.patch
Parent Directory | Revision Log
Revision 199 -
(show annotations)
(download)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 52028 byte(s)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 52028 byte(s)
-import
1 | Implement the "staircase" hybrid foreground-background single priority |
2 | array cpu scheduler policy. |
3 | |
4 | Signed-off-by: Con Kolivas <kernel@kolivas.org> |
5 | |
6 | fs/proc/array.c | 4 |
7 | include/linux/sched.h | 21 - |
8 | kernel/exit.c | 1 |
9 | kernel/sched.c | 1015 ++++++++++++++++++-------------------------------- |
10 | 4 files changed, 378 insertions(+), 663 deletions(-) |
11 | |
12 | Index: linux-ck-dev/fs/proc/array.c |
13 | =================================================================== |
14 | --- linux-ck-dev.orig/fs/proc/array.c 2006-06-18 15:20:15.000000000 +1000 |
15 | +++ linux-ck-dev/fs/proc/array.c 2006-06-18 15:21:50.000000000 +1000 |
16 | @@ -165,7 +165,7 @@ static inline char * task_state(struct t |
17 | read_lock(&tasklist_lock); |
18 | buffer += sprintf(buffer, |
19 | "State:\t%s\n" |
20 | - "SleepAVG:\t%lu%%\n" |
21 | + "Bonus:\t%d\n" |
22 | "Tgid:\t%d\n" |
23 | "Pid:\t%d\n" |
24 | "PPid:\t%d\n" |
25 | @@ -173,7 +173,7 @@ static inline char * task_state(struct t |
26 | "Uid:\t%d\t%d\t%d\t%d\n" |
27 | "Gid:\t%d\t%d\t%d\t%d\n", |
28 | get_task_state(p), |
29 | - (p->sleep_avg/1024)*100/(1020000000/1024), |
30 | + p->bonus, |
31 | p->tgid, |
32 | p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, |
33 | pid_alive(p) && p->ptrace ? p->parent->pid : 0, |
34 | Index: linux-ck-dev/include/linux/sched.h |
35 | =================================================================== |
36 | --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:21:31.000000000 +1000 |
37 | +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:21:50.000000000 +1000 |
38 | @@ -483,6 +483,7 @@ struct signal_struct { |
39 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
40 | |
41 | #define MAX_PRIO (MAX_RT_PRIO + 40) |
42 | +#define MIN_USER_PRIO (MAX_PRIO - 1) |
43 | |
44 | #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) |
45 | #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) |
46 | @@ -518,7 +519,6 @@ extern struct user_struct *find_user(uid |
47 | extern struct user_struct root_user; |
48 | #define INIT_USER (&root_user) |
49 | |
50 | -typedef struct prio_array prio_array_t; |
51 | struct backing_dev_info; |
52 | struct reclaim_state; |
53 | |
54 | @@ -687,13 +687,6 @@ struct audit_context; /* See audit.c */ |
55 | struct mempolicy; |
56 | struct pipe_inode_info; |
57 | |
58 | -enum sleep_type { |
59 | - SLEEP_NORMAL, |
60 | - SLEEP_NONINTERACTIVE, |
61 | - SLEEP_INTERACTIVE, |
62 | - SLEEP_INTERRUPTED, |
63 | -}; |
64 | - |
65 | struct task_struct { |
66 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
67 | struct thread_info *thread_info; |
68 | @@ -711,19 +704,18 @@ struct task_struct { |
69 | int load_weight; /* for niceness load balancing purposes */ |
70 | int prio, static_prio; |
71 | struct list_head run_list; |
72 | - prio_array_t *array; |
73 | |
74 | unsigned short ioprio; |
75 | unsigned int btrace_seq; |
76 | |
77 | - unsigned long sleep_avg; |
78 | - unsigned long long timestamp, last_ran; |
79 | + unsigned long long timestamp; |
80 | + unsigned long runtime, totalrun, ns_debit, systime; |
81 | + unsigned int bonus; |
82 | + unsigned int slice, time_slice; |
83 | unsigned long long sched_time; /* sched_clock time spent running */ |
84 | - enum sleep_type sleep_type; |
85 | |
86 | unsigned long policy; |
87 | cpumask_t cpus_allowed; |
88 | - unsigned int time_slice, first_time_slice; |
89 | |
90 | #ifdef CONFIG_SCHEDSTATS |
91 | struct sched_info sched_info; |
92 | @@ -952,6 +944,8 @@ static inline void put_task_struct(struc |
93 | #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ |
94 | #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ |
95 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
96 | +#define PF_NONSLEEP 0x20000000 /* Waiting on in kernel activity */ |
97 | +#define PF_FORKED 0x40000000 /* Task just forked another process */ |
98 | |
99 | /* |
100 | * Only the _current_ task can read/write to tsk->flags, but other |
101 | @@ -1073,7 +1067,6 @@ extern void FASTCALL(wake_up_new_task(st |
102 | static inline void kick_process(struct task_struct *tsk) { } |
103 | #endif |
104 | extern void FASTCALL(sched_fork(task_t * p, int clone_flags)); |
105 | -extern void FASTCALL(sched_exit(task_t * p)); |
106 | |
107 | extern int in_group_p(gid_t); |
108 | extern int in_egroup_p(gid_t); |
109 | Index: linux-ck-dev/kernel/exit.c |
110 | =================================================================== |
111 | --- linux-ck-dev.orig/kernel/exit.c 2006-06-18 15:21:00.000000000 +1000 |
112 | +++ linux-ck-dev/kernel/exit.c 2006-06-18 15:21:50.000000000 +1000 |
113 | @@ -170,7 +170,6 @@ repeat: |
114 | zap_leader = (leader->exit_signal == -1); |
115 | } |
116 | |
117 | - sched_exit(p); |
118 | write_unlock_irq(&tasklist_lock); |
119 | spin_unlock(&p->proc_lock); |
120 | proc_pid_flush(proc_dentry); |
121 | Index: linux-ck-dev/kernel/sched.c |
122 | =================================================================== |
123 | --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000 |
124 | +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:22:27.000000000 +1000 |
125 | @@ -16,6 +16,9 @@ |
126 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
127 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
128 | * 2004-04-02 Scheduler domains code by Nick Piggin |
129 | + * 2006-06-18 Staircase scheduling policy by Con Kolivas with help |
130 | + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. |
131 | + * Staircase v16 |
132 | */ |
133 | |
134 | #include <linux/mm.h> |
135 | @@ -75,131 +78,27 @@ |
136 | /* |
137 | * Some helpers for converting nanosecond timing to jiffy resolution |
138 | */ |
139 | -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
140 | -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
141 | - |
142 | -/* |
143 | - * These are the 'tuning knobs' of the scheduler: |
144 | - * |
145 | - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), |
146 | - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. |
147 | - * Timeslices get refilled after they expire. |
148 | - */ |
149 | -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
150 | -#define DEF_TIMESLICE (100 * HZ / 1000) |
151 | -#define ON_RUNQUEUE_WEIGHT 30 |
152 | -#define CHILD_PENALTY 95 |
153 | -#define PARENT_PENALTY 100 |
154 | -#define EXIT_WEIGHT 3 |
155 | -#define PRIO_BONUS_RATIO 25 |
156 | -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) |
157 | -#define INTERACTIVE_DELTA 2 |
158 | -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) |
159 | -#define STARVATION_LIMIT (MAX_SLEEP_AVG) |
160 | -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) |
161 | - |
162 | -/* |
163 | - * If a task is 'interactive' then we reinsert it in the active |
164 | - * array after it has expired its current timeslice. (it will not |
165 | - * continue to run immediately, it will still roundrobin with |
166 | - * other interactive tasks.) |
167 | - * |
168 | - * This part scales the interactivity limit depending on niceness. |
169 | - * |
170 | - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. |
171 | - * Here are a few examples of different nice levels: |
172 | - * |
173 | - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] |
174 | - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] |
175 | - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] |
176 | - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] |
177 | - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] |
178 | - * |
179 | - * (the X axis represents the possible -5 ... 0 ... +5 dynamic |
180 | - * priority range a task can explore, a value of '1' means the |
181 | - * task is rated interactive.) |
182 | - * |
183 | - * Ie. nice +19 tasks can never get 'interactive' enough to be |
184 | - * reinserted into the active array. And only heavily CPU-hog nice -20 |
185 | - * tasks will be expired. Default nice 0 tasks are somewhere between, |
186 | - * it takes some effort for them to get interactive, but it's not |
187 | - * too hard. |
188 | - */ |
189 | - |
190 | -#define CURRENT_BONUS(p) \ |
191 | - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ |
192 | - MAX_SLEEP_AVG) |
193 | - |
194 | -#define GRANULARITY (10 * HZ / 1000 ? : 1) |
195 | - |
196 | -#ifdef CONFIG_SMP |
197 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
198 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ |
199 | - num_online_cpus()) |
200 | -#else |
201 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
202 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) |
203 | -#endif |
204 | - |
205 | -#define SCALE(v1,v1_max,v2_max) \ |
206 | - (v1) * (v2_max) / (v1_max) |
207 | - |
208 | -#define DELTA(p) \ |
209 | - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ |
210 | - INTERACTIVE_DELTA) |
211 | - |
212 | -#define TASK_INTERACTIVE(p) \ |
213 | - ((p)->prio <= (p)->static_prio - DELTA(p)) |
214 | - |
215 | -#define INTERACTIVE_SLEEP(p) \ |
216 | - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ |
217 | - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) |
218 | - |
219 | +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ |
220 | +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY) |
221 | +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY) |
222 | #define TASK_PREEMPTS_CURR(p, rq) \ |
223 | ((p)->prio < (rq)->curr->prio) |
224 | |
225 | /* |
226 | - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
227 | - * to time slice values: [800ms ... 100ms ... 5ms] |
228 | - * |
229 | - * The higher a thread's priority, the bigger timeslices |
230 | - * it gets during one round of execution. But even the lowest |
231 | - * priority thread gets MIN_TIMESLICE worth of execution time. |
232 | + * This is the time all tasks within the same priority round robin. |
233 | + * Set to a minimum of 6ms. |
234 | */ |
235 | +#define RR_INTERVAL ((6 * HZ / 1001) + 1) |
236 | +#define DEF_TIMESLICE (RR_INTERVAL * 19) |
237 | |
238 | -#define SCALE_PRIO(x, prio) \ |
239 | - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
240 | - |
241 | -static unsigned int static_prio_timeslice(int static_prio) |
242 | -{ |
243 | - if (static_prio < NICE_TO_PRIO(0)) |
244 | - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
245 | - else |
246 | - return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
247 | -} |
248 | - |
249 | -static inline unsigned int task_timeslice(task_t *p) |
250 | -{ |
251 | - return static_prio_timeslice(p->static_prio); |
252 | -} |
253 | - |
254 | -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
255 | +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \ |
256 | < (long long) (sd)->cache_hot_time) |
257 | |
258 | /* |
259 | * These are the runqueue data structures: |
260 | */ |
261 | - |
262 | -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) |
263 | - |
264 | typedef struct runqueue runqueue_t; |
265 | |
266 | -struct prio_array { |
267 | - unsigned int nr_active; |
268 | - unsigned long bitmap[BITMAP_SIZE]; |
269 | - struct list_head queue[MAX_PRIO]; |
270 | -}; |
271 | - |
272 | /* |
273 | * This is the main, per-CPU runqueue data structure. |
274 | * |
275 | @@ -229,12 +128,11 @@ struct runqueue { |
276 | */ |
277 | unsigned long nr_uninterruptible; |
278 | |
279 | - unsigned long expired_timestamp; |
280 | unsigned long long timestamp_last_tick; |
281 | task_t *curr, *idle; |
282 | struct mm_struct *prev_mm; |
283 | - prio_array_t *active, *expired, arrays[2]; |
284 | - int best_expired_prio; |
285 | + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; |
286 | + struct list_head queue[MAX_PRIO]; |
287 | atomic_t nr_iowait; |
288 | |
289 | #ifdef CONFIG_SMP |
290 | @@ -499,13 +397,7 @@ static inline runqueue_t *this_rq_lock(v |
291 | |
292 | #ifdef CONFIG_SCHEDSTATS |
293 | /* |
294 | - * Called when a process is dequeued from the active array and given |
295 | - * the cpu. We should note that with the exception of interactive |
296 | - * tasks, the expired queue will become the active queue after the active |
297 | - * queue is empty, without explicitly dequeuing and requeuing tasks in the |
298 | - * expired queue. (Interactive tasks may be requeued directly to the |
299 | - * active queue, thus delaying tasks in the expired queue from running; |
300 | - * see scheduler_tick()). |
301 | + * Called when a process is dequeued and given the cpu. |
302 | * |
303 | * This function is only called from sched_info_arrive(), rather than |
304 | * dequeue_task(). Even though a task may be queued and dequeued multiple |
305 | @@ -543,13 +435,11 @@ static void sched_info_arrive(task_t *t) |
306 | } |
307 | |
308 | /* |
309 | - * Called when a process is queued into either the active or expired |
310 | - * array. The time is noted and later used to determine how long we |
311 | - * had to wait for us to reach the cpu. Since the expired queue will |
312 | - * become the active queue after active queue is empty, without dequeuing |
313 | - * and requeuing any tasks, we are interested in queuing to either. It |
314 | - * is unusual but not impossible for tasks to be dequeued and immediately |
315 | - * requeued in the same or another array: this can happen in sched_yield(), |
316 | + * Called when a process is queued |
317 | + * The time is noted and later used to determine how long we had to wait for |
318 | + * us to reach the cpu. |
319 | + * It is unusual but not impossible for tasks to be dequeued and immediately |
320 | + * requeued: this can happen in sched_yield(), |
321 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue |
322 | * to runqueue. |
323 | * |
324 | @@ -603,74 +493,81 @@ static inline void sched_info_switch(tas |
325 | #define sched_info_switch(t, next) do { } while (0) |
326 | #endif /* CONFIG_SCHEDSTATS */ |
327 | |
328 | -/* |
329 | - * Adding/removing a task to/from a priority array: |
330 | - */ |
331 | -static void dequeue_task(struct task_struct *p, prio_array_t *array) |
332 | +#if BITS_PER_LONG < 64 |
333 | +static inline void longlimit(unsigned long long *longlong) |
334 | +{ |
335 | + if (*longlong > (1 << 31)) |
336 | + *longlong = 1 << 31; |
337 | +} |
338 | +#else |
339 | +static inline void longlimit(unsigned long long *__unused) |
340 | { |
341 | - array->nr_active--; |
342 | - list_del(&p->run_list); |
343 | - if (list_empty(array->queue + p->prio)) |
344 | - __clear_bit(p->prio, array->bitmap); |
345 | +} |
346 | +#endif |
347 | + |
348 | +/* Get nanosecond clock difference without overflowing unsigned long. */ |
349 | +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2) |
350 | +{ |
351 | + unsigned long long vdiff; |
352 | + if (likely(v1 >= v2)) { |
353 | + vdiff = v1 - v2; |
354 | + longlimit(&vdiff); |
355 | + } else { |
356 | + /* |
357 | + * Rarely the clock appears to go backwards. There should |
358 | + * always be a positive difference so return 1. |
359 | + */ |
360 | + vdiff = 1; |
361 | + } |
362 | + return (unsigned long)vdiff; |
363 | } |
364 | |
365 | -static void enqueue_task(struct task_struct *p, prio_array_t *array) |
366 | +static inline int task_queued(const task_t *task) |
367 | { |
368 | - sched_info_queued(p); |
369 | - list_add_tail(&p->run_list, array->queue + p->prio); |
370 | - __set_bit(p->prio, array->bitmap); |
371 | - array->nr_active++; |
372 | - p->array = array; |
373 | + return !list_empty(&task->run_list); |
374 | } |
375 | |
376 | /* |
377 | - * Put task to the end of the run list without the overhead of dequeue |
378 | - * followed by enqueue. |
379 | + * Adding/removing a task to/from a runqueue: |
380 | */ |
381 | -static void requeue_task(struct task_struct *p, prio_array_t *array) |
382 | +static void dequeue_task(task_t *p, runqueue_t *rq) |
383 | { |
384 | - list_move_tail(&p->run_list, array->queue + p->prio); |
385 | + list_del_init(&p->run_list); |
386 | + if (list_empty(rq->queue + p->prio)) |
387 | + __clear_bit(p->prio, rq->bitmap); |
388 | + p->ns_debit = 0; |
389 | } |
390 | |
391 | -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) |
392 | +static void enqueue_task(task_t *p, runqueue_t *rq) |
393 | { |
394 | - list_add(&p->run_list, array->queue + p->prio); |
395 | - __set_bit(p->prio, array->bitmap); |
396 | - array->nr_active++; |
397 | - p->array = array; |
398 | + list_add_tail(&p->run_list, rq->queue + p->prio); |
399 | + __set_bit(p->prio, rq->bitmap); |
400 | } |
401 | |
402 | /* |
403 | - * effective_prio - return the priority that is based on the static |
404 | - * priority but is modified by bonuses/penalties. |
405 | - * |
406 | - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
407 | - * into the -5 ... 0 ... +5 bonus/penalty range. |
408 | - * |
409 | - * We use 25% of the full 0...39 priority range so that: |
410 | - * |
411 | - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. |
412 | - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. |
413 | - * |
414 | - * Both properties are important to certain workloads. |
415 | + * Put task to the end of the run list without the overhead of dequeue |
416 | + * followed by enqueue. |
417 | */ |
418 | -static int effective_prio(task_t *p) |
419 | +static void requeue_task(task_t *p, runqueue_t *rq, const int prio) |
420 | { |
421 | - int bonus, prio; |
422 | - |
423 | - if (rt_task(p)) |
424 | - return p->prio; |
425 | - |
426 | - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
427 | + list_move_tail(&p->run_list, rq->queue + prio); |
428 | + if (p->prio != prio) { |
429 | + if (list_empty(rq->queue + p->prio)) |
430 | + __clear_bit(p->prio, rq->bitmap); |
431 | + p->prio = prio; |
432 | + __set_bit(prio, rq->bitmap); |
433 | + } |
434 | + p->ns_debit = 0; |
435 | +} |
436 | |
437 | - prio = p->static_prio - bonus; |
438 | - if (prio < MAX_RT_PRIO) |
439 | - prio = MAX_RT_PRIO; |
440 | - if (prio > MAX_PRIO-1) |
441 | - prio = MAX_PRIO-1; |
442 | - return prio; |
443 | +static inline void enqueue_task_head(task_t *p, runqueue_t *rq) |
444 | +{ |
445 | + list_add(&p->run_list, rq->queue + p->prio); |
446 | + __set_bit(p->prio, rq->bitmap); |
447 | } |
448 | |
449 | +static unsigned int slice(const task_t *p); |
450 | + |
451 | /* |
452 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
453 | * of tasks with abnormal "nice" values across CPUs the contribution that |
454 | @@ -688,10 +585,9 @@ static int effective_prio(task_t *p) |
455 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
456 | #define LOAD_WEIGHT(lp) \ |
457 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
458 | -#define PRIO_TO_LOAD_WEIGHT(prio) \ |
459 | - LOAD_WEIGHT(static_prio_timeslice(prio)) |
460 | -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
461 | - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) |
462 | +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p)) |
463 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
464 | + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp)))) |
465 | |
466 | static void set_load_weight(task_t *p) |
467 | { |
468 | @@ -708,7 +604,7 @@ static void set_load_weight(task_t *p) |
469 | #endif |
470 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); |
471 | } else |
472 | - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); |
473 | + p->load_weight = TASK_LOAD_WEIGHT(p); |
474 | } |
475 | |
476 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) |
477 | @@ -736,13 +632,9 @@ static inline void dec_nr_running(task_t |
478 | /* |
479 | * __activate_task - move a task to the runqueue. |
480 | */ |
481 | -static void __activate_task(task_t *p, runqueue_t *rq) |
482 | +static inline void __activate_task(task_t *p, runqueue_t *rq) |
483 | { |
484 | - prio_array_t *target = rq->active; |
485 | - |
486 | - if (batch_task(p)) |
487 | - target = rq->expired; |
488 | - enqueue_task(p, target); |
489 | + enqueue_task(p, rq); |
490 | inc_nr_running(p, rq); |
491 | } |
492 | |
493 | @@ -751,85 +643,181 @@ static void __activate_task(task_t *p, r |
494 | */ |
495 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
496 | { |
497 | - enqueue_task_head(p, rq->active); |
498 | + enqueue_task_head(p, rq); |
499 | inc_nr_running(p, rq); |
500 | } |
501 | |
502 | -static int recalc_task_prio(task_t *p, unsigned long long now) |
503 | +/* |
504 | + * Bonus - How much higher than its base priority an interactive task can run. |
505 | + */ |
506 | +static inline unsigned int bonus(const task_t *p) |
507 | { |
508 | - /* Caller must always ensure 'now >= p->timestamp' */ |
509 | - unsigned long long __sleep_time = now - p->timestamp; |
510 | - unsigned long sleep_time; |
511 | + return TASK_USER_PRIO(p); |
512 | +} |
513 | |
514 | - if (batch_task(p)) |
515 | - sleep_time = 0; |
516 | +static unsigned int rr_interval(const task_t *p) |
517 | +{ |
518 | + int nice = TASK_NICE(p); |
519 | + |
520 | + if (nice < 0 && !rt_task(p)) |
521 | + return RR_INTERVAL * (20 - nice) / 20; |
522 | + return RR_INTERVAL; |
523 | +} |
524 | + |
525 | +/* |
526 | + * slice - the duration a task runs before getting requeued at its best |
527 | + * priority and has its bonus decremented. |
528 | + */ |
529 | +static unsigned int slice(const task_t *p) |
530 | +{ |
531 | + unsigned int slice, rr; |
532 | + |
533 | + slice = rr = rr_interval(p); |
534 | + if (likely(!rt_task(p))) |
535 | + slice += (39 - TASK_USER_PRIO(p)) * rr; |
536 | + return slice; |
537 | +} |
538 | + |
539 | +/* |
540 | + * We increase our bonus by sleeping more than the time we ran. |
541 | + * The ratio of sleep to run gives us the cpu% that we last ran and determines |
542 | + * the maximum bonus we can acquire. |
543 | + */ |
544 | +static void inc_bonus(task_t *p, unsigned long totalrun, unsigned long sleep) |
545 | +{ |
546 | + unsigned int best_bonus = sleep / (totalrun + 1); |
547 | + |
548 | + if (p->bonus >= best_bonus) |
549 | + return; |
550 | + best_bonus = bonus(p); |
551 | + if (p->bonus < best_bonus) |
552 | + p->bonus++; |
553 | +} |
554 | + |
555 | +static inline void dec_bonus(task_t *p) |
556 | +{ |
557 | + if (p->bonus) |
558 | + p->bonus--; |
559 | +} |
560 | + |
561 | +static inline void slice_overrun(struct task_struct *p) |
562 | +{ |
563 | + unsigned long ns_slice = JIFFIES_TO_NS(p->slice); |
564 | + |
565 | + do { |
566 | + p->totalrun -= ns_slice; |
567 | + dec_bonus(p); |
568 | + } while (unlikely(p->totalrun > ns_slice)); |
569 | +} |
570 | + |
571 | +/* |
572 | + * effective_prio - dynamic priority dependent on bonus. |
573 | + * The priority normally decreases by one each RR_INTERVAL. |
574 | + * As the bonus increases the initial priority starts at a higher "stair" or |
575 | + * priority for longer. |
576 | + */ |
577 | +static int effective_prio(const task_t *p) |
578 | +{ |
579 | + int prio; |
580 | + unsigned int full_slice, used_slice = 0; |
581 | + unsigned int best_bonus, rr; |
582 | + |
583 | + if (rt_task(p)) |
584 | + return p->prio; |
585 | + |
586 | + full_slice = slice(p); |
587 | + if (full_slice > p->slice) |
588 | + used_slice = full_slice - p->slice; |
589 | + |
590 | + best_bonus = bonus(p); |
591 | + prio = MAX_RT_PRIO + best_bonus; |
592 | + if (!batch_task(p)) |
593 | + prio -= p->bonus; |
594 | + |
595 | + rr = rr_interval(p); |
596 | + prio += used_slice / rr; |
597 | + if (prio > MIN_USER_PRIO) |
598 | + prio = MIN_USER_PRIO; |
599 | + return prio; |
600 | +} |
601 | + |
602 | +static inline void continue_slice(task_t *p) |
603 | +{ |
604 | + unsigned long total_run = NS_TO_JIFFIES(p->totalrun); |
605 | + |
606 | + if (unlikely(total_run >= p->slice)) |
607 | + slice_overrun(p); |
608 | else { |
609 | - if (__sleep_time > NS_MAX_SLEEP_AVG) |
610 | - sleep_time = NS_MAX_SLEEP_AVG; |
611 | - else |
612 | - sleep_time = (unsigned long)__sleep_time; |
613 | + unsigned long remainder; |
614 | + |
615 | + p->slice -= total_run; |
616 | + remainder = p->slice % rr_interval(p); |
617 | + if (remainder) |
618 | + p->time_slice = remainder; |
619 | } |
620 | +} |
621 | |
622 | - if (likely(sleep_time > 0)) { |
623 | - /* |
624 | - * User tasks that sleep a long time are categorised as |
625 | - * idle. They will only have their sleep_avg increased to a |
626 | - * level that makes them just interactive priority to stay |
627 | - * active yet prevent them suddenly becoming cpu hogs and |
628 | - * starving other processes. |
629 | - */ |
630 | - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { |
631 | - unsigned long ceiling; |
632 | +/* |
633 | + * recalc_task_prio - this checks for tasks that have run less than a full |
634 | + * slice and have woken up again soon after, or have just forked a |
635 | + * thread/process and make them continue their old slice instead of starting |
636 | + * a new one at high priority. |
637 | + */ |
638 | +static inline void recalc_task_prio(task_t *p, const unsigned long long now) |
639 | +{ |
640 | + unsigned long sleep_time; |
641 | |
642 | - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - |
643 | - DEF_TIMESLICE); |
644 | - if (p->sleep_avg < ceiling) |
645 | - p->sleep_avg = ceiling; |
646 | - } else { |
647 | - /* |
648 | - * Tasks waking from uninterruptible sleep are |
649 | - * limited in their sleep_avg rise as they |
650 | - * are likely to be waiting on I/O |
651 | - */ |
652 | - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
653 | - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) |
654 | - sleep_time = 0; |
655 | - else if (p->sleep_avg + sleep_time >= |
656 | - INTERACTIVE_SLEEP(p)) { |
657 | - p->sleep_avg = INTERACTIVE_SLEEP(p); |
658 | - sleep_time = 0; |
659 | - } |
660 | - } |
661 | + /* |
662 | + * If this task has managed to run to its lowest priority then |
663 | + * decrease its bonus and requeue it now at best priority instead |
664 | + * of possibly flagging around lowest priority. Save up any systime |
665 | + * that may affect priority on the next reschedule. |
666 | + */ |
667 | + if (p->slice > p->time_slice && |
668 | + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) { |
669 | + dec_bonus(p); |
670 | + p->totalrun = 0; |
671 | + return; |
672 | + } |
673 | |
674 | - /* |
675 | - * This code gives a bonus to interactive tasks. |
676 | - * |
677 | - * The boost works by updating the 'average sleep time' |
678 | - * value here, based on ->timestamp. The more time a |
679 | - * task spends sleeping, the higher the average gets - |
680 | - * and the higher the priority boost gets as well. |
681 | - */ |
682 | - p->sleep_avg += sleep_time; |
683 | + /* |
684 | + * Add the total for this last scheduled run (p->runtime) and system |
685 | + * time (p->systime) done on behalf of p to the running total so far |
686 | + * used (p->totalrun). |
687 | + */ |
688 | + p->totalrun += p->runtime + p->systime; |
689 | + sleep_time = ns_diff(now, p->timestamp); |
690 | |
691 | - if (p->sleep_avg > NS_MAX_SLEEP_AVG) |
692 | - p->sleep_avg = NS_MAX_SLEEP_AVG; |
693 | + if (p->systime > sleep_time || p->flags & PF_FORKED) |
694 | + sleep_time = 0; |
695 | + else { |
696 | + sleep_time -= p->systime; |
697 | + /* |
698 | + * We elevate priority by the amount of time we slept. If we |
699 | + * sleep longer than our running total and have not set the |
700 | + * PF_NONSLEEP flag we gain a bonus. |
701 | + */ |
702 | + if (sleep_time >= p->totalrun) { |
703 | + if (!(p->flags & PF_NONSLEEP)) |
704 | + inc_bonus(p, p->totalrun, sleep_time); |
705 | + p->totalrun = 0; |
706 | + return; |
707 | } |
708 | + p->totalrun -= sleep_time; |
709 | } |
710 | - |
711 | - return effective_prio(p); |
712 | + continue_slice(p); |
713 | } |
714 | |
715 | /* |
716 | * activate_task - move a task to the runqueue and do priority recalculation |
717 | * |
718 | - * Update all the scheduling statistics stuff. (sleep average |
719 | - * calculation, priority modifiers, etc.) |
720 | + * Update all the scheduling statistics stuff. (priority modifiers, etc.) |
721 | */ |
722 | -static void activate_task(task_t *p, runqueue_t *rq, int local) |
723 | +static void activate_task(task_t *p, runqueue_t *rq, const int local) |
724 | { |
725 | - unsigned long long now; |
726 | + unsigned long long now = sched_clock(); |
727 | + unsigned long rr = rr_interval(p); |
728 | |
729 | - now = sched_clock(); |
730 | #ifdef CONFIG_SMP |
731 | if (!local) { |
732 | /* Compensate for drifting sched_clock */ |
733 | @@ -838,45 +826,25 @@ static void activate_task(task_t *p, run |
734 | + rq->timestamp_last_tick; |
735 | } |
736 | #endif |
737 | - |
738 | - if (!rt_task(p)) |
739 | - p->prio = recalc_task_prio(p, now); |
740 | - |
741 | - /* |
742 | - * This checks to make sure it's not an uninterruptible task |
743 | - * that is now waking up. |
744 | - */ |
745 | - if (p->sleep_type == SLEEP_NORMAL) { |
746 | - /* |
747 | - * Tasks which were woken up by interrupts (ie. hw events) |
748 | - * are most likely of interactive nature. So we give them |
749 | - * the credit of extending their sleep time to the period |
750 | - * of time they spend on the runqueue, waiting for execution |
751 | - * on a CPU, first time around: |
752 | - */ |
753 | - if (in_interrupt()) |
754 | - p->sleep_type = SLEEP_INTERRUPTED; |
755 | - else { |
756 | - /* |
757 | - * Normal first-time wakeups get a credit too for |
758 | - * on-runqueue time, but it will be weighted down: |
759 | - */ |
760 | - p->sleep_type = SLEEP_INTERACTIVE; |
761 | - } |
762 | + p->slice = slice(p); |
763 | + p->time_slice = p->slice % rr ? : rr; |
764 | + if (!rt_task(p)) { |
765 | + recalc_task_prio(p, now); |
766 | + p->prio = effective_prio(p); |
767 | + p->systime = 0; |
768 | + p->flags &= ~(PF_FORKED | PF_NONSLEEP); |
769 | } |
770 | p->timestamp = now; |
771 | - |
772 | __activate_task(p, rq); |
773 | } |
774 | |
775 | /* |
776 | * deactivate_task - remove a task from the runqueue. |
777 | */ |
778 | -static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
779 | +static void deactivate_task(task_t *p, runqueue_t *rq) |
780 | { |
781 | dec_nr_running(p, rq); |
782 | - dequeue_task(p, p->array); |
783 | - p->array = NULL; |
784 | + dequeue_task(p, rq); |
785 | } |
786 | |
787 | /* |
788 | @@ -952,7 +920,7 @@ static int migrate_task(task_t *p, int d |
789 | * If the task is not on a runqueue (and not running), then |
790 | * it is sufficient to simply update the task's cpu field. |
791 | */ |
792 | - if (!p->array && !task_running(rq, p)) { |
793 | + if (!task_queued(p) && !task_running(rq, p)) { |
794 | set_task_cpu(p, dest_cpu); |
795 | return 0; |
796 | } |
797 | @@ -982,7 +950,7 @@ void wait_task_inactive(task_t *p) |
798 | repeat: |
799 | rq = task_rq_lock(p, &flags); |
800 | /* Must be off runqueue entirely, not preempted. */ |
801 | - if (unlikely(p->array || task_running(rq, p))) { |
802 | + if (unlikely(task_queued(p) || task_running(rq, p))) { |
803 | /* If it's preempted, we yield. It could be a while. */ |
804 | preempted = !task_running(rq, p); |
805 | task_rq_unlock(rq, &flags); |
806 | @@ -1234,6 +1202,15 @@ static inline int wake_idle(int cpu, tas |
807 | } |
808 | #endif |
809 | |
810 | +/* |
811 | + * Check to see if p preempts rq->curr and resched if it does. |
812 | + */ |
813 | +static inline void preempt(const task_t *p, runqueue_t *rq) |
814 | +{ |
815 | + if (TASK_PREEMPTS_CURR(p, rq)) |
816 | + resched_task(rq->curr); |
817 | +} |
818 | + |
819 | /*** |
820 | * try_to_wake_up - wake up a thread |
821 | * @p: the to-be-woken-up thread |
822 | @@ -1265,7 +1242,7 @@ static int try_to_wake_up(task_t *p, uns |
823 | if (!(old_state & state)) |
824 | goto out; |
825 | |
826 | - if (p->array) |
827 | + if (task_queued(p)) |
828 | goto out_running; |
829 | |
830 | cpu = task_cpu(p); |
831 | @@ -1356,7 +1333,7 @@ out_set_cpu: |
832 | old_state = p->state; |
833 | if (!(old_state & state)) |
834 | goto out; |
835 | - if (p->array) |
836 | + if (task_queued(p)) |
837 | goto out_running; |
838 | |
839 | this_cpu = smp_processor_id(); |
840 | @@ -1365,25 +1342,9 @@ out_set_cpu: |
841 | |
842 | out_activate: |
843 | #endif /* CONFIG_SMP */ |
844 | - if (old_state == TASK_UNINTERRUPTIBLE) { |
845 | + if (old_state == TASK_UNINTERRUPTIBLE) |
846 | rq->nr_uninterruptible--; |
847 | - /* |
848 | - * Tasks on involuntary sleep don't earn |
849 | - * sleep_avg beyond just interactive state. |
850 | - */ |
851 | - p->sleep_type = SLEEP_NONINTERACTIVE; |
852 | - } else |
853 | - |
854 | - /* |
855 | - * Tasks that have marked their sleep as noninteractive get |
856 | - * woken up with their sleep average not weighted in an |
857 | - * interactive way. |
858 | - */ |
859 | - if (old_state & TASK_NONINTERACTIVE) |
860 | - p->sleep_type = SLEEP_NONINTERACTIVE; |
861 | - |
862 | |
863 | - activate_task(p, rq, cpu == this_cpu); |
864 | /* |
865 | * Sync wakeups (i.e. those types of wakeups where the waker |
866 | * has indicated that it will leave the CPU in short order) |
867 | @@ -1392,10 +1353,9 @@ out_activate: |
868 | * the waker guarantees that the freshly woken up task is going |
869 | * to be considered on this CPU.) |
870 | */ |
871 | - if (!sync || cpu != this_cpu) { |
872 | - if (TASK_PREEMPTS_CURR(p, rq)) |
873 | - resched_task(rq->curr); |
874 | - } |
875 | + activate_task(p, rq, cpu == this_cpu); |
876 | + if (!sync || cpu != this_cpu) |
877 | + preempt(p, rq); |
878 | success = 1; |
879 | |
880 | out_running: |
881 | @@ -1440,7 +1400,6 @@ void fastcall sched_fork(task_t *p, int |
882 | */ |
883 | p->state = TASK_RUNNING; |
884 | INIT_LIST_HEAD(&p->run_list); |
885 | - p->array = NULL; |
886 | #ifdef CONFIG_SCHEDSTATS |
887 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
888 | #endif |
889 | @@ -1451,30 +1410,6 @@ void fastcall sched_fork(task_t *p, int |
890 | /* Want to start with kernel preemption disabled. */ |
891 | task_thread_info(p)->preempt_count = 1; |
892 | #endif |
893 | - /* |
894 | - * Share the timeslice between parent and child, thus the |
895 | - * total amount of pending timeslices in the system doesn't change, |
896 | - * resulting in more scheduling fairness. |
897 | - */ |
898 | - local_irq_disable(); |
899 | - p->time_slice = (current->time_slice + 1) >> 1; |
900 | - /* |
901 | - * The remainder of the first timeslice might be recovered by |
902 | - * the parent if the child exits early enough. |
903 | - */ |
904 | - p->first_time_slice = 1; |
905 | - current->time_slice >>= 1; |
906 | - p->timestamp = sched_clock(); |
907 | - if (unlikely(!current->time_slice)) { |
908 | - /* |
909 | - * This case is rare, it happens when the parent has only |
910 | - * a single jiffy left from its timeslice. Taking the |
911 | - * runqueue lock is not a problem. |
912 | - */ |
913 | - current->time_slice = 1; |
914 | - scheduler_tick(); |
915 | - } |
916 | - local_irq_enable(); |
917 | put_cpu(); |
918 | } |
919 | |
920 | @@ -1496,37 +1431,20 @@ void fastcall wake_up_new_task(task_t *p |
921 | this_cpu = smp_processor_id(); |
922 | cpu = task_cpu(p); |
923 | |
924 | - /* |
925 | - * We decrease the sleep average of forking parents |
926 | - * and children as well, to keep max-interactive tasks |
927 | - * from forking tasks that are max-interactive. The parent |
928 | - * (current) is done further down, under its lock. |
929 | - */ |
930 | - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * |
931 | - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
932 | - |
933 | - p->prio = effective_prio(p); |
934 | + /* Forked process gets no bonus to prevent fork bombs. */ |
935 | + p->bonus = 0; |
936 | + current->flags |= PF_FORKED; |
937 | |
938 | if (likely(cpu == this_cpu)) { |
939 | + activate_task(p, rq, 1); |
940 | if (!(clone_flags & CLONE_VM)) { |
941 | /* |
942 | * The VM isn't cloned, so we're in a good position to |
943 | * do child-runs-first in anticipation of an exec. This |
944 | * usually avoids a lot of COW overhead. |
945 | */ |
946 | - if (unlikely(!current->array)) |
947 | - __activate_task(p, rq); |
948 | - else { |
949 | - p->prio = current->prio; |
950 | - list_add_tail(&p->run_list, ¤t->run_list); |
951 | - p->array = current->array; |
952 | - p->array->nr_active++; |
953 | - inc_nr_running(p, rq); |
954 | - } |
955 | set_need_resched(); |
956 | - } else |
957 | - /* Run child last */ |
958 | - __activate_task(p, rq); |
959 | + } |
960 | /* |
961 | * We skip the following code due to cpu == this_cpu |
962 | * |
963 | @@ -1543,53 +1461,19 @@ void fastcall wake_up_new_task(task_t *p |
964 | */ |
965 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) |
966 | + rq->timestamp_last_tick; |
967 | - __activate_task(p, rq); |
968 | - if (TASK_PREEMPTS_CURR(p, rq)) |
969 | - resched_task(rq->curr); |
970 | + activate_task(p, rq, 0); |
971 | + preempt(p, rq); |
972 | |
973 | /* |
974 | * Parent and child are on different CPUs, now get the |
975 | - * parent runqueue to update the parent's ->sleep_avg: |
976 | + * parent runqueue to update the parent's ->flags: |
977 | */ |
978 | task_rq_unlock(rq, &flags); |
979 | this_rq = task_rq_lock(current, &flags); |
980 | } |
981 | - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * |
982 | - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
983 | task_rq_unlock(this_rq, &flags); |
984 | } |
985 | |
986 | -/* |
987 | - * Potentially available exiting-child timeslices are |
988 | - * retrieved here - this way the parent does not get |
989 | - * penalized for creating too many threads. |
990 | - * |
991 | - * (this cannot be used to 'generate' timeslices |
992 | - * artificially, because any timeslice recovered here |
993 | - * was given away by the parent in the first place.) |
994 | - */ |
995 | -void fastcall sched_exit(task_t *p) |
996 | -{ |
997 | - unsigned long flags; |
998 | - runqueue_t *rq; |
999 | - |
1000 | - /* |
1001 | - * If the child was a (relative-) CPU hog then decrease |
1002 | - * the sleep_avg of the parent as well. |
1003 | - */ |
1004 | - rq = task_rq_lock(p->parent, &flags); |
1005 | - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
1006 | - p->parent->time_slice += p->time_slice; |
1007 | - if (unlikely(p->parent->time_slice > task_timeslice(p))) |
1008 | - p->parent->time_slice = task_timeslice(p); |
1009 | - } |
1010 | - if (p->sleep_avg < p->parent->sleep_avg) |
1011 | - p->parent->sleep_avg = p->parent->sleep_avg / |
1012 | - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / |
1013 | - (EXIT_WEIGHT + 1); |
1014 | - task_rq_unlock(rq, &flags); |
1015 | -} |
1016 | - |
1017 | /** |
1018 | * prepare_task_switch - prepare to switch tasks |
1019 | * @rq: the runqueue preparing to switch |
1020 | @@ -1885,23 +1769,21 @@ void sched_exec(void) |
1021 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1022 | * Both runqueues must be locked. |
1023 | */ |
1024 | -static |
1025 | -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, |
1026 | - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1027 | +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, |
1028 | + const int this_cpu) |
1029 | { |
1030 | - dequeue_task(p, src_array); |
1031 | + dequeue_task(p, src_rq); |
1032 | dec_nr_running(p, src_rq); |
1033 | set_task_cpu(p, this_cpu); |
1034 | inc_nr_running(p, this_rq); |
1035 | - enqueue_task(p, this_array); |
1036 | + enqueue_task(p, this_rq); |
1037 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1038 | + this_rq->timestamp_last_tick; |
1039 | /* |
1040 | * Note that idle threads have a prio of MAX_PRIO, for this test |
1041 | * to be always true for them. |
1042 | */ |
1043 | - if (TASK_PREEMPTS_CURR(p, this_rq)) |
1044 | - resched_task(this_rq->curr); |
1045 | + preempt(p, this_rq); |
1046 | } |
1047 | |
1048 | /* |
1049 | @@ -1939,7 +1821,6 @@ int can_migrate_task(task_t *p, runqueue |
1050 | return 1; |
1051 | } |
1052 | |
1053 | -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) |
1054 | /* |
1055 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1056 | * load from busiest to this_rq, as part of a balancing operation within |
1057 | @@ -1952,7 +1833,6 @@ static int move_tasks(runqueue_t *this_r |
1058 | struct sched_domain *sd, enum idle_type idle, |
1059 | int *all_pinned) |
1060 | { |
1061 | - prio_array_t *array, *dst_array; |
1062 | struct list_head *head, *curr; |
1063 | int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; |
1064 | int busiest_best_prio_seen; |
1065 | @@ -1965,8 +1845,8 @@ static int move_tasks(runqueue_t *this_r |
1066 | |
1067 | rem_load_move = max_load_move; |
1068 | pinned = 1; |
1069 | - this_best_prio = rq_best_prio(this_rq); |
1070 | - busiest_best_prio = rq_best_prio(busiest); |
1071 | + this_best_prio = this_rq->curr->prio; |
1072 | + busiest_best_prio = busiest->curr->prio; |
1073 | /* |
1074 | * Enable handling of the case where there is more than one task |
1075 | * with the best priority. If the current running task is one |
1076 | @@ -1976,38 +1856,17 @@ static int move_tasks(runqueue_t *this_r |
1077 | */ |
1078 | busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; |
1079 | |
1080 | - /* |
1081 | - * We first consider expired tasks. Those will likely not be |
1082 | - * executed in the near future, and they are most likely to |
1083 | - * be cache-cold, thus switching CPUs has the least effect |
1084 | - * on them. |
1085 | - */ |
1086 | - if (busiest->expired->nr_active) { |
1087 | - array = busiest->expired; |
1088 | - dst_array = this_rq->expired; |
1089 | - } else { |
1090 | - array = busiest->active; |
1091 | - dst_array = this_rq->active; |
1092 | - } |
1093 | - |
1094 | -new_array: |
1095 | /* Start searching at priority 0: */ |
1096 | idx = 0; |
1097 | skip_bitmap: |
1098 | if (!idx) |
1099 | - idx = sched_find_first_bit(array->bitmap); |
1100 | + idx = sched_find_first_bit(busiest->bitmap); |
1101 | else |
1102 | - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); |
1103 | - if (idx >= MAX_PRIO) { |
1104 | - if (array == busiest->expired && busiest->active->nr_active) { |
1105 | - array = busiest->active; |
1106 | - dst_array = this_rq->active; |
1107 | - goto new_array; |
1108 | - } |
1109 | + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); |
1110 | + if (idx >= MAX_PRIO) |
1111 | goto out; |
1112 | - } |
1113 | |
1114 | - head = array->queue + idx; |
1115 | + head = busiest->queue + idx; |
1116 | curr = head->prev; |
1117 | skip_queue: |
1118 | tmp = list_entry(curr, task_t, run_list); |
1119 | @@ -2036,7 +1895,7 @@ skip_queue: |
1120 | schedstat_inc(sd, lb_hot_gained[idle]); |
1121 | #endif |
1122 | |
1123 | - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1124 | + pull_task(busiest, tmp, this_rq, this_cpu); |
1125 | pulled++; |
1126 | rem_load_move -= tmp->load_weight; |
1127 | |
1128 | @@ -2585,15 +2444,13 @@ static void rebalance_tick(int this_cpu, |
1129 | continue; |
1130 | |
1131 | interval = sd->balance_interval; |
1132 | - if (idle != SCHED_IDLE) |
1133 | - interval *= sd->busy_factor; |
1134 | |
1135 | /* scale ms to jiffies */ |
1136 | interval = msecs_to_jiffies(interval); |
1137 | if (unlikely(!interval)) |
1138 | interval = 1; |
1139 | |
1140 | - if (j - sd->last_balance >= interval) { |
1141 | + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) { |
1142 | if (load_balance(this_cpu, this_rq, sd, idle)) { |
1143 | /* |
1144 | * We've pulled tasks over so either we're no |
1145 | @@ -2667,22 +2524,6 @@ unsigned long long current_sched_time(co |
1146 | } |
1147 | |
1148 | /* |
1149 | - * We place interactive tasks back into the active array, if possible. |
1150 | - * |
1151 | - * To guarantee that this does not starve expired tasks we ignore the |
1152 | - * interactivity of a task if the first expired task had to wait more |
1153 | - * than a 'reasonable' amount of time. This deadline timeout is |
1154 | - * load-dependent, as the frequency of array switched decreases with |
1155 | - * increasing number of running tasks. We also ignore the interactivity |
1156 | - * if a better static_prio task has expired: |
1157 | - */ |
1158 | -#define EXPIRED_STARVING(rq) \ |
1159 | - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ |
1160 | - (jiffies - (rq)->expired_timestamp >= \ |
1161 | - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ |
1162 | - ((rq)->curr->static_prio > (rq)->best_expired_prio)) |
1163 | - |
1164 | -/* |
1165 | * Account user cpu time to a process. |
1166 | * @p: the process that the cpu time gets accounted to |
1167 | * @hardirq_offset: the offset to subtract from hardirq_count() |
1168 | @@ -2730,6 +2571,8 @@ void account_system_time(struct task_str |
1169 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
1170 | else |
1171 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
1172 | + |
1173 | + p->systime += NSJIFFY; |
1174 | /* Account for system time used */ |
1175 | acct_update_integrals(p); |
1176 | } |
1177 | @@ -2755,18 +2598,23 @@ void account_steal_time(struct task_stru |
1178 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
1179 | } |
1180 | |
1181 | +static void time_slice_expired(task_t *p, runqueue_t *rq) |
1182 | +{ |
1183 | + set_tsk_need_resched(p); |
1184 | + p->time_slice = rr_interval(p); |
1185 | + requeue_task(p, rq, effective_prio(p)); |
1186 | +} |
1187 | + |
1188 | /* |
1189 | * This function gets called by the timer code, with HZ frequency. |
1190 | * We call it with interrupts disabled. |
1191 | - * |
1192 | - * It also gets called by the fork code, when changing the parent's |
1193 | - * timeslices. |
1194 | */ |
1195 | void scheduler_tick(void) |
1196 | { |
1197 | int cpu = smp_processor_id(); |
1198 | runqueue_t *rq = this_rq(); |
1199 | task_t *p = current; |
1200 | + unsigned long debit; |
1201 | unsigned long long now = sched_clock(); |
1202 | |
1203 | update_cpu_clock(p, rq, now); |
1204 | @@ -2781,73 +2629,37 @@ void scheduler_tick(void) |
1205 | } |
1206 | |
1207 | /* Task might have expired already, but not scheduled off yet */ |
1208 | - if (p->array != rq->active) { |
1209 | + if (unlikely(!task_queued(p))) { |
1210 | set_tsk_need_resched(p); |
1211 | goto out; |
1212 | } |
1213 | + /* SCHED_FIFO tasks never run out of timeslice. */ |
1214 | + if (unlikely(p->policy == SCHED_FIFO)) |
1215 | + goto out; |
1216 | + |
1217 | spin_lock(&rq->lock); |
1218 | + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); |
1219 | + p->ns_debit += debit; |
1220 | + if (p->ns_debit < NSJIFFY) |
1221 | + goto out_unlock; |
1222 | + p->ns_debit %= NSJIFFY; |
1223 | /* |
1224 | - * The task was running during this tick - update the |
1225 | - * time slice counter. Note: we do not update a thread's |
1226 | - * priority until it either goes to sleep or uses up its |
1227 | - * timeslice. This makes it possible for interactive tasks |
1228 | - * to use up their timeslices at their highest priority levels. |
1229 | + * Tasks lose bonus each time they use up a full slice(). |
1230 | */ |
1231 | - if (rt_task(p)) { |
1232 | - /* |
1233 | - * RR tasks need a special form of timeslice management. |
1234 | - * FIFO tasks have no timeslices. |
1235 | - */ |
1236 | - if ((p->policy == SCHED_RR) && !--p->time_slice) { |
1237 | - p->time_slice = task_timeslice(p); |
1238 | - p->first_time_slice = 0; |
1239 | - set_tsk_need_resched(p); |
1240 | - |
1241 | - /* put it at the end of the queue: */ |
1242 | - requeue_task(p, rq->active); |
1243 | - } |
1244 | + if (!--p->slice) { |
1245 | + dec_bonus(p); |
1246 | + p->totalrun = 0; |
1247 | + p->slice = slice(p); |
1248 | + time_slice_expired(p, rq); |
1249 | goto out_unlock; |
1250 | } |
1251 | + /* |
1252 | + * Tasks that run out of time_slice but still have slice left get |
1253 | + * requeued with a lower priority && RR_INTERVAL time_slice. |
1254 | + */ |
1255 | if (!--p->time_slice) { |
1256 | - dequeue_task(p, rq->active); |
1257 | - set_tsk_need_resched(p); |
1258 | - p->prio = effective_prio(p); |
1259 | - p->time_slice = task_timeslice(p); |
1260 | - p->first_time_slice = 0; |
1261 | - |
1262 | - if (!rq->expired_timestamp) |
1263 | - rq->expired_timestamp = jiffies; |
1264 | - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { |
1265 | - enqueue_task(p, rq->expired); |
1266 | - if (p->static_prio < rq->best_expired_prio) |
1267 | - rq->best_expired_prio = p->static_prio; |
1268 | - } else |
1269 | - enqueue_task(p, rq->active); |
1270 | - } else { |
1271 | - /* |
1272 | - * Prevent a too long timeslice allowing a task to monopolize |
1273 | - * the CPU. We do this by splitting up the timeslice into |
1274 | - * smaller pieces. |
1275 | - * |
1276 | - * Note: this does not mean the task's timeslices expire or |
1277 | - * get lost in any way, they just might be preempted by |
1278 | - * another task of equal priority. (one with higher |
1279 | - * priority would have preempted this task already.) We |
1280 | - * requeue this task to the end of the list on this priority |
1281 | - * level, which is in essence a round-robin of tasks with |
1282 | - * equal priority. |
1283 | - * |
1284 | - * This only applies to tasks in the interactive |
1285 | - * delta range with at least TIMESLICE_GRANULARITY to requeue. |
1286 | - */ |
1287 | - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - |
1288 | - p->time_slice) % TIMESLICE_GRANULARITY(p)) && |
1289 | - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && |
1290 | - (p->array == rq->active)) { |
1291 | - |
1292 | - requeue_task(p, rq->active); |
1293 | - set_tsk_need_resched(p); |
1294 | - } |
1295 | + time_slice_expired(p, rq); |
1296 | + goto out_unlock; |
1297 | } |
1298 | out_unlock: |
1299 | spin_unlock(&rq->lock); |
1300 | @@ -2896,12 +2708,13 @@ static void wake_sleeping_dependent(int |
1301 | |
1302 | /* |
1303 | * number of 'lost' timeslices this task wont be able to fully |
1304 | - * utilize, if another task runs on a sibling. This models the |
1305 | + * utilise, if another task runs on a sibling. This models the |
1306 | * slowdown effect of other tasks running on siblings: |
1307 | */ |
1308 | -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) |
1309 | +static inline unsigned long |
1310 | +smt_slice(const task_t *p, const struct sched_domain *sd) |
1311 | { |
1312 | - return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
1313 | + return p->slice * (100 - sd->per_cpu_gain) / 100; |
1314 | } |
1315 | |
1316 | /* |
1317 | @@ -2964,7 +2777,7 @@ static int dependent_sleeper(int this_cp |
1318 | } else |
1319 | if (smt_curr->static_prio < p->static_prio && |
1320 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
1321 | - smt_slice(smt_curr, sd) > task_timeslice(p)) |
1322 | + smt_slice(smt_curr, sd) > slice(p)) |
1323 | ret = 1; |
1324 | |
1325 | unlock: |
1326 | @@ -3015,12 +2828,6 @@ EXPORT_SYMBOL(sub_preempt_count); |
1327 | |
1328 | #endif |
1329 | |
1330 | -static inline int interactive_sleep(enum sleep_type sleep_type) |
1331 | -{ |
1332 | - return (sleep_type == SLEEP_INTERACTIVE || |
1333 | - sleep_type == SLEEP_INTERRUPTED); |
1334 | -} |
1335 | - |
1336 | /* |
1337 | * schedule() is the main scheduler function. |
1338 | */ |
1339 | @@ -3029,11 +2836,10 @@ asmlinkage void __sched schedule(void) |
1340 | long *switch_count; |
1341 | task_t *prev, *next; |
1342 | runqueue_t *rq; |
1343 | - prio_array_t *array; |
1344 | struct list_head *queue; |
1345 | unsigned long long now; |
1346 | - unsigned long run_time; |
1347 | - int cpu, idx, new_prio; |
1348 | + unsigned long debit; |
1349 | + int cpu, idx; |
1350 | |
1351 | /* |
1352 | * Test if we are atomic. Since do_exit() needs to call into |
1353 | @@ -3066,20 +2872,11 @@ need_resched_nonpreemptible: |
1354 | |
1355 | schedstat_inc(rq, sched_cnt); |
1356 | now = sched_clock(); |
1357 | - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { |
1358 | - run_time = now - prev->timestamp; |
1359 | - if (unlikely((long long)(now - prev->timestamp) < 0)) |
1360 | - run_time = 0; |
1361 | - } else |
1362 | - run_time = NS_MAX_SLEEP_AVG; |
1363 | - |
1364 | - /* |
1365 | - * Tasks charged proportionately less run_time at high sleep_avg to |
1366 | - * delay them losing their interactive status |
1367 | - */ |
1368 | - run_time /= (CURRENT_BONUS(prev) ? : 1); |
1369 | |
1370 | spin_lock_irq(&rq->lock); |
1371 | + prev->runtime = ns_diff(now, prev->timestamp); |
1372 | + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY; |
1373 | + prev->ns_debit += debit; |
1374 | |
1375 | if (unlikely(prev->flags & PF_DEAD)) |
1376 | prev->state = EXIT_DEAD; |
1377 | @@ -3091,8 +2888,10 @@ need_resched_nonpreemptible: |
1378 | unlikely(signal_pending(prev)))) |
1379 | prev->state = TASK_RUNNING; |
1380 | else { |
1381 | - if (prev->state == TASK_UNINTERRUPTIBLE) |
1382 | + if (prev->state == TASK_UNINTERRUPTIBLE) { |
1383 | + prev->flags |= PF_NONSLEEP; |
1384 | rq->nr_uninterruptible++; |
1385 | + } |
1386 | deactivate_task(prev, rq); |
1387 | } |
1388 | } |
1389 | @@ -3102,64 +2901,30 @@ need_resched_nonpreemptible: |
1390 | idle_balance(cpu, rq); |
1391 | if (!rq->nr_running) { |
1392 | next = rq->idle; |
1393 | - rq->expired_timestamp = 0; |
1394 | wake_sleeping_dependent(cpu); |
1395 | goto switch_tasks; |
1396 | } |
1397 | } |
1398 | |
1399 | - array = rq->active; |
1400 | - if (unlikely(!array->nr_active)) { |
1401 | - /* |
1402 | - * Switch the active and expired arrays. |
1403 | - */ |
1404 | - schedstat_inc(rq, sched_switch); |
1405 | - rq->active = rq->expired; |
1406 | - rq->expired = array; |
1407 | - array = rq->active; |
1408 | - rq->expired_timestamp = 0; |
1409 | - rq->best_expired_prio = MAX_PRIO; |
1410 | - } |
1411 | - |
1412 | - idx = sched_find_first_bit(array->bitmap); |
1413 | - queue = array->queue + idx; |
1414 | + idx = sched_find_first_bit(rq->bitmap); |
1415 | + queue = rq->queue + idx; |
1416 | next = list_entry(queue->next, task_t, run_list); |
1417 | |
1418 | - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
1419 | - unsigned long long delta = now - next->timestamp; |
1420 | - if (unlikely((long long)(now - next->timestamp) < 0)) |
1421 | - delta = 0; |
1422 | - |
1423 | - if (next->sleep_type == SLEEP_INTERACTIVE) |
1424 | - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
1425 | - |
1426 | - array = next->array; |
1427 | - new_prio = recalc_task_prio(next, next->timestamp + delta); |
1428 | - |
1429 | - if (unlikely(next->prio != new_prio)) { |
1430 | - dequeue_task(next, array); |
1431 | - next->prio = new_prio; |
1432 | - enqueue_task(next, array); |
1433 | - } |
1434 | - } |
1435 | - next->sleep_type = SLEEP_NORMAL; |
1436 | if (dependent_sleeper(cpu, rq, next)) |
1437 | next = rq->idle; |
1438 | + else { |
1439 | + prefetch(next); |
1440 | + prefetch_stack(next); |
1441 | + } |
1442 | switch_tasks: |
1443 | if (next == rq->idle) |
1444 | schedstat_inc(rq, sched_goidle); |
1445 | - prefetch(next); |
1446 | - prefetch_stack(next); |
1447 | + prev->timestamp = now; |
1448 | clear_tsk_need_resched(prev); |
1449 | rcu_qsctr_inc(task_cpu(prev)); |
1450 | |
1451 | update_cpu_clock(prev, rq, now); |
1452 | |
1453 | - prev->sleep_avg -= run_time; |
1454 | - if ((long)prev->sleep_avg <= 0) |
1455 | - prev->sleep_avg = 0; |
1456 | - prev->timestamp = prev->last_ran = now; |
1457 | - |
1458 | sched_info_switch(prev, next); |
1459 | if (likely(prev != next)) { |
1460 | next->timestamp = now; |
1461 | @@ -3591,9 +3356,8 @@ EXPORT_SYMBOL(sleep_on_timeout); |
1462 | void set_user_nice(task_t *p, long nice) |
1463 | { |
1464 | unsigned long flags; |
1465 | - prio_array_t *array; |
1466 | runqueue_t *rq; |
1467 | - int old_prio, new_prio, delta; |
1468 | + int queued, old_prio, new_prio, delta; |
1469 | |
1470 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
1471 | return; |
1472 | @@ -3612,9 +3376,8 @@ void set_user_nice(task_t *p, long nice) |
1473 | p->static_prio = NICE_TO_PRIO(nice); |
1474 | goto out_unlock; |
1475 | } |
1476 | - array = p->array; |
1477 | - if (array) { |
1478 | - dequeue_task(p, array); |
1479 | + if ((queued = task_queued(p))) { |
1480 | + dequeue_task(p, rq); |
1481 | dec_raw_weighted_load(rq, p); |
1482 | } |
1483 | |
1484 | @@ -3624,9 +3387,11 @@ void set_user_nice(task_t *p, long nice) |
1485 | p->static_prio = NICE_TO_PRIO(nice); |
1486 | set_load_weight(p); |
1487 | p->prio += delta; |
1488 | + if (p->bonus > bonus(p)) |
1489 | + p->bonus= bonus(p); |
1490 | |
1491 | - if (array) { |
1492 | - enqueue_task(p, array); |
1493 | + if (queued) { |
1494 | + enqueue_task(p, rq); |
1495 | inc_raw_weighted_load(rq, p); |
1496 | /* |
1497 | * If the task increased its priority or is running and |
1498 | @@ -3750,19 +3515,13 @@ static inline task_t *find_process_by_pi |
1499 | /* Actually do priority change: must hold rq lock. */ |
1500 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
1501 | { |
1502 | - BUG_ON(p->array); |
1503 | + BUG_ON(task_queued(p)); |
1504 | p->policy = policy; |
1505 | p->rt_priority = prio; |
1506 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { |
1507 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
1508 | - } else { |
1509 | + } else |
1510 | p->prio = p->static_prio; |
1511 | - /* |
1512 | - * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
1513 | - */ |
1514 | - if (policy == SCHED_BATCH) |
1515 | - p->sleep_avg = 0; |
1516 | - } |
1517 | set_load_weight(p); |
1518 | } |
1519 | |
1520 | @@ -3777,8 +3536,7 @@ int sched_setscheduler(struct task_struc |
1521 | struct sched_param *param) |
1522 | { |
1523 | int retval; |
1524 | - int oldprio, oldpolicy = -1; |
1525 | - prio_array_t *array; |
1526 | + int queued, oldprio, oldpolicy = -1; |
1527 | unsigned long flags; |
1528 | runqueue_t *rq; |
1529 | |
1530 | @@ -3840,12 +3598,11 @@ recheck: |
1531 | task_rq_unlock(rq, &flags); |
1532 | goto recheck; |
1533 | } |
1534 | - array = p->array; |
1535 | - if (array) |
1536 | + if ((queued = task_queued(p))) |
1537 | deactivate_task(p, rq); |
1538 | oldprio = p->prio; |
1539 | __setscheduler(p, policy, param->sched_priority); |
1540 | - if (array) { |
1541 | + if (queued) { |
1542 | __activate_task(p, rq); |
1543 | /* |
1544 | * Reschedule if we are currently running on this runqueue and |
1545 | @@ -3855,8 +3612,8 @@ recheck: |
1546 | if (task_running(rq, p)) { |
1547 | if (p->prio > oldprio) |
1548 | resched_task(rq->curr); |
1549 | - } else if (TASK_PREEMPTS_CURR(p, rq)) |
1550 | - resched_task(rq->curr); |
1551 | + } else |
1552 | + preempt(p, rq); |
1553 | } |
1554 | task_rq_unlock(rq, &flags); |
1555 | return 0; |
1556 | @@ -4113,43 +3870,22 @@ asmlinkage long sys_sched_getaffinity(pi |
1557 | |
1558 | /** |
1559 | * sys_sched_yield - yield the current processor to other threads. |
1560 | - * |
1561 | - * this function yields the current CPU by moving the calling thread |
1562 | - * to the expired array. If there are no other threads running on this |
1563 | - * CPU then this function will return. |
1564 | + * This function yields the current CPU by dropping the priority of current |
1565 | + * to the lowest priority. |
1566 | */ |
1567 | asmlinkage long sys_sched_yield(void) |
1568 | { |
1569 | + int newprio; |
1570 | runqueue_t *rq = this_rq_lock(); |
1571 | - prio_array_t *array = current->array; |
1572 | - prio_array_t *target = rq->expired; |
1573 | |
1574 | + newprio = current->prio; |
1575 | schedstat_inc(rq, yld_cnt); |
1576 | - /* |
1577 | - * We implement yielding by moving the task into the expired |
1578 | - * queue. |
1579 | - * |
1580 | - * (special rule: RT tasks will just roundrobin in the active |
1581 | - * array.) |
1582 | - */ |
1583 | - if (rt_task(current)) |
1584 | - target = rq->active; |
1585 | + current->slice = slice(current); |
1586 | + current->time_slice = rr_interval(current); |
1587 | + if (likely(!rt_task(current))) |
1588 | + newprio = MIN_USER_PRIO; |
1589 | |
1590 | - if (array->nr_active == 1) { |
1591 | - schedstat_inc(rq, yld_act_empty); |
1592 | - if (!rq->expired->nr_active) |
1593 | - schedstat_inc(rq, yld_both_empty); |
1594 | - } else if (!rq->expired->nr_active) |
1595 | - schedstat_inc(rq, yld_exp_empty); |
1596 | - |
1597 | - if (array != target) { |
1598 | - dequeue_task(current, array); |
1599 | - enqueue_task(current, target); |
1600 | - } else |
1601 | - /* |
1602 | - * requeue_task is cheaper so perform that if possible. |
1603 | - */ |
1604 | - requeue_task(current, array); |
1605 | + requeue_task(current, rq, newprio); |
1606 | |
1607 | /* |
1608 | * Since we are going to call schedule() anyway, there's |
1609 | @@ -4358,7 +4094,7 @@ long sys_sched_rr_get_interval(pid_t pid |
1610 | goto out_unlock; |
1611 | |
1612 | jiffies_to_timespec(p->policy & SCHED_FIFO ? |
1613 | - 0 : task_timeslice(p), &t); |
1614 | + 0 : slice(p), &t); |
1615 | read_unlock(&tasklist_lock); |
1616 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
1617 | out_nounlock: |
1618 | @@ -4481,8 +4217,6 @@ void __devinit init_idle(task_t *idle, i |
1619 | unsigned long flags; |
1620 | |
1621 | idle->timestamp = sched_clock(); |
1622 | - idle->sleep_avg = 0; |
1623 | - idle->array = NULL; |
1624 | idle->prio = MAX_PRIO; |
1625 | idle->state = TASK_RUNNING; |
1626 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
1627 | @@ -4599,7 +4333,7 @@ static void __migrate_task(struct task_s |
1628 | goto out; |
1629 | |
1630 | set_task_cpu(p, dest_cpu); |
1631 | - if (p->array) { |
1632 | + if (task_queued(p)) { |
1633 | /* |
1634 | * Sync timestamp with rq_dest's before activating. |
1635 | * The same thing could be achieved by doing this step |
1636 | @@ -4610,8 +4344,7 @@ static void __migrate_task(struct task_s |
1637 | + rq_dest->timestamp_last_tick; |
1638 | deactivate_task(p, rq_src); |
1639 | activate_task(p, rq_dest, 0); |
1640 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) |
1641 | - resched_task(rq_dest->curr); |
1642 | + preempt(p, rq_dest); |
1643 | } |
1644 | |
1645 | out: |
1646 | @@ -4825,7 +4558,7 @@ static void migrate_dead_tasks(unsigned |
1647 | |
1648 | for (arr = 0; arr < 2; arr++) { |
1649 | for (i = 0; i < MAX_PRIO; i++) { |
1650 | - struct list_head *list = &rq->arrays[arr].queue[i]; |
1651 | + struct list_head *list = &rq->queue[i]; |
1652 | while (!list_empty(list)) |
1653 | migrate_dead(dead_cpu, |
1654 | list_entry(list->next, task_t, |
1655 | @@ -6226,17 +5959,13 @@ int in_sched_functions(unsigned long add |
1656 | void __init sched_init(void) |
1657 | { |
1658 | runqueue_t *rq; |
1659 | - int i, j, k; |
1660 | + int i, j; |
1661 | |
1662 | for_each_possible_cpu(i) { |
1663 | - prio_array_t *array; |
1664 | |
1665 | rq = cpu_rq(i); |
1666 | spin_lock_init(&rq->lock); |
1667 | rq->nr_running = 0; |
1668 | - rq->active = rq->arrays; |
1669 | - rq->expired = rq->arrays + 1; |
1670 | - rq->best_expired_prio = MAX_PRIO; |
1671 | |
1672 | #ifdef CONFIG_SMP |
1673 | rq->sd = NULL; |
1674 | @@ -6248,16 +5977,11 @@ void __init sched_init(void) |
1675 | INIT_LIST_HEAD(&rq->migration_queue); |
1676 | #endif |
1677 | atomic_set(&rq->nr_iowait, 0); |
1678 | - |
1679 | - for (j = 0; j < 2; j++) { |
1680 | - array = rq->arrays + j; |
1681 | - for (k = 0; k < MAX_PRIO; k++) { |
1682 | - INIT_LIST_HEAD(array->queue + k); |
1683 | - __clear_bit(k, array->bitmap); |
1684 | - } |
1685 | - // delimiter for bitsearch |
1686 | - __set_bit(MAX_PRIO, array->bitmap); |
1687 | - } |
1688 | + for (j = 0; j < MAX_PRIO; j++) |
1689 | + INIT_LIST_HEAD(&rq->queue[j]); |
1690 | + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long)); |
1691 | + /* delimiter for bitsearch */ |
1692 | + __set_bit(MAX_PRIO, rq->bitmap); |
1693 | } |
1694 | |
1695 | set_load_weight(&init_task); |
1696 | @@ -6302,9 +6026,9 @@ EXPORT_SYMBOL(__might_sleep); |
1697 | void normalize_rt_tasks(void) |
1698 | { |
1699 | struct task_struct *p; |
1700 | - prio_array_t *array; |
1701 | unsigned long flags; |
1702 | runqueue_t *rq; |
1703 | + int queued; |
1704 | |
1705 | read_lock_irq(&tasklist_lock); |
1706 | for_each_process(p) { |
1707 | @@ -6313,11 +6037,10 @@ void normalize_rt_tasks(void) |
1708 | |
1709 | rq = task_rq_lock(p, &flags); |
1710 | |
1711 | - array = p->array; |
1712 | - if (array) |
1713 | + if ((queued = task_queued(p))) |
1714 | deactivate_task(p, task_rq(p)); |
1715 | __setscheduler(p, SCHED_NORMAL, 0); |
1716 | - if (array) { |
1717 | + if (queued) { |
1718 | __activate_task(p, task_rq(p)); |
1719 | resched_task(rq->curr); |
1720 | } |