Annotation of /trunk/kernel26-magellan/patches-2.6.20-r3/0001-2.6.20-sched-staircase-17.patch
Parent Directory | Revision Log
Revision 132 -
(hide annotations)
(download)
Fri Apr 27 15:24:56 2007 UTC (17 years, 5 months ago) by niro
File size: 53298 byte(s)
Fri Apr 27 15:24:56 2007 UTC (17 years, 5 months ago) by niro
File size: 53298 byte(s)
files for 2.6.20-r4
1 | niro | 132 | Implement the "staircase" hybrid foreground-background single priority |
2 | array cpu scheduler policy. | ||
3 | |||
4 | Signed-off-by: Con Kolivas <kernel@kolivas.org> | ||
5 | --- | ||
6 | fs/proc/array.c | 4 | ||
7 | include/linux/sched.h | 20 | ||
8 | kernel/exit.c | 1 | ||
9 | kernel/sched.c | 1084 ++++++++++++++++++-------------------------------- | ||
10 | 4 files changed, 404 insertions(+), 705 deletions(-) | ||
11 | |||
12 | Index: linux-2.6.20-ck1/fs/proc/array.c | ||
13 | =================================================================== | ||
14 | --- linux-2.6.20-ck1.orig/fs/proc/array.c 2007-02-05 22:52:03.000000000 +1100 | ||
15 | +++ linux-2.6.20-ck1/fs/proc/array.c 2007-02-16 19:01:30.000000000 +1100 | ||
16 | @@ -165,7 +165,7 @@ static inline char * task_state(struct t | ||
17 | rcu_read_lock(); | ||
18 | buffer += sprintf(buffer, | ||
19 | "State:\t%s\n" | ||
20 | - "SleepAVG:\t%lu%%\n" | ||
21 | + "Bonus:\t%d\n" | ||
22 | "Tgid:\t%d\n" | ||
23 | "Pid:\t%d\n" | ||
24 | "PPid:\t%d\n" | ||
25 | @@ -173,7 +173,7 @@ static inline char * task_state(struct t | ||
26 | "Uid:\t%d\t%d\t%d\t%d\n" | ||
27 | "Gid:\t%d\t%d\t%d\t%d\n", | ||
28 | get_task_state(p), | ||
29 | - (p->sleep_avg/1024)*100/(1020000000/1024), | ||
30 | + p->bonus, | ||
31 | p->tgid, p->pid, | ||
32 | pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, | ||
33 | pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, | ||
34 | Index: linux-2.6.20-ck1/kernel/exit.c | ||
35 | =================================================================== | ||
36 | --- linux-2.6.20-ck1.orig/kernel/exit.c 2007-02-05 22:52:04.000000000 +1100 | ||
37 | +++ linux-2.6.20-ck1/kernel/exit.c 2007-02-16 19:01:30.000000000 +1100 | ||
38 | @@ -170,7 +170,6 @@ repeat: | ||
39 | zap_leader = (leader->exit_signal == -1); | ||
40 | } | ||
41 | |||
42 | - sched_exit(p); | ||
43 | write_unlock_irq(&tasklist_lock); | ||
44 | proc_flush_task(p); | ||
45 | release_thread(p); | ||
46 | Index: linux-2.6.20-ck1/include/linux/sched.h | ||
47 | =================================================================== | ||
48 | --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-05 22:52:04.000000000 +1100 | ||
49 | +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100 | ||
50 | @@ -524,6 +524,7 @@ struct signal_struct { | ||
51 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | ||
52 | |||
53 | #define MAX_PRIO (MAX_RT_PRIO + 40) | ||
54 | +#define MIN_USER_PRIO (MAX_PRIO - 1) | ||
55 | |||
56 | #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) | ||
57 | #define rt_task(p) rt_prio((p)->prio) | ||
58 | @@ -789,15 +790,6 @@ struct mempolicy; | ||
59 | struct pipe_inode_info; | ||
60 | struct uts_namespace; | ||
61 | |||
62 | -enum sleep_type { | ||
63 | - SLEEP_NORMAL, | ||
64 | - SLEEP_NONINTERACTIVE, | ||
65 | - SLEEP_INTERACTIVE, | ||
66 | - SLEEP_INTERRUPTED, | ||
67 | -}; | ||
68 | - | ||
69 | -struct prio_array; | ||
70 | - | ||
71 | struct task_struct { | ||
72 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | ||
73 | struct thread_info *thread_info; | ||
74 | @@ -815,20 +807,19 @@ struct task_struct { | ||
75 | int load_weight; /* for niceness load balancing purposes */ | ||
76 | int prio, static_prio, normal_prio; | ||
77 | struct list_head run_list; | ||
78 | - struct prio_array *array; | ||
79 | |||
80 | unsigned short ioprio; | ||
81 | #ifdef CONFIG_BLK_DEV_IO_TRACE | ||
82 | unsigned int btrace_seq; | ||
83 | #endif | ||
84 | - unsigned long sleep_avg; | ||
85 | unsigned long long timestamp, last_ran; | ||
86 | + unsigned long runtime, totalrun, ns_debit, systime; | ||
87 | + unsigned int bonus; | ||
88 | + unsigned int slice, time_slice; | ||
89 | unsigned long long sched_time; /* sched_clock time spent running */ | ||
90 | - enum sleep_type sleep_type; | ||
91 | |||
92 | unsigned long policy; | ||
93 | cpumask_t cpus_allowed; | ||
94 | - unsigned int time_slice, first_time_slice; | ||
95 | |||
96 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | ||
97 | struct sched_info sched_info; | ||
98 | @@ -1157,6 +1148,8 @@ static inline void put_task_struct(struc | ||
99 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | ||
100 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | ||
101 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | ||
102 | +#define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */ | ||
103 | +#define PF_FORKED 0x80000000 /* Task just forked another process */ | ||
104 | |||
105 | /* | ||
106 | * Only the _current_ task can read/write to tsk->flags, but other | ||
107 | @@ -1291,7 +1284,6 @@ extern void FASTCALL(wake_up_new_task(st | ||
108 | static inline void kick_process(struct task_struct *tsk) { } | ||
109 | #endif | ||
110 | extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); | ||
111 | -extern void FASTCALL(sched_exit(struct task_struct * p)); | ||
112 | |||
113 | extern int in_group_p(gid_t); | ||
114 | extern int in_egroup_p(gid_t); | ||
115 | Index: linux-2.6.20-ck1/kernel/sched.c | ||
116 | =================================================================== | ||
117 | --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-05 22:52:04.000000000 +1100 | ||
118 | +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100 | ||
119 | @@ -16,6 +16,10 @@ | ||
120 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
121 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
122 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
123 | + * 2007-02-14 Staircase scheduling policy by Con Kolivas with help | ||
124 | + * from William Lee Irwin III, Zwane Mwaikambo, Peter Williams | ||
125 | + * and Andreas Mohr. | ||
126 | + * Staircase v17 | ||
127 | */ | ||
128 | |||
129 | #include <linux/mm.h> | ||
130 | @@ -77,123 +81,19 @@ | ||
131 | /* | ||
132 | * Some helpers for converting nanosecond timing to jiffy resolution | ||
133 | */ | ||
134 | -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | ||
135 | -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | ||
136 | - | ||
137 | -/* | ||
138 | - * These are the 'tuning knobs' of the scheduler: | ||
139 | - * | ||
140 | - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | ||
141 | - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
142 | - * Timeslices get refilled after they expire. | ||
143 | - */ | ||
144 | -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
145 | -#define DEF_TIMESLICE (100 * HZ / 1000) | ||
146 | -#define ON_RUNQUEUE_WEIGHT 30 | ||
147 | -#define CHILD_PENALTY 95 | ||
148 | -#define PARENT_PENALTY 100 | ||
149 | -#define EXIT_WEIGHT 3 | ||
150 | -#define PRIO_BONUS_RATIO 25 | ||
151 | -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | ||
152 | -#define INTERACTIVE_DELTA 2 | ||
153 | -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | ||
154 | -#define STARVATION_LIMIT (MAX_SLEEP_AVG) | ||
155 | -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | ||
156 | - | ||
157 | -/* | ||
158 | - * If a task is 'interactive' then we reinsert it in the active | ||
159 | - * array after it has expired its current timeslice. (it will not | ||
160 | - * continue to run immediately, it will still roundrobin with | ||
161 | - * other interactive tasks.) | ||
162 | - * | ||
163 | - * This part scales the interactivity limit depending on niceness. | ||
164 | - * | ||
165 | - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | ||
166 | - * Here are a few examples of different nice levels: | ||
167 | - * | ||
168 | - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | ||
169 | - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | ||
170 | - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | ||
171 | - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | ||
172 | - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | ||
173 | - * | ||
174 | - * (the X axis represents the possible -5 ... 0 ... +5 dynamic | ||
175 | - * priority range a task can explore, a value of '1' means the | ||
176 | - * task is rated interactive.) | ||
177 | - * | ||
178 | - * Ie. nice +19 tasks can never get 'interactive' enough to be | ||
179 | - * reinserted into the active array. And only heavily CPU-hog nice -20 | ||
180 | - * tasks will be expired. Default nice 0 tasks are somewhere between, | ||
181 | - * it takes some effort for them to get interactive, but it's not | ||
182 | - * too hard. | ||
183 | - */ | ||
184 | - | ||
185 | -#define CURRENT_BONUS(p) \ | ||
186 | - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | ||
187 | - MAX_SLEEP_AVG) | ||
188 | - | ||
189 | -#define GRANULARITY (10 * HZ / 1000 ? : 1) | ||
190 | - | ||
191 | -#ifdef CONFIG_SMP | ||
192 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
193 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | ||
194 | - num_online_cpus()) | ||
195 | -#else | ||
196 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
197 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | ||
198 | -#endif | ||
199 | - | ||
200 | -#define SCALE(v1,v1_max,v2_max) \ | ||
201 | - (v1) * (v2_max) / (v1_max) | ||
202 | - | ||
203 | -#define DELTA(p) \ | ||
204 | - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ | ||
205 | - INTERACTIVE_DELTA) | ||
206 | - | ||
207 | -#define TASK_INTERACTIVE(p) \ | ||
208 | - ((p)->prio <= (p)->static_prio - DELTA(p)) | ||
209 | - | ||
210 | -#define INTERACTIVE_SLEEP(p) \ | ||
211 | - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | ||
212 | - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | ||
213 | +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ | ||
214 | +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY) | ||
215 | +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY) | ||
216 | |||
217 | #define TASK_PREEMPTS_CURR(p, rq) \ | ||
218 | ((p)->prio < (rq)->curr->prio) | ||
219 | |||
220 | -#define SCALE_PRIO(x, prio) \ | ||
221 | - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
222 | - | ||
223 | -static unsigned int static_prio_timeslice(int static_prio) | ||
224 | -{ | ||
225 | - if (static_prio < NICE_TO_PRIO(0)) | ||
226 | - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
227 | - else | ||
228 | - return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
229 | -} | ||
230 | - | ||
231 | /* | ||
232 | - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
233 | - * to time slice values: [800ms ... 100ms ... 5ms] | ||
234 | - * | ||
235 | - * The higher a thread's priority, the bigger timeslices | ||
236 | - * it gets during one round of execution. But even the lowest | ||
237 | - * priority thread gets MIN_TIMESLICE worth of execution time. | ||
238 | + * This is the time all tasks within the same priority round robin. | ||
239 | + * Set to a minimum of 6ms. | ||
240 | */ | ||
241 | - | ||
242 | -static inline unsigned int task_timeslice(struct task_struct *p) | ||
243 | -{ | ||
244 | - return static_prio_timeslice(p->static_prio); | ||
245 | -} | ||
246 | - | ||
247 | -/* | ||
248 | - * These are the runqueue data structures: | ||
249 | - */ | ||
250 | - | ||
251 | -struct prio_array { | ||
252 | - unsigned int nr_active; | ||
253 | - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ | ||
254 | - struct list_head queue[MAX_PRIO]; | ||
255 | -}; | ||
256 | +#define RR_INTERVAL ((6 * HZ / 1001) + 1) | ||
257 | +#define DEF_TIMESLICE (RR_INTERVAL * 19) | ||
258 | |||
259 | /* | ||
260 | * This is the main, per-CPU runqueue data structure. | ||
261 | @@ -224,14 +124,13 @@ struct rq { | ||
262 | */ | ||
263 | unsigned long nr_uninterruptible; | ||
264 | |||
265 | - unsigned long expired_timestamp; | ||
266 | /* Cached timestamp set by update_cpu_clock() */ | ||
267 | unsigned long long most_recent_timestamp; | ||
268 | struct task_struct *curr, *idle; | ||
269 | unsigned long next_balance; | ||
270 | struct mm_struct *prev_mm; | ||
271 | - struct prio_array *active, *expired, arrays[2]; | ||
272 | - int best_expired_prio; | ||
273 | + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; | ||
274 | + struct list_head queue[MAX_PRIO]; | ||
275 | atomic_t nr_iowait; | ||
276 | |||
277 | #ifdef CONFIG_SMP | ||
278 | @@ -568,13 +467,7 @@ static inline struct rq *this_rq_lock(vo | ||
279 | |||
280 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | ||
281 | /* | ||
282 | - * Called when a process is dequeued from the active array and given | ||
283 | - * the cpu. We should note that with the exception of interactive | ||
284 | - * tasks, the expired queue will become the active queue after the active | ||
285 | - * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
286 | - * expired queue. (Interactive tasks may be requeued directly to the | ||
287 | - * active queue, thus delaying tasks in the expired queue from running; | ||
288 | - * see scheduler_tick()). | ||
289 | + * Called when a process is dequeued and given the cpu. | ||
290 | * | ||
291 | * This function is only called from sched_info_arrive(), rather than | ||
292 | * dequeue_task(). Even though a task may be queued and dequeued multiple | ||
293 | @@ -607,13 +500,11 @@ static void sched_info_arrive(struct tas | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | - * Called when a process is queued into either the active or expired | ||
298 | - * array. The time is noted and later used to determine how long we | ||
299 | - * had to wait for us to reach the cpu. Since the expired queue will | ||
300 | - * become the active queue after active queue is empty, without dequeuing | ||
301 | - * and requeuing any tasks, we are interested in queuing to either. It | ||
302 | - * is unusual but not impossible for tasks to be dequeued and immediately | ||
303 | - * requeued in the same or another array: this can happen in sched_yield(), | ||
304 | + * Called when a process is queued. | ||
305 | + * The time is noted and later used to determine how long we had to wait for | ||
306 | + * us to reach the cpu. | ||
307 | + * It is unusual but not impossible for tasks to be dequeued and immediately | ||
308 | + * requeued: this can happen in sched_yield(), | ||
309 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | ||
310 | * to runqueue. | ||
311 | * | ||
312 | @@ -672,73 +563,81 @@ sched_info_switch(struct task_struct *pr | ||
313 | #define sched_info_switch(t, next) do { } while (0) | ||
314 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | ||
315 | |||
316 | -/* | ||
317 | - * Adding/removing a task to/from a priority array: | ||
318 | - */ | ||
319 | -static void dequeue_task(struct task_struct *p, struct prio_array *array) | ||
320 | +#if BITS_PER_LONG < 64 | ||
321 | +static inline void longlimit(unsigned long long *longlong) | ||
322 | +{ | ||
323 | + if (*longlong > (1 << 31)) | ||
324 | + *longlong = 1 << 31; | ||
325 | +} | ||
326 | +#else | ||
327 | +static inline void longlimit(unsigned long long *__unused) | ||
328 | +{ | ||
329 | +} | ||
330 | +#endif | ||
331 | + | ||
332 | +/* Get nanosecond clock difference without overflowing unsigned long. */ | ||
333 | +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2) | ||
334 | { | ||
335 | - array->nr_active--; | ||
336 | - list_del(&p->run_list); | ||
337 | - if (list_empty(array->queue + p->prio)) | ||
338 | - __clear_bit(p->prio, array->bitmap); | ||
339 | + unsigned long long vdiff; | ||
340 | + if (likely(v1 >= v2)) { | ||
341 | + vdiff = v1 - v2; | ||
342 | + longlimit(&vdiff); | ||
343 | + } else { | ||
344 | + /* | ||
345 | + * Rarely the clock appears to go backwards. There should | ||
346 | + * always be a positive difference so return 1. | ||
347 | + */ | ||
348 | + vdiff = 1; | ||
349 | + } | ||
350 | + return (unsigned long)vdiff; | ||
351 | } | ||
352 | |||
353 | -static void enqueue_task(struct task_struct *p, struct prio_array *array) | ||
354 | +static inline int task_queued(struct task_struct *task) | ||
355 | { | ||
356 | - sched_info_queued(p); | ||
357 | - list_add_tail(&p->run_list, array->queue + p->prio); | ||
358 | - __set_bit(p->prio, array->bitmap); | ||
359 | - array->nr_active++; | ||
360 | - p->array = array; | ||
361 | + return !list_empty(&task->run_list); | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | - * Put task to the end of the run list without the overhead of dequeue | ||
366 | - * followed by enqueue. | ||
367 | + * Adding/removing a task to/from a runqueue: | ||
368 | */ | ||
369 | -static void requeue_task(struct task_struct *p, struct prio_array *array) | ||
370 | +static void dequeue_task(struct task_struct *p, struct rq *rq) | ||
371 | { | ||
372 | - list_move_tail(&p->run_list, array->queue + p->prio); | ||
373 | + list_del_init(&p->run_list); | ||
374 | + if (list_empty(rq->queue + p->prio)) | ||
375 | + __clear_bit(p->prio, rq->bitmap); | ||
376 | + p->ns_debit = 0; | ||
377 | } | ||
378 | |||
379 | -static inline void | ||
380 | -enqueue_task_head(struct task_struct *p, struct prio_array *array) | ||
381 | +static void enqueue_task(struct task_struct *p, struct rq *rq) | ||
382 | { | ||
383 | - list_add(&p->run_list, array->queue + p->prio); | ||
384 | - __set_bit(p->prio, array->bitmap); | ||
385 | - array->nr_active++; | ||
386 | - p->array = array; | ||
387 | + list_add_tail(&p->run_list, rq->queue + p->prio); | ||
388 | + __set_bit(p->prio, rq->bitmap); | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | - * __normal_prio - return the priority that is based on the static | ||
393 | - * priority but is modified by bonuses/penalties. | ||
394 | - * | ||
395 | - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | ||
396 | - * into the -5 ... 0 ... +5 bonus/penalty range. | ||
397 | - * | ||
398 | - * We use 25% of the full 0...39 priority range so that: | ||
399 | - * | ||
400 | - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | ||
401 | - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | ||
402 | - * | ||
403 | - * Both properties are important to certain workloads. | ||
404 | + * Put task to the end of the run list without the overhead of dequeue | ||
405 | + * followed by enqueue. | ||
406 | */ | ||
407 | - | ||
408 | -static inline int __normal_prio(struct task_struct *p) | ||
409 | +static void requeue_task(struct task_struct *p, struct rq *rq, const int prio) | ||
410 | { | ||
411 | - int bonus, prio; | ||
412 | - | ||
413 | - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | ||
414 | + list_move_tail(&p->run_list, rq->queue + prio); | ||
415 | + if (p->prio != prio) { | ||
416 | + if (list_empty(rq->queue + p->prio)) | ||
417 | + __clear_bit(p->prio, rq->bitmap); | ||
418 | + p->prio = prio; | ||
419 | + __set_bit(prio, rq->bitmap); | ||
420 | + } | ||
421 | + p->ns_debit = 0; | ||
422 | +} | ||
423 | |||
424 | - prio = p->static_prio - bonus; | ||
425 | - if (prio < MAX_RT_PRIO) | ||
426 | - prio = MAX_RT_PRIO; | ||
427 | - if (prio > MAX_PRIO-1) | ||
428 | - prio = MAX_PRIO-1; | ||
429 | - return prio; | ||
430 | +static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) | ||
431 | +{ | ||
432 | + list_add(&p->run_list, rq->queue + p->prio); | ||
433 | + __set_bit(p->prio, rq->bitmap); | ||
434 | } | ||
435 | |||
436 | +static unsigned int slice(const struct task_struct *p); | ||
437 | + | ||
438 | /* | ||
439 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
440 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
441 | @@ -756,10 +655,9 @@ static inline int __normal_prio(struct t | ||
442 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
443 | #define LOAD_WEIGHT(lp) \ | ||
444 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
445 | -#define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
446 | - LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
447 | -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
448 | - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
449 | +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p)) | ||
450 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
451 | + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp)))) | ||
452 | |||
453 | static void set_load_weight(struct task_struct *p) | ||
454 | { | ||
455 | @@ -776,7 +674,7 @@ static void set_load_weight(struct task_ | ||
456 | #endif | ||
457 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
458 | } else | ||
459 | - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
460 | + p->load_weight = TASK_LOAD_WEIGHT(p); | ||
461 | } | ||
462 | |||
463 | static inline void | ||
464 | @@ -804,6 +702,182 @@ static inline void dec_nr_running(struct | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | + * __activate_task - move a task to the runqueue. | ||
469 | + */ | ||
470 | +static inline void __activate_task(struct task_struct *p, struct rq *rq) | ||
471 | +{ | ||
472 | + enqueue_task(p, rq); | ||
473 | + inc_nr_running(p, rq); | ||
474 | +} | ||
475 | + | ||
476 | +/* | ||
477 | + * __activate_idle_task - move idle task to the _front_ of runqueue. | ||
478 | + */ | ||
479 | +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) | ||
480 | +{ | ||
481 | + enqueue_task_head(p, rq); | ||
482 | + inc_nr_running(p, rq); | ||
483 | +} | ||
484 | + | ||
485 | +/* | ||
486 | + * Bonus - How much higher than its base priority an interactive task can run. | ||
487 | + */ | ||
488 | +static inline unsigned int bonus(const struct task_struct *p) | ||
489 | +{ | ||
490 | + return TASK_USER_PRIO(p); | ||
491 | +} | ||
492 | + | ||
493 | +static unsigned int rr_interval(const struct task_struct *p) | ||
494 | +{ | ||
495 | + int nice = TASK_NICE(p); | ||
496 | + | ||
497 | + if (nice < 0 && !rt_task(p)) | ||
498 | + return RR_INTERVAL * (20 - nice) / 20; | ||
499 | + return RR_INTERVAL; | ||
500 | +} | ||
501 | + | ||
502 | +/* | ||
503 | + * slice - the duration a task runs before getting requeued at its best | ||
504 | + * priority and has its bonus decremented. | ||
505 | + */ | ||
506 | +static unsigned int slice(const struct task_struct *p) | ||
507 | +{ | ||
508 | + unsigned int slice, rr; | ||
509 | + | ||
510 | + slice = rr = rr_interval(p); | ||
511 | + if (likely(!rt_task(p))) | ||
512 | + slice += (39 - TASK_USER_PRIO(p)) * rr; | ||
513 | + return slice; | ||
514 | +} | ||
515 | + | ||
516 | +/* | ||
517 | + * We increase our bonus by sleeping more than the time we ran. | ||
518 | + * The ratio of sleep to run gives us the cpu% that we last ran and determines | ||
519 | + * the maximum bonus we can acquire. | ||
520 | + */ | ||
521 | +static void inc_bonus(struct task_struct *p, unsigned long totalrun, unsigned long sleep) | ||
522 | +{ | ||
523 | + unsigned int best_bonus = sleep / (totalrun + 1); | ||
524 | + | ||
525 | + if (p->bonus >= best_bonus) | ||
526 | + return; | ||
527 | + best_bonus = bonus(p); | ||
528 | + if (p->bonus < best_bonus) | ||
529 | + p->bonus++; | ||
530 | +} | ||
531 | + | ||
532 | +static inline void dec_bonus(struct task_struct *p) | ||
533 | +{ | ||
534 | + if (p->bonus) | ||
535 | + p->bonus--; | ||
536 | +} | ||
537 | + | ||
538 | +static inline void slice_overrun(struct task_struct *p) | ||
539 | +{ | ||
540 | + unsigned long ns_slice = JIFFIES_TO_NS(p->slice); | ||
541 | + | ||
542 | + do { | ||
543 | + p->totalrun -= ns_slice; | ||
544 | + dec_bonus(p); | ||
545 | + } while (unlikely(p->totalrun > ns_slice)); | ||
546 | +} | ||
547 | + | ||
548 | +static inline void continue_slice(struct task_struct *p) | ||
549 | +{ | ||
550 | + unsigned long total_run = NS_TO_JIFFIES(p->totalrun); | ||
551 | + | ||
552 | + if (unlikely(total_run >= p->slice)) | ||
553 | + slice_overrun(p); | ||
554 | + else { | ||
555 | + unsigned long remainder; | ||
556 | + | ||
557 | + p->slice -= total_run; | ||
558 | + remainder = p->slice % rr_interval(p); | ||
559 | + if (remainder) | ||
560 | + p->time_slice = remainder; | ||
561 | + } | ||
562 | +} | ||
563 | + | ||
564 | +/* | ||
565 | + * recalc_task_prio - this checks for tasks that have run less than a full | ||
566 | + * slice and have woken up again soon after, or have just forked a | ||
567 | + * thread/process and make them continue their old slice instead of starting | ||
568 | + * a new one at high priority. | ||
569 | + */ | ||
570 | +static inline void recalc_task_prio(struct task_struct *p, const unsigned long long now) | ||
571 | +{ | ||
572 | + unsigned long sleep_time; | ||
573 | + | ||
574 | + /* | ||
575 | + * If this task has managed to run to its lowest priority then | ||
576 | + * decrease its bonus and requeue it now at best priority instead | ||
577 | + * of possibly flagging around lowest priority. Save up any systime | ||
578 | + * that may affect priority on the next reschedule. | ||
579 | + */ | ||
580 | + if (p->slice > p->time_slice && | ||
581 | + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) { | ||
582 | + dec_bonus(p); | ||
583 | + p->totalrun = 0; | ||
584 | + return; | ||
585 | + } | ||
586 | + | ||
587 | + /* | ||
588 | + * Add the total for this last scheduled run (p->runtime) and system | ||
589 | + * time (p->systime) done on behalf of p to the running total so far | ||
590 | + * used (p->totalrun). | ||
591 | + */ | ||
592 | + p->totalrun += p->runtime + p->systime; | ||
593 | + sleep_time = ns_diff(now, p->timestamp); | ||
594 | + | ||
595 | + if (p->systime > sleep_time || p->flags & PF_FORKED) | ||
596 | + sleep_time = 0; | ||
597 | + else { | ||
598 | + sleep_time -= p->systime; | ||
599 | + /* | ||
600 | + * We elevate priority by the amount of time we slept. If we | ||
601 | + * sleep longer than our running total and have not set the | ||
602 | + * PF_NONSLEEP flag we gain a bonus. | ||
603 | + */ | ||
604 | + if (sleep_time >= p->totalrun) { | ||
605 | + if (!(p->flags & PF_NONSLEEP)) | ||
606 | + inc_bonus(p, p->totalrun, sleep_time); | ||
607 | + p->totalrun = 0; | ||
608 | + return; | ||
609 | + } | ||
610 | + p->totalrun -= sleep_time; | ||
611 | + } | ||
612 | + continue_slice(p); | ||
613 | +} | ||
614 | + | ||
615 | +/* | ||
616 | + * __normal_prio - dynamic priority dependent on bonus. | ||
617 | + * The priority normally decreases by one each RR_INTERVAL. | ||
618 | + * As the bonus increases the initial priority starts at a higher "stair" or | ||
619 | + * priority for longer. | ||
620 | + */ | ||
621 | +static inline int __normal_prio(struct task_struct *p) | ||
622 | +{ | ||
623 | + int prio; | ||
624 | + unsigned int full_slice, used_slice = 0; | ||
625 | + unsigned int best_bonus, rr; | ||
626 | + | ||
627 | + full_slice = slice(p); | ||
628 | + if (full_slice > p->slice) | ||
629 | + used_slice = full_slice - p->slice; | ||
630 | + | ||
631 | + best_bonus = bonus(p); | ||
632 | + prio = MAX_RT_PRIO + best_bonus; | ||
633 | + if (!batch_task(p)) | ||
634 | + prio -= p->bonus; | ||
635 | + | ||
636 | + rr = rr_interval(p); | ||
637 | + prio += used_slice / rr; | ||
638 | + if (prio > MIN_USER_PRIO) | ||
639 | + prio = MIN_USER_PRIO; | ||
640 | + return prio; | ||
641 | +} | ||
642 | + | ||
643 | +/* | ||
644 | * Calculate the expected normal priority: i.e. priority | ||
645 | * without taking RT-inheritance into account. Might be | ||
646 | * boosted by interactivity modifiers. Changes upon fork, | ||
647 | @@ -842,111 +916,14 @@ static int effective_prio(struct task_st | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | - * __activate_task - move a task to the runqueue. | ||
652 | - */ | ||
653 | -static void __activate_task(struct task_struct *p, struct rq *rq) | ||
654 | -{ | ||
655 | - struct prio_array *target = rq->active; | ||
656 | - | ||
657 | - if (batch_task(p)) | ||
658 | - target = rq->expired; | ||
659 | - enqueue_task(p, target); | ||
660 | - inc_nr_running(p, rq); | ||
661 | -} | ||
662 | - | ||
663 | -/* | ||
664 | - * __activate_idle_task - move idle task to the _front_ of runqueue. | ||
665 | - */ | ||
666 | -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) | ||
667 | -{ | ||
668 | - enqueue_task_head(p, rq->active); | ||
669 | - inc_nr_running(p, rq); | ||
670 | -} | ||
671 | - | ||
672 | -/* | ||
673 | - * Recalculate p->normal_prio and p->prio after having slept, | ||
674 | - * updating the sleep-average too: | ||
675 | - */ | ||
676 | -static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
677 | -{ | ||
678 | - /* Caller must always ensure 'now >= p->timestamp' */ | ||
679 | - unsigned long sleep_time = now - p->timestamp; | ||
680 | - | ||
681 | - if (batch_task(p)) | ||
682 | - sleep_time = 0; | ||
683 | - | ||
684 | - if (likely(sleep_time > 0)) { | ||
685 | - /* | ||
686 | - * This ceiling is set to the lowest priority that would allow | ||
687 | - * a task to be reinserted into the active array on timeslice | ||
688 | - * completion. | ||
689 | - */ | ||
690 | - unsigned long ceiling = INTERACTIVE_SLEEP(p); | ||
691 | - | ||
692 | - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { | ||
693 | - /* | ||
694 | - * Prevents user tasks from achieving best priority | ||
695 | - * with one single large enough sleep. | ||
696 | - */ | ||
697 | - p->sleep_avg = ceiling; | ||
698 | - /* | ||
699 | - * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
700 | - * nice(0) task 1ms sleep away from promotion, and | ||
701 | - * gives it 700ms to round-robin with no chance of | ||
702 | - * being demoted. This is more than generous, so | ||
703 | - * mark this sleep as non-interactive to prevent the | ||
704 | - * on-runqueue bonus logic from intervening should | ||
705 | - * this task not receive cpu immediately. | ||
706 | - */ | ||
707 | - p->sleep_type = SLEEP_NONINTERACTIVE; | ||
708 | - } else { | ||
709 | - /* | ||
710 | - * Tasks waking from uninterruptible sleep are | ||
711 | - * limited in their sleep_avg rise as they | ||
712 | - * are likely to be waiting on I/O | ||
713 | - */ | ||
714 | - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | ||
715 | - if (p->sleep_avg >= ceiling) | ||
716 | - sleep_time = 0; | ||
717 | - else if (p->sleep_avg + sleep_time >= | ||
718 | - ceiling) { | ||
719 | - p->sleep_avg = ceiling; | ||
720 | - sleep_time = 0; | ||
721 | - } | ||
722 | - } | ||
723 | - | ||
724 | - /* | ||
725 | - * This code gives a bonus to interactive tasks. | ||
726 | - * | ||
727 | - * The boost works by updating the 'average sleep time' | ||
728 | - * value here, based on ->timestamp. The more time a | ||
729 | - * task spends sleeping, the higher the average gets - | ||
730 | - * and the higher the priority boost gets as well. | ||
731 | - */ | ||
732 | - p->sleep_avg += sleep_time; | ||
733 | - | ||
734 | - } | ||
735 | - if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
736 | - p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
737 | - } | ||
738 | - | ||
739 | - return effective_prio(p); | ||
740 | -} | ||
741 | - | ||
742 | -/* | ||
743 | * activate_task - move a task to the runqueue and do priority recalculation | ||
744 | * | ||
745 | - * Update all the scheduling statistics stuff. (sleep average | ||
746 | - * calculation, priority modifiers, etc.) | ||
747 | */ | ||
748 | static void activate_task(struct task_struct *p, struct rq *rq, int local) | ||
749 | { | ||
750 | - unsigned long long now; | ||
751 | - | ||
752 | - if (rt_task(p)) | ||
753 | - goto out; | ||
754 | + unsigned long long now = sched_clock(); | ||
755 | + unsigned long rr = rr_interval(p); | ||
756 | |||
757 | - now = sched_clock(); | ||
758 | #ifdef CONFIG_SMP | ||
759 | if (!local) { | ||
760 | /* Compensate for drifting sched_clock */ | ||
761 | @@ -967,32 +944,15 @@ static void activate_task(struct task_st | ||
762 | (now - p->timestamp) >> 20); | ||
763 | } | ||
764 | |||
765 | - p->prio = recalc_task_prio(p, now); | ||
766 | - | ||
767 | - /* | ||
768 | - * This checks to make sure it's not an uninterruptible task | ||
769 | - * that is now waking up. | ||
770 | - */ | ||
771 | - if (p->sleep_type == SLEEP_NORMAL) { | ||
772 | - /* | ||
773 | - * Tasks which were woken up by interrupts (ie. hw events) | ||
774 | - * are most likely of interactive nature. So we give them | ||
775 | - * the credit of extending their sleep time to the period | ||
776 | - * of time they spend on the runqueue, waiting for execution | ||
777 | - * on a CPU, first time around: | ||
778 | - */ | ||
779 | - if (in_interrupt()) | ||
780 | - p->sleep_type = SLEEP_INTERRUPTED; | ||
781 | - else { | ||
782 | - /* | ||
783 | - * Normal first-time wakeups get a credit too for | ||
784 | - * on-runqueue time, but it will be weighted down: | ||
785 | - */ | ||
786 | - p->sleep_type = SLEEP_INTERACTIVE; | ||
787 | - } | ||
788 | + p->slice = slice(p); | ||
789 | + p->time_slice = p->slice % rr ? : rr; | ||
790 | + if (!rt_task(p)) { | ||
791 | + recalc_task_prio(p, now); | ||
792 | + p->prio = effective_prio(p); | ||
793 | + p->systime = 0; | ||
794 | + p->flags &= ~(PF_FORKED | PF_NONSLEEP); | ||
795 | } | ||
796 | p->timestamp = now; | ||
797 | -out: | ||
798 | __activate_task(p, rq); | ||
799 | } | ||
800 | |||
801 | @@ -1002,8 +962,7 @@ out: | ||
802 | static void deactivate_task(struct task_struct *p, struct rq *rq) | ||
803 | { | ||
804 | dec_nr_running(p, rq); | ||
805 | - dequeue_task(p, p->array); | ||
806 | - p->array = NULL; | ||
807 | + dequeue_task(p, rq); | ||
808 | } | ||
809 | |||
810 | /* | ||
811 | @@ -1085,7 +1044,7 @@ migrate_task(struct task_struct *p, int | ||
812 | * If the task is not on a runqueue (and not running), then | ||
813 | * it is sufficient to simply update the task's cpu field. | ||
814 | */ | ||
815 | - if (!p->array && !task_running(rq, p)) { | ||
816 | + if (!task_queued(p) && !task_running(rq, p)) { | ||
817 | set_task_cpu(p, dest_cpu); | ||
818 | return 0; | ||
819 | } | ||
820 | @@ -1116,7 +1075,7 @@ void wait_task_inactive(struct task_stru | ||
821 | repeat: | ||
822 | rq = task_rq_lock(p, &flags); | ||
823 | /* Must be off runqueue entirely, not preempted. */ | ||
824 | - if (unlikely(p->array || task_running(rq, p))) { | ||
825 | + if (unlikely(task_queued(p) || task_running(rq, p))) { | ||
826 | /* If it's preempted, we yield. It could be a while. */ | ||
827 | preempted = !task_running(rq, p); | ||
828 | task_rq_unlock(rq, &flags); | ||
829 | @@ -1381,6 +1340,16 @@ static inline int wake_idle(int cpu, str | ||
830 | } | ||
831 | #endif | ||
832 | |||
833 | +/* | ||
834 | + * Check to see if p preempts rq->curr and resched if it does. | ||
835 | + */ | ||
836 | +static inline void preempt(const struct task_struct *p, struct rq *rq) | ||
837 | +{ | ||
838 | + if (TASK_PREEMPTS_CURR(p, rq)) | ||
839 | + resched_task(rq->curr); | ||
840 | +} | ||
841 | + | ||
842 | + | ||
843 | /*** | ||
844 | * try_to_wake_up - wake up a thread | ||
845 | * @p: the to-be-woken-up thread | ||
846 | @@ -1412,7 +1381,7 @@ static int try_to_wake_up(struct task_st | ||
847 | if (!(old_state & state)) | ||
848 | goto out; | ||
849 | |||
850 | - if (p->array) | ||
851 | + if (task_queued(p)) | ||
852 | goto out_running; | ||
853 | |||
854 | cpu = task_cpu(p); | ||
855 | @@ -1505,7 +1474,7 @@ out_set_cpu: | ||
856 | old_state = p->state; | ||
857 | if (!(old_state & state)) | ||
858 | goto out; | ||
859 | - if (p->array) | ||
860 | + if (task_queued(p)) | ||
861 | goto out_running; | ||
862 | |||
863 | this_cpu = smp_processor_id(); | ||
864 | @@ -1514,25 +1483,9 @@ out_set_cpu: | ||
865 | |||
866 | out_activate: | ||
867 | #endif /* CONFIG_SMP */ | ||
868 | - if (old_state == TASK_UNINTERRUPTIBLE) { | ||
869 | + if (old_state == TASK_UNINTERRUPTIBLE) | ||
870 | rq->nr_uninterruptible--; | ||
871 | - /* | ||
872 | - * Tasks on involuntary sleep don't earn | ||
873 | - * sleep_avg beyond just interactive state. | ||
874 | - */ | ||
875 | - p->sleep_type = SLEEP_NONINTERACTIVE; | ||
876 | - } else | ||
877 | - | ||
878 | - /* | ||
879 | - * Tasks that have marked their sleep as noninteractive get | ||
880 | - * woken up with their sleep average not weighted in an | ||
881 | - * interactive way. | ||
882 | - */ | ||
883 | - if (old_state & TASK_NONINTERACTIVE) | ||
884 | - p->sleep_type = SLEEP_NONINTERACTIVE; | ||
885 | - | ||
886 | |||
887 | - activate_task(p, rq, cpu == this_cpu); | ||
888 | /* | ||
889 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
890 | * has indicated that it will leave the CPU in short order) | ||
891 | @@ -1541,10 +1494,9 @@ out_activate: | ||
892 | * the waker guarantees that the freshly woken up task is going | ||
893 | * to be considered on this CPU.) | ||
894 | */ | ||
895 | - if (!sync || cpu != this_cpu) { | ||
896 | - if (TASK_PREEMPTS_CURR(p, rq)) | ||
897 | - resched_task(rq->curr); | ||
898 | - } | ||
899 | + activate_task(p, rq, cpu == this_cpu); | ||
900 | + if (!sync || cpu != this_cpu) | ||
901 | + preempt(p, rq); | ||
902 | success = 1; | ||
903 | |||
904 | out_running: | ||
905 | @@ -1595,7 +1547,6 @@ void fastcall sched_fork(struct task_str | ||
906 | p->prio = current->normal_prio; | ||
907 | |||
908 | INIT_LIST_HEAD(&p->run_list); | ||
909 | - p->array = NULL; | ||
910 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | ||
911 | if (unlikely(sched_info_on())) | ||
912 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
913 | @@ -1607,30 +1558,6 @@ void fastcall sched_fork(struct task_str | ||
914 | /* Want to start with kernel preemption disabled. */ | ||
915 | task_thread_info(p)->preempt_count = 1; | ||
916 | #endif | ||
917 | - /* | ||
918 | - * Share the timeslice between parent and child, thus the | ||
919 | - * total amount of pending timeslices in the system doesn't change, | ||
920 | - * resulting in more scheduling fairness. | ||
921 | - */ | ||
922 | - local_irq_disable(); | ||
923 | - p->time_slice = (current->time_slice + 1) >> 1; | ||
924 | - /* | ||
925 | - * The remainder of the first timeslice might be recovered by | ||
926 | - * the parent if the child exits early enough. | ||
927 | - */ | ||
928 | - p->first_time_slice = 1; | ||
929 | - current->time_slice >>= 1; | ||
930 | - p->timestamp = sched_clock(); | ||
931 | - if (unlikely(!current->time_slice)) { | ||
932 | - /* | ||
933 | - * This case is rare, it happens when the parent has only | ||
934 | - * a single jiffy left from its timeslice. Taking the | ||
935 | - * runqueue lock is not a problem. | ||
936 | - */ | ||
937 | - current->time_slice = 1; | ||
938 | - task_running_tick(cpu_rq(cpu), current); | ||
939 | - } | ||
940 | - local_irq_enable(); | ||
941 | put_cpu(); | ||
942 | } | ||
943 | |||
944 | @@ -1652,38 +1579,20 @@ void fastcall wake_up_new_task(struct ta | ||
945 | this_cpu = smp_processor_id(); | ||
946 | cpu = task_cpu(p); | ||
947 | |||
948 | - /* | ||
949 | - * We decrease the sleep average of forking parents | ||
950 | - * and children as well, to keep max-interactive tasks | ||
951 | - * from forking tasks that are max-interactive. The parent | ||
952 | - * (current) is done further down, under its lock. | ||
953 | - */ | ||
954 | - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
955 | - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
956 | - | ||
957 | - p->prio = effective_prio(p); | ||
958 | + /* Forked process gets no bonus to prevent fork bombs. */ | ||
959 | + p->bonus = 0; | ||
960 | + current->flags |= PF_FORKED; | ||
961 | |||
962 | if (likely(cpu == this_cpu)) { | ||
963 | + activate_task(p, rq, 1); | ||
964 | if (!(clone_flags & CLONE_VM)) { | ||
965 | /* | ||
966 | * The VM isn't cloned, so we're in a good position to | ||
967 | * do child-runs-first in anticipation of an exec. This | ||
968 | * usually avoids a lot of COW overhead. | ||
969 | */ | ||
970 | - if (unlikely(!current->array)) | ||
971 | - __activate_task(p, rq); | ||
972 | - else { | ||
973 | - p->prio = current->prio; | ||
974 | - p->normal_prio = current->normal_prio; | ||
975 | - list_add_tail(&p->run_list, ¤t->run_list); | ||
976 | - p->array = current->array; | ||
977 | - p->array->nr_active++; | ||
978 | - inc_nr_running(p, rq); | ||
979 | - } | ||
980 | set_need_resched(); | ||
981 | - } else | ||
982 | - /* Run child last */ | ||
983 | - __activate_task(p, rq); | ||
984 | + } | ||
985 | /* | ||
986 | * We skip the following code due to cpu == this_cpu | ||
987 | * | ||
988 | @@ -1700,53 +1609,19 @@ void fastcall wake_up_new_task(struct ta | ||
989 | */ | ||
990 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) | ||
991 | + rq->most_recent_timestamp; | ||
992 | - __activate_task(p, rq); | ||
993 | - if (TASK_PREEMPTS_CURR(p, rq)) | ||
994 | - resched_task(rq->curr); | ||
995 | + activate_task(p, rq, 0); | ||
996 | + preempt(p, rq); | ||
997 | |||
998 | /* | ||
999 | * Parent and child are on different CPUs, now get the | ||
1000 | - * parent runqueue to update the parent's ->sleep_avg: | ||
1001 | + * parent runqueue to update the parent's ->flags: | ||
1002 | */ | ||
1003 | task_rq_unlock(rq, &flags); | ||
1004 | this_rq = task_rq_lock(current, &flags); | ||
1005 | } | ||
1006 | - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | ||
1007 | - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
1008 | task_rq_unlock(this_rq, &flags); | ||
1009 | } | ||
1010 | |||
1011 | -/* | ||
1012 | - * Potentially available exiting-child timeslices are | ||
1013 | - * retrieved here - this way the parent does not get | ||
1014 | - * penalized for creating too many threads. | ||
1015 | - * | ||
1016 | - * (this cannot be used to 'generate' timeslices | ||
1017 | - * artificially, because any timeslice recovered here | ||
1018 | - * was given away by the parent in the first place.) | ||
1019 | - */ | ||
1020 | -void fastcall sched_exit(struct task_struct *p) | ||
1021 | -{ | ||
1022 | - unsigned long flags; | ||
1023 | - struct rq *rq; | ||
1024 | - | ||
1025 | - /* | ||
1026 | - * If the child was a (relative-) CPU hog then decrease | ||
1027 | - * the sleep_avg of the parent as well. | ||
1028 | - */ | ||
1029 | - rq = task_rq_lock(p->parent, &flags); | ||
1030 | - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { | ||
1031 | - p->parent->time_slice += p->time_slice; | ||
1032 | - if (unlikely(p->parent->time_slice > task_timeslice(p))) | ||
1033 | - p->parent->time_slice = task_timeslice(p); | ||
1034 | - } | ||
1035 | - if (p->sleep_avg < p->parent->sleep_avg) | ||
1036 | - p->parent->sleep_avg = p->parent->sleep_avg / | ||
1037 | - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | ||
1038 | - (EXIT_WEIGHT + 1); | ||
1039 | - task_rq_unlock(rq, &flags); | ||
1040 | -} | ||
1041 | - | ||
1042 | /** | ||
1043 | * prepare_task_switch - prepare to switch tasks | ||
1044 | * @rq: the runqueue preparing to switch | ||
1045 | @@ -2068,23 +1943,21 @@ void sched_exec(void) | ||
1046 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
1047 | * Both runqueues must be locked. | ||
1048 | */ | ||
1049 | -static void pull_task(struct rq *src_rq, struct prio_array *src_array, | ||
1050 | - struct task_struct *p, struct rq *this_rq, | ||
1051 | - struct prio_array *this_array, int this_cpu) | ||
1052 | +static void pull_task(struct rq *src_rq, struct task_struct *p, | ||
1053 | + struct rq *this_rq, int this_cpu) | ||
1054 | { | ||
1055 | - dequeue_task(p, src_array); | ||
1056 | + dequeue_task(p, src_rq); | ||
1057 | dec_nr_running(p, src_rq); | ||
1058 | set_task_cpu(p, this_cpu); | ||
1059 | inc_nr_running(p, this_rq); | ||
1060 | - enqueue_task(p, this_array); | ||
1061 | + enqueue_task(p, this_rq); | ||
1062 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) | ||
1063 | + this_rq->most_recent_timestamp; | ||
1064 | /* | ||
1065 | * Note that idle threads have a prio of MAX_PRIO, for this test | ||
1066 | * to be always true for them. | ||
1067 | */ | ||
1068 | - if (TASK_PREEMPTS_CURR(p, this_rq)) | ||
1069 | - resched_task(this_rq->curr); | ||
1070 | + preempt(p, this_rq); | ||
1071 | } | ||
1072 | |||
1073 | /* | ||
1074 | @@ -2127,8 +2000,6 @@ int can_migrate_task(struct task_struct | ||
1075 | return 1; | ||
1076 | } | ||
1077 | |||
1078 | -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
1079 | - | ||
1080 | /* | ||
1081 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
1082 | * load from busiest to this_rq, as part of a balancing operation within | ||
1083 | @@ -2143,7 +2014,6 @@ static int move_tasks(struct rq *this_rq | ||
1084 | { | ||
1085 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, | ||
1086 | best_prio_seen, skip_for_load; | ||
1087 | - struct prio_array *array, *dst_array; | ||
1088 | struct list_head *head, *curr; | ||
1089 | struct task_struct *tmp; | ||
1090 | long rem_load_move; | ||
1091 | @@ -2153,8 +2023,8 @@ static int move_tasks(struct rq *this_rq | ||
1092 | |||
1093 | rem_load_move = max_load_move; | ||
1094 | pinned = 1; | ||
1095 | - this_best_prio = rq_best_prio(this_rq); | ||
1096 | - best_prio = rq_best_prio(busiest); | ||
1097 | + this_best_prio = this_rq->curr->prio; | ||
1098 | + best_prio = busiest->curr->prio; | ||
1099 | /* | ||
1100 | * Enable handling of the case where there is more than one task | ||
1101 | * with the best priority. If the current running task is one | ||
1102 | @@ -2164,38 +2034,17 @@ static int move_tasks(struct rq *this_rq | ||
1103 | */ | ||
1104 | best_prio_seen = best_prio == busiest->curr->prio; | ||
1105 | |||
1106 | - /* | ||
1107 | - * We first consider expired tasks. Those will likely not be | ||
1108 | - * executed in the near future, and they are most likely to | ||
1109 | - * be cache-cold, thus switching CPUs has the least effect | ||
1110 | - * on them. | ||
1111 | - */ | ||
1112 | - if (busiest->expired->nr_active) { | ||
1113 | - array = busiest->expired; | ||
1114 | - dst_array = this_rq->expired; | ||
1115 | - } else { | ||
1116 | - array = busiest->active; | ||
1117 | - dst_array = this_rq->active; | ||
1118 | - } | ||
1119 | - | ||
1120 | -new_array: | ||
1121 | /* Start searching at priority 0: */ | ||
1122 | idx = 0; | ||
1123 | skip_bitmap: | ||
1124 | if (!idx) | ||
1125 | - idx = sched_find_first_bit(array->bitmap); | ||
1126 | + idx = sched_find_first_bit(busiest->bitmap); | ||
1127 | else | ||
1128 | - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
1129 | - if (idx >= MAX_PRIO) { | ||
1130 | - if (array == busiest->expired && busiest->active->nr_active) { | ||
1131 | - array = busiest->active; | ||
1132 | - dst_array = this_rq->active; | ||
1133 | - goto new_array; | ||
1134 | - } | ||
1135 | + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); | ||
1136 | + if (idx >= MAX_PRIO) | ||
1137 | goto out; | ||
1138 | - } | ||
1139 | |||
1140 | - head = array->queue + idx; | ||
1141 | + head = busiest->queue + idx; | ||
1142 | curr = head->prev; | ||
1143 | skip_queue: | ||
1144 | tmp = list_entry(curr, struct task_struct, run_list); | ||
1145 | @@ -2220,7 +2069,7 @@ skip_queue: | ||
1146 | goto skip_bitmap; | ||
1147 | } | ||
1148 | |||
1149 | - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | ||
1150 | + pull_task(busiest, tmp, this_rq, this_cpu); | ||
1151 | pulled++; | ||
1152 | rem_load_move -= tmp->load_weight; | ||
1153 | |||
1154 | @@ -3036,27 +2885,6 @@ unsigned long long current_sched_time(co | ||
1155 | } | ||
1156 | |||
1157 | /* | ||
1158 | - * We place interactive tasks back into the active array, if possible. | ||
1159 | - * | ||
1160 | - * To guarantee that this does not starve expired tasks we ignore the | ||
1161 | - * interactivity of a task if the first expired task had to wait more | ||
1162 | - * than a 'reasonable' amount of time. This deadline timeout is | ||
1163 | - * load-dependent, as the frequency of array switched decreases with | ||
1164 | - * increasing number of running tasks. We also ignore the interactivity | ||
1165 | - * if a better static_prio task has expired: | ||
1166 | - */ | ||
1167 | -static inline int expired_starving(struct rq *rq) | ||
1168 | -{ | ||
1169 | - if (rq->curr->static_prio > rq->best_expired_prio) | ||
1170 | - return 1; | ||
1171 | - if (!STARVATION_LIMIT || !rq->expired_timestamp) | ||
1172 | - return 0; | ||
1173 | - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | ||
1174 | - return 1; | ||
1175 | - return 0; | ||
1176 | -} | ||
1177 | - | ||
1178 | -/* | ||
1179 | * Account user cpu time to a process. | ||
1180 | * @p: the process that the cpu time gets accounted to | ||
1181 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
1182 | @@ -3104,6 +2932,7 @@ void account_system_time(struct task_str | ||
1183 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | ||
1184 | else | ||
1185 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | ||
1186 | + p->systime += NSJIFFY; | ||
1187 | /* Account for system time used */ | ||
1188 | acct_update_integrals(p); | ||
1189 | } | ||
1190 | @@ -3129,76 +2958,49 @@ void account_steal_time(struct task_stru | ||
1191 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | ||
1192 | } | ||
1193 | |||
1194 | +static void time_slice_expired(struct task_struct *p, struct rq *rq) | ||
1195 | +{ | ||
1196 | + set_tsk_need_resched(p); | ||
1197 | + p->time_slice = rr_interval(p); | ||
1198 | + requeue_task(p, rq, effective_prio(p)); | ||
1199 | +} | ||
1200 | + | ||
1201 | static void task_running_tick(struct rq *rq, struct task_struct *p) | ||
1202 | { | ||
1203 | - if (p->array != rq->active) { | ||
1204 | + unsigned long debit; | ||
1205 | + | ||
1206 | + if (unlikely(!task_queued(p))) { | ||
1207 | /* Task has expired but was not scheduled yet */ | ||
1208 | set_tsk_need_resched(p); | ||
1209 | return; | ||
1210 | } | ||
1211 | + /* SCHED_FIFO tasks never run out of timeslice. */ | ||
1212 | + if (unlikely(p->policy == SCHED_FIFO)) | ||
1213 | + return; | ||
1214 | + | ||
1215 | spin_lock(&rq->lock); | ||
1216 | + debit = ns_diff(rq->most_recent_timestamp, p->timestamp); | ||
1217 | + p->ns_debit += debit; | ||
1218 | + if (p->ns_debit < NSJIFFY) | ||
1219 | + goto out_unlock; | ||
1220 | + p->ns_debit %= NSJIFFY; | ||
1221 | /* | ||
1222 | - * The task was running during this tick - update the | ||
1223 | - * time slice counter. Note: we do not update a thread's | ||
1224 | - * priority until it either goes to sleep or uses up its | ||
1225 | - * timeslice. This makes it possible for interactive tasks | ||
1226 | - * to use up their timeslices at their highest priority levels. | ||
1227 | + * Tasks lose bonus each time they use up a full slice(). | ||
1228 | */ | ||
1229 | - if (rt_task(p)) { | ||
1230 | - /* | ||
1231 | - * RR tasks need a special form of timeslice management. | ||
1232 | - * FIFO tasks have no timeslices. | ||
1233 | - */ | ||
1234 | - if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
1235 | - p->time_slice = task_timeslice(p); | ||
1236 | - p->first_time_slice = 0; | ||
1237 | - set_tsk_need_resched(p); | ||
1238 | - | ||
1239 | - /* put it at the end of the queue: */ | ||
1240 | - requeue_task(p, rq->active); | ||
1241 | - } | ||
1242 | + if (!--p->slice) { | ||
1243 | + dec_bonus(p); | ||
1244 | + p->totalrun = 0; | ||
1245 | + p->slice = slice(p); | ||
1246 | + time_slice_expired(p, rq); | ||
1247 | goto out_unlock; | ||
1248 | } | ||
1249 | + /* | ||
1250 | + * Tasks that run out of time_slice but still have slice left get | ||
1251 | + * requeued with a lower priority && RR_INTERVAL time_slice. | ||
1252 | + */ | ||
1253 | if (!--p->time_slice) { | ||
1254 | - dequeue_task(p, rq->active); | ||
1255 | - set_tsk_need_resched(p); | ||
1256 | - p->prio = effective_prio(p); | ||
1257 | - p->time_slice = task_timeslice(p); | ||
1258 | - p->first_time_slice = 0; | ||
1259 | - | ||
1260 | - if (!rq->expired_timestamp) | ||
1261 | - rq->expired_timestamp = jiffies; | ||
1262 | - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { | ||
1263 | - enqueue_task(p, rq->expired); | ||
1264 | - if (p->static_prio < rq->best_expired_prio) | ||
1265 | - rq->best_expired_prio = p->static_prio; | ||
1266 | - } else | ||
1267 | - enqueue_task(p, rq->active); | ||
1268 | - } else { | ||
1269 | - /* | ||
1270 | - * Prevent a too long timeslice allowing a task to monopolize | ||
1271 | - * the CPU. We do this by splitting up the timeslice into | ||
1272 | - * smaller pieces. | ||
1273 | - * | ||
1274 | - * Note: this does not mean the task's timeslices expire or | ||
1275 | - * get lost in any way, they just might be preempted by | ||
1276 | - * another task of equal priority. (one with higher | ||
1277 | - * priority would have preempted this task already.) We | ||
1278 | - * requeue this task to the end of the list on this priority | ||
1279 | - * level, which is in essence a round-robin of tasks with | ||
1280 | - * equal priority. | ||
1281 | - * | ||
1282 | - * This only applies to tasks in the interactive | ||
1283 | - * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
1284 | - */ | ||
1285 | - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
1286 | - p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
1287 | - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
1288 | - (p->array == rq->active)) { | ||
1289 | - | ||
1290 | - requeue_task(p, rq->active); | ||
1291 | - set_tsk_need_resched(p); | ||
1292 | - } | ||
1293 | + time_slice_expired(p, rq); | ||
1294 | + goto out_unlock; | ||
1295 | } | ||
1296 | out_unlock: | ||
1297 | spin_unlock(&rq->lock); | ||
1298 | @@ -3207,9 +3009,6 @@ out_unlock: | ||
1299 | /* | ||
1300 | * This function gets called by the timer code, with HZ frequency. | ||
1301 | * We call it with interrupts disabled. | ||
1302 | - * | ||
1303 | - * It also gets called by the fork code, when changing the parent's | ||
1304 | - * timeslices. | ||
1305 | */ | ||
1306 | void scheduler_tick(void) | ||
1307 | { | ||
1308 | @@ -3273,13 +3072,13 @@ static void wake_sleeping_dependent(int | ||
1309 | |||
1310 | /* | ||
1311 | * number of 'lost' timeslices this task wont be able to fully | ||
1312 | - * utilize, if another task runs on a sibling. This models the | ||
1313 | + * utilise, if another task runs on a sibling. This models the | ||
1314 | * slowdown effect of other tasks running on siblings: | ||
1315 | */ | ||
1316 | static inline unsigned long | ||
1317 | smt_slice(struct task_struct *p, struct sched_domain *sd) | ||
1318 | { | ||
1319 | - return p->time_slice * (100 - sd->per_cpu_gain) / 100; | ||
1320 | + return p->slice * (100 - sd->per_cpu_gain) / 100; | ||
1321 | } | ||
1322 | |||
1323 | /* | ||
1324 | @@ -3343,7 +3142,7 @@ dependent_sleeper(int this_cpu, struct r | ||
1325 | } else { | ||
1326 | if (smt_curr->static_prio < p->static_prio && | ||
1327 | !TASK_PREEMPTS_CURR(p, smt_rq) && | ||
1328 | - smt_slice(smt_curr, sd) > task_timeslice(p)) | ||
1329 | + smt_slice(smt_curr, sd) > slice(p)) | ||
1330 | ret = 1; | ||
1331 | } | ||
1332 | unlock: | ||
1333 | @@ -3400,25 +3199,18 @@ EXPORT_SYMBOL(sub_preempt_count); | ||
1334 | |||
1335 | #endif | ||
1336 | |||
1337 | -static inline int interactive_sleep(enum sleep_type sleep_type) | ||
1338 | -{ | ||
1339 | - return (sleep_type == SLEEP_INTERACTIVE || | ||
1340 | - sleep_type == SLEEP_INTERRUPTED); | ||
1341 | -} | ||
1342 | - | ||
1343 | /* | ||
1344 | * schedule() is the main scheduler function. | ||
1345 | */ | ||
1346 | asmlinkage void __sched schedule(void) | ||
1347 | { | ||
1348 | struct task_struct *prev, *next; | ||
1349 | - struct prio_array *array; | ||
1350 | struct list_head *queue; | ||
1351 | unsigned long long now; | ||
1352 | - unsigned long run_time; | ||
1353 | - int cpu, idx, new_prio; | ||
1354 | long *switch_count; | ||
1355 | + unsigned long debit; | ||
1356 | struct rq *rq; | ||
1357 | + int cpu, idx; | ||
1358 | |||
1359 | /* | ||
1360 | * Test if we are atomic. Since do_exit() needs to call into | ||
1361 | @@ -3454,20 +3246,11 @@ need_resched_nonpreemptible: | ||
1362 | |||
1363 | schedstat_inc(rq, sched_cnt); | ||
1364 | now = sched_clock(); | ||
1365 | - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | ||
1366 | - run_time = now - prev->timestamp; | ||
1367 | - if (unlikely((long long)(now - prev->timestamp) < 0)) | ||
1368 | - run_time = 0; | ||
1369 | - } else | ||
1370 | - run_time = NS_MAX_SLEEP_AVG; | ||
1371 | - | ||
1372 | - /* | ||
1373 | - * Tasks charged proportionately less run_time at high sleep_avg to | ||
1374 | - * delay them losing their interactive status | ||
1375 | - */ | ||
1376 | - run_time /= (CURRENT_BONUS(prev) ? : 1); | ||
1377 | |||
1378 | spin_lock_irq(&rq->lock); | ||
1379 | + prev->runtime = ns_diff(now, prev->timestamp); | ||
1380 | + debit = ns_diff(now, rq->most_recent_timestamp) % NSJIFFY; | ||
1381 | + prev->ns_debit += debit; | ||
1382 | |||
1383 | switch_count = &prev->nivcsw; | ||
1384 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | ||
1385 | @@ -3476,8 +3259,10 @@ need_resched_nonpreemptible: | ||
1386 | unlikely(signal_pending(prev)))) | ||
1387 | prev->state = TASK_RUNNING; | ||
1388 | else { | ||
1389 | - if (prev->state == TASK_UNINTERRUPTIBLE) | ||
1390 | + if (prev->state == TASK_UNINTERRUPTIBLE) { | ||
1391 | + prev->flags |= PF_NONSLEEP; | ||
1392 | rq->nr_uninterruptible++; | ||
1393 | + } | ||
1394 | deactivate_task(prev, rq); | ||
1395 | } | ||
1396 | } | ||
1397 | @@ -3487,62 +3272,28 @@ need_resched_nonpreemptible: | ||
1398 | idle_balance(cpu, rq); | ||
1399 | if (!rq->nr_running) { | ||
1400 | next = rq->idle; | ||
1401 | - rq->expired_timestamp = 0; | ||
1402 | wake_sleeping_dependent(cpu); | ||
1403 | goto switch_tasks; | ||
1404 | } | ||
1405 | } | ||
1406 | |||
1407 | - array = rq->active; | ||
1408 | - if (unlikely(!array->nr_active)) { | ||
1409 | - /* | ||
1410 | - * Switch the active and expired arrays. | ||
1411 | - */ | ||
1412 | - schedstat_inc(rq, sched_switch); | ||
1413 | - rq->active = rq->expired; | ||
1414 | - rq->expired = array; | ||
1415 | - array = rq->active; | ||
1416 | - rq->expired_timestamp = 0; | ||
1417 | - rq->best_expired_prio = MAX_PRIO; | ||
1418 | - } | ||
1419 | - | ||
1420 | - idx = sched_find_first_bit(array->bitmap); | ||
1421 | - queue = array->queue + idx; | ||
1422 | + idx = sched_find_first_bit(rq->bitmap); | ||
1423 | + queue = rq->queue + idx; | ||
1424 | next = list_entry(queue->next, struct task_struct, run_list); | ||
1425 | |||
1426 | - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | ||
1427 | - unsigned long long delta = now - next->timestamp; | ||
1428 | - if (unlikely((long long)(now - next->timestamp) < 0)) | ||
1429 | - delta = 0; | ||
1430 | - | ||
1431 | - if (next->sleep_type == SLEEP_INTERACTIVE) | ||
1432 | - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | ||
1433 | - | ||
1434 | - array = next->array; | ||
1435 | - new_prio = recalc_task_prio(next, next->timestamp + delta); | ||
1436 | - | ||
1437 | - if (unlikely(next->prio != new_prio)) { | ||
1438 | - dequeue_task(next, array); | ||
1439 | - next->prio = new_prio; | ||
1440 | - enqueue_task(next, array); | ||
1441 | - } | ||
1442 | - } | ||
1443 | - next->sleep_type = SLEEP_NORMAL; | ||
1444 | if (dependent_sleeper(cpu, rq, next)) | ||
1445 | next = rq->idle; | ||
1446 | + else { | ||
1447 | + prefetch(next); | ||
1448 | + prefetch_stack(next); | ||
1449 | + } | ||
1450 | switch_tasks: | ||
1451 | if (next == rq->idle) | ||
1452 | schedstat_inc(rq, sched_goidle); | ||
1453 | - prefetch(next); | ||
1454 | - prefetch_stack(next); | ||
1455 | clear_tsk_need_resched(prev); | ||
1456 | rcu_qsctr_inc(task_cpu(prev)); | ||
1457 | |||
1458 | update_cpu_clock(prev, rq, now); | ||
1459 | - | ||
1460 | - prev->sleep_avg -= run_time; | ||
1461 | - if ((long)prev->sleep_avg <= 0) | ||
1462 | - prev->sleep_avg = 0; | ||
1463 | prev->timestamp = prev->last_ran = now; | ||
1464 | |||
1465 | sched_info_switch(prev, next); | ||
1466 | @@ -3978,29 +3729,21 @@ EXPORT_SYMBOL(sleep_on_timeout); | ||
1467 | */ | ||
1468 | void rt_mutex_setprio(struct task_struct *p, int prio) | ||
1469 | { | ||
1470 | - struct prio_array *array; | ||
1471 | unsigned long flags; | ||
1472 | + int queued, oldprio; | ||
1473 | struct rq *rq; | ||
1474 | - int oldprio; | ||
1475 | |||
1476 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
1477 | |||
1478 | rq = task_rq_lock(p, &flags); | ||
1479 | |||
1480 | oldprio = p->prio; | ||
1481 | - array = p->array; | ||
1482 | - if (array) | ||
1483 | - dequeue_task(p, array); | ||
1484 | + if ((queued = task_queued(p))) | ||
1485 | + dequeue_task(p, rq); | ||
1486 | p->prio = prio; | ||
1487 | |||
1488 | - if (array) { | ||
1489 | - /* | ||
1490 | - * If changing to an RT priority then queue it | ||
1491 | - * in the active array! | ||
1492 | - */ | ||
1493 | - if (rt_task(p)) | ||
1494 | - array = rq->active; | ||
1495 | - enqueue_task(p, array); | ||
1496 | + if (queued) { | ||
1497 | + enqueue_task(p, rq); | ||
1498 | /* | ||
1499 | * Reschedule if we are currently running on this runqueue and | ||
1500 | * our priority decreased, or if we are not currently running on | ||
1501 | @@ -4009,8 +3752,8 @@ void rt_mutex_setprio(struct task_struct | ||
1502 | if (task_running(rq, p)) { | ||
1503 | if (p->prio > oldprio) | ||
1504 | resched_task(rq->curr); | ||
1505 | - } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
1506 | - resched_task(rq->curr); | ||
1507 | + } else | ||
1508 | + preempt(p, rq); | ||
1509 | } | ||
1510 | task_rq_unlock(rq, &flags); | ||
1511 | } | ||
1512 | @@ -4019,8 +3762,7 @@ void rt_mutex_setprio(struct task_struct | ||
1513 | |||
1514 | void set_user_nice(struct task_struct *p, long nice) | ||
1515 | { | ||
1516 | - struct prio_array *array; | ||
1517 | - int old_prio, delta; | ||
1518 | + int queued, old_prio,delta; | ||
1519 | unsigned long flags; | ||
1520 | struct rq *rq; | ||
1521 | |||
1522 | @@ -4041,20 +3783,21 @@ void set_user_nice(struct task_struct *p | ||
1523 | p->static_prio = NICE_TO_PRIO(nice); | ||
1524 | goto out_unlock; | ||
1525 | } | ||
1526 | - array = p->array; | ||
1527 | - if (array) { | ||
1528 | - dequeue_task(p, array); | ||
1529 | + if ((queued = task_queued(p))) { | ||
1530 | + dequeue_task(p, rq); | ||
1531 | dec_raw_weighted_load(rq, p); | ||
1532 | } | ||
1533 | |||
1534 | p->static_prio = NICE_TO_PRIO(nice); | ||
1535 | set_load_weight(p); | ||
1536 | old_prio = p->prio; | ||
1537 | + if (p->bonus > bonus(p)) | ||
1538 | + p->bonus= bonus(p); | ||
1539 | p->prio = effective_prio(p); | ||
1540 | delta = p->prio - old_prio; | ||
1541 | |||
1542 | - if (array) { | ||
1543 | - enqueue_task(p, array); | ||
1544 | + if (queued) { | ||
1545 | + enqueue_task(p, rq); | ||
1546 | inc_raw_weighted_load(rq, p); | ||
1547 | /* | ||
1548 | * If the task increased its priority or is running and | ||
1549 | @@ -4177,18 +3920,13 @@ static inline struct task_struct *find_p | ||
1550 | /* Actually do priority change: must hold rq lock. */ | ||
1551 | static void __setscheduler(struct task_struct *p, int policy, int prio) | ||
1552 | { | ||
1553 | - BUG_ON(p->array); | ||
1554 | + BUG_ON(task_queued(p)); | ||
1555 | |||
1556 | p->policy = policy; | ||
1557 | p->rt_priority = prio; | ||
1558 | p->normal_prio = normal_prio(p); | ||
1559 | /* we are holding p->pi_lock already */ | ||
1560 | p->prio = rt_mutex_getprio(p); | ||
1561 | - /* | ||
1562 | - * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
1563 | - */ | ||
1564 | - if (policy == SCHED_BATCH) | ||
1565 | - p->sleep_avg = 0; | ||
1566 | set_load_weight(p); | ||
1567 | } | ||
1568 | |||
1569 | @@ -4204,8 +3942,7 @@ static void __setscheduler(struct task_s | ||
1570 | int sched_setscheduler(struct task_struct *p, int policy, | ||
1571 | struct sched_param *param) | ||
1572 | { | ||
1573 | - int retval, oldprio, oldpolicy = -1; | ||
1574 | - struct prio_array *array; | ||
1575 | + int queued, retval, oldprio, oldpolicy = -1; | ||
1576 | unsigned long flags; | ||
1577 | struct rq *rq; | ||
1578 | |||
1579 | @@ -4279,12 +4016,11 @@ recheck: | ||
1580 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
1581 | goto recheck; | ||
1582 | } | ||
1583 | - array = p->array; | ||
1584 | - if (array) | ||
1585 | + if ((queued = task_queued(p))) | ||
1586 | deactivate_task(p, rq); | ||
1587 | oldprio = p->prio; | ||
1588 | __setscheduler(p, policy, param->sched_priority); | ||
1589 | - if (array) { | ||
1590 | + if (queued) { | ||
1591 | __activate_task(p, rq); | ||
1592 | /* | ||
1593 | * Reschedule if we are currently running on this runqueue and | ||
1594 | @@ -4294,8 +4030,8 @@ recheck: | ||
1595 | if (task_running(rq, p)) { | ||
1596 | if (p->prio > oldprio) | ||
1597 | resched_task(rq->curr); | ||
1598 | - } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
1599 | - resched_task(rq->curr); | ||
1600 | + } else | ||
1601 | + preempt(p, rq); | ||
1602 | } | ||
1603 | __task_rq_unlock(rq); | ||
1604 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
1605 | @@ -4567,41 +4303,24 @@ asmlinkage long sys_sched_getaffinity(pi | ||
1606 | /** | ||
1607 | * sys_sched_yield - yield the current processor to other threads. | ||
1608 | * | ||
1609 | - * this function yields the current CPU by moving the calling thread | ||
1610 | - * to the expired array. If there are no other threads running on this | ||
1611 | - * CPU then this function will return. | ||
1612 | + * This function yields the current CPU by dropping the priority of current | ||
1613 | + * to the lowest priority. | ||
1614 | */ | ||
1615 | asmlinkage long sys_sched_yield(void) | ||
1616 | { | ||
1617 | struct rq *rq = this_rq_lock(); | ||
1618 | - struct prio_array *array = current->array, *target = rq->expired; | ||
1619 | + int newprio = current->prio; | ||
1620 | |||
1621 | schedstat_inc(rq, yld_cnt); | ||
1622 | - /* | ||
1623 | - * We implement yielding by moving the task into the expired | ||
1624 | - * queue. | ||
1625 | - * | ||
1626 | - * (special rule: RT tasks will just roundrobin in the active | ||
1627 | - * array.) | ||
1628 | - */ | ||
1629 | - if (rt_task(current)) | ||
1630 | - target = rq->active; | ||
1631 | |||
1632 | - if (array->nr_active == 1) { | ||
1633 | - schedstat_inc(rq, yld_act_empty); | ||
1634 | - if (!rq->expired->nr_active) | ||
1635 | - schedstat_inc(rq, yld_both_empty); | ||
1636 | - } else if (!rq->expired->nr_active) | ||
1637 | - schedstat_inc(rq, yld_exp_empty); | ||
1638 | - | ||
1639 | - if (array != target) { | ||
1640 | - dequeue_task(current, array); | ||
1641 | - enqueue_task(current, target); | ||
1642 | - } else | ||
1643 | - /* | ||
1644 | - * requeue_task is cheaper so perform that if possible. | ||
1645 | - */ | ||
1646 | - requeue_task(current, array); | ||
1647 | + newprio = current->prio; | ||
1648 | + schedstat_inc(rq, yld_cnt); | ||
1649 | + current->slice = slice(current); | ||
1650 | + current->time_slice = rr_interval(current); | ||
1651 | + if (likely(!rt_task(current))) | ||
1652 | + newprio = MIN_USER_PRIO; | ||
1653 | + | ||
1654 | + requeue_task(current, rq, newprio); | ||
1655 | |||
1656 | /* | ||
1657 | * Since we are going to call schedule() anyway, there's | ||
1658 | @@ -4812,7 +4531,7 @@ long sys_sched_rr_get_interval(pid_t pid | ||
1659 | goto out_unlock; | ||
1660 | |||
1661 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | ||
1662 | - 0 : task_timeslice(p), &t); | ||
1663 | + 0 : slice(p), &t); | ||
1664 | read_unlock(&tasklist_lock); | ||
1665 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | ||
1666 | out_nounlock: | ||
1667 | @@ -4941,8 +4660,6 @@ void __cpuinit init_idle(struct task_str | ||
1668 | unsigned long flags; | ||
1669 | |||
1670 | idle->timestamp = sched_clock(); | ||
1671 | - idle->sleep_avg = 0; | ||
1672 | - idle->array = NULL; | ||
1673 | idle->prio = idle->normal_prio = MAX_PRIO; | ||
1674 | idle->state = TASK_RUNNING; | ||
1675 | idle->cpus_allowed = cpumask_of_cpu(cpu); | ||
1676 | @@ -5062,7 +4779,7 @@ static int __migrate_task(struct task_st | ||
1677 | goto out; | ||
1678 | |||
1679 | set_task_cpu(p, dest_cpu); | ||
1680 | - if (p->array) { | ||
1681 | + if (task_queued(p)) { | ||
1682 | /* | ||
1683 | * Sync timestamp with rq_dest's before activating. | ||
1684 | * The same thing could be achieved by doing this step | ||
1685 | @@ -5073,8 +4790,7 @@ static int __migrate_task(struct task_st | ||
1686 | + rq_dest->most_recent_timestamp; | ||
1687 | deactivate_task(p, rq_src); | ||
1688 | __activate_task(p, rq_dest); | ||
1689 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
1690 | - resched_task(rq_dest->curr); | ||
1691 | + preempt(p, rq_dest); | ||
1692 | } | ||
1693 | ret = 1; | ||
1694 | out: | ||
1695 | @@ -5303,7 +5019,7 @@ static void migrate_dead_tasks(unsigned | ||
1696 | |||
1697 | for (arr = 0; arr < 2; arr++) { | ||
1698 | for (i = 0; i < MAX_PRIO; i++) { | ||
1699 | - struct list_head *list = &rq->arrays[arr].queue[i]; | ||
1700 | + struct list_head *list = &rq->queue[i]; | ||
1701 | |||
1702 | while (!list_empty(list)) | ||
1703 | migrate_dead(dead_cpu, list_entry(list->next, | ||
1704 | @@ -6894,19 +6610,16 @@ int in_sched_functions(unsigned long add | ||
1705 | |||
1706 | void __init sched_init(void) | ||
1707 | { | ||
1708 | - int i, j, k; | ||
1709 | + int i; | ||
1710 | |||
1711 | for_each_possible_cpu(i) { | ||
1712 | - struct prio_array *array; | ||
1713 | struct rq *rq; | ||
1714 | + int j; | ||
1715 | |||
1716 | rq = cpu_rq(i); | ||
1717 | spin_lock_init(&rq->lock); | ||
1718 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
1719 | rq->nr_running = 0; | ||
1720 | - rq->active = rq->arrays; | ||
1721 | - rq->expired = rq->arrays + 1; | ||
1722 | - rq->best_expired_prio = MAX_PRIO; | ||
1723 | |||
1724 | #ifdef CONFIG_SMP | ||
1725 | rq->sd = NULL; | ||
1726 | @@ -6920,15 +6633,11 @@ void __init sched_init(void) | ||
1727 | #endif | ||
1728 | atomic_set(&rq->nr_iowait, 0); | ||
1729 | |||
1730 | - for (j = 0; j < 2; j++) { | ||
1731 | - array = rq->arrays + j; | ||
1732 | - for (k = 0; k < MAX_PRIO; k++) { | ||
1733 | - INIT_LIST_HEAD(array->queue + k); | ||
1734 | - __clear_bit(k, array->bitmap); | ||
1735 | - } | ||
1736 | - // delimiter for bitsearch | ||
1737 | - __set_bit(MAX_PRIO, array->bitmap); | ||
1738 | - } | ||
1739 | + for (j = 0; j < MAX_PRIO; j++) | ||
1740 | + INIT_LIST_HEAD(&rq->queue[j]); | ||
1741 | + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long)); | ||
1742 | + /* delimiter for bitsearch */ | ||
1743 | + __set_bit(MAX_PRIO, rq->bitmap); | ||
1744 | } | ||
1745 | |||
1746 | set_load_weight(&init_task); | ||
1747 | @@ -6984,10 +6693,10 @@ EXPORT_SYMBOL(__might_sleep); | ||
1748 | #ifdef CONFIG_MAGIC_SYSRQ | ||
1749 | void normalize_rt_tasks(void) | ||
1750 | { | ||
1751 | - struct prio_array *array; | ||
1752 | struct task_struct *p; | ||
1753 | unsigned long flags; | ||
1754 | struct rq *rq; | ||
1755 | + int queued; | ||
1756 | |||
1757 | read_lock_irq(&tasklist_lock); | ||
1758 | for_each_process(p) { | ||
1759 | @@ -6997,11 +6706,10 @@ void normalize_rt_tasks(void) | ||
1760 | spin_lock_irqsave(&p->pi_lock, flags); | ||
1761 | rq = __task_rq_lock(p); | ||
1762 | |||
1763 | - array = p->array; | ||
1764 | - if (array) | ||
1765 | + if ((queued = task_queued(p))) | ||
1766 | deactivate_task(p, task_rq(p)); | ||
1767 | __setscheduler(p, SCHED_NORMAL, 0); | ||
1768 | - if (array) { | ||
1769 | + if (queued) { | ||
1770 | __activate_task(p, task_rq(p)); | ||
1771 | resched_task(rq->curr); | ||
1772 | } |