Contents of /trunk/kernel26-alx/patches-2.6.20-r6/0001-2.6.20-sched-staircase-17.patch
Parent Directory | Revision Log
Revision 1175 -
(show annotations)
(download)
Thu Oct 14 12:15:46 2010 UTC (13 years, 11 months ago) by niro
File size: 53298 byte(s)
Thu Oct 14 12:15:46 2010 UTC (13 years, 11 months ago) by niro
File size: 53298 byte(s)
-2.6.20-alx-r6 new magellan 0.5.2 kernel
1 | Implement the "staircase" hybrid foreground-background single priority |
2 | array cpu scheduler policy. |
3 | |
4 | Signed-off-by: Con Kolivas <kernel@kolivas.org> |
5 | --- |
6 | fs/proc/array.c | 4 |
7 | include/linux/sched.h | 20 |
8 | kernel/exit.c | 1 |
9 | kernel/sched.c | 1084 ++++++++++++++++++-------------------------------- |
10 | 4 files changed, 404 insertions(+), 705 deletions(-) |
11 | |
12 | Index: linux-2.6.20-ck1/fs/proc/array.c |
13 | =================================================================== |
14 | --- linux-2.6.20-ck1.orig/fs/proc/array.c 2007-02-05 22:52:03.000000000 +1100 |
15 | +++ linux-2.6.20-ck1/fs/proc/array.c 2007-02-16 19:01:30.000000000 +1100 |
16 | @@ -165,7 +165,7 @@ static inline char * task_state(struct t |
17 | rcu_read_lock(); |
18 | buffer += sprintf(buffer, |
19 | "State:\t%s\n" |
20 | - "SleepAVG:\t%lu%%\n" |
21 | + "Bonus:\t%d\n" |
22 | "Tgid:\t%d\n" |
23 | "Pid:\t%d\n" |
24 | "PPid:\t%d\n" |
25 | @@ -173,7 +173,7 @@ static inline char * task_state(struct t |
26 | "Uid:\t%d\t%d\t%d\t%d\n" |
27 | "Gid:\t%d\t%d\t%d\t%d\n", |
28 | get_task_state(p), |
29 | - (p->sleep_avg/1024)*100/(1020000000/1024), |
30 | + p->bonus, |
31 | p->tgid, p->pid, |
32 | pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, |
33 | pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, |
34 | Index: linux-2.6.20-ck1/kernel/exit.c |
35 | =================================================================== |
36 | --- linux-2.6.20-ck1.orig/kernel/exit.c 2007-02-05 22:52:04.000000000 +1100 |
37 | +++ linux-2.6.20-ck1/kernel/exit.c 2007-02-16 19:01:30.000000000 +1100 |
38 | @@ -170,7 +170,6 @@ repeat: |
39 | zap_leader = (leader->exit_signal == -1); |
40 | } |
41 | |
42 | - sched_exit(p); |
43 | write_unlock_irq(&tasklist_lock); |
44 | proc_flush_task(p); |
45 | release_thread(p); |
46 | Index: linux-2.6.20-ck1/include/linux/sched.h |
47 | =================================================================== |
48 | --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-05 22:52:04.000000000 +1100 |
49 | +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100 |
50 | @@ -524,6 +524,7 @@ struct signal_struct { |
51 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
52 | |
53 | #define MAX_PRIO (MAX_RT_PRIO + 40) |
54 | +#define MIN_USER_PRIO (MAX_PRIO - 1) |
55 | |
56 | #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) |
57 | #define rt_task(p) rt_prio((p)->prio) |
58 | @@ -789,15 +790,6 @@ struct mempolicy; |
59 | struct pipe_inode_info; |
60 | struct uts_namespace; |
61 | |
62 | -enum sleep_type { |
63 | - SLEEP_NORMAL, |
64 | - SLEEP_NONINTERACTIVE, |
65 | - SLEEP_INTERACTIVE, |
66 | - SLEEP_INTERRUPTED, |
67 | -}; |
68 | - |
69 | -struct prio_array; |
70 | - |
71 | struct task_struct { |
72 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
73 | struct thread_info *thread_info; |
74 | @@ -815,20 +807,19 @@ struct task_struct { |
75 | int load_weight; /* for niceness load balancing purposes */ |
76 | int prio, static_prio, normal_prio; |
77 | struct list_head run_list; |
78 | - struct prio_array *array; |
79 | |
80 | unsigned short ioprio; |
81 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
82 | unsigned int btrace_seq; |
83 | #endif |
84 | - unsigned long sleep_avg; |
85 | unsigned long long timestamp, last_ran; |
86 | + unsigned long runtime, totalrun, ns_debit, systime; |
87 | + unsigned int bonus; |
88 | + unsigned int slice, time_slice; |
89 | unsigned long long sched_time; /* sched_clock time spent running */ |
90 | - enum sleep_type sleep_type; |
91 | |
92 | unsigned long policy; |
93 | cpumask_t cpus_allowed; |
94 | - unsigned int time_slice, first_time_slice; |
95 | |
96 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
97 | struct sched_info sched_info; |
98 | @@ -1157,6 +1148,8 @@ static inline void put_task_struct(struc |
99 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
100 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
101 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
102 | +#define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */ |
103 | +#define PF_FORKED 0x80000000 /* Task just forked another process */ |
104 | |
105 | /* |
106 | * Only the _current_ task can read/write to tsk->flags, but other |
107 | @@ -1291,7 +1284,6 @@ extern void FASTCALL(wake_up_new_task(st |
108 | static inline void kick_process(struct task_struct *tsk) { } |
109 | #endif |
110 | extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); |
111 | -extern void FASTCALL(sched_exit(struct task_struct * p)); |
112 | |
113 | extern int in_group_p(gid_t); |
114 | extern int in_egroup_p(gid_t); |
115 | Index: linux-2.6.20-ck1/kernel/sched.c |
116 | =================================================================== |
117 | --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-05 22:52:04.000000000 +1100 |
118 | +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100 |
119 | @@ -16,6 +16,10 @@ |
120 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
121 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
122 | * 2004-04-02 Scheduler domains code by Nick Piggin |
123 | + * 2007-02-14 Staircase scheduling policy by Con Kolivas with help |
124 | + * from William Lee Irwin III, Zwane Mwaikambo, Peter Williams |
125 | + * and Andreas Mohr. |
126 | + * Staircase v17 |
127 | */ |
128 | |
129 | #include <linux/mm.h> |
130 | @@ -77,123 +81,19 @@ |
131 | /* |
132 | * Some helpers for converting nanosecond timing to jiffy resolution |
133 | */ |
134 | -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
135 | -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
136 | - |
137 | -/* |
138 | - * These are the 'tuning knobs' of the scheduler: |
139 | - * |
140 | - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), |
141 | - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. |
142 | - * Timeslices get refilled after they expire. |
143 | - */ |
144 | -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
145 | -#define DEF_TIMESLICE (100 * HZ / 1000) |
146 | -#define ON_RUNQUEUE_WEIGHT 30 |
147 | -#define CHILD_PENALTY 95 |
148 | -#define PARENT_PENALTY 100 |
149 | -#define EXIT_WEIGHT 3 |
150 | -#define PRIO_BONUS_RATIO 25 |
151 | -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) |
152 | -#define INTERACTIVE_DELTA 2 |
153 | -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) |
154 | -#define STARVATION_LIMIT (MAX_SLEEP_AVG) |
155 | -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) |
156 | - |
157 | -/* |
158 | - * If a task is 'interactive' then we reinsert it in the active |
159 | - * array after it has expired its current timeslice. (it will not |
160 | - * continue to run immediately, it will still roundrobin with |
161 | - * other interactive tasks.) |
162 | - * |
163 | - * This part scales the interactivity limit depending on niceness. |
164 | - * |
165 | - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. |
166 | - * Here are a few examples of different nice levels: |
167 | - * |
168 | - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] |
169 | - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] |
170 | - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] |
171 | - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] |
172 | - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] |
173 | - * |
174 | - * (the X axis represents the possible -5 ... 0 ... +5 dynamic |
175 | - * priority range a task can explore, a value of '1' means the |
176 | - * task is rated interactive.) |
177 | - * |
178 | - * Ie. nice +19 tasks can never get 'interactive' enough to be |
179 | - * reinserted into the active array. And only heavily CPU-hog nice -20 |
180 | - * tasks will be expired. Default nice 0 tasks are somewhere between, |
181 | - * it takes some effort for them to get interactive, but it's not |
182 | - * too hard. |
183 | - */ |
184 | - |
185 | -#define CURRENT_BONUS(p) \ |
186 | - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ |
187 | - MAX_SLEEP_AVG) |
188 | - |
189 | -#define GRANULARITY (10 * HZ / 1000 ? : 1) |
190 | - |
191 | -#ifdef CONFIG_SMP |
192 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
193 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ |
194 | - num_online_cpus()) |
195 | -#else |
196 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
197 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) |
198 | -#endif |
199 | - |
200 | -#define SCALE(v1,v1_max,v2_max) \ |
201 | - (v1) * (v2_max) / (v1_max) |
202 | - |
203 | -#define DELTA(p) \ |
204 | - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ |
205 | - INTERACTIVE_DELTA) |
206 | - |
207 | -#define TASK_INTERACTIVE(p) \ |
208 | - ((p)->prio <= (p)->static_prio - DELTA(p)) |
209 | - |
210 | -#define INTERACTIVE_SLEEP(p) \ |
211 | - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ |
212 | - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) |
213 | +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ |
214 | +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY) |
215 | +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY) |
216 | |
217 | #define TASK_PREEMPTS_CURR(p, rq) \ |
218 | ((p)->prio < (rq)->curr->prio) |
219 | |
220 | -#define SCALE_PRIO(x, prio) \ |
221 | - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
222 | - |
223 | -static unsigned int static_prio_timeslice(int static_prio) |
224 | -{ |
225 | - if (static_prio < NICE_TO_PRIO(0)) |
226 | - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
227 | - else |
228 | - return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
229 | -} |
230 | - |
231 | /* |
232 | - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
233 | - * to time slice values: [800ms ... 100ms ... 5ms] |
234 | - * |
235 | - * The higher a thread's priority, the bigger timeslices |
236 | - * it gets during one round of execution. But even the lowest |
237 | - * priority thread gets MIN_TIMESLICE worth of execution time. |
238 | + * This is the time all tasks within the same priority round robin. |
239 | + * Set to a minimum of 6ms. |
240 | */ |
241 | - |
242 | -static inline unsigned int task_timeslice(struct task_struct *p) |
243 | -{ |
244 | - return static_prio_timeslice(p->static_prio); |
245 | -} |
246 | - |
247 | -/* |
248 | - * These are the runqueue data structures: |
249 | - */ |
250 | - |
251 | -struct prio_array { |
252 | - unsigned int nr_active; |
253 | - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
254 | - struct list_head queue[MAX_PRIO]; |
255 | -}; |
256 | +#define RR_INTERVAL ((6 * HZ / 1001) + 1) |
257 | +#define DEF_TIMESLICE (RR_INTERVAL * 19) |
258 | |
259 | /* |
260 | * This is the main, per-CPU runqueue data structure. |
261 | @@ -224,14 +124,13 @@ struct rq { |
262 | */ |
263 | unsigned long nr_uninterruptible; |
264 | |
265 | - unsigned long expired_timestamp; |
266 | /* Cached timestamp set by update_cpu_clock() */ |
267 | unsigned long long most_recent_timestamp; |
268 | struct task_struct *curr, *idle; |
269 | unsigned long next_balance; |
270 | struct mm_struct *prev_mm; |
271 | - struct prio_array *active, *expired, arrays[2]; |
272 | - int best_expired_prio; |
273 | + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; |
274 | + struct list_head queue[MAX_PRIO]; |
275 | atomic_t nr_iowait; |
276 | |
277 | #ifdef CONFIG_SMP |
278 | @@ -568,13 +467,7 @@ static inline struct rq *this_rq_lock(vo |
279 | |
280 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
281 | /* |
282 | - * Called when a process is dequeued from the active array and given |
283 | - * the cpu. We should note that with the exception of interactive |
284 | - * tasks, the expired queue will become the active queue after the active |
285 | - * queue is empty, without explicitly dequeuing and requeuing tasks in the |
286 | - * expired queue. (Interactive tasks may be requeued directly to the |
287 | - * active queue, thus delaying tasks in the expired queue from running; |
288 | - * see scheduler_tick()). |
289 | + * Called when a process is dequeued and given the cpu. |
290 | * |
291 | * This function is only called from sched_info_arrive(), rather than |
292 | * dequeue_task(). Even though a task may be queued and dequeued multiple |
293 | @@ -607,13 +500,11 @@ static void sched_info_arrive(struct tas |
294 | } |
295 | |
296 | /* |
297 | - * Called when a process is queued into either the active or expired |
298 | - * array. The time is noted and later used to determine how long we |
299 | - * had to wait for us to reach the cpu. Since the expired queue will |
300 | - * become the active queue after active queue is empty, without dequeuing |
301 | - * and requeuing any tasks, we are interested in queuing to either. It |
302 | - * is unusual but not impossible for tasks to be dequeued and immediately |
303 | - * requeued in the same or another array: this can happen in sched_yield(), |
304 | + * Called when a process is queued. |
305 | + * The time is noted and later used to determine how long we had to wait for |
306 | + * us to reach the cpu. |
307 | + * It is unusual but not impossible for tasks to be dequeued and immediately |
308 | + * requeued: this can happen in sched_yield(), |
309 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue |
310 | * to runqueue. |
311 | * |
312 | @@ -672,73 +563,81 @@ sched_info_switch(struct task_struct *pr |
313 | #define sched_info_switch(t, next) do { } while (0) |
314 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
315 | |
316 | -/* |
317 | - * Adding/removing a task to/from a priority array: |
318 | - */ |
319 | -static void dequeue_task(struct task_struct *p, struct prio_array *array) |
320 | +#if BITS_PER_LONG < 64 |
321 | +static inline void longlimit(unsigned long long *longlong) |
322 | +{ |
323 | + if (*longlong > (1 << 31)) |
324 | + *longlong = 1 << 31; |
325 | +} |
326 | +#else |
327 | +static inline void longlimit(unsigned long long *__unused) |
328 | +{ |
329 | +} |
330 | +#endif |
331 | + |
332 | +/* Get nanosecond clock difference without overflowing unsigned long. */ |
333 | +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2) |
334 | { |
335 | - array->nr_active--; |
336 | - list_del(&p->run_list); |
337 | - if (list_empty(array->queue + p->prio)) |
338 | - __clear_bit(p->prio, array->bitmap); |
339 | + unsigned long long vdiff; |
340 | + if (likely(v1 >= v2)) { |
341 | + vdiff = v1 - v2; |
342 | + longlimit(&vdiff); |
343 | + } else { |
344 | + /* |
345 | + * Rarely the clock appears to go backwards. There should |
346 | + * always be a positive difference so return 1. |
347 | + */ |
348 | + vdiff = 1; |
349 | + } |
350 | + return (unsigned long)vdiff; |
351 | } |
352 | |
353 | -static void enqueue_task(struct task_struct *p, struct prio_array *array) |
354 | +static inline int task_queued(struct task_struct *task) |
355 | { |
356 | - sched_info_queued(p); |
357 | - list_add_tail(&p->run_list, array->queue + p->prio); |
358 | - __set_bit(p->prio, array->bitmap); |
359 | - array->nr_active++; |
360 | - p->array = array; |
361 | + return !list_empty(&task->run_list); |
362 | } |
363 | |
364 | /* |
365 | - * Put task to the end of the run list without the overhead of dequeue |
366 | - * followed by enqueue. |
367 | + * Adding/removing a task to/from a runqueue: |
368 | */ |
369 | -static void requeue_task(struct task_struct *p, struct prio_array *array) |
370 | +static void dequeue_task(struct task_struct *p, struct rq *rq) |
371 | { |
372 | - list_move_tail(&p->run_list, array->queue + p->prio); |
373 | + list_del_init(&p->run_list); |
374 | + if (list_empty(rq->queue + p->prio)) |
375 | + __clear_bit(p->prio, rq->bitmap); |
376 | + p->ns_debit = 0; |
377 | } |
378 | |
379 | -static inline void |
380 | -enqueue_task_head(struct task_struct *p, struct prio_array *array) |
381 | +static void enqueue_task(struct task_struct *p, struct rq *rq) |
382 | { |
383 | - list_add(&p->run_list, array->queue + p->prio); |
384 | - __set_bit(p->prio, array->bitmap); |
385 | - array->nr_active++; |
386 | - p->array = array; |
387 | + list_add_tail(&p->run_list, rq->queue + p->prio); |
388 | + __set_bit(p->prio, rq->bitmap); |
389 | } |
390 | |
391 | /* |
392 | - * __normal_prio - return the priority that is based on the static |
393 | - * priority but is modified by bonuses/penalties. |
394 | - * |
395 | - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
396 | - * into the -5 ... 0 ... +5 bonus/penalty range. |
397 | - * |
398 | - * We use 25% of the full 0...39 priority range so that: |
399 | - * |
400 | - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. |
401 | - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. |
402 | - * |
403 | - * Both properties are important to certain workloads. |
404 | + * Put task to the end of the run list without the overhead of dequeue |
405 | + * followed by enqueue. |
406 | */ |
407 | - |
408 | -static inline int __normal_prio(struct task_struct *p) |
409 | +static void requeue_task(struct task_struct *p, struct rq *rq, const int prio) |
410 | { |
411 | - int bonus, prio; |
412 | - |
413 | - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
414 | + list_move_tail(&p->run_list, rq->queue + prio); |
415 | + if (p->prio != prio) { |
416 | + if (list_empty(rq->queue + p->prio)) |
417 | + __clear_bit(p->prio, rq->bitmap); |
418 | + p->prio = prio; |
419 | + __set_bit(prio, rq->bitmap); |
420 | + } |
421 | + p->ns_debit = 0; |
422 | +} |
423 | |
424 | - prio = p->static_prio - bonus; |
425 | - if (prio < MAX_RT_PRIO) |
426 | - prio = MAX_RT_PRIO; |
427 | - if (prio > MAX_PRIO-1) |
428 | - prio = MAX_PRIO-1; |
429 | - return prio; |
430 | +static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) |
431 | +{ |
432 | + list_add(&p->run_list, rq->queue + p->prio); |
433 | + __set_bit(p->prio, rq->bitmap); |
434 | } |
435 | |
436 | +static unsigned int slice(const struct task_struct *p); |
437 | + |
438 | /* |
439 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
440 | * of tasks with abnormal "nice" values across CPUs the contribution that |
441 | @@ -756,10 +655,9 @@ static inline int __normal_prio(struct t |
442 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
443 | #define LOAD_WEIGHT(lp) \ |
444 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
445 | -#define PRIO_TO_LOAD_WEIGHT(prio) \ |
446 | - LOAD_WEIGHT(static_prio_timeslice(prio)) |
447 | -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
448 | - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) |
449 | +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p)) |
450 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
451 | + (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp)))) |
452 | |
453 | static void set_load_weight(struct task_struct *p) |
454 | { |
455 | @@ -776,7 +674,7 @@ static void set_load_weight(struct task_ |
456 | #endif |
457 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); |
458 | } else |
459 | - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); |
460 | + p->load_weight = TASK_LOAD_WEIGHT(p); |
461 | } |
462 | |
463 | static inline void |
464 | @@ -804,6 +702,182 @@ static inline void dec_nr_running(struct |
465 | } |
466 | |
467 | /* |
468 | + * __activate_task - move a task to the runqueue. |
469 | + */ |
470 | +static inline void __activate_task(struct task_struct *p, struct rq *rq) |
471 | +{ |
472 | + enqueue_task(p, rq); |
473 | + inc_nr_running(p, rq); |
474 | +} |
475 | + |
476 | +/* |
477 | + * __activate_idle_task - move idle task to the _front_ of runqueue. |
478 | + */ |
479 | +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
480 | +{ |
481 | + enqueue_task_head(p, rq); |
482 | + inc_nr_running(p, rq); |
483 | +} |
484 | + |
485 | +/* |
486 | + * Bonus - How much higher than its base priority an interactive task can run. |
487 | + */ |
488 | +static inline unsigned int bonus(const struct task_struct *p) |
489 | +{ |
490 | + return TASK_USER_PRIO(p); |
491 | +} |
492 | + |
493 | +static unsigned int rr_interval(const struct task_struct *p) |
494 | +{ |
495 | + int nice = TASK_NICE(p); |
496 | + |
497 | + if (nice < 0 && !rt_task(p)) |
498 | + return RR_INTERVAL * (20 - nice) / 20; |
499 | + return RR_INTERVAL; |
500 | +} |
501 | + |
502 | +/* |
503 | + * slice - the duration a task runs before getting requeued at its best |
504 | + * priority and has its bonus decremented. |
505 | + */ |
506 | +static unsigned int slice(const struct task_struct *p) |
507 | +{ |
508 | + unsigned int slice, rr; |
509 | + |
510 | + slice = rr = rr_interval(p); |
511 | + if (likely(!rt_task(p))) |
512 | + slice += (39 - TASK_USER_PRIO(p)) * rr; |
513 | + return slice; |
514 | +} |
515 | + |
516 | +/* |
517 | + * We increase our bonus by sleeping more than the time we ran. |
518 | + * The ratio of sleep to run gives us the cpu% that we last ran and determines |
519 | + * the maximum bonus we can acquire. |
520 | + */ |
521 | +static void inc_bonus(struct task_struct *p, unsigned long totalrun, unsigned long sleep) |
522 | +{ |
523 | + unsigned int best_bonus = sleep / (totalrun + 1); |
524 | + |
525 | + if (p->bonus >= best_bonus) |
526 | + return; |
527 | + best_bonus = bonus(p); |
528 | + if (p->bonus < best_bonus) |
529 | + p->bonus++; |
530 | +} |
531 | + |
532 | +static inline void dec_bonus(struct task_struct *p) |
533 | +{ |
534 | + if (p->bonus) |
535 | + p->bonus--; |
536 | +} |
537 | + |
538 | +static inline void slice_overrun(struct task_struct *p) |
539 | +{ |
540 | + unsigned long ns_slice = JIFFIES_TO_NS(p->slice); |
541 | + |
542 | + do { |
543 | + p->totalrun -= ns_slice; |
544 | + dec_bonus(p); |
545 | + } while (unlikely(p->totalrun > ns_slice)); |
546 | +} |
547 | + |
548 | +static inline void continue_slice(struct task_struct *p) |
549 | +{ |
550 | + unsigned long total_run = NS_TO_JIFFIES(p->totalrun); |
551 | + |
552 | + if (unlikely(total_run >= p->slice)) |
553 | + slice_overrun(p); |
554 | + else { |
555 | + unsigned long remainder; |
556 | + |
557 | + p->slice -= total_run; |
558 | + remainder = p->slice % rr_interval(p); |
559 | + if (remainder) |
560 | + p->time_slice = remainder; |
561 | + } |
562 | +} |
563 | + |
564 | +/* |
565 | + * recalc_task_prio - this checks for tasks that have run less than a full |
566 | + * slice and have woken up again soon after, or have just forked a |
567 | + * thread/process and make them continue their old slice instead of starting |
568 | + * a new one at high priority. |
569 | + */ |
570 | +static inline void recalc_task_prio(struct task_struct *p, const unsigned long long now) |
571 | +{ |
572 | + unsigned long sleep_time; |
573 | + |
574 | + /* |
575 | + * If this task has managed to run to its lowest priority then |
576 | + * decrease its bonus and requeue it now at best priority instead |
577 | + * of possibly flagging around lowest priority. Save up any systime |
578 | + * that may affect priority on the next reschedule. |
579 | + */ |
580 | + if (p->slice > p->time_slice && |
581 | + p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) { |
582 | + dec_bonus(p); |
583 | + p->totalrun = 0; |
584 | + return; |
585 | + } |
586 | + |
587 | + /* |
588 | + * Add the total for this last scheduled run (p->runtime) and system |
589 | + * time (p->systime) done on behalf of p to the running total so far |
590 | + * used (p->totalrun). |
591 | + */ |
592 | + p->totalrun += p->runtime + p->systime; |
593 | + sleep_time = ns_diff(now, p->timestamp); |
594 | + |
595 | + if (p->systime > sleep_time || p->flags & PF_FORKED) |
596 | + sleep_time = 0; |
597 | + else { |
598 | + sleep_time -= p->systime; |
599 | + /* |
600 | + * We elevate priority by the amount of time we slept. If we |
601 | + * sleep longer than our running total and have not set the |
602 | + * PF_NONSLEEP flag we gain a bonus. |
603 | + */ |
604 | + if (sleep_time >= p->totalrun) { |
605 | + if (!(p->flags & PF_NONSLEEP)) |
606 | + inc_bonus(p, p->totalrun, sleep_time); |
607 | + p->totalrun = 0; |
608 | + return; |
609 | + } |
610 | + p->totalrun -= sleep_time; |
611 | + } |
612 | + continue_slice(p); |
613 | +} |
614 | + |
615 | +/* |
616 | + * __normal_prio - dynamic priority dependent on bonus. |
617 | + * The priority normally decreases by one each RR_INTERVAL. |
618 | + * As the bonus increases the initial priority starts at a higher "stair" or |
619 | + * priority for longer. |
620 | + */ |
621 | +static inline int __normal_prio(struct task_struct *p) |
622 | +{ |
623 | + int prio; |
624 | + unsigned int full_slice, used_slice = 0; |
625 | + unsigned int best_bonus, rr; |
626 | + |
627 | + full_slice = slice(p); |
628 | + if (full_slice > p->slice) |
629 | + used_slice = full_slice - p->slice; |
630 | + |
631 | + best_bonus = bonus(p); |
632 | + prio = MAX_RT_PRIO + best_bonus; |
633 | + if (!batch_task(p)) |
634 | + prio -= p->bonus; |
635 | + |
636 | + rr = rr_interval(p); |
637 | + prio += used_slice / rr; |
638 | + if (prio > MIN_USER_PRIO) |
639 | + prio = MIN_USER_PRIO; |
640 | + return prio; |
641 | +} |
642 | + |
643 | +/* |
644 | * Calculate the expected normal priority: i.e. priority |
645 | * without taking RT-inheritance into account. Might be |
646 | * boosted by interactivity modifiers. Changes upon fork, |
647 | @@ -842,111 +916,14 @@ static int effective_prio(struct task_st |
648 | } |
649 | |
650 | /* |
651 | - * __activate_task - move a task to the runqueue. |
652 | - */ |
653 | -static void __activate_task(struct task_struct *p, struct rq *rq) |
654 | -{ |
655 | - struct prio_array *target = rq->active; |
656 | - |
657 | - if (batch_task(p)) |
658 | - target = rq->expired; |
659 | - enqueue_task(p, target); |
660 | - inc_nr_running(p, rq); |
661 | -} |
662 | - |
663 | -/* |
664 | - * __activate_idle_task - move idle task to the _front_ of runqueue. |
665 | - */ |
666 | -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
667 | -{ |
668 | - enqueue_task_head(p, rq->active); |
669 | - inc_nr_running(p, rq); |
670 | -} |
671 | - |
672 | -/* |
673 | - * Recalculate p->normal_prio and p->prio after having slept, |
674 | - * updating the sleep-average too: |
675 | - */ |
676 | -static int recalc_task_prio(struct task_struct *p, unsigned long long now) |
677 | -{ |
678 | - /* Caller must always ensure 'now >= p->timestamp' */ |
679 | - unsigned long sleep_time = now - p->timestamp; |
680 | - |
681 | - if (batch_task(p)) |
682 | - sleep_time = 0; |
683 | - |
684 | - if (likely(sleep_time > 0)) { |
685 | - /* |
686 | - * This ceiling is set to the lowest priority that would allow |
687 | - * a task to be reinserted into the active array on timeslice |
688 | - * completion. |
689 | - */ |
690 | - unsigned long ceiling = INTERACTIVE_SLEEP(p); |
691 | - |
692 | - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
693 | - /* |
694 | - * Prevents user tasks from achieving best priority |
695 | - * with one single large enough sleep. |
696 | - */ |
697 | - p->sleep_avg = ceiling; |
698 | - /* |
699 | - * Using INTERACTIVE_SLEEP() as a ceiling places a |
700 | - * nice(0) task 1ms sleep away from promotion, and |
701 | - * gives it 700ms to round-robin with no chance of |
702 | - * being demoted. This is more than generous, so |
703 | - * mark this sleep as non-interactive to prevent the |
704 | - * on-runqueue bonus logic from intervening should |
705 | - * this task not receive cpu immediately. |
706 | - */ |
707 | - p->sleep_type = SLEEP_NONINTERACTIVE; |
708 | - } else { |
709 | - /* |
710 | - * Tasks waking from uninterruptible sleep are |
711 | - * limited in their sleep_avg rise as they |
712 | - * are likely to be waiting on I/O |
713 | - */ |
714 | - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
715 | - if (p->sleep_avg >= ceiling) |
716 | - sleep_time = 0; |
717 | - else if (p->sleep_avg + sleep_time >= |
718 | - ceiling) { |
719 | - p->sleep_avg = ceiling; |
720 | - sleep_time = 0; |
721 | - } |
722 | - } |
723 | - |
724 | - /* |
725 | - * This code gives a bonus to interactive tasks. |
726 | - * |
727 | - * The boost works by updating the 'average sleep time' |
728 | - * value here, based on ->timestamp. The more time a |
729 | - * task spends sleeping, the higher the average gets - |
730 | - * and the higher the priority boost gets as well. |
731 | - */ |
732 | - p->sleep_avg += sleep_time; |
733 | - |
734 | - } |
735 | - if (p->sleep_avg > NS_MAX_SLEEP_AVG) |
736 | - p->sleep_avg = NS_MAX_SLEEP_AVG; |
737 | - } |
738 | - |
739 | - return effective_prio(p); |
740 | -} |
741 | - |
742 | -/* |
743 | * activate_task - move a task to the runqueue and do priority recalculation |
744 | * |
745 | - * Update all the scheduling statistics stuff. (sleep average |
746 | - * calculation, priority modifiers, etc.) |
747 | */ |
748 | static void activate_task(struct task_struct *p, struct rq *rq, int local) |
749 | { |
750 | - unsigned long long now; |
751 | - |
752 | - if (rt_task(p)) |
753 | - goto out; |
754 | + unsigned long long now = sched_clock(); |
755 | + unsigned long rr = rr_interval(p); |
756 | |
757 | - now = sched_clock(); |
758 | #ifdef CONFIG_SMP |
759 | if (!local) { |
760 | /* Compensate for drifting sched_clock */ |
761 | @@ -967,32 +944,15 @@ static void activate_task(struct task_st |
762 | (now - p->timestamp) >> 20); |
763 | } |
764 | |
765 | - p->prio = recalc_task_prio(p, now); |
766 | - |
767 | - /* |
768 | - * This checks to make sure it's not an uninterruptible task |
769 | - * that is now waking up. |
770 | - */ |
771 | - if (p->sleep_type == SLEEP_NORMAL) { |
772 | - /* |
773 | - * Tasks which were woken up by interrupts (ie. hw events) |
774 | - * are most likely of interactive nature. So we give them |
775 | - * the credit of extending their sleep time to the period |
776 | - * of time they spend on the runqueue, waiting for execution |
777 | - * on a CPU, first time around: |
778 | - */ |
779 | - if (in_interrupt()) |
780 | - p->sleep_type = SLEEP_INTERRUPTED; |
781 | - else { |
782 | - /* |
783 | - * Normal first-time wakeups get a credit too for |
784 | - * on-runqueue time, but it will be weighted down: |
785 | - */ |
786 | - p->sleep_type = SLEEP_INTERACTIVE; |
787 | - } |
788 | + p->slice = slice(p); |
789 | + p->time_slice = p->slice % rr ? : rr; |
790 | + if (!rt_task(p)) { |
791 | + recalc_task_prio(p, now); |
792 | + p->prio = effective_prio(p); |
793 | + p->systime = 0; |
794 | + p->flags &= ~(PF_FORKED | PF_NONSLEEP); |
795 | } |
796 | p->timestamp = now; |
797 | -out: |
798 | __activate_task(p, rq); |
799 | } |
800 | |
801 | @@ -1002,8 +962,7 @@ out: |
802 | static void deactivate_task(struct task_struct *p, struct rq *rq) |
803 | { |
804 | dec_nr_running(p, rq); |
805 | - dequeue_task(p, p->array); |
806 | - p->array = NULL; |
807 | + dequeue_task(p, rq); |
808 | } |
809 | |
810 | /* |
811 | @@ -1085,7 +1044,7 @@ migrate_task(struct task_struct *p, int |
812 | * If the task is not on a runqueue (and not running), then |
813 | * it is sufficient to simply update the task's cpu field. |
814 | */ |
815 | - if (!p->array && !task_running(rq, p)) { |
816 | + if (!task_queued(p) && !task_running(rq, p)) { |
817 | set_task_cpu(p, dest_cpu); |
818 | return 0; |
819 | } |
820 | @@ -1116,7 +1075,7 @@ void wait_task_inactive(struct task_stru |
821 | repeat: |
822 | rq = task_rq_lock(p, &flags); |
823 | /* Must be off runqueue entirely, not preempted. */ |
824 | - if (unlikely(p->array || task_running(rq, p))) { |
825 | + if (unlikely(task_queued(p) || task_running(rq, p))) { |
826 | /* If it's preempted, we yield. It could be a while. */ |
827 | preempted = !task_running(rq, p); |
828 | task_rq_unlock(rq, &flags); |
829 | @@ -1381,6 +1340,16 @@ static inline int wake_idle(int cpu, str |
830 | } |
831 | #endif |
832 | |
833 | +/* |
834 | + * Check to see if p preempts rq->curr and resched if it does. |
835 | + */ |
836 | +static inline void preempt(const struct task_struct *p, struct rq *rq) |
837 | +{ |
838 | + if (TASK_PREEMPTS_CURR(p, rq)) |
839 | + resched_task(rq->curr); |
840 | +} |
841 | + |
842 | + |
843 | /*** |
844 | * try_to_wake_up - wake up a thread |
845 | * @p: the to-be-woken-up thread |
846 | @@ -1412,7 +1381,7 @@ static int try_to_wake_up(struct task_st |
847 | if (!(old_state & state)) |
848 | goto out; |
849 | |
850 | - if (p->array) |
851 | + if (task_queued(p)) |
852 | goto out_running; |
853 | |
854 | cpu = task_cpu(p); |
855 | @@ -1505,7 +1474,7 @@ out_set_cpu: |
856 | old_state = p->state; |
857 | if (!(old_state & state)) |
858 | goto out; |
859 | - if (p->array) |
860 | + if (task_queued(p)) |
861 | goto out_running; |
862 | |
863 | this_cpu = smp_processor_id(); |
864 | @@ -1514,25 +1483,9 @@ out_set_cpu: |
865 | |
866 | out_activate: |
867 | #endif /* CONFIG_SMP */ |
868 | - if (old_state == TASK_UNINTERRUPTIBLE) { |
869 | + if (old_state == TASK_UNINTERRUPTIBLE) |
870 | rq->nr_uninterruptible--; |
871 | - /* |
872 | - * Tasks on involuntary sleep don't earn |
873 | - * sleep_avg beyond just interactive state. |
874 | - */ |
875 | - p->sleep_type = SLEEP_NONINTERACTIVE; |
876 | - } else |
877 | - |
878 | - /* |
879 | - * Tasks that have marked their sleep as noninteractive get |
880 | - * woken up with their sleep average not weighted in an |
881 | - * interactive way. |
882 | - */ |
883 | - if (old_state & TASK_NONINTERACTIVE) |
884 | - p->sleep_type = SLEEP_NONINTERACTIVE; |
885 | - |
886 | |
887 | - activate_task(p, rq, cpu == this_cpu); |
888 | /* |
889 | * Sync wakeups (i.e. those types of wakeups where the waker |
890 | * has indicated that it will leave the CPU in short order) |
891 | @@ -1541,10 +1494,9 @@ out_activate: |
892 | * the waker guarantees that the freshly woken up task is going |
893 | * to be considered on this CPU.) |
894 | */ |
895 | - if (!sync || cpu != this_cpu) { |
896 | - if (TASK_PREEMPTS_CURR(p, rq)) |
897 | - resched_task(rq->curr); |
898 | - } |
899 | + activate_task(p, rq, cpu == this_cpu); |
900 | + if (!sync || cpu != this_cpu) |
901 | + preempt(p, rq); |
902 | success = 1; |
903 | |
904 | out_running: |
905 | @@ -1595,7 +1547,6 @@ void fastcall sched_fork(struct task_str |
906 | p->prio = current->normal_prio; |
907 | |
908 | INIT_LIST_HEAD(&p->run_list); |
909 | - p->array = NULL; |
910 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
911 | if (unlikely(sched_info_on())) |
912 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
913 | @@ -1607,30 +1558,6 @@ void fastcall sched_fork(struct task_str |
914 | /* Want to start with kernel preemption disabled. */ |
915 | task_thread_info(p)->preempt_count = 1; |
916 | #endif |
917 | - /* |
918 | - * Share the timeslice between parent and child, thus the |
919 | - * total amount of pending timeslices in the system doesn't change, |
920 | - * resulting in more scheduling fairness. |
921 | - */ |
922 | - local_irq_disable(); |
923 | - p->time_slice = (current->time_slice + 1) >> 1; |
924 | - /* |
925 | - * The remainder of the first timeslice might be recovered by |
926 | - * the parent if the child exits early enough. |
927 | - */ |
928 | - p->first_time_slice = 1; |
929 | - current->time_slice >>= 1; |
930 | - p->timestamp = sched_clock(); |
931 | - if (unlikely(!current->time_slice)) { |
932 | - /* |
933 | - * This case is rare, it happens when the parent has only |
934 | - * a single jiffy left from its timeslice. Taking the |
935 | - * runqueue lock is not a problem. |
936 | - */ |
937 | - current->time_slice = 1; |
938 | - task_running_tick(cpu_rq(cpu), current); |
939 | - } |
940 | - local_irq_enable(); |
941 | put_cpu(); |
942 | } |
943 | |
944 | @@ -1652,38 +1579,20 @@ void fastcall wake_up_new_task(struct ta |
945 | this_cpu = smp_processor_id(); |
946 | cpu = task_cpu(p); |
947 | |
948 | - /* |
949 | - * We decrease the sleep average of forking parents |
950 | - * and children as well, to keep max-interactive tasks |
951 | - * from forking tasks that are max-interactive. The parent |
952 | - * (current) is done further down, under its lock. |
953 | - */ |
954 | - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * |
955 | - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
956 | - |
957 | - p->prio = effective_prio(p); |
958 | + /* Forked process gets no bonus to prevent fork bombs. */ |
959 | + p->bonus = 0; |
960 | + current->flags |= PF_FORKED; |
961 | |
962 | if (likely(cpu == this_cpu)) { |
963 | + activate_task(p, rq, 1); |
964 | if (!(clone_flags & CLONE_VM)) { |
965 | /* |
966 | * The VM isn't cloned, so we're in a good position to |
967 | * do child-runs-first in anticipation of an exec. This |
968 | * usually avoids a lot of COW overhead. |
969 | */ |
970 | - if (unlikely(!current->array)) |
971 | - __activate_task(p, rq); |
972 | - else { |
973 | - p->prio = current->prio; |
974 | - p->normal_prio = current->normal_prio; |
975 | - list_add_tail(&p->run_list, ¤t->run_list); |
976 | - p->array = current->array; |
977 | - p->array->nr_active++; |
978 | - inc_nr_running(p, rq); |
979 | - } |
980 | set_need_resched(); |
981 | - } else |
982 | - /* Run child last */ |
983 | - __activate_task(p, rq); |
984 | + } |
985 | /* |
986 | * We skip the following code due to cpu == this_cpu |
987 | * |
988 | @@ -1700,53 +1609,19 @@ void fastcall wake_up_new_task(struct ta |
989 | */ |
990 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) |
991 | + rq->most_recent_timestamp; |
992 | - __activate_task(p, rq); |
993 | - if (TASK_PREEMPTS_CURR(p, rq)) |
994 | - resched_task(rq->curr); |
995 | + activate_task(p, rq, 0); |
996 | + preempt(p, rq); |
997 | |
998 | /* |
999 | * Parent and child are on different CPUs, now get the |
1000 | - * parent runqueue to update the parent's ->sleep_avg: |
1001 | + * parent runqueue to update the parent's ->flags: |
1002 | */ |
1003 | task_rq_unlock(rq, &flags); |
1004 | this_rq = task_rq_lock(current, &flags); |
1005 | } |
1006 | - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * |
1007 | - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
1008 | task_rq_unlock(this_rq, &flags); |
1009 | } |
1010 | |
1011 | -/* |
1012 | - * Potentially available exiting-child timeslices are |
1013 | - * retrieved here - this way the parent does not get |
1014 | - * penalized for creating too many threads. |
1015 | - * |
1016 | - * (this cannot be used to 'generate' timeslices |
1017 | - * artificially, because any timeslice recovered here |
1018 | - * was given away by the parent in the first place.) |
1019 | - */ |
1020 | -void fastcall sched_exit(struct task_struct *p) |
1021 | -{ |
1022 | - unsigned long flags; |
1023 | - struct rq *rq; |
1024 | - |
1025 | - /* |
1026 | - * If the child was a (relative-) CPU hog then decrease |
1027 | - * the sleep_avg of the parent as well. |
1028 | - */ |
1029 | - rq = task_rq_lock(p->parent, &flags); |
1030 | - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
1031 | - p->parent->time_slice += p->time_slice; |
1032 | - if (unlikely(p->parent->time_slice > task_timeslice(p))) |
1033 | - p->parent->time_slice = task_timeslice(p); |
1034 | - } |
1035 | - if (p->sleep_avg < p->parent->sleep_avg) |
1036 | - p->parent->sleep_avg = p->parent->sleep_avg / |
1037 | - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / |
1038 | - (EXIT_WEIGHT + 1); |
1039 | - task_rq_unlock(rq, &flags); |
1040 | -} |
1041 | - |
1042 | /** |
1043 | * prepare_task_switch - prepare to switch tasks |
1044 | * @rq: the runqueue preparing to switch |
1045 | @@ -2068,23 +1943,21 @@ void sched_exec(void) |
1046 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1047 | * Both runqueues must be locked. |
1048 | */ |
1049 | -static void pull_task(struct rq *src_rq, struct prio_array *src_array, |
1050 | - struct task_struct *p, struct rq *this_rq, |
1051 | - struct prio_array *this_array, int this_cpu) |
1052 | +static void pull_task(struct rq *src_rq, struct task_struct *p, |
1053 | + struct rq *this_rq, int this_cpu) |
1054 | { |
1055 | - dequeue_task(p, src_array); |
1056 | + dequeue_task(p, src_rq); |
1057 | dec_nr_running(p, src_rq); |
1058 | set_task_cpu(p, this_cpu); |
1059 | inc_nr_running(p, this_rq); |
1060 | - enqueue_task(p, this_array); |
1061 | + enqueue_task(p, this_rq); |
1062 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) |
1063 | + this_rq->most_recent_timestamp; |
1064 | /* |
1065 | * Note that idle threads have a prio of MAX_PRIO, for this test |
1066 | * to be always true for them. |
1067 | */ |
1068 | - if (TASK_PREEMPTS_CURR(p, this_rq)) |
1069 | - resched_task(this_rq->curr); |
1070 | + preempt(p, this_rq); |
1071 | } |
1072 | |
1073 | /* |
1074 | @@ -2127,8 +2000,6 @@ int can_migrate_task(struct task_struct |
1075 | return 1; |
1076 | } |
1077 | |
1078 | -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) |
1079 | - |
1080 | /* |
1081 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1082 | * load from busiest to this_rq, as part of a balancing operation within |
1083 | @@ -2143,7 +2014,6 @@ static int move_tasks(struct rq *this_rq |
1084 | { |
1085 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, |
1086 | best_prio_seen, skip_for_load; |
1087 | - struct prio_array *array, *dst_array; |
1088 | struct list_head *head, *curr; |
1089 | struct task_struct *tmp; |
1090 | long rem_load_move; |
1091 | @@ -2153,8 +2023,8 @@ static int move_tasks(struct rq *this_rq |
1092 | |
1093 | rem_load_move = max_load_move; |
1094 | pinned = 1; |
1095 | - this_best_prio = rq_best_prio(this_rq); |
1096 | - best_prio = rq_best_prio(busiest); |
1097 | + this_best_prio = this_rq->curr->prio; |
1098 | + best_prio = busiest->curr->prio; |
1099 | /* |
1100 | * Enable handling of the case where there is more than one task |
1101 | * with the best priority. If the current running task is one |
1102 | @@ -2164,38 +2034,17 @@ static int move_tasks(struct rq *this_rq |
1103 | */ |
1104 | best_prio_seen = best_prio == busiest->curr->prio; |
1105 | |
1106 | - /* |
1107 | - * We first consider expired tasks. Those will likely not be |
1108 | - * executed in the near future, and they are most likely to |
1109 | - * be cache-cold, thus switching CPUs has the least effect |
1110 | - * on them. |
1111 | - */ |
1112 | - if (busiest->expired->nr_active) { |
1113 | - array = busiest->expired; |
1114 | - dst_array = this_rq->expired; |
1115 | - } else { |
1116 | - array = busiest->active; |
1117 | - dst_array = this_rq->active; |
1118 | - } |
1119 | - |
1120 | -new_array: |
1121 | /* Start searching at priority 0: */ |
1122 | idx = 0; |
1123 | skip_bitmap: |
1124 | if (!idx) |
1125 | - idx = sched_find_first_bit(array->bitmap); |
1126 | + idx = sched_find_first_bit(busiest->bitmap); |
1127 | else |
1128 | - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); |
1129 | - if (idx >= MAX_PRIO) { |
1130 | - if (array == busiest->expired && busiest->active->nr_active) { |
1131 | - array = busiest->active; |
1132 | - dst_array = this_rq->active; |
1133 | - goto new_array; |
1134 | - } |
1135 | + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); |
1136 | + if (idx >= MAX_PRIO) |
1137 | goto out; |
1138 | - } |
1139 | |
1140 | - head = array->queue + idx; |
1141 | + head = busiest->queue + idx; |
1142 | curr = head->prev; |
1143 | skip_queue: |
1144 | tmp = list_entry(curr, struct task_struct, run_list); |
1145 | @@ -2220,7 +2069,7 @@ skip_queue: |
1146 | goto skip_bitmap; |
1147 | } |
1148 | |
1149 | - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1150 | + pull_task(busiest, tmp, this_rq, this_cpu); |
1151 | pulled++; |
1152 | rem_load_move -= tmp->load_weight; |
1153 | |
1154 | @@ -3036,27 +2885,6 @@ unsigned long long current_sched_time(co |
1155 | } |
1156 | |
1157 | /* |
1158 | - * We place interactive tasks back into the active array, if possible. |
1159 | - * |
1160 | - * To guarantee that this does not starve expired tasks we ignore the |
1161 | - * interactivity of a task if the first expired task had to wait more |
1162 | - * than a 'reasonable' amount of time. This deadline timeout is |
1163 | - * load-dependent, as the frequency of array switched decreases with |
1164 | - * increasing number of running tasks. We also ignore the interactivity |
1165 | - * if a better static_prio task has expired: |
1166 | - */ |
1167 | -static inline int expired_starving(struct rq *rq) |
1168 | -{ |
1169 | - if (rq->curr->static_prio > rq->best_expired_prio) |
1170 | - return 1; |
1171 | - if (!STARVATION_LIMIT || !rq->expired_timestamp) |
1172 | - return 0; |
1173 | - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) |
1174 | - return 1; |
1175 | - return 0; |
1176 | -} |
1177 | - |
1178 | -/* |
1179 | * Account user cpu time to a process. |
1180 | * @p: the process that the cpu time gets accounted to |
1181 | * @hardirq_offset: the offset to subtract from hardirq_count() |
1182 | @@ -3104,6 +2932,7 @@ void account_system_time(struct task_str |
1183 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
1184 | else |
1185 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
1186 | + p->systime += NSJIFFY; |
1187 | /* Account for system time used */ |
1188 | acct_update_integrals(p); |
1189 | } |
1190 | @@ -3129,76 +2958,49 @@ void account_steal_time(struct task_stru |
1191 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
1192 | } |
1193 | |
1194 | +static void time_slice_expired(struct task_struct *p, struct rq *rq) |
1195 | +{ |
1196 | + set_tsk_need_resched(p); |
1197 | + p->time_slice = rr_interval(p); |
1198 | + requeue_task(p, rq, effective_prio(p)); |
1199 | +} |
1200 | + |
1201 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
1202 | { |
1203 | - if (p->array != rq->active) { |
1204 | + unsigned long debit; |
1205 | + |
1206 | + if (unlikely(!task_queued(p))) { |
1207 | /* Task has expired but was not scheduled yet */ |
1208 | set_tsk_need_resched(p); |
1209 | return; |
1210 | } |
1211 | + /* SCHED_FIFO tasks never run out of timeslice. */ |
1212 | + if (unlikely(p->policy == SCHED_FIFO)) |
1213 | + return; |
1214 | + |
1215 | spin_lock(&rq->lock); |
1216 | + debit = ns_diff(rq->most_recent_timestamp, p->timestamp); |
1217 | + p->ns_debit += debit; |
1218 | + if (p->ns_debit < NSJIFFY) |
1219 | + goto out_unlock; |
1220 | + p->ns_debit %= NSJIFFY; |
1221 | /* |
1222 | - * The task was running during this tick - update the |
1223 | - * time slice counter. Note: we do not update a thread's |
1224 | - * priority until it either goes to sleep or uses up its |
1225 | - * timeslice. This makes it possible for interactive tasks |
1226 | - * to use up their timeslices at their highest priority levels. |
1227 | + * Tasks lose bonus each time they use up a full slice(). |
1228 | */ |
1229 | - if (rt_task(p)) { |
1230 | - /* |
1231 | - * RR tasks need a special form of timeslice management. |
1232 | - * FIFO tasks have no timeslices. |
1233 | - */ |
1234 | - if ((p->policy == SCHED_RR) && !--p->time_slice) { |
1235 | - p->time_slice = task_timeslice(p); |
1236 | - p->first_time_slice = 0; |
1237 | - set_tsk_need_resched(p); |
1238 | - |
1239 | - /* put it at the end of the queue: */ |
1240 | - requeue_task(p, rq->active); |
1241 | - } |
1242 | + if (!--p->slice) { |
1243 | + dec_bonus(p); |
1244 | + p->totalrun = 0; |
1245 | + p->slice = slice(p); |
1246 | + time_slice_expired(p, rq); |
1247 | goto out_unlock; |
1248 | } |
1249 | + /* |
1250 | + * Tasks that run out of time_slice but still have slice left get |
1251 | + * requeued with a lower priority && RR_INTERVAL time_slice. |
1252 | + */ |
1253 | if (!--p->time_slice) { |
1254 | - dequeue_task(p, rq->active); |
1255 | - set_tsk_need_resched(p); |
1256 | - p->prio = effective_prio(p); |
1257 | - p->time_slice = task_timeslice(p); |
1258 | - p->first_time_slice = 0; |
1259 | - |
1260 | - if (!rq->expired_timestamp) |
1261 | - rq->expired_timestamp = jiffies; |
1262 | - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { |
1263 | - enqueue_task(p, rq->expired); |
1264 | - if (p->static_prio < rq->best_expired_prio) |
1265 | - rq->best_expired_prio = p->static_prio; |
1266 | - } else |
1267 | - enqueue_task(p, rq->active); |
1268 | - } else { |
1269 | - /* |
1270 | - * Prevent a too long timeslice allowing a task to monopolize |
1271 | - * the CPU. We do this by splitting up the timeslice into |
1272 | - * smaller pieces. |
1273 | - * |
1274 | - * Note: this does not mean the task's timeslices expire or |
1275 | - * get lost in any way, they just might be preempted by |
1276 | - * another task of equal priority. (one with higher |
1277 | - * priority would have preempted this task already.) We |
1278 | - * requeue this task to the end of the list on this priority |
1279 | - * level, which is in essence a round-robin of tasks with |
1280 | - * equal priority. |
1281 | - * |
1282 | - * This only applies to tasks in the interactive |
1283 | - * delta range with at least TIMESLICE_GRANULARITY to requeue. |
1284 | - */ |
1285 | - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - |
1286 | - p->time_slice) % TIMESLICE_GRANULARITY(p)) && |
1287 | - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && |
1288 | - (p->array == rq->active)) { |
1289 | - |
1290 | - requeue_task(p, rq->active); |
1291 | - set_tsk_need_resched(p); |
1292 | - } |
1293 | + time_slice_expired(p, rq); |
1294 | + goto out_unlock; |
1295 | } |
1296 | out_unlock: |
1297 | spin_unlock(&rq->lock); |
1298 | @@ -3207,9 +3009,6 @@ out_unlock: |
1299 | /* |
1300 | * This function gets called by the timer code, with HZ frequency. |
1301 | * We call it with interrupts disabled. |
1302 | - * |
1303 | - * It also gets called by the fork code, when changing the parent's |
1304 | - * timeslices. |
1305 | */ |
1306 | void scheduler_tick(void) |
1307 | { |
1308 | @@ -3273,13 +3072,13 @@ static void wake_sleeping_dependent(int |
1309 | |
1310 | /* |
1311 | * number of 'lost' timeslices this task wont be able to fully |
1312 | - * utilize, if another task runs on a sibling. This models the |
1313 | + * utilise, if another task runs on a sibling. This models the |
1314 | * slowdown effect of other tasks running on siblings: |
1315 | */ |
1316 | static inline unsigned long |
1317 | smt_slice(struct task_struct *p, struct sched_domain *sd) |
1318 | { |
1319 | - return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
1320 | + return p->slice * (100 - sd->per_cpu_gain) / 100; |
1321 | } |
1322 | |
1323 | /* |
1324 | @@ -3343,7 +3142,7 @@ dependent_sleeper(int this_cpu, struct r |
1325 | } else { |
1326 | if (smt_curr->static_prio < p->static_prio && |
1327 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
1328 | - smt_slice(smt_curr, sd) > task_timeslice(p)) |
1329 | + smt_slice(smt_curr, sd) > slice(p)) |
1330 | ret = 1; |
1331 | } |
1332 | unlock: |
1333 | @@ -3400,25 +3199,18 @@ EXPORT_SYMBOL(sub_preempt_count); |
1334 | |
1335 | #endif |
1336 | |
1337 | -static inline int interactive_sleep(enum sleep_type sleep_type) |
1338 | -{ |
1339 | - return (sleep_type == SLEEP_INTERACTIVE || |
1340 | - sleep_type == SLEEP_INTERRUPTED); |
1341 | -} |
1342 | - |
1343 | /* |
1344 | * schedule() is the main scheduler function. |
1345 | */ |
1346 | asmlinkage void __sched schedule(void) |
1347 | { |
1348 | struct task_struct *prev, *next; |
1349 | - struct prio_array *array; |
1350 | struct list_head *queue; |
1351 | unsigned long long now; |
1352 | - unsigned long run_time; |
1353 | - int cpu, idx, new_prio; |
1354 | long *switch_count; |
1355 | + unsigned long debit; |
1356 | struct rq *rq; |
1357 | + int cpu, idx; |
1358 | |
1359 | /* |
1360 | * Test if we are atomic. Since do_exit() needs to call into |
1361 | @@ -3454,20 +3246,11 @@ need_resched_nonpreemptible: |
1362 | |
1363 | schedstat_inc(rq, sched_cnt); |
1364 | now = sched_clock(); |
1365 | - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { |
1366 | - run_time = now - prev->timestamp; |
1367 | - if (unlikely((long long)(now - prev->timestamp) < 0)) |
1368 | - run_time = 0; |
1369 | - } else |
1370 | - run_time = NS_MAX_SLEEP_AVG; |
1371 | - |
1372 | - /* |
1373 | - * Tasks charged proportionately less run_time at high sleep_avg to |
1374 | - * delay them losing their interactive status |
1375 | - */ |
1376 | - run_time /= (CURRENT_BONUS(prev) ? : 1); |
1377 | |
1378 | spin_lock_irq(&rq->lock); |
1379 | + prev->runtime = ns_diff(now, prev->timestamp); |
1380 | + debit = ns_diff(now, rq->most_recent_timestamp) % NSJIFFY; |
1381 | + prev->ns_debit += debit; |
1382 | |
1383 | switch_count = &prev->nivcsw; |
1384 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
1385 | @@ -3476,8 +3259,10 @@ need_resched_nonpreemptible: |
1386 | unlikely(signal_pending(prev)))) |
1387 | prev->state = TASK_RUNNING; |
1388 | else { |
1389 | - if (prev->state == TASK_UNINTERRUPTIBLE) |
1390 | + if (prev->state == TASK_UNINTERRUPTIBLE) { |
1391 | + prev->flags |= PF_NONSLEEP; |
1392 | rq->nr_uninterruptible++; |
1393 | + } |
1394 | deactivate_task(prev, rq); |
1395 | } |
1396 | } |
1397 | @@ -3487,62 +3272,28 @@ need_resched_nonpreemptible: |
1398 | idle_balance(cpu, rq); |
1399 | if (!rq->nr_running) { |
1400 | next = rq->idle; |
1401 | - rq->expired_timestamp = 0; |
1402 | wake_sleeping_dependent(cpu); |
1403 | goto switch_tasks; |
1404 | } |
1405 | } |
1406 | |
1407 | - array = rq->active; |
1408 | - if (unlikely(!array->nr_active)) { |
1409 | - /* |
1410 | - * Switch the active and expired arrays. |
1411 | - */ |
1412 | - schedstat_inc(rq, sched_switch); |
1413 | - rq->active = rq->expired; |
1414 | - rq->expired = array; |
1415 | - array = rq->active; |
1416 | - rq->expired_timestamp = 0; |
1417 | - rq->best_expired_prio = MAX_PRIO; |
1418 | - } |
1419 | - |
1420 | - idx = sched_find_first_bit(array->bitmap); |
1421 | - queue = array->queue + idx; |
1422 | + idx = sched_find_first_bit(rq->bitmap); |
1423 | + queue = rq->queue + idx; |
1424 | next = list_entry(queue->next, struct task_struct, run_list); |
1425 | |
1426 | - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
1427 | - unsigned long long delta = now - next->timestamp; |
1428 | - if (unlikely((long long)(now - next->timestamp) < 0)) |
1429 | - delta = 0; |
1430 | - |
1431 | - if (next->sleep_type == SLEEP_INTERACTIVE) |
1432 | - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
1433 | - |
1434 | - array = next->array; |
1435 | - new_prio = recalc_task_prio(next, next->timestamp + delta); |
1436 | - |
1437 | - if (unlikely(next->prio != new_prio)) { |
1438 | - dequeue_task(next, array); |
1439 | - next->prio = new_prio; |
1440 | - enqueue_task(next, array); |
1441 | - } |
1442 | - } |
1443 | - next->sleep_type = SLEEP_NORMAL; |
1444 | if (dependent_sleeper(cpu, rq, next)) |
1445 | next = rq->idle; |
1446 | + else { |
1447 | + prefetch(next); |
1448 | + prefetch_stack(next); |
1449 | + } |
1450 | switch_tasks: |
1451 | if (next == rq->idle) |
1452 | schedstat_inc(rq, sched_goidle); |
1453 | - prefetch(next); |
1454 | - prefetch_stack(next); |
1455 | clear_tsk_need_resched(prev); |
1456 | rcu_qsctr_inc(task_cpu(prev)); |
1457 | |
1458 | update_cpu_clock(prev, rq, now); |
1459 | - |
1460 | - prev->sleep_avg -= run_time; |
1461 | - if ((long)prev->sleep_avg <= 0) |
1462 | - prev->sleep_avg = 0; |
1463 | prev->timestamp = prev->last_ran = now; |
1464 | |
1465 | sched_info_switch(prev, next); |
1466 | @@ -3978,29 +3729,21 @@ EXPORT_SYMBOL(sleep_on_timeout); |
1467 | */ |
1468 | void rt_mutex_setprio(struct task_struct *p, int prio) |
1469 | { |
1470 | - struct prio_array *array; |
1471 | unsigned long flags; |
1472 | + int queued, oldprio; |
1473 | struct rq *rq; |
1474 | - int oldprio; |
1475 | |
1476 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
1477 | |
1478 | rq = task_rq_lock(p, &flags); |
1479 | |
1480 | oldprio = p->prio; |
1481 | - array = p->array; |
1482 | - if (array) |
1483 | - dequeue_task(p, array); |
1484 | + if ((queued = task_queued(p))) |
1485 | + dequeue_task(p, rq); |
1486 | p->prio = prio; |
1487 | |
1488 | - if (array) { |
1489 | - /* |
1490 | - * If changing to an RT priority then queue it |
1491 | - * in the active array! |
1492 | - */ |
1493 | - if (rt_task(p)) |
1494 | - array = rq->active; |
1495 | - enqueue_task(p, array); |
1496 | + if (queued) { |
1497 | + enqueue_task(p, rq); |
1498 | /* |
1499 | * Reschedule if we are currently running on this runqueue and |
1500 | * our priority decreased, or if we are not currently running on |
1501 | @@ -4009,8 +3752,8 @@ void rt_mutex_setprio(struct task_struct |
1502 | if (task_running(rq, p)) { |
1503 | if (p->prio > oldprio) |
1504 | resched_task(rq->curr); |
1505 | - } else if (TASK_PREEMPTS_CURR(p, rq)) |
1506 | - resched_task(rq->curr); |
1507 | + } else |
1508 | + preempt(p, rq); |
1509 | } |
1510 | task_rq_unlock(rq, &flags); |
1511 | } |
1512 | @@ -4019,8 +3762,7 @@ void rt_mutex_setprio(struct task_struct |
1513 | |
1514 | void set_user_nice(struct task_struct *p, long nice) |
1515 | { |
1516 | - struct prio_array *array; |
1517 | - int old_prio, delta; |
1518 | + int queued, old_prio,delta; |
1519 | unsigned long flags; |
1520 | struct rq *rq; |
1521 | |
1522 | @@ -4041,20 +3783,21 @@ void set_user_nice(struct task_struct *p |
1523 | p->static_prio = NICE_TO_PRIO(nice); |
1524 | goto out_unlock; |
1525 | } |
1526 | - array = p->array; |
1527 | - if (array) { |
1528 | - dequeue_task(p, array); |
1529 | + if ((queued = task_queued(p))) { |
1530 | + dequeue_task(p, rq); |
1531 | dec_raw_weighted_load(rq, p); |
1532 | } |
1533 | |
1534 | p->static_prio = NICE_TO_PRIO(nice); |
1535 | set_load_weight(p); |
1536 | old_prio = p->prio; |
1537 | + if (p->bonus > bonus(p)) |
1538 | + p->bonus= bonus(p); |
1539 | p->prio = effective_prio(p); |
1540 | delta = p->prio - old_prio; |
1541 | |
1542 | - if (array) { |
1543 | - enqueue_task(p, array); |
1544 | + if (queued) { |
1545 | + enqueue_task(p, rq); |
1546 | inc_raw_weighted_load(rq, p); |
1547 | /* |
1548 | * If the task increased its priority or is running and |
1549 | @@ -4177,18 +3920,13 @@ static inline struct task_struct *find_p |
1550 | /* Actually do priority change: must hold rq lock. */ |
1551 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
1552 | { |
1553 | - BUG_ON(p->array); |
1554 | + BUG_ON(task_queued(p)); |
1555 | |
1556 | p->policy = policy; |
1557 | p->rt_priority = prio; |
1558 | p->normal_prio = normal_prio(p); |
1559 | /* we are holding p->pi_lock already */ |
1560 | p->prio = rt_mutex_getprio(p); |
1561 | - /* |
1562 | - * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
1563 | - */ |
1564 | - if (policy == SCHED_BATCH) |
1565 | - p->sleep_avg = 0; |
1566 | set_load_weight(p); |
1567 | } |
1568 | |
1569 | @@ -4204,8 +3942,7 @@ static void __setscheduler(struct task_s |
1570 | int sched_setscheduler(struct task_struct *p, int policy, |
1571 | struct sched_param *param) |
1572 | { |
1573 | - int retval, oldprio, oldpolicy = -1; |
1574 | - struct prio_array *array; |
1575 | + int queued, retval, oldprio, oldpolicy = -1; |
1576 | unsigned long flags; |
1577 | struct rq *rq; |
1578 | |
1579 | @@ -4279,12 +4016,11 @@ recheck: |
1580 | spin_unlock_irqrestore(&p->pi_lock, flags); |
1581 | goto recheck; |
1582 | } |
1583 | - array = p->array; |
1584 | - if (array) |
1585 | + if ((queued = task_queued(p))) |
1586 | deactivate_task(p, rq); |
1587 | oldprio = p->prio; |
1588 | __setscheduler(p, policy, param->sched_priority); |
1589 | - if (array) { |
1590 | + if (queued) { |
1591 | __activate_task(p, rq); |
1592 | /* |
1593 | * Reschedule if we are currently running on this runqueue and |
1594 | @@ -4294,8 +4030,8 @@ recheck: |
1595 | if (task_running(rq, p)) { |
1596 | if (p->prio > oldprio) |
1597 | resched_task(rq->curr); |
1598 | - } else if (TASK_PREEMPTS_CURR(p, rq)) |
1599 | - resched_task(rq->curr); |
1600 | + } else |
1601 | + preempt(p, rq); |
1602 | } |
1603 | __task_rq_unlock(rq); |
1604 | spin_unlock_irqrestore(&p->pi_lock, flags); |
1605 | @@ -4567,41 +4303,24 @@ asmlinkage long sys_sched_getaffinity(pi |
1606 | /** |
1607 | * sys_sched_yield - yield the current processor to other threads. |
1608 | * |
1609 | - * this function yields the current CPU by moving the calling thread |
1610 | - * to the expired array. If there are no other threads running on this |
1611 | - * CPU then this function will return. |
1612 | + * This function yields the current CPU by dropping the priority of current |
1613 | + * to the lowest priority. |
1614 | */ |
1615 | asmlinkage long sys_sched_yield(void) |
1616 | { |
1617 | struct rq *rq = this_rq_lock(); |
1618 | - struct prio_array *array = current->array, *target = rq->expired; |
1619 | + int newprio = current->prio; |
1620 | |
1621 | schedstat_inc(rq, yld_cnt); |
1622 | - /* |
1623 | - * We implement yielding by moving the task into the expired |
1624 | - * queue. |
1625 | - * |
1626 | - * (special rule: RT tasks will just roundrobin in the active |
1627 | - * array.) |
1628 | - */ |
1629 | - if (rt_task(current)) |
1630 | - target = rq->active; |
1631 | |
1632 | - if (array->nr_active == 1) { |
1633 | - schedstat_inc(rq, yld_act_empty); |
1634 | - if (!rq->expired->nr_active) |
1635 | - schedstat_inc(rq, yld_both_empty); |
1636 | - } else if (!rq->expired->nr_active) |
1637 | - schedstat_inc(rq, yld_exp_empty); |
1638 | - |
1639 | - if (array != target) { |
1640 | - dequeue_task(current, array); |
1641 | - enqueue_task(current, target); |
1642 | - } else |
1643 | - /* |
1644 | - * requeue_task is cheaper so perform that if possible. |
1645 | - */ |
1646 | - requeue_task(current, array); |
1647 | + newprio = current->prio; |
1648 | + schedstat_inc(rq, yld_cnt); |
1649 | + current->slice = slice(current); |
1650 | + current->time_slice = rr_interval(current); |
1651 | + if (likely(!rt_task(current))) |
1652 | + newprio = MIN_USER_PRIO; |
1653 | + |
1654 | + requeue_task(current, rq, newprio); |
1655 | |
1656 | /* |
1657 | * Since we are going to call schedule() anyway, there's |
1658 | @@ -4812,7 +4531,7 @@ long sys_sched_rr_get_interval(pid_t pid |
1659 | goto out_unlock; |
1660 | |
1661 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
1662 | - 0 : task_timeslice(p), &t); |
1663 | + 0 : slice(p), &t); |
1664 | read_unlock(&tasklist_lock); |
1665 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
1666 | out_nounlock: |
1667 | @@ -4941,8 +4660,6 @@ void __cpuinit init_idle(struct task_str |
1668 | unsigned long flags; |
1669 | |
1670 | idle->timestamp = sched_clock(); |
1671 | - idle->sleep_avg = 0; |
1672 | - idle->array = NULL; |
1673 | idle->prio = idle->normal_prio = MAX_PRIO; |
1674 | idle->state = TASK_RUNNING; |
1675 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
1676 | @@ -5062,7 +4779,7 @@ static int __migrate_task(struct task_st |
1677 | goto out; |
1678 | |
1679 | set_task_cpu(p, dest_cpu); |
1680 | - if (p->array) { |
1681 | + if (task_queued(p)) { |
1682 | /* |
1683 | * Sync timestamp with rq_dest's before activating. |
1684 | * The same thing could be achieved by doing this step |
1685 | @@ -5073,8 +4790,7 @@ static int __migrate_task(struct task_st |
1686 | + rq_dest->most_recent_timestamp; |
1687 | deactivate_task(p, rq_src); |
1688 | __activate_task(p, rq_dest); |
1689 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) |
1690 | - resched_task(rq_dest->curr); |
1691 | + preempt(p, rq_dest); |
1692 | } |
1693 | ret = 1; |
1694 | out: |
1695 | @@ -5303,7 +5019,7 @@ static void migrate_dead_tasks(unsigned |
1696 | |
1697 | for (arr = 0; arr < 2; arr++) { |
1698 | for (i = 0; i < MAX_PRIO; i++) { |
1699 | - struct list_head *list = &rq->arrays[arr].queue[i]; |
1700 | + struct list_head *list = &rq->queue[i]; |
1701 | |
1702 | while (!list_empty(list)) |
1703 | migrate_dead(dead_cpu, list_entry(list->next, |
1704 | @@ -6894,19 +6610,16 @@ int in_sched_functions(unsigned long add |
1705 | |
1706 | void __init sched_init(void) |
1707 | { |
1708 | - int i, j, k; |
1709 | + int i; |
1710 | |
1711 | for_each_possible_cpu(i) { |
1712 | - struct prio_array *array; |
1713 | struct rq *rq; |
1714 | + int j; |
1715 | |
1716 | rq = cpu_rq(i); |
1717 | spin_lock_init(&rq->lock); |
1718 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
1719 | rq->nr_running = 0; |
1720 | - rq->active = rq->arrays; |
1721 | - rq->expired = rq->arrays + 1; |
1722 | - rq->best_expired_prio = MAX_PRIO; |
1723 | |
1724 | #ifdef CONFIG_SMP |
1725 | rq->sd = NULL; |
1726 | @@ -6920,15 +6633,11 @@ void __init sched_init(void) |
1727 | #endif |
1728 | atomic_set(&rq->nr_iowait, 0); |
1729 | |
1730 | - for (j = 0; j < 2; j++) { |
1731 | - array = rq->arrays + j; |
1732 | - for (k = 0; k < MAX_PRIO; k++) { |
1733 | - INIT_LIST_HEAD(array->queue + k); |
1734 | - __clear_bit(k, array->bitmap); |
1735 | - } |
1736 | - // delimiter for bitsearch |
1737 | - __set_bit(MAX_PRIO, array->bitmap); |
1738 | - } |
1739 | + for (j = 0; j < MAX_PRIO; j++) |
1740 | + INIT_LIST_HEAD(&rq->queue[j]); |
1741 | + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long)); |
1742 | + /* delimiter for bitsearch */ |
1743 | + __set_bit(MAX_PRIO, rq->bitmap); |
1744 | } |
1745 | |
1746 | set_load_weight(&init_task); |
1747 | @@ -6984,10 +6693,10 @@ EXPORT_SYMBOL(__might_sleep); |
1748 | #ifdef CONFIG_MAGIC_SYSRQ |
1749 | void normalize_rt_tasks(void) |
1750 | { |
1751 | - struct prio_array *array; |
1752 | struct task_struct *p; |
1753 | unsigned long flags; |
1754 | struct rq *rq; |
1755 | + int queued; |
1756 | |
1757 | read_lock_irq(&tasklist_lock); |
1758 | for_each_process(p) { |
1759 | @@ -6997,11 +6706,10 @@ void normalize_rt_tasks(void) |
1760 | spin_lock_irqsave(&p->pi_lock, flags); |
1761 | rq = __task_rq_lock(p); |
1762 | |
1763 | - array = p->array; |
1764 | - if (array) |
1765 | + if ((queued = task_queued(p))) |
1766 | deactivate_task(p, task_rq(p)); |
1767 | __setscheduler(p, SCHED_NORMAL, 0); |
1768 | - if (array) { |
1769 | + if (queued) { |
1770 | __activate_task(p, task_rq(p)); |
1771 | resched_task(rq->curr); |
1772 | } |