Contents of /trunk/kernel26-magellan/patches-2.6.16-r12/0007-2.6.16-sched-staircase14.2.patch
Parent Directory | Revision Log
Revision 72 -
(show annotations)
(download)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 53449 byte(s)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 53449 byte(s)
ver bump to 2.6.16-r12: - updated to linux-2.6.16.19 - updated to ck11
1 | fs/proc/array.c | 4 |
2 | include/linux/sched.h | 13 |
3 | include/linux/sysctl.h | 2 |
4 | kernel/exit.c | 1 |
5 | kernel/sched.c | 1022 ++++++++++++++++++------------------------------- |
6 | kernel/sysctl.c | 16 |
7 | 6 files changed, 406 insertions(+), 652 deletions(-) |
8 | |
9 | Index: linux-2.6.16-ck1/fs/proc/array.c |
10 | =================================================================== |
11 | --- linux-2.6.16-ck1.orig/fs/proc/array.c 2006-03-20 20:46:26.000000000 +1100 |
12 | +++ linux-2.6.16-ck1/fs/proc/array.c 2006-03-20 20:46:48.000000000 +1100 |
13 | @@ -165,7 +165,7 @@ static inline char * task_state(struct t |
14 | read_lock(&tasklist_lock); |
15 | buffer += sprintf(buffer, |
16 | "State:\t%s\n" |
17 | - "SleepAVG:\t%lu%%\n" |
18 | + "Bonus:\t%d\n" |
19 | "Tgid:\t%d\n" |
20 | "Pid:\t%d\n" |
21 | "PPid:\t%d\n" |
22 | @@ -173,7 +173,7 @@ static inline char * task_state(struct t |
23 | "Uid:\t%d\t%d\t%d\t%d\n" |
24 | "Gid:\t%d\t%d\t%d\t%d\n", |
25 | get_task_state(p), |
26 | - (p->sleep_avg/1024)*100/(1020000000/1024), |
27 | + p->bonus, |
28 | p->tgid, |
29 | p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, |
30 | pid_alive(p) && p->ptrace ? p->parent->pid : 0, |
31 | Index: linux-2.6.16-ck1/include/linux/sched.h |
32 | =================================================================== |
33 | --- linux-2.6.16-ck1.orig/include/linux/sched.h 2006-03-20 20:46:47.000000000 +1100 |
34 | +++ linux-2.6.16-ck1/include/linux/sched.h 2006-03-20 20:46:48.000000000 +1100 |
35 | @@ -200,6 +200,7 @@ extern void show_stack(struct task_struc |
36 | |
37 | void io_schedule(void); |
38 | long io_schedule_timeout(long timeout); |
39 | +extern int sched_interactive, sched_compute; |
40 | |
41 | extern void cpu_init (void); |
42 | extern void trap_init(void); |
43 | @@ -522,7 +523,6 @@ extern struct user_struct *find_user(uid |
44 | extern struct user_struct root_user; |
45 | #define INIT_USER (&root_user) |
46 | |
47 | -typedef struct prio_array prio_array_t; |
48 | struct backing_dev_info; |
49 | struct reclaim_state; |
50 | |
51 | @@ -723,18 +723,17 @@ struct task_struct { |
52 | int load_weight; /* for niceness load balancing purposes */ |
53 | int prio, static_prio; |
54 | struct list_head run_list; |
55 | - prio_array_t *array; |
56 | |
57 | unsigned short ioprio; |
58 | |
59 | - unsigned long sleep_avg; |
60 | - unsigned long long timestamp, last_ran; |
61 | + unsigned long long timestamp; |
62 | + unsigned long runtime, totalrun, ns_debit; |
63 | + unsigned int bonus; |
64 | + unsigned int slice, time_slice; |
65 | unsigned long long sched_time; /* sched_clock time spent running */ |
66 | - int activated; |
67 | |
68 | unsigned long policy; |
69 | cpumask_t cpus_allowed; |
70 | - unsigned int time_slice, first_time_slice; |
71 | |
72 | #ifdef CONFIG_SCHEDSTATS |
73 | struct sched_info sched_info; |
74 | @@ -948,6 +947,7 @@ static inline void put_task_struct(struc |
75 | #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ |
76 | #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ |
77 | #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ |
78 | +#define PF_NONSLEEP 0x02000000 /* Waiting on in kernel activity */ |
79 | |
80 | /* |
81 | * Only the _current_ task can read/write to tsk->flags, but other |
82 | @@ -1069,7 +1069,6 @@ extern void FASTCALL(wake_up_new_task(st |
83 | static inline void kick_process(struct task_struct *tsk) { } |
84 | #endif |
85 | extern void FASTCALL(sched_fork(task_t * p, int clone_flags)); |
86 | -extern void FASTCALL(sched_exit(task_t * p)); |
87 | |
88 | extern int in_group_p(gid_t); |
89 | extern int in_egroup_p(gid_t); |
90 | Index: linux-2.6.16-ck1/include/linux/sysctl.h |
91 | =================================================================== |
92 | --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:26.000000000 +1100 |
93 | +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:46:48.000000000 +1100 |
94 | @@ -148,6 +148,8 @@ enum |
95 | KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ |
96 | KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ |
97 | KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ |
98 | + KERN_INTERACTIVE=73, /* interactive tasks can have cpu bursts */ |
99 | + KERN_COMPUTE=74, /* adjust timeslices for a compute server */ |
100 | }; |
101 | |
102 | |
103 | Index: linux-2.6.16-ck1/kernel/exit.c |
104 | =================================================================== |
105 | --- linux-2.6.16-ck1.orig/kernel/exit.c 2006-03-20 20:46:26.000000000 +1100 |
106 | +++ linux-2.6.16-ck1/kernel/exit.c 2006-03-20 20:46:48.000000000 +1100 |
107 | @@ -102,7 +102,6 @@ repeat: |
108 | zap_leader = (leader->exit_signal == -1); |
109 | } |
110 | |
111 | - sched_exit(p); |
112 | write_unlock_irq(&tasklist_lock); |
113 | spin_unlock(&p->proc_lock); |
114 | proc_pid_flush(proc_dentry); |
115 | Index: linux-2.6.16-ck1/kernel/sched.c |
116 | =================================================================== |
117 | --- linux-2.6.16-ck1.orig/kernel/sched.c 2006-03-20 20:46:46.000000000 +1100 |
118 | +++ linux-2.6.16-ck1/kernel/sched.c 2006-03-20 20:46:48.000000000 +1100 |
119 | @@ -16,6 +16,9 @@ |
120 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
121 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
122 | * 2004-04-02 Scheduler domains code by Nick Piggin |
123 | + * 2006-03-16 New staircase scheduling policy by Con Kolivas with help |
124 | + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. |
125 | + * Staircase v14.2 |
126 | */ |
127 | |
128 | #include <linux/mm.h> |
129 | @@ -76,128 +79,27 @@ |
130 | */ |
131 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
132 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
133 | +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ |
134 | +#define TASK_PREEMPTS_CURR(p, rq) ((p)->prio < (rq)->curr->prio) |
135 | |
136 | +int sched_compute __read_mostly = 0; |
137 | /* |
138 | - * These are the 'tuning knobs' of the scheduler: |
139 | - * |
140 | - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), |
141 | - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. |
142 | - * Timeslices get refilled after they expire. |
143 | - */ |
144 | -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
145 | -#define DEF_TIMESLICE (100 * HZ / 1000) |
146 | -#define ON_RUNQUEUE_WEIGHT 30 |
147 | -#define CHILD_PENALTY 95 |
148 | -#define PARENT_PENALTY 100 |
149 | -#define EXIT_WEIGHT 3 |
150 | -#define PRIO_BONUS_RATIO 25 |
151 | -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) |
152 | -#define INTERACTIVE_DELTA 2 |
153 | -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) |
154 | -#define STARVATION_LIMIT (MAX_SLEEP_AVG) |
155 | -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) |
156 | - |
157 | -/* |
158 | - * If a task is 'interactive' then we reinsert it in the active |
159 | - * array after it has expired its current timeslice. (it will not |
160 | - * continue to run immediately, it will still roundrobin with |
161 | - * other interactive tasks.) |
162 | - * |
163 | - * This part scales the interactivity limit depending on niceness. |
164 | - * |
165 | - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. |
166 | - * Here are a few examples of different nice levels: |
167 | - * |
168 | - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] |
169 | - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] |
170 | - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] |
171 | - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] |
172 | - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] |
173 | - * |
174 | - * (the X axis represents the possible -5 ... 0 ... +5 dynamic |
175 | - * priority range a task can explore, a value of '1' means the |
176 | - * task is rated interactive.) |
177 | - * |
178 | - * Ie. nice +19 tasks can never get 'interactive' enough to be |
179 | - * reinserted into the active array. And only heavily CPU-hog nice -20 |
180 | - * tasks will be expired. Default nice 0 tasks are somewhere between, |
181 | - * it takes some effort for them to get interactive, but it's not |
182 | - * too hard. |
183 | - */ |
184 | - |
185 | -#define CURRENT_BONUS(p) \ |
186 | - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ |
187 | - MAX_SLEEP_AVG) |
188 | - |
189 | -#define GRANULARITY (10 * HZ / 1000 ? : 1) |
190 | - |
191 | -#ifdef CONFIG_SMP |
192 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
193 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ |
194 | - num_online_cpus()) |
195 | -#else |
196 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
197 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) |
198 | -#endif |
199 | - |
200 | -#define SCALE(v1,v1_max,v2_max) \ |
201 | - (v1) * (v2_max) / (v1_max) |
202 | - |
203 | -#define DELTA(p) \ |
204 | - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) |
205 | - |
206 | -#define TASK_INTERACTIVE(p) \ |
207 | - ((p)->prio <= (p)->static_prio - DELTA(p)) |
208 | - |
209 | -#define INTERACTIVE_SLEEP(p) \ |
210 | - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ |
211 | - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) |
212 | - |
213 | -#define TASK_PREEMPTS_CURR(p, rq) \ |
214 | - ((p)->prio < (rq)->curr->prio) |
215 | - |
216 | -/* |
217 | - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
218 | - * to time slice values: [800ms ... 100ms ... 5ms] |
219 | - * |
220 | - * The higher a thread's priority, the bigger timeslices |
221 | - * it gets during one round of execution. But even the lowest |
222 | - * priority thread gets MIN_TIMESLICE worth of execution time. |
223 | + *This is the time all tasks within the same priority round robin. |
224 | + *compute setting is reserved for dedicated computational scheduling |
225 | + *and has twenty times larger intervals. Set to a minimum of 6ms. |
226 | */ |
227 | +#define _RR_INTERVAL ((6 * HZ / 1001) + 1) |
228 | +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 16 * sched_compute)) |
229 | +#define DEF_TIMESLICE (RR_INTERVAL() * 19) |
230 | |
231 | -#define SCALE_PRIO(x, prio) \ |
232 | - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
233 | - |
234 | -static unsigned int static_prio_timeslice(int static_prio) |
235 | -{ |
236 | - if (static_prio < NICE_TO_PRIO(0)) |
237 | - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
238 | - else |
239 | - return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
240 | -} |
241 | - |
242 | -static inline unsigned int task_timeslice(task_t *p) |
243 | -{ |
244 | - return static_prio_timeslice(p->static_prio); |
245 | -} |
246 | - |
247 | -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
248 | +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \ |
249 | < (long long) (sd)->cache_hot_time) |
250 | |
251 | /* |
252 | * These are the runqueue data structures: |
253 | */ |
254 | - |
255 | -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) |
256 | - |
257 | typedef struct runqueue runqueue_t; |
258 | |
259 | -struct prio_array { |
260 | - unsigned int nr_active; |
261 | - unsigned long bitmap[BITMAP_SIZE]; |
262 | - struct list_head queue[MAX_PRIO]; |
263 | -}; |
264 | - |
265 | /* |
266 | * This is the main, per-CPU runqueue data structure. |
267 | * |
268 | @@ -227,12 +129,12 @@ struct runqueue { |
269 | */ |
270 | unsigned long nr_uninterruptible; |
271 | |
272 | - unsigned long expired_timestamp; |
273 | unsigned long long timestamp_last_tick; |
274 | + unsigned int cache_ticks, preempted; |
275 | task_t *curr, *idle; |
276 | struct mm_struct *prev_mm; |
277 | - prio_array_t *active, *expired, arrays[2]; |
278 | - int best_expired_prio; |
279 | + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; |
280 | + struct list_head queue[MAX_PRIO]; |
281 | atomic_t nr_iowait; |
282 | |
283 | #ifdef CONFIG_SMP |
284 | @@ -496,13 +398,7 @@ static inline runqueue_t *this_rq_lock(v |
285 | |
286 | #ifdef CONFIG_SCHEDSTATS |
287 | /* |
288 | - * Called when a process is dequeued from the active array and given |
289 | - * the cpu. We should note that with the exception of interactive |
290 | - * tasks, the expired queue will become the active queue after the active |
291 | - * queue is empty, without explicitly dequeuing and requeuing tasks in the |
292 | - * expired queue. (Interactive tasks may be requeued directly to the |
293 | - * active queue, thus delaying tasks in the expired queue from running; |
294 | - * see scheduler_tick()). |
295 | + * Called when a process is dequeued and given the cpu. |
296 | * |
297 | * This function is only called from sched_info_arrive(), rather than |
298 | * dequeue_task(). Even though a task may be queued and dequeued multiple |
299 | @@ -540,13 +436,11 @@ static void sched_info_arrive(task_t *t) |
300 | } |
301 | |
302 | /* |
303 | - * Called when a process is queued into either the active or expired |
304 | - * array. The time is noted and later used to determine how long we |
305 | - * had to wait for us to reach the cpu. Since the expired queue will |
306 | - * become the active queue after active queue is empty, without dequeuing |
307 | - * and requeuing any tasks, we are interested in queuing to either. It |
308 | - * is unusual but not impossible for tasks to be dequeued and immediately |
309 | - * requeued in the same or another array: this can happen in sched_yield(), |
310 | + * Called when a process is queued |
311 | + * The time is noted and later used to determine how long we had to wait for |
312 | + * us to reach the cpu. |
313 | + * It is unusual but not impossible for tasks to be dequeued and immediately |
314 | + * requeued: this can happen in sched_yield(), |
315 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue |
316 | * to runqueue. |
317 | * |
318 | @@ -601,73 +495,67 @@ static inline void sched_info_switch(tas |
319 | #endif /* CONFIG_SCHEDSTATS */ |
320 | |
321 | /* |
322 | - * Adding/removing a task to/from a priority array: |
323 | + * Get nanosecond clock difference without overflowing unsigned long. |
324 | */ |
325 | -static void dequeue_task(struct task_struct *p, prio_array_t *array) |
326 | +static unsigned long ns_diff(const unsigned long long v1, |
327 | + const unsigned long long v2) |
328 | { |
329 | - array->nr_active--; |
330 | - list_del(&p->run_list); |
331 | - if (list_empty(array->queue + p->prio)) |
332 | - __clear_bit(p->prio, array->bitmap); |
333 | + unsigned long long vdiff; |
334 | + if (likely(v1 > v2)) { |
335 | + vdiff = v1 - v2; |
336 | +#if BITS_PER_LONG < 64 |
337 | + if (vdiff > (1 << 31)) |
338 | + vdiff = 1 << 31; |
339 | +#endif |
340 | + } else { |
341 | + /* |
342 | + * Rarely the clock appears to go backwards. There should |
343 | + * always be a positive difference so return 1. |
344 | + */ |
345 | + vdiff = 1; |
346 | + } |
347 | + return (unsigned long)vdiff; |
348 | } |
349 | |
350 | -static void enqueue_task(struct task_struct *p, prio_array_t *array) |
351 | +static inline int task_queued(const task_t *task) |
352 | { |
353 | - sched_info_queued(p); |
354 | - list_add_tail(&p->run_list, array->queue + p->prio); |
355 | - __set_bit(p->prio, array->bitmap); |
356 | - array->nr_active++; |
357 | - p->array = array; |
358 | + return !list_empty(&task->run_list); |
359 | } |
360 | |
361 | /* |
362 | - * Put task to the end of the run list without the overhead of dequeue |
363 | - * followed by enqueue. |
364 | + * Adding/removing a task to/from a runqueue: |
365 | */ |
366 | -static void requeue_task(struct task_struct *p, prio_array_t *array) |
367 | +static void fastcall dequeue_task(task_t *p, runqueue_t *rq) |
368 | { |
369 | - list_move_tail(&p->run_list, array->queue + p->prio); |
370 | + list_del_init(&p->run_list); |
371 | + if (list_empty(rq->queue + p->prio)) |
372 | + __clear_bit(p->prio, rq->bitmap); |
373 | + p->ns_debit = 0; |
374 | } |
375 | |
376 | -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) |
377 | +static void fastcall enqueue_task(task_t *p, runqueue_t *rq) |
378 | { |
379 | - list_add(&p->run_list, array->queue + p->prio); |
380 | - __set_bit(p->prio, array->bitmap); |
381 | - array->nr_active++; |
382 | - p->array = array; |
383 | + list_add_tail(&p->run_list, rq->queue + p->prio); |
384 | + __set_bit(p->prio, rq->bitmap); |
385 | } |
386 | |
387 | /* |
388 | - * effective_prio - return the priority that is based on the static |
389 | - * priority but is modified by bonuses/penalties. |
390 | - * |
391 | - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
392 | - * into the -5 ... 0 ... +5 bonus/penalty range. |
393 | - * |
394 | - * We use 25% of the full 0...39 priority range so that: |
395 | - * |
396 | - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. |
397 | - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. |
398 | - * |
399 | - * Both properties are important to certain workloads. |
400 | + * Put task to the end of the run list without the overhead of dequeue |
401 | + * followed by enqueue. |
402 | */ |
403 | -static int effective_prio(task_t *p) |
404 | +static inline void requeue_task(task_t *p, runqueue_t *rq) |
405 | { |
406 | - int bonus, prio; |
407 | - |
408 | - if (rt_task(p)) |
409 | - return p->prio; |
410 | - |
411 | - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
412 | + list_move_tail(&p->run_list, rq->queue + p->prio); |
413 | +} |
414 | |
415 | - prio = p->static_prio - bonus; |
416 | - if (prio < MAX_RT_PRIO) |
417 | - prio = MAX_RT_PRIO; |
418 | - if (prio > MAX_PRIO-1) |
419 | - prio = MAX_PRIO-1; |
420 | - return prio; |
421 | +static inline void enqueue_task_head(task_t *p, runqueue_t *rq) |
422 | +{ |
423 | + list_add(&p->run_list, rq->queue + p->prio); |
424 | + __set_bit(p->prio, rq->bitmap); |
425 | } |
426 | |
427 | +static unsigned int fastcall slice(const task_t *p); |
428 | + |
429 | /* |
430 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
431 | * of tasks with abnormal "nice" values across CPUs the contribution that |
432 | @@ -685,10 +573,9 @@ static int effective_prio(task_t *p) |
433 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
434 | #define LOAD_WEIGHT(lp) \ |
435 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
436 | -#define PRIO_TO_LOAD_WEIGHT(prio) \ |
437 | - LOAD_WEIGHT(static_prio_timeslice(prio)) |
438 | -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
439 | - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) |
440 | +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p)) |
441 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
442 | + (LOAD_WEIGHT((RR_INTERVAL() + 20 + (rp)))) |
443 | |
444 | static void set_load_weight(task_t *p) |
445 | { |
446 | @@ -705,7 +592,7 @@ static void set_load_weight(task_t *p) |
447 | #endif |
448 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); |
449 | } else |
450 | - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); |
451 | + p->load_weight = TASK_LOAD_WEIGHT(p); |
452 | } |
453 | |
454 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) |
455 | @@ -733,9 +620,9 @@ static inline void dec_nr_running(task_t |
456 | /* |
457 | * __activate_task - move a task to the runqueue. |
458 | */ |
459 | -static inline void __activate_task(task_t *p, runqueue_t *rq) |
460 | +static void fastcall __activate_task(task_t *p, runqueue_t *rq) |
461 | { |
462 | - enqueue_task(p, rq->active); |
463 | + enqueue_task(p, rq); |
464 | inc_nr_running(p, rq); |
465 | } |
466 | |
467 | @@ -744,74 +631,157 @@ static inline void __activate_task(task_ |
468 | */ |
469 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
470 | { |
471 | - enqueue_task_head(p, rq->active); |
472 | + enqueue_task_head(p, rq); |
473 | inc_nr_running(p, rq); |
474 | } |
475 | |
476 | -static int recalc_task_prio(task_t *p, unsigned long long now) |
477 | +/* |
478 | + * Bonus - How much higher than its base priority an interactive task can run. |
479 | + */ |
480 | +static inline unsigned int bonus(const task_t *p) |
481 | { |
482 | - /* Caller must always ensure 'now >= p->timestamp' */ |
483 | - unsigned long long __sleep_time = now - p->timestamp; |
484 | - unsigned long sleep_time; |
485 | - |
486 | - if (unlikely(p->policy == SCHED_BATCH)) |
487 | - sleep_time = 0; |
488 | - else { |
489 | - if (__sleep_time > NS_MAX_SLEEP_AVG) |
490 | - sleep_time = NS_MAX_SLEEP_AVG; |
491 | - else |
492 | - sleep_time = (unsigned long)__sleep_time; |
493 | - } |
494 | + return TASK_USER_PRIO(p); |
495 | +} |
496 | |
497 | - if (likely(sleep_time > 0)) { |
498 | - /* |
499 | - * User tasks that sleep a long time are categorised as |
500 | - * idle and will get just interactive status to stay active & |
501 | - * prevent them suddenly becoming cpu hogs and starving |
502 | - * other processes. |
503 | - */ |
504 | - if (p->mm && p->activated != -1 && |
505 | - sleep_time > INTERACTIVE_SLEEP(p)) { |
506 | - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - |
507 | - DEF_TIMESLICE); |
508 | - } else { |
509 | - /* |
510 | - * The lower the sleep avg a task has the more |
511 | - * rapidly it will rise with sleep time. |
512 | - */ |
513 | - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; |
514 | +static unsigned int fastcall rr_interval(const task_t *p) |
515 | +{ |
516 | + int nice = TASK_NICE(p); |
517 | |
518 | - /* |
519 | - * Tasks waking from uninterruptible sleep are |
520 | - * limited in their sleep_avg rise as they |
521 | - * are likely to be waiting on I/O |
522 | - */ |
523 | - if (p->activated == -1 && p->mm) { |
524 | - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) |
525 | - sleep_time = 0; |
526 | - else if (p->sleep_avg + sleep_time >= |
527 | - INTERACTIVE_SLEEP(p)) { |
528 | - p->sleep_avg = INTERACTIVE_SLEEP(p); |
529 | - sleep_time = 0; |
530 | - } |
531 | - } |
532 | + if (nice < 0 && !rt_task(p)) |
533 | + return RR_INTERVAL() * (20 - nice) / 20; |
534 | + return RR_INTERVAL(); |
535 | +} |
536 | |
537 | - /* |
538 | - * This code gives a bonus to interactive tasks. |
539 | - * |
540 | - * The boost works by updating the 'average sleep time' |
541 | - * value here, based on ->timestamp. The more time a |
542 | - * task spends sleeping, the higher the average gets - |
543 | - * and the higher the priority boost gets as well. |
544 | - */ |
545 | - p->sleep_avg += sleep_time; |
546 | +/* |
547 | + * slice - the duration a task runs before getting requeued at its best |
548 | + * priority and has its bonus decremented. |
549 | + */ |
550 | +static unsigned int fastcall slice(const task_t *p) |
551 | +{ |
552 | + unsigned int slice, rr; |
553 | |
554 | - if (p->sleep_avg > NS_MAX_SLEEP_AVG) |
555 | - p->sleep_avg = NS_MAX_SLEEP_AVG; |
556 | - } |
557 | + slice = rr = rr_interval(p); |
558 | + if (likely(!rt_task(p))) |
559 | + slice += (39 - TASK_USER_PRIO(p)) * rr; |
560 | + return slice; |
561 | +} |
562 | + |
563 | +/* |
564 | + * We increase our bonus by sleeping more than the time we ran. |
565 | + * The ratio of sleep to run gives us the cpu% that we last ran and determines |
566 | + * the maximum bonus we can acquire. |
567 | + */ |
568 | +static void fastcall inc_bonus(task_t *p, const unsigned long totalrun, |
569 | + const unsigned long sleep) |
570 | +{ |
571 | + unsigned int best_bonus; |
572 | + |
573 | + best_bonus = sleep / (totalrun + 1); |
574 | + if (p->bonus >= best_bonus) |
575 | + return; |
576 | + |
577 | + p->bonus++; |
578 | + best_bonus = bonus(p); |
579 | + if (p->bonus > best_bonus) |
580 | + p->bonus = best_bonus; |
581 | +} |
582 | + |
583 | +static void dec_bonus(task_t *p) |
584 | +{ |
585 | + if (p->bonus) |
586 | + p->bonus--; |
587 | +} |
588 | + |
589 | +/* |
590 | + * sched_interactive - sysctl which allows interactive tasks to have bonus |
591 | + * raise its priority. |
592 | + */ |
593 | +int sched_interactive __read_mostly = 1; |
594 | + |
595 | +/* |
596 | + * effective_prio - dynamic priority dependent on bonus. |
597 | + * The priority normally decreases by one each RR_INTERVAL. |
598 | + * As the bonus increases the initial priority starts at a higher "stair" or |
599 | + * priority for longer. |
600 | + */ |
601 | +static int effective_prio(const task_t *p) |
602 | +{ |
603 | + int prio; |
604 | + unsigned int full_slice, used_slice = 0; |
605 | + unsigned int best_bonus, rr; |
606 | + |
607 | + if (rt_task(p)) |
608 | + return p->prio; |
609 | + |
610 | + full_slice = slice(p); |
611 | + if (full_slice > p->slice) |
612 | + used_slice = full_slice - p->slice; |
613 | + |
614 | + best_bonus = bonus(p); |
615 | + prio = MAX_RT_PRIO + best_bonus; |
616 | + if (sched_interactive && !sched_compute && p->policy != SCHED_BATCH) |
617 | + prio -= p->bonus; |
618 | + |
619 | + rr = rr_interval(p); |
620 | + prio += used_slice / rr; |
621 | + if (prio > MAX_PRIO - 1) |
622 | + prio = MAX_PRIO - 1; |
623 | + return prio; |
624 | +} |
625 | + |
626 | +static inline void continue_slice(task_t *p) |
627 | +{ |
628 | + unsigned long total_run = NS_TO_JIFFIES(p->totalrun); |
629 | + |
630 | + if (total_run >= p->slice) { |
631 | + p->totalrun -= JIFFIES_TO_NS(p->slice); |
632 | + dec_bonus(p); |
633 | + } else { |
634 | + unsigned int remainder; |
635 | + |
636 | + p->slice -= total_run; |
637 | + remainder = p->slice % rr_interval(p); |
638 | + if (remainder) |
639 | + p->time_slice = remainder; |
640 | } |
641 | +} |
642 | |
643 | - return effective_prio(p); |
644 | +/* |
645 | + * recalc_task_prio - this checks for tasks that run ultra short timeslices |
646 | + * or have just forked a thread/process and make them continue their old |
647 | + * slice instead of starting a new one at high priority. |
648 | + */ |
649 | +static inline void recalc_task_prio(task_t *p, const unsigned long long now) |
650 | +{ |
651 | + unsigned long sleep_time = ns_diff(now, p->timestamp); |
652 | + |
653 | + /* |
654 | + * Add the total for this last scheduled run (p->runtime) to the |
655 | + * running total so far used (p->totalrun). |
656 | + */ |
657 | + p->totalrun += p->runtime; |
658 | + |
659 | + /* |
660 | + * If we sleep longer than our running total and have not set the |
661 | + * PF_NONSLEEP flag we gain a bonus. |
662 | + */ |
663 | + if (sleep_time >= p->totalrun && !(p->flags & PF_NONSLEEP) && |
664 | + !sched_compute) { |
665 | + inc_bonus(p, p->totalrun, sleep_time); |
666 | + p->totalrun = 0; |
667 | + return; |
668 | + } |
669 | + |
670 | + /* |
671 | + * If we have not set the PF_NONSLEEP flag we elevate priority by the |
672 | + * amount of time we slept. |
673 | + */ |
674 | + if (p->flags & PF_NONSLEEP) |
675 | + p->flags &= ~PF_NONSLEEP; |
676 | + else |
677 | + p->totalrun -= sleep_time; |
678 | + |
679 | + continue_slice(p); |
680 | } |
681 | |
682 | /* |
683 | @@ -820,11 +790,11 @@ static int recalc_task_prio(task_t *p, u |
684 | * Update all the scheduling statistics stuff. (sleep average |
685 | * calculation, priority modifiers, etc.) |
686 | */ |
687 | -static void activate_task(task_t *p, runqueue_t *rq, int local) |
688 | +static void activate_task(task_t *p, runqueue_t *rq, const int local) |
689 | { |
690 | - unsigned long long now; |
691 | + unsigned long long now = sched_clock(); |
692 | + unsigned long rr = rr_interval(p); |
693 | |
694 | - now = sched_clock(); |
695 | #ifdef CONFIG_SMP |
696 | if (!local) { |
697 | /* Compensate for drifting sched_clock */ |
698 | @@ -833,45 +803,24 @@ static void activate_task(task_t *p, run |
699 | + rq->timestamp_last_tick; |
700 | } |
701 | #endif |
702 | - |
703 | - if (!rt_task(p)) |
704 | - p->prio = recalc_task_prio(p, now); |
705 | - |
706 | - /* |
707 | - * This checks to make sure it's not an uninterruptible task |
708 | - * that is now waking up. |
709 | - */ |
710 | - if (!p->activated) { |
711 | - /* |
712 | - * Tasks which were woken up by interrupts (ie. hw events) |
713 | - * are most likely of interactive nature. So we give them |
714 | - * the credit of extending their sleep time to the period |
715 | - * of time they spend on the runqueue, waiting for execution |
716 | - * on a CPU, first time around: |
717 | - */ |
718 | - if (in_interrupt()) |
719 | - p->activated = 2; |
720 | - else { |
721 | - /* |
722 | - * Normal first-time wakeups get a credit too for |
723 | - * on-runqueue time, but it will be weighted down: |
724 | - */ |
725 | - p->activated = 1; |
726 | - } |
727 | + p->slice = slice(p); |
728 | + p->time_slice = p->slice % rr ? : rr; |
729 | + if (!rt_task(p)) { |
730 | + recalc_task_prio(p, now); |
731 | + p->flags &= ~PF_NONSLEEP; |
732 | + p->prio = effective_prio(p); |
733 | } |
734 | p->timestamp = now; |
735 | - |
736 | __activate_task(p, rq); |
737 | } |
738 | |
739 | /* |
740 | * deactivate_task - remove a task from the runqueue. |
741 | */ |
742 | -static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
743 | +static void fastcall deactivate_task(task_t *p, runqueue_t *rq) |
744 | { |
745 | dec_nr_running(p, rq); |
746 | - dequeue_task(p, p->array); |
747 | - p->array = NULL; |
748 | + dequeue_task(p, rq); |
749 | } |
750 | |
751 | /* |
752 | @@ -947,7 +896,7 @@ static int migrate_task(task_t *p, int d |
753 | * If the task is not on a runqueue (and not running), then |
754 | * it is sufficient to simply update the task's cpu field. |
755 | */ |
756 | - if (!p->array && !task_running(rq, p)) { |
757 | + if (!task_queued(p) && !task_running(rq, p)) { |
758 | set_task_cpu(p, dest_cpu); |
759 | return 0; |
760 | } |
761 | @@ -977,7 +926,7 @@ void wait_task_inactive(task_t *p) |
762 | repeat: |
763 | rq = task_rq_lock(p, &flags); |
764 | /* Must be off runqueue entirely, not preempted. */ |
765 | - if (unlikely(p->array || task_running(rq, p))) { |
766 | + if (unlikely(task_queued(p) || task_running(rq, p))) { |
767 | /* If it's preempted, we yield. It could be a while. */ |
768 | preempted = !task_running(rq, p); |
769 | task_rq_unlock(rq, &flags); |
770 | @@ -1228,6 +1177,26 @@ static inline int wake_idle(int cpu, tas |
771 | } |
772 | #endif |
773 | |
774 | +/* |
775 | + * CACHE_DELAY is the time preemption is delayed in sched_compute mode |
776 | + * and is set to a nominal 10ms. |
777 | + */ |
778 | +#define CACHE_DELAY (10 * (HZ) / 1001 + 1) |
779 | + |
780 | +/* |
781 | + * Check to see if p preempts rq->curr and resched if it does. In compute |
782 | + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted. |
783 | + */ |
784 | +static void fastcall preempt(const task_t *p, runqueue_t *rq) |
785 | +{ |
786 | + if (p->prio >= rq->curr->prio) |
787 | + return; |
788 | + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || |
789 | + !p->mm || rt_task(p)) |
790 | + resched_task(rq->curr); |
791 | + rq->preempted = 1; |
792 | +} |
793 | + |
794 | /*** |
795 | * try_to_wake_up - wake up a thread |
796 | * @p: the to-be-woken-up thread |
797 | @@ -1259,7 +1228,7 @@ static int try_to_wake_up(task_t *p, uns |
798 | if (!(old_state & state)) |
799 | goto out; |
800 | |
801 | - if (p->array) |
802 | + if (task_queued(p)) |
803 | goto out_running; |
804 | |
805 | cpu = task_cpu(p); |
806 | @@ -1350,7 +1319,7 @@ out_set_cpu: |
807 | old_state = p->state; |
808 | if (!(old_state & state)) |
809 | goto out; |
810 | - if (p->array) |
811 | + if (task_queued(p)) |
812 | goto out_running; |
813 | |
814 | this_cpu = smp_processor_id(); |
815 | @@ -1359,26 +1328,10 @@ out_set_cpu: |
816 | |
817 | out_activate: |
818 | #endif /* CONFIG_SMP */ |
819 | - if (old_state == TASK_UNINTERRUPTIBLE) { |
820 | + if (old_state == TASK_UNINTERRUPTIBLE) |
821 | rq->nr_uninterruptible--; |
822 | - /* |
823 | - * Tasks on involuntary sleep don't earn |
824 | - * sleep_avg beyond just interactive state. |
825 | - */ |
826 | - p->activated = -1; |
827 | - } |
828 | |
829 | /* |
830 | - * Tasks that have marked their sleep as noninteractive get |
831 | - * woken up without updating their sleep average. (i.e. their |
832 | - * sleep is handled in a priority-neutral manner, no priority |
833 | - * boost and no penalty.) |
834 | - */ |
835 | - if (old_state & TASK_NONINTERACTIVE) |
836 | - __activate_task(p, rq); |
837 | - else |
838 | - activate_task(p, rq, cpu == this_cpu); |
839 | - /* |
840 | * Sync wakeups (i.e. those types of wakeups where the waker |
841 | * has indicated that it will leave the CPU in short order) |
842 | * don't trigger a preemption, if the woken up task will run on |
843 | @@ -1386,10 +1339,9 @@ out_activate: |
844 | * the waker guarantees that the freshly woken up task is going |
845 | * to be considered on this CPU.) |
846 | */ |
847 | - if (!sync || cpu != this_cpu) { |
848 | - if (TASK_PREEMPTS_CURR(p, rq)) |
849 | - resched_task(rq->curr); |
850 | - } |
851 | + activate_task(p, rq, cpu == this_cpu); |
852 | + if (!sync || cpu != this_cpu) |
853 | + preempt(p, rq); |
854 | success = 1; |
855 | |
856 | out_running: |
857 | @@ -1434,7 +1386,6 @@ void fastcall sched_fork(task_t *p, int |
858 | */ |
859 | p->state = TASK_RUNNING; |
860 | INIT_LIST_HEAD(&p->run_list); |
861 | - p->array = NULL; |
862 | #ifdef CONFIG_SCHEDSTATS |
863 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
864 | #endif |
865 | @@ -1445,30 +1396,6 @@ void fastcall sched_fork(task_t *p, int |
866 | /* Want to start with kernel preemption disabled. */ |
867 | task_thread_info(p)->preempt_count = 1; |
868 | #endif |
869 | - /* |
870 | - * Share the timeslice between parent and child, thus the |
871 | - * total amount of pending timeslices in the system doesn't change, |
872 | - * resulting in more scheduling fairness. |
873 | - */ |
874 | - local_irq_disable(); |
875 | - p->time_slice = (current->time_slice + 1) >> 1; |
876 | - /* |
877 | - * The remainder of the first timeslice might be recovered by |
878 | - * the parent if the child exits early enough. |
879 | - */ |
880 | - p->first_time_slice = 1; |
881 | - current->time_slice >>= 1; |
882 | - p->timestamp = sched_clock(); |
883 | - if (unlikely(!current->time_slice)) { |
884 | - /* |
885 | - * This case is rare, it happens when the parent has only |
886 | - * a single jiffy left from its timeslice. Taking the |
887 | - * runqueue lock is not a problem. |
888 | - */ |
889 | - current->time_slice = 1; |
890 | - scheduler_tick(); |
891 | - } |
892 | - local_irq_enable(); |
893 | put_cpu(); |
894 | } |
895 | |
896 | @@ -1491,36 +1418,20 @@ void fastcall wake_up_new_task(task_t *p |
897 | cpu = task_cpu(p); |
898 | |
899 | /* |
900 | - * We decrease the sleep average of forking parents |
901 | - * and children as well, to keep max-interactive tasks |
902 | - * from forking tasks that are max-interactive. The parent |
903 | - * (current) is done further down, under its lock. |
904 | + * Forked process gets no bonus to prevent fork bombs. |
905 | */ |
906 | - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * |
907 | - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
908 | - |
909 | - p->prio = effective_prio(p); |
910 | + p->bonus = 0; |
911 | |
912 | if (likely(cpu == this_cpu)) { |
913 | - if (!(clone_flags & CLONE_VM)) { |
914 | + current->flags |= PF_NONSLEEP; |
915 | + activate_task(p, rq, 1); |
916 | + if (!(clone_flags & CLONE_VM)) |
917 | /* |
918 | * The VM isn't cloned, so we're in a good position to |
919 | * do child-runs-first in anticipation of an exec. This |
920 | * usually avoids a lot of COW overhead. |
921 | */ |
922 | - if (unlikely(!current->array)) |
923 | - __activate_task(p, rq); |
924 | - else { |
925 | - p->prio = current->prio; |
926 | - list_add_tail(&p->run_list, ¤t->run_list); |
927 | - p->array = current->array; |
928 | - p->array->nr_active++; |
929 | - inc_nr_running(p, rq); |
930 | - } |
931 | set_need_resched(); |
932 | - } else |
933 | - /* Run child last */ |
934 | - __activate_task(p, rq); |
935 | /* |
936 | * We skip the following code due to cpu == this_cpu |
937 | * |
938 | @@ -1537,53 +1448,20 @@ void fastcall wake_up_new_task(task_t *p |
939 | */ |
940 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) |
941 | + rq->timestamp_last_tick; |
942 | - __activate_task(p, rq); |
943 | - if (TASK_PREEMPTS_CURR(p, rq)) |
944 | - resched_task(rq->curr); |
945 | + activate_task(p, rq, 0); |
946 | + preempt(p, rq); |
947 | |
948 | /* |
949 | * Parent and child are on different CPUs, now get the |
950 | - * parent runqueue to update the parent's ->sleep_avg: |
951 | + * parent runqueue to update the parent's ->flags: |
952 | */ |
953 | task_rq_unlock(rq, &flags); |
954 | this_rq = task_rq_lock(current, &flags); |
955 | + current->flags |= PF_NONSLEEP; |
956 | } |
957 | - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * |
958 | - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
959 | task_rq_unlock(this_rq, &flags); |
960 | } |
961 | |
962 | -/* |
963 | - * Potentially available exiting-child timeslices are |
964 | - * retrieved here - this way the parent does not get |
965 | - * penalized for creating too many threads. |
966 | - * |
967 | - * (this cannot be used to 'generate' timeslices |
968 | - * artificially, because any timeslice recovered here |
969 | - * was given away by the parent in the first place.) |
970 | - */ |
971 | -void fastcall sched_exit(task_t *p) |
972 | -{ |
973 | - unsigned long flags; |
974 | - runqueue_t *rq; |
975 | - |
976 | - /* |
977 | - * If the child was a (relative-) CPU hog then decrease |
978 | - * the sleep_avg of the parent as well. |
979 | - */ |
980 | - rq = task_rq_lock(p->parent, &flags); |
981 | - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
982 | - p->parent->time_slice += p->time_slice; |
983 | - if (unlikely(p->parent->time_slice > task_timeslice(p))) |
984 | - p->parent->time_slice = task_timeslice(p); |
985 | - } |
986 | - if (p->sleep_avg < p->parent->sleep_avg) |
987 | - p->parent->sleep_avg = p->parent->sleep_avg / |
988 | - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / |
989 | - (EXIT_WEIGHT + 1); |
990 | - task_rq_unlock(rq, &flags); |
991 | -} |
992 | - |
993 | /** |
994 | * prepare_task_switch - prepare to switch tasks |
995 | * @rq: the runqueue preparing to switch |
996 | @@ -1855,32 +1733,28 @@ void sched_exec(void) |
997 | * pull_task - move a task from a remote runqueue to the local runqueue. |
998 | * Both runqueues must be locked. |
999 | */ |
1000 | -static |
1001 | -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, |
1002 | - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1003 | +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, |
1004 | + const int this_cpu) |
1005 | { |
1006 | - dequeue_task(p, src_array); |
1007 | + dequeue_task(p, src_rq); |
1008 | dec_nr_running(p, src_rq); |
1009 | set_task_cpu(p, this_cpu); |
1010 | inc_nr_running(p, this_rq); |
1011 | - enqueue_task(p, this_array); |
1012 | + enqueue_task(p, this_rq); |
1013 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1014 | + this_rq->timestamp_last_tick; |
1015 | /* |
1016 | * Note that idle threads have a prio of MAX_PRIO, for this test |
1017 | * to be always true for them. |
1018 | */ |
1019 | - if (TASK_PREEMPTS_CURR(p, this_rq)) |
1020 | - resched_task(this_rq->curr); |
1021 | + preempt(p, this_rq); |
1022 | } |
1023 | |
1024 | /* |
1025 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1026 | */ |
1027 | -static |
1028 | -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1029 | - struct sched_domain *sd, enum idle_type idle, |
1030 | - int *all_pinned) |
1031 | +static int can_migrate_task(task_t *p, runqueue_t *rq, const int this_cpu, |
1032 | + struct sched_domain *sd, const enum idle_type idle, int *all_pinned) |
1033 | { |
1034 | /* |
1035 | * We do not migrate tasks that are: |
1036 | @@ -1921,7 +1795,6 @@ static int move_tasks(runqueue_t *this_r |
1037 | struct sched_domain *sd, enum idle_type idle, |
1038 | int *all_pinned) |
1039 | { |
1040 | - prio_array_t *array, *dst_array; |
1041 | struct list_head *head, *curr; |
1042 | int idx, pulled = 0, pinned = 0; |
1043 | long rem_load_move; |
1044 | @@ -1933,38 +1806,17 @@ static int move_tasks(runqueue_t *this_r |
1045 | rem_load_move = max_load_move; |
1046 | pinned = 1; |
1047 | |
1048 | - /* |
1049 | - * We first consider expired tasks. Those will likely not be |
1050 | - * executed in the near future, and they are most likely to |
1051 | - * be cache-cold, thus switching CPUs has the least effect |
1052 | - * on them. |
1053 | - */ |
1054 | - if (busiest->expired->nr_active) { |
1055 | - array = busiest->expired; |
1056 | - dst_array = this_rq->expired; |
1057 | - } else { |
1058 | - array = busiest->active; |
1059 | - dst_array = this_rq->active; |
1060 | - } |
1061 | - |
1062 | -new_array: |
1063 | /* Start searching at priority 0: */ |
1064 | idx = 0; |
1065 | skip_bitmap: |
1066 | if (!idx) |
1067 | - idx = sched_find_first_bit(array->bitmap); |
1068 | + idx = sched_find_first_bit(busiest->bitmap); |
1069 | else |
1070 | - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); |
1071 | - if (idx >= MAX_PRIO) { |
1072 | - if (array == busiest->expired && busiest->active->nr_active) { |
1073 | - array = busiest->active; |
1074 | - dst_array = this_rq->active; |
1075 | - goto new_array; |
1076 | - } |
1077 | + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); |
1078 | + if (idx >= MAX_PRIO) |
1079 | goto out; |
1080 | - } |
1081 | |
1082 | - head = array->queue + idx; |
1083 | + head = busiest->queue + idx; |
1084 | curr = head->prev; |
1085 | skip_queue: |
1086 | tmp = list_entry(curr, task_t, run_list); |
1087 | @@ -1984,7 +1836,7 @@ skip_queue: |
1088 | schedstat_inc(sd, lb_hot_gained[idle]); |
1089 | #endif |
1090 | |
1091 | - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1092 | + pull_task(busiest, tmp, this_rq, this_cpu); |
1093 | pulled++; |
1094 | rem_load_move -= tmp->load_weight; |
1095 | |
1096 | @@ -2507,15 +2359,13 @@ static void rebalance_tick(int this_cpu, |
1097 | continue; |
1098 | |
1099 | interval = sd->balance_interval; |
1100 | - if (idle != SCHED_IDLE) |
1101 | - interval *= sd->busy_factor; |
1102 | |
1103 | /* scale ms to jiffies */ |
1104 | interval = msecs_to_jiffies(interval); |
1105 | if (unlikely(!interval)) |
1106 | interval = 1; |
1107 | |
1108 | - if (j - sd->last_balance >= interval) { |
1109 | + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) { |
1110 | if (load_balance(this_cpu, this_rq, sd, idle)) { |
1111 | /* |
1112 | * We've pulled tasks over so either we're no |
1113 | @@ -2589,22 +2439,6 @@ unsigned long long current_sched_time(co |
1114 | } |
1115 | |
1116 | /* |
1117 | - * We place interactive tasks back into the active array, if possible. |
1118 | - * |
1119 | - * To guarantee that this does not starve expired tasks we ignore the |
1120 | - * interactivity of a task if the first expired task had to wait more |
1121 | - * than a 'reasonable' amount of time. This deadline timeout is |
1122 | - * load-dependent, as the frequency of array switched decreases with |
1123 | - * increasing number of running tasks. We also ignore the interactivity |
1124 | - * if a better static_prio task has expired: |
1125 | - */ |
1126 | -#define EXPIRED_STARVING(rq) \ |
1127 | - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ |
1128 | - (jiffies - (rq)->expired_timestamp >= \ |
1129 | - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ |
1130 | - ((rq)->curr->static_prio > (rq)->best_expired_prio)) |
1131 | - |
1132 | -/* |
1133 | * Account user cpu time to a process. |
1134 | * @p: the process that the cpu time gets accounted to |
1135 | * @hardirq_offset: the offset to subtract from hardirq_count() |
1136 | @@ -2652,6 +2486,7 @@ void account_system_time(struct task_str |
1137 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
1138 | else |
1139 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
1140 | + |
1141 | /* Account for system time used */ |
1142 | acct_update_integrals(p); |
1143 | } |
1144 | @@ -2677,18 +2512,25 @@ void account_steal_time(struct task_stru |
1145 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
1146 | } |
1147 | |
1148 | +static void time_slice_expired(task_t *p, runqueue_t *rq) |
1149 | +{ |
1150 | + set_tsk_need_resched(p); |
1151 | + dequeue_task(p, rq); |
1152 | + p->prio = effective_prio(p); |
1153 | + p->time_slice = rr_interval(p); |
1154 | + enqueue_task(p, rq); |
1155 | +} |
1156 | + |
1157 | /* |
1158 | * This function gets called by the timer code, with HZ frequency. |
1159 | * We call it with interrupts disabled. |
1160 | - * |
1161 | - * It also gets called by the fork code, when changing the parent's |
1162 | - * timeslices. |
1163 | */ |
1164 | void scheduler_tick(void) |
1165 | { |
1166 | int cpu = smp_processor_id(); |
1167 | runqueue_t *rq = this_rq(); |
1168 | task_t *p = current; |
1169 | + unsigned long debit, expired_balance = rq->nr_running; |
1170 | unsigned long long now = sched_clock(); |
1171 | |
1172 | update_cpu_clock(p, rq, now); |
1173 | @@ -2703,78 +2545,53 @@ void scheduler_tick(void) |
1174 | } |
1175 | |
1176 | /* Task might have expired already, but not scheduled off yet */ |
1177 | - if (p->array != rq->active) { |
1178 | + if (unlikely(!task_queued(p))) { |
1179 | set_tsk_need_resched(p); |
1180 | goto out; |
1181 | } |
1182 | - spin_lock(&rq->lock); |
1183 | /* |
1184 | - * The task was running during this tick - update the |
1185 | - * time slice counter. Note: we do not update a thread's |
1186 | - * priority until it either goes to sleep or uses up its |
1187 | - * timeslice. This makes it possible for interactive tasks |
1188 | - * to use up their timeslices at their highest priority levels. |
1189 | + * SCHED_FIFO tasks never run out of timeslice. |
1190 | */ |
1191 | - if (rt_task(p)) { |
1192 | - /* |
1193 | - * RR tasks need a special form of timeslice management. |
1194 | - * FIFO tasks have no timeslices. |
1195 | - */ |
1196 | - if ((p->policy == SCHED_RR) && !--p->time_slice) { |
1197 | - p->time_slice = task_timeslice(p); |
1198 | - p->first_time_slice = 0; |
1199 | - set_tsk_need_resched(p); |
1200 | + if (unlikely(p->policy == SCHED_FIFO)) { |
1201 | + expired_balance = 0; |
1202 | + goto out; |
1203 | + } |
1204 | |
1205 | - /* put it at the end of the queue: */ |
1206 | - requeue_task(p, rq->active); |
1207 | - } |
1208 | + spin_lock(&rq->lock); |
1209 | + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); |
1210 | + p->ns_debit += debit; |
1211 | + if (p->ns_debit < NSJIFFY) |
1212 | + goto out_unlock; |
1213 | + p->ns_debit %= NSJIFFY; |
1214 | + /* |
1215 | + * Tasks lose bonus each time they use up a full slice(). |
1216 | + */ |
1217 | + if (!--p->slice) { |
1218 | + dec_bonus(p); |
1219 | + p->slice = slice(p); |
1220 | + time_slice_expired(p, rq); |
1221 | + p->totalrun = 0; |
1222 | goto out_unlock; |
1223 | } |
1224 | + /* |
1225 | + * Tasks that run out of time_slice but still have slice left get |
1226 | + * requeued with a lower priority && RR_INTERVAL time_slice. |
1227 | + */ |
1228 | if (!--p->time_slice) { |
1229 | - dequeue_task(p, rq->active); |
1230 | + time_slice_expired(p, rq); |
1231 | + goto out_unlock; |
1232 | + } |
1233 | + rq->cache_ticks++; |
1234 | + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) { |
1235 | set_tsk_need_resched(p); |
1236 | - p->prio = effective_prio(p); |
1237 | - p->time_slice = task_timeslice(p); |
1238 | - p->first_time_slice = 0; |
1239 | - |
1240 | - if (!rq->expired_timestamp) |
1241 | - rq->expired_timestamp = jiffies; |
1242 | - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { |
1243 | - enqueue_task(p, rq->expired); |
1244 | - if (p->static_prio < rq->best_expired_prio) |
1245 | - rq->best_expired_prio = p->static_prio; |
1246 | - } else |
1247 | - enqueue_task(p, rq->active); |
1248 | - } else { |
1249 | - /* |
1250 | - * Prevent a too long timeslice allowing a task to monopolize |
1251 | - * the CPU. We do this by splitting up the timeslice into |
1252 | - * smaller pieces. |
1253 | - * |
1254 | - * Note: this does not mean the task's timeslices expire or |
1255 | - * get lost in any way, they just might be preempted by |
1256 | - * another task of equal priority. (one with higher |
1257 | - * priority would have preempted this task already.) We |
1258 | - * requeue this task to the end of the list on this priority |
1259 | - * level, which is in essence a round-robin of tasks with |
1260 | - * equal priority. |
1261 | - * |
1262 | - * This only applies to tasks in the interactive |
1263 | - * delta range with at least TIMESLICE_GRANULARITY to requeue. |
1264 | - */ |
1265 | - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - |
1266 | - p->time_slice) % TIMESLICE_GRANULARITY(p)) && |
1267 | - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && |
1268 | - (p->array == rq->active)) { |
1269 | - |
1270 | - requeue_task(p, rq->active); |
1271 | - set_tsk_need_resched(p); |
1272 | - } |
1273 | + goto out_unlock; |
1274 | } |
1275 | + expired_balance = 0; |
1276 | out_unlock: |
1277 | spin_unlock(&rq->lock); |
1278 | out: |
1279 | - rebalance_tick(cpu, rq, NOT_IDLE); |
1280 | + if (expired_balance > 1) |
1281 | + rebalance_tick(cpu, rq, NOT_IDLE); |
1282 | } |
1283 | |
1284 | #ifdef CONFIG_SCHED_SMT |
1285 | @@ -2831,19 +2648,19 @@ static void wake_sleeping_dependent(int |
1286 | |
1287 | /* |
1288 | * number of 'lost' timeslices this task wont be able to fully |
1289 | - * utilize, if another task runs on a sibling. This models the |
1290 | + * utilise, if another task runs on a sibling. This models the |
1291 | * slowdown effect of other tasks running on siblings: |
1292 | */ |
1293 | -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) |
1294 | +static inline unsigned long smt_slice(const task_t *p, |
1295 | + const struct sched_domain *sd) |
1296 | { |
1297 | - return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
1298 | + return p->slice * (100 - sd->per_cpu_gain) / 100; |
1299 | } |
1300 | |
1301 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
1302 | { |
1303 | struct sched_domain *tmp, *sd = NULL; |
1304 | cpumask_t sibling_map; |
1305 | - prio_array_t *array; |
1306 | int ret = 0, i; |
1307 | task_t *p; |
1308 | |
1309 | @@ -2870,12 +2687,8 @@ static int dependent_sleeper(int this_cp |
1310 | */ |
1311 | if (!this_rq->nr_running) |
1312 | goto out_unlock; |
1313 | - array = this_rq->active; |
1314 | - if (!array->nr_active) |
1315 | - array = this_rq->expired; |
1316 | - BUG_ON(!array->nr_active); |
1317 | |
1318 | - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, |
1319 | + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, |
1320 | task_t, run_list); |
1321 | |
1322 | for_each_cpu_mask(i, sibling_map) { |
1323 | @@ -2905,7 +2718,7 @@ static int dependent_sleeper(int this_cp |
1324 | } else |
1325 | if (smt_curr->static_prio < p->static_prio && |
1326 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
1327 | - smt_slice(smt_curr, sd) > task_timeslice(p)) |
1328 | + smt_slice(smt_curr, sd) > slice(p)) |
1329 | ret = 1; |
1330 | |
1331 | check_smt_task: |
1332 | @@ -2928,7 +2741,7 @@ check_smt_task: |
1333 | resched_task(smt_curr); |
1334 | } else { |
1335 | if (TASK_PREEMPTS_CURR(p, smt_rq) && |
1336 | - smt_slice(p, sd) > task_timeslice(smt_curr)) |
1337 | + smt_slice(p, sd) > slice(smt_curr)) |
1338 | resched_task(smt_curr); |
1339 | else |
1340 | wakeup_busy_runqueue(smt_rq); |
1341 | @@ -2990,11 +2803,10 @@ asmlinkage void __sched schedule(void) |
1342 | long *switch_count; |
1343 | task_t *prev, *next; |
1344 | runqueue_t *rq; |
1345 | - prio_array_t *array; |
1346 | struct list_head *queue; |
1347 | unsigned long long now; |
1348 | - unsigned long run_time; |
1349 | - int cpu, idx, new_prio; |
1350 | + unsigned long debit; |
1351 | + int cpu, idx; |
1352 | |
1353 | /* |
1354 | * Test if we are atomic. Since do_exit() needs to call into |
1355 | @@ -3029,20 +2841,11 @@ need_resched_nonpreemptible: |
1356 | |
1357 | schedstat_inc(rq, sched_cnt); |
1358 | now = sched_clock(); |
1359 | - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { |
1360 | - run_time = now - prev->timestamp; |
1361 | - if (unlikely((long long)(now - prev->timestamp) < 0)) |
1362 | - run_time = 0; |
1363 | - } else |
1364 | - run_time = NS_MAX_SLEEP_AVG; |
1365 | - |
1366 | - /* |
1367 | - * Tasks charged proportionately less run_time at high sleep_avg to |
1368 | - * delay them losing their interactive status |
1369 | - */ |
1370 | - run_time /= (CURRENT_BONUS(prev) ? : 1); |
1371 | |
1372 | spin_lock_irq(&rq->lock); |
1373 | + prev->runtime = ns_diff(now, prev->timestamp); |
1374 | + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY; |
1375 | + prev->ns_debit += debit; |
1376 | |
1377 | if (unlikely(prev->flags & PF_DEAD)) |
1378 | prev->state = EXIT_DEAD; |
1379 | @@ -3054,8 +2857,10 @@ need_resched_nonpreemptible: |
1380 | unlikely(signal_pending(prev)))) |
1381 | prev->state = TASK_RUNNING; |
1382 | else { |
1383 | - if (prev->state == TASK_UNINTERRUPTIBLE) |
1384 | + if (prev->state == TASK_UNINTERRUPTIBLE) { |
1385 | + prev->flags |= PF_NONSLEEP; |
1386 | rq->nr_uninterruptible++; |
1387 | + } |
1388 | deactivate_task(prev, rq); |
1389 | } |
1390 | } |
1391 | @@ -3066,7 +2871,6 @@ go_idle: |
1392 | idle_balance(cpu, rq); |
1393 | if (!rq->nr_running) { |
1394 | next = rq->idle; |
1395 | - rq->expired_timestamp = 0; |
1396 | wake_sleeping_dependent(cpu, rq); |
1397 | /* |
1398 | * wake_sleeping_dependent() might have released |
1399 | @@ -3090,45 +2894,15 @@ go_idle: |
1400 | goto go_idle; |
1401 | } |
1402 | |
1403 | - array = rq->active; |
1404 | - if (unlikely(!array->nr_active)) { |
1405 | - /* |
1406 | - * Switch the active and expired arrays. |
1407 | - */ |
1408 | - schedstat_inc(rq, sched_switch); |
1409 | - rq->active = rq->expired; |
1410 | - rq->expired = array; |
1411 | - array = rq->active; |
1412 | - rq->expired_timestamp = 0; |
1413 | - rq->best_expired_prio = MAX_PRIO; |
1414 | - } |
1415 | - |
1416 | - idx = sched_find_first_bit(array->bitmap); |
1417 | - queue = array->queue + idx; |
1418 | + idx = sched_find_first_bit(rq->bitmap); |
1419 | + queue = rq->queue + idx; |
1420 | next = list_entry(queue->next, task_t, run_list); |
1421 | |
1422 | - if (!rt_task(next) && next->activated > 0) { |
1423 | - unsigned long long delta = now - next->timestamp; |
1424 | - if (unlikely((long long)(now - next->timestamp) < 0)) |
1425 | - delta = 0; |
1426 | - |
1427 | - if (next->activated == 1) |
1428 | - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
1429 | - |
1430 | - array = next->array; |
1431 | - new_prio = recalc_task_prio(next, next->timestamp + delta); |
1432 | - |
1433 | - if (unlikely(next->prio != new_prio)) { |
1434 | - dequeue_task(next, array); |
1435 | - next->prio = new_prio; |
1436 | - enqueue_task(next, array); |
1437 | - } else |
1438 | - requeue_task(next, array); |
1439 | - } |
1440 | - next->activated = 0; |
1441 | switch_tasks: |
1442 | if (next == rq->idle) |
1443 | schedstat_inc(rq, sched_goidle); |
1444 | + prev->timestamp = now; |
1445 | + |
1446 | prefetch(next); |
1447 | prefetch_stack(next); |
1448 | clear_tsk_need_resched(prev); |
1449 | @@ -3136,13 +2910,10 @@ switch_tasks: |
1450 | |
1451 | update_cpu_clock(prev, rq, now); |
1452 | |
1453 | - prev->sleep_avg -= run_time; |
1454 | - if ((long)prev->sleep_avg <= 0) |
1455 | - prev->sleep_avg = 0; |
1456 | - prev->timestamp = prev->last_ran = now; |
1457 | - |
1458 | sched_info_switch(prev, next); |
1459 | if (likely(prev != next)) { |
1460 | + rq->preempted = 0; |
1461 | + rq->cache_ticks = 0; |
1462 | next->timestamp = now; |
1463 | rq->nr_switches++; |
1464 | rq->curr = next; |
1465 | @@ -3572,9 +3343,8 @@ EXPORT_SYMBOL(sleep_on_timeout); |
1466 | void set_user_nice(task_t *p, long nice) |
1467 | { |
1468 | unsigned long flags; |
1469 | - prio_array_t *array; |
1470 | runqueue_t *rq; |
1471 | - int old_prio, new_prio, delta; |
1472 | + int queued, old_prio, new_prio, delta; |
1473 | |
1474 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
1475 | return; |
1476 | @@ -3593,9 +3363,8 @@ void set_user_nice(task_t *p, long nice) |
1477 | p->static_prio = NICE_TO_PRIO(nice); |
1478 | goto out_unlock; |
1479 | } |
1480 | - array = p->array; |
1481 | - if (array) { |
1482 | - dequeue_task(p, array); |
1483 | + if ((queued = task_queued(p))) { |
1484 | + dequeue_task(p, rq); |
1485 | dec_raw_weighted_load(rq, p); |
1486 | } |
1487 | |
1488 | @@ -3605,9 +3374,11 @@ void set_user_nice(task_t *p, long nice) |
1489 | p->static_prio = NICE_TO_PRIO(nice); |
1490 | set_load_weight(p); |
1491 | p->prio += delta; |
1492 | + if (p->bonus > bonus(p)) |
1493 | + p->bonus= bonus(p); |
1494 | |
1495 | - if (array) { |
1496 | - enqueue_task(p, array); |
1497 | + if (queued) { |
1498 | + enqueue_task(p, rq); |
1499 | inc_raw_weighted_load(rq, p); |
1500 | /* |
1501 | * If the task increased its priority or is running and |
1502 | @@ -3731,19 +3502,13 @@ static inline task_t *find_process_by_pi |
1503 | /* Actually do priority change: must hold rq lock. */ |
1504 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
1505 | { |
1506 | - BUG_ON(p->array); |
1507 | + BUG_ON(task_queued(p)); |
1508 | p->policy = policy; |
1509 | p->rt_priority = prio; |
1510 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { |
1511 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
1512 | - } else { |
1513 | + } else |
1514 | p->prio = p->static_prio; |
1515 | - /* |
1516 | - * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
1517 | - */ |
1518 | - if (policy == SCHED_BATCH) |
1519 | - p->sleep_avg = 0; |
1520 | - } |
1521 | set_load_weight(p); |
1522 | } |
1523 | |
1524 | @@ -3758,8 +3523,7 @@ int sched_setscheduler(struct task_struc |
1525 | struct sched_param *param) |
1526 | { |
1527 | int retval; |
1528 | - int oldprio, oldpolicy = -1; |
1529 | - prio_array_t *array; |
1530 | + int queued, oldprio, oldpolicy = -1; |
1531 | unsigned long flags; |
1532 | runqueue_t *rq; |
1533 | |
1534 | @@ -3821,12 +3585,11 @@ recheck: |
1535 | task_rq_unlock(rq, &flags); |
1536 | goto recheck; |
1537 | } |
1538 | - array = p->array; |
1539 | - if (array) |
1540 | + if ((queued = task_queued(p))) |
1541 | deactivate_task(p, rq); |
1542 | oldprio = p->prio; |
1543 | __setscheduler(p, policy, param->sched_priority); |
1544 | - if (array) { |
1545 | + if (queued) { |
1546 | __activate_task(p, rq); |
1547 | /* |
1548 | * Reschedule if we are currently running on this runqueue and |
1549 | @@ -3836,8 +3599,8 @@ recheck: |
1550 | if (task_running(rq, p)) { |
1551 | if (p->prio > oldprio) |
1552 | resched_task(rq->curr); |
1553 | - } else if (TASK_PREEMPTS_CURR(p, rq)) |
1554 | - resched_task(rq->curr); |
1555 | + } else |
1556 | + preempt(p, rq); |
1557 | } |
1558 | task_rq_unlock(rq, &flags); |
1559 | return 0; |
1560 | @@ -4094,43 +3857,27 @@ asmlinkage long sys_sched_getaffinity(pi |
1561 | |
1562 | /** |
1563 | * sys_sched_yield - yield the current processor to other threads. |
1564 | - * |
1565 | - * this function yields the current CPU by moving the calling thread |
1566 | - * to the expired array. If there are no other threads running on this |
1567 | - * CPU then this function will return. |
1568 | + * This function yields the current CPU by dropping the priority of current |
1569 | + * to the lowest priority. |
1570 | */ |
1571 | asmlinkage long sys_sched_yield(void) |
1572 | { |
1573 | + int newprio; |
1574 | runqueue_t *rq = this_rq_lock(); |
1575 | - prio_array_t *array = current->array; |
1576 | - prio_array_t *target = rq->expired; |
1577 | |
1578 | + newprio = current->prio; |
1579 | schedstat_inc(rq, yld_cnt); |
1580 | - /* |
1581 | - * We implement yielding by moving the task into the expired |
1582 | - * queue. |
1583 | - * |
1584 | - * (special rule: RT tasks will just roundrobin in the active |
1585 | - * array.) |
1586 | - */ |
1587 | - if (rt_task(current)) |
1588 | - target = rq->active; |
1589 | - |
1590 | - if (array->nr_active == 1) { |
1591 | - schedstat_inc(rq, yld_act_empty); |
1592 | - if (!rq->expired->nr_active) |
1593 | - schedstat_inc(rq, yld_both_empty); |
1594 | - } else if (!rq->expired->nr_active) |
1595 | - schedstat_inc(rq, yld_exp_empty); |
1596 | - |
1597 | - if (array != target) { |
1598 | - dequeue_task(current, array); |
1599 | - enqueue_task(current, target); |
1600 | + current->slice = slice(current); |
1601 | + current->time_slice = rr_interval(current); |
1602 | + if (likely(!rt_task(current))) |
1603 | + newprio = MAX_PRIO - 1; |
1604 | + |
1605 | + if (newprio != current->prio) { |
1606 | + dequeue_task(current, rq); |
1607 | + current->prio = newprio; |
1608 | + enqueue_task(current, rq); |
1609 | } else |
1610 | - /* |
1611 | - * requeue_task is cheaper so perform that if possible. |
1612 | - */ |
1613 | - requeue_task(current, array); |
1614 | + requeue_task(current, rq); |
1615 | |
1616 | /* |
1617 | * Since we are going to call schedule() anyway, there's |
1618 | @@ -4339,7 +4086,7 @@ long sys_sched_rr_get_interval(pid_t pid |
1619 | goto out_unlock; |
1620 | |
1621 | jiffies_to_timespec(p->policy & SCHED_FIFO ? |
1622 | - 0 : task_timeslice(p), &t); |
1623 | + 0 : slice(p), &t); |
1624 | read_unlock(&tasklist_lock); |
1625 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
1626 | out_nounlock: |
1627 | @@ -4462,8 +4209,6 @@ void __devinit init_idle(task_t *idle, i |
1628 | unsigned long flags; |
1629 | |
1630 | idle->timestamp = sched_clock(); |
1631 | - idle->sleep_avg = 0; |
1632 | - idle->array = NULL; |
1633 | idle->prio = MAX_PRIO; |
1634 | idle->state = TASK_RUNNING; |
1635 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
1636 | @@ -4580,7 +4325,7 @@ static void __migrate_task(struct task_s |
1637 | goto out; |
1638 | |
1639 | set_task_cpu(p, dest_cpu); |
1640 | - if (p->array) { |
1641 | + if (task_queued(p)) { |
1642 | /* |
1643 | * Sync timestamp with rq_dest's before activating. |
1644 | * The same thing could be achieved by doing this step |
1645 | @@ -4591,8 +4336,7 @@ static void __migrate_task(struct task_s |
1646 | + rq_dest->timestamp_last_tick; |
1647 | deactivate_task(p, rq_src); |
1648 | activate_task(p, rq_dest, 0); |
1649 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) |
1650 | - resched_task(rq_dest->curr); |
1651 | + preempt(p, rq_dest); |
1652 | } |
1653 | |
1654 | out: |
1655 | @@ -4806,7 +4550,7 @@ static void migrate_dead_tasks(unsigned |
1656 | |
1657 | for (arr = 0; arr < 2; arr++) { |
1658 | for (i = 0; i < MAX_PRIO; i++) { |
1659 | - struct list_head *list = &rq->arrays[arr].queue[i]; |
1660 | + struct list_head *list = &rq->queue[i]; |
1661 | while (!list_empty(list)) |
1662 | migrate_dead(dead_cpu, |
1663 | list_entry(list->next, task_t, |
1664 | @@ -6148,17 +5892,15 @@ int in_sched_functions(unsigned long add |
1665 | void __init sched_init(void) |
1666 | { |
1667 | runqueue_t *rq; |
1668 | - int i, j, k; |
1669 | + int i, j; |
1670 | |
1671 | for_each_cpu(i) { |
1672 | - prio_array_t *array; |
1673 | |
1674 | rq = cpu_rq(i); |
1675 | spin_lock_init(&rq->lock); |
1676 | rq->nr_running = 0; |
1677 | - rq->active = rq->arrays; |
1678 | - rq->expired = rq->arrays + 1; |
1679 | - rq->best_expired_prio = MAX_PRIO; |
1680 | + rq->cache_ticks = 0; |
1681 | + rq->preempted = 0; |
1682 | |
1683 | #ifdef CONFIG_SMP |
1684 | rq->sd = NULL; |
1685 | @@ -6170,16 +5912,13 @@ void __init sched_init(void) |
1686 | INIT_LIST_HEAD(&rq->migration_queue); |
1687 | #endif |
1688 | atomic_set(&rq->nr_iowait, 0); |
1689 | - |
1690 | - for (j = 0; j < 2; j++) { |
1691 | - array = rq->arrays + j; |
1692 | - for (k = 0; k < MAX_PRIO; k++) { |
1693 | - INIT_LIST_HEAD(array->queue + k); |
1694 | - __clear_bit(k, array->bitmap); |
1695 | - } |
1696 | - // delimiter for bitsearch |
1697 | - __set_bit(MAX_PRIO, array->bitmap); |
1698 | - } |
1699 | + for (j = 0; j < MAX_PRIO; j++) |
1700 | + INIT_LIST_HEAD(&rq->queue[j]); |
1701 | + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long)); |
1702 | + /* |
1703 | + * delimiter for bitsearch |
1704 | + */ |
1705 | + __set_bit(MAX_PRIO, rq->bitmap); |
1706 | } |
1707 | |
1708 | set_load_weight(&init_task); |
1709 | @@ -6224,9 +5963,9 @@ EXPORT_SYMBOL(__might_sleep); |
1710 | void normalize_rt_tasks(void) |
1711 | { |
1712 | struct task_struct *p; |
1713 | - prio_array_t *array; |
1714 | unsigned long flags; |
1715 | runqueue_t *rq; |
1716 | + int queued; |
1717 | |
1718 | read_lock_irq(&tasklist_lock); |
1719 | for_each_process (p) { |
1720 | @@ -6235,11 +5974,10 @@ void normalize_rt_tasks(void) |
1721 | |
1722 | rq = task_rq_lock(p, &flags); |
1723 | |
1724 | - array = p->array; |
1725 | - if (array) |
1726 | + if ((queued = task_queued(p))) |
1727 | deactivate_task(p, task_rq(p)); |
1728 | __setscheduler(p, SCHED_NORMAL, 0); |
1729 | - if (array) { |
1730 | + if (queued) { |
1731 | __activate_task(p, task_rq(p)); |
1732 | resched_task(rq->curr); |
1733 | } |
1734 | Index: linux-2.6.16-ck1/kernel/sysctl.c |
1735 | =================================================================== |
1736 | --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:26.000000000 +1100 |
1737 | +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:46:48.000000000 +1100 |
1738 | @@ -623,6 +623,22 @@ static ctl_table kern_table[] = { |
1739 | .mode = 0444, |
1740 | .proc_handler = &proc_dointvec, |
1741 | }, |
1742 | + { |
1743 | + .ctl_name = KERN_INTERACTIVE, |
1744 | + .procname = "interactive", |
1745 | + .data = &sched_interactive, |
1746 | + .maxlen = sizeof (int), |
1747 | + .mode = 0644, |
1748 | + .proc_handler = &proc_dointvec, |
1749 | + }, |
1750 | + { |
1751 | + .ctl_name = KERN_COMPUTE, |
1752 | + .procname = "compute", |
1753 | + .data = &sched_compute, |
1754 | + .maxlen = sizeof (int), |
1755 | + .mode = 0644, |
1756 | + .proc_handler = &proc_dointvec, |
1757 | + }, |
1758 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
1759 | { |
1760 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, |