Annotation of /trunk/kernel26-magellan/patches-2.6.16-r12/0007-2.6.16-sched-staircase14.2.patch
Parent Directory | Revision Log
Revision 72 -
(hide annotations)
(download)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 53449 byte(s)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 53449 byte(s)
ver bump to 2.6.16-r12: - updated to linux-2.6.16.19 - updated to ck11
1 | niro | 72 | fs/proc/array.c | 4 |
2 | include/linux/sched.h | 13 | ||
3 | include/linux/sysctl.h | 2 | ||
4 | kernel/exit.c | 1 | ||
5 | kernel/sched.c | 1022 ++++++++++++++++++------------------------------- | ||
6 | kernel/sysctl.c | 16 | ||
7 | 6 files changed, 406 insertions(+), 652 deletions(-) | ||
8 | |||
9 | Index: linux-2.6.16-ck1/fs/proc/array.c | ||
10 | =================================================================== | ||
11 | --- linux-2.6.16-ck1.orig/fs/proc/array.c 2006-03-20 20:46:26.000000000 +1100 | ||
12 | +++ linux-2.6.16-ck1/fs/proc/array.c 2006-03-20 20:46:48.000000000 +1100 | ||
13 | @@ -165,7 +165,7 @@ static inline char * task_state(struct t | ||
14 | read_lock(&tasklist_lock); | ||
15 | buffer += sprintf(buffer, | ||
16 | "State:\t%s\n" | ||
17 | - "SleepAVG:\t%lu%%\n" | ||
18 | + "Bonus:\t%d\n" | ||
19 | "Tgid:\t%d\n" | ||
20 | "Pid:\t%d\n" | ||
21 | "PPid:\t%d\n" | ||
22 | @@ -173,7 +173,7 @@ static inline char * task_state(struct t | ||
23 | "Uid:\t%d\t%d\t%d\t%d\n" | ||
24 | "Gid:\t%d\t%d\t%d\t%d\n", | ||
25 | get_task_state(p), | ||
26 | - (p->sleep_avg/1024)*100/(1020000000/1024), | ||
27 | + p->bonus, | ||
28 | p->tgid, | ||
29 | p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, | ||
30 | pid_alive(p) && p->ptrace ? p->parent->pid : 0, | ||
31 | Index: linux-2.6.16-ck1/include/linux/sched.h | ||
32 | =================================================================== | ||
33 | --- linux-2.6.16-ck1.orig/include/linux/sched.h 2006-03-20 20:46:47.000000000 +1100 | ||
34 | +++ linux-2.6.16-ck1/include/linux/sched.h 2006-03-20 20:46:48.000000000 +1100 | ||
35 | @@ -200,6 +200,7 @@ extern void show_stack(struct task_struc | ||
36 | |||
37 | void io_schedule(void); | ||
38 | long io_schedule_timeout(long timeout); | ||
39 | +extern int sched_interactive, sched_compute; | ||
40 | |||
41 | extern void cpu_init (void); | ||
42 | extern void trap_init(void); | ||
43 | @@ -522,7 +523,6 @@ extern struct user_struct *find_user(uid | ||
44 | extern struct user_struct root_user; | ||
45 | #define INIT_USER (&root_user) | ||
46 | |||
47 | -typedef struct prio_array prio_array_t; | ||
48 | struct backing_dev_info; | ||
49 | struct reclaim_state; | ||
50 | |||
51 | @@ -723,18 +723,17 @@ struct task_struct { | ||
52 | int load_weight; /* for niceness load balancing purposes */ | ||
53 | int prio, static_prio; | ||
54 | struct list_head run_list; | ||
55 | - prio_array_t *array; | ||
56 | |||
57 | unsigned short ioprio; | ||
58 | |||
59 | - unsigned long sleep_avg; | ||
60 | - unsigned long long timestamp, last_ran; | ||
61 | + unsigned long long timestamp; | ||
62 | + unsigned long runtime, totalrun, ns_debit; | ||
63 | + unsigned int bonus; | ||
64 | + unsigned int slice, time_slice; | ||
65 | unsigned long long sched_time; /* sched_clock time spent running */ | ||
66 | - int activated; | ||
67 | |||
68 | unsigned long policy; | ||
69 | cpumask_t cpus_allowed; | ||
70 | - unsigned int time_slice, first_time_slice; | ||
71 | |||
72 | #ifdef CONFIG_SCHEDSTATS | ||
73 | struct sched_info sched_info; | ||
74 | @@ -948,6 +947,7 @@ static inline void put_task_struct(struc | ||
75 | #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ | ||
76 | #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ | ||
77 | #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ | ||
78 | +#define PF_NONSLEEP 0x02000000 /* Waiting on in kernel activity */ | ||
79 | |||
80 | /* | ||
81 | * Only the _current_ task can read/write to tsk->flags, but other | ||
82 | @@ -1069,7 +1069,6 @@ extern void FASTCALL(wake_up_new_task(st | ||
83 | static inline void kick_process(struct task_struct *tsk) { } | ||
84 | #endif | ||
85 | extern void FASTCALL(sched_fork(task_t * p, int clone_flags)); | ||
86 | -extern void FASTCALL(sched_exit(task_t * p)); | ||
87 | |||
88 | extern int in_group_p(gid_t); | ||
89 | extern int in_egroup_p(gid_t); | ||
90 | Index: linux-2.6.16-ck1/include/linux/sysctl.h | ||
91 | =================================================================== | ||
92 | --- linux-2.6.16-ck1.orig/include/linux/sysctl.h 2006-03-20 20:46:26.000000000 +1100 | ||
93 | +++ linux-2.6.16-ck1/include/linux/sysctl.h 2006-03-20 20:46:48.000000000 +1100 | ||
94 | @@ -148,6 +148,8 @@ enum | ||
95 | KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ | ||
96 | KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ | ||
97 | KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ | ||
98 | + KERN_INTERACTIVE=73, /* interactive tasks can have cpu bursts */ | ||
99 | + KERN_COMPUTE=74, /* adjust timeslices for a compute server */ | ||
100 | }; | ||
101 | |||
102 | |||
103 | Index: linux-2.6.16-ck1/kernel/exit.c | ||
104 | =================================================================== | ||
105 | --- linux-2.6.16-ck1.orig/kernel/exit.c 2006-03-20 20:46:26.000000000 +1100 | ||
106 | +++ linux-2.6.16-ck1/kernel/exit.c 2006-03-20 20:46:48.000000000 +1100 | ||
107 | @@ -102,7 +102,6 @@ repeat: | ||
108 | zap_leader = (leader->exit_signal == -1); | ||
109 | } | ||
110 | |||
111 | - sched_exit(p); | ||
112 | write_unlock_irq(&tasklist_lock); | ||
113 | spin_unlock(&p->proc_lock); | ||
114 | proc_pid_flush(proc_dentry); | ||
115 | Index: linux-2.6.16-ck1/kernel/sched.c | ||
116 | =================================================================== | ||
117 | --- linux-2.6.16-ck1.orig/kernel/sched.c 2006-03-20 20:46:46.000000000 +1100 | ||
118 | +++ linux-2.6.16-ck1/kernel/sched.c 2006-03-20 20:46:48.000000000 +1100 | ||
119 | @@ -16,6 +16,9 @@ | ||
120 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
121 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
122 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
123 | + * 2006-03-16 New staircase scheduling policy by Con Kolivas with help | ||
124 | + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. | ||
125 | + * Staircase v14.2 | ||
126 | */ | ||
127 | |||
128 | #include <linux/mm.h> | ||
129 | @@ -76,128 +79,27 @@ | ||
130 | */ | ||
131 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | ||
132 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | ||
133 | +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ | ||
134 | +#define TASK_PREEMPTS_CURR(p, rq) ((p)->prio < (rq)->curr->prio) | ||
135 | |||
136 | +int sched_compute __read_mostly = 0; | ||
137 | /* | ||
138 | - * These are the 'tuning knobs' of the scheduler: | ||
139 | - * | ||
140 | - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | ||
141 | - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
142 | - * Timeslices get refilled after they expire. | ||
143 | - */ | ||
144 | -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
145 | -#define DEF_TIMESLICE (100 * HZ / 1000) | ||
146 | -#define ON_RUNQUEUE_WEIGHT 30 | ||
147 | -#define CHILD_PENALTY 95 | ||
148 | -#define PARENT_PENALTY 100 | ||
149 | -#define EXIT_WEIGHT 3 | ||
150 | -#define PRIO_BONUS_RATIO 25 | ||
151 | -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | ||
152 | -#define INTERACTIVE_DELTA 2 | ||
153 | -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | ||
154 | -#define STARVATION_LIMIT (MAX_SLEEP_AVG) | ||
155 | -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | ||
156 | - | ||
157 | -/* | ||
158 | - * If a task is 'interactive' then we reinsert it in the active | ||
159 | - * array after it has expired its current timeslice. (it will not | ||
160 | - * continue to run immediately, it will still roundrobin with | ||
161 | - * other interactive tasks.) | ||
162 | - * | ||
163 | - * This part scales the interactivity limit depending on niceness. | ||
164 | - * | ||
165 | - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | ||
166 | - * Here are a few examples of different nice levels: | ||
167 | - * | ||
168 | - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | ||
169 | - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | ||
170 | - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | ||
171 | - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | ||
172 | - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | ||
173 | - * | ||
174 | - * (the X axis represents the possible -5 ... 0 ... +5 dynamic | ||
175 | - * priority range a task can explore, a value of '1' means the | ||
176 | - * task is rated interactive.) | ||
177 | - * | ||
178 | - * Ie. nice +19 tasks can never get 'interactive' enough to be | ||
179 | - * reinserted into the active array. And only heavily CPU-hog nice -20 | ||
180 | - * tasks will be expired. Default nice 0 tasks are somewhere between, | ||
181 | - * it takes some effort for them to get interactive, but it's not | ||
182 | - * too hard. | ||
183 | - */ | ||
184 | - | ||
185 | -#define CURRENT_BONUS(p) \ | ||
186 | - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | ||
187 | - MAX_SLEEP_AVG) | ||
188 | - | ||
189 | -#define GRANULARITY (10 * HZ / 1000 ? : 1) | ||
190 | - | ||
191 | -#ifdef CONFIG_SMP | ||
192 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
193 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | ||
194 | - num_online_cpus()) | ||
195 | -#else | ||
196 | -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
197 | - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | ||
198 | -#endif | ||
199 | - | ||
200 | -#define SCALE(v1,v1_max,v2_max) \ | ||
201 | - (v1) * (v2_max) / (v1_max) | ||
202 | - | ||
203 | -#define DELTA(p) \ | ||
204 | - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) | ||
205 | - | ||
206 | -#define TASK_INTERACTIVE(p) \ | ||
207 | - ((p)->prio <= (p)->static_prio - DELTA(p)) | ||
208 | - | ||
209 | -#define INTERACTIVE_SLEEP(p) \ | ||
210 | - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | ||
211 | - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | ||
212 | - | ||
213 | -#define TASK_PREEMPTS_CURR(p, rq) \ | ||
214 | - ((p)->prio < (rq)->curr->prio) | ||
215 | - | ||
216 | -/* | ||
217 | - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
218 | - * to time slice values: [800ms ... 100ms ... 5ms] | ||
219 | - * | ||
220 | - * The higher a thread's priority, the bigger timeslices | ||
221 | - * it gets during one round of execution. But even the lowest | ||
222 | - * priority thread gets MIN_TIMESLICE worth of execution time. | ||
223 | + *This is the time all tasks within the same priority round robin. | ||
224 | + *compute setting is reserved for dedicated computational scheduling | ||
225 | + *and has twenty times larger intervals. Set to a minimum of 6ms. | ||
226 | */ | ||
227 | +#define _RR_INTERVAL ((6 * HZ / 1001) + 1) | ||
228 | +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 16 * sched_compute)) | ||
229 | +#define DEF_TIMESLICE (RR_INTERVAL() * 19) | ||
230 | |||
231 | -#define SCALE_PRIO(x, prio) \ | ||
232 | - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
233 | - | ||
234 | -static unsigned int static_prio_timeslice(int static_prio) | ||
235 | -{ | ||
236 | - if (static_prio < NICE_TO_PRIO(0)) | ||
237 | - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
238 | - else | ||
239 | - return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
240 | -} | ||
241 | - | ||
242 | -static inline unsigned int task_timeslice(task_t *p) | ||
243 | -{ | ||
244 | - return static_prio_timeslice(p->static_prio); | ||
245 | -} | ||
246 | - | ||
247 | -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
248 | +#define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \ | ||
249 | < (long long) (sd)->cache_hot_time) | ||
250 | |||
251 | /* | ||
252 | * These are the runqueue data structures: | ||
253 | */ | ||
254 | - | ||
255 | -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
256 | - | ||
257 | typedef struct runqueue runqueue_t; | ||
258 | |||
259 | -struct prio_array { | ||
260 | - unsigned int nr_active; | ||
261 | - unsigned long bitmap[BITMAP_SIZE]; | ||
262 | - struct list_head queue[MAX_PRIO]; | ||
263 | -}; | ||
264 | - | ||
265 | /* | ||
266 | * This is the main, per-CPU runqueue data structure. | ||
267 | * | ||
268 | @@ -227,12 +129,12 @@ struct runqueue { | ||
269 | */ | ||
270 | unsigned long nr_uninterruptible; | ||
271 | |||
272 | - unsigned long expired_timestamp; | ||
273 | unsigned long long timestamp_last_tick; | ||
274 | + unsigned int cache_ticks, preempted; | ||
275 | task_t *curr, *idle; | ||
276 | struct mm_struct *prev_mm; | ||
277 | - prio_array_t *active, *expired, arrays[2]; | ||
278 | - int best_expired_prio; | ||
279 | + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; | ||
280 | + struct list_head queue[MAX_PRIO]; | ||
281 | atomic_t nr_iowait; | ||
282 | |||
283 | #ifdef CONFIG_SMP | ||
284 | @@ -496,13 +398,7 @@ static inline runqueue_t *this_rq_lock(v | ||
285 | |||
286 | #ifdef CONFIG_SCHEDSTATS | ||
287 | /* | ||
288 | - * Called when a process is dequeued from the active array and given | ||
289 | - * the cpu. We should note that with the exception of interactive | ||
290 | - * tasks, the expired queue will become the active queue after the active | ||
291 | - * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
292 | - * expired queue. (Interactive tasks may be requeued directly to the | ||
293 | - * active queue, thus delaying tasks in the expired queue from running; | ||
294 | - * see scheduler_tick()). | ||
295 | + * Called when a process is dequeued and given the cpu. | ||
296 | * | ||
297 | * This function is only called from sched_info_arrive(), rather than | ||
298 | * dequeue_task(). Even though a task may be queued and dequeued multiple | ||
299 | @@ -540,13 +436,11 @@ static void sched_info_arrive(task_t *t) | ||
300 | } | ||
301 | |||
302 | /* | ||
303 | - * Called when a process is queued into either the active or expired | ||
304 | - * array. The time is noted and later used to determine how long we | ||
305 | - * had to wait for us to reach the cpu. Since the expired queue will | ||
306 | - * become the active queue after active queue is empty, without dequeuing | ||
307 | - * and requeuing any tasks, we are interested in queuing to either. It | ||
308 | - * is unusual but not impossible for tasks to be dequeued and immediately | ||
309 | - * requeued in the same or another array: this can happen in sched_yield(), | ||
310 | + * Called when a process is queued | ||
311 | + * The time is noted and later used to determine how long we had to wait for | ||
312 | + * us to reach the cpu. | ||
313 | + * It is unusual but not impossible for tasks to be dequeued and immediately | ||
314 | + * requeued: this can happen in sched_yield(), | ||
315 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | ||
316 | * to runqueue. | ||
317 | * | ||
318 | @@ -601,73 +495,67 @@ static inline void sched_info_switch(tas | ||
319 | #endif /* CONFIG_SCHEDSTATS */ | ||
320 | |||
321 | /* | ||
322 | - * Adding/removing a task to/from a priority array: | ||
323 | + * Get nanosecond clock difference without overflowing unsigned long. | ||
324 | */ | ||
325 | -static void dequeue_task(struct task_struct *p, prio_array_t *array) | ||
326 | +static unsigned long ns_diff(const unsigned long long v1, | ||
327 | + const unsigned long long v2) | ||
328 | { | ||
329 | - array->nr_active--; | ||
330 | - list_del(&p->run_list); | ||
331 | - if (list_empty(array->queue + p->prio)) | ||
332 | - __clear_bit(p->prio, array->bitmap); | ||
333 | + unsigned long long vdiff; | ||
334 | + if (likely(v1 > v2)) { | ||
335 | + vdiff = v1 - v2; | ||
336 | +#if BITS_PER_LONG < 64 | ||
337 | + if (vdiff > (1 << 31)) | ||
338 | + vdiff = 1 << 31; | ||
339 | +#endif | ||
340 | + } else { | ||
341 | + /* | ||
342 | + * Rarely the clock appears to go backwards. There should | ||
343 | + * always be a positive difference so return 1. | ||
344 | + */ | ||
345 | + vdiff = 1; | ||
346 | + } | ||
347 | + return (unsigned long)vdiff; | ||
348 | } | ||
349 | |||
350 | -static void enqueue_task(struct task_struct *p, prio_array_t *array) | ||
351 | +static inline int task_queued(const task_t *task) | ||
352 | { | ||
353 | - sched_info_queued(p); | ||
354 | - list_add_tail(&p->run_list, array->queue + p->prio); | ||
355 | - __set_bit(p->prio, array->bitmap); | ||
356 | - array->nr_active++; | ||
357 | - p->array = array; | ||
358 | + return !list_empty(&task->run_list); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | - * Put task to the end of the run list without the overhead of dequeue | ||
363 | - * followed by enqueue. | ||
364 | + * Adding/removing a task to/from a runqueue: | ||
365 | */ | ||
366 | -static void requeue_task(struct task_struct *p, prio_array_t *array) | ||
367 | +static void fastcall dequeue_task(task_t *p, runqueue_t *rq) | ||
368 | { | ||
369 | - list_move_tail(&p->run_list, array->queue + p->prio); | ||
370 | + list_del_init(&p->run_list); | ||
371 | + if (list_empty(rq->queue + p->prio)) | ||
372 | + __clear_bit(p->prio, rq->bitmap); | ||
373 | + p->ns_debit = 0; | ||
374 | } | ||
375 | |||
376 | -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | ||
377 | +static void fastcall enqueue_task(task_t *p, runqueue_t *rq) | ||
378 | { | ||
379 | - list_add(&p->run_list, array->queue + p->prio); | ||
380 | - __set_bit(p->prio, array->bitmap); | ||
381 | - array->nr_active++; | ||
382 | - p->array = array; | ||
383 | + list_add_tail(&p->run_list, rq->queue + p->prio); | ||
384 | + __set_bit(p->prio, rq->bitmap); | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | - * effective_prio - return the priority that is based on the static | ||
389 | - * priority but is modified by bonuses/penalties. | ||
390 | - * | ||
391 | - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | ||
392 | - * into the -5 ... 0 ... +5 bonus/penalty range. | ||
393 | - * | ||
394 | - * We use 25% of the full 0...39 priority range so that: | ||
395 | - * | ||
396 | - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | ||
397 | - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | ||
398 | - * | ||
399 | - * Both properties are important to certain workloads. | ||
400 | + * Put task to the end of the run list without the overhead of dequeue | ||
401 | + * followed by enqueue. | ||
402 | */ | ||
403 | -static int effective_prio(task_t *p) | ||
404 | +static inline void requeue_task(task_t *p, runqueue_t *rq) | ||
405 | { | ||
406 | - int bonus, prio; | ||
407 | - | ||
408 | - if (rt_task(p)) | ||
409 | - return p->prio; | ||
410 | - | ||
411 | - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | ||
412 | + list_move_tail(&p->run_list, rq->queue + p->prio); | ||
413 | +} | ||
414 | |||
415 | - prio = p->static_prio - bonus; | ||
416 | - if (prio < MAX_RT_PRIO) | ||
417 | - prio = MAX_RT_PRIO; | ||
418 | - if (prio > MAX_PRIO-1) | ||
419 | - prio = MAX_PRIO-1; | ||
420 | - return prio; | ||
421 | +static inline void enqueue_task_head(task_t *p, runqueue_t *rq) | ||
422 | +{ | ||
423 | + list_add(&p->run_list, rq->queue + p->prio); | ||
424 | + __set_bit(p->prio, rq->bitmap); | ||
425 | } | ||
426 | |||
427 | +static unsigned int fastcall slice(const task_t *p); | ||
428 | + | ||
429 | /* | ||
430 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
431 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
432 | @@ -685,10 +573,9 @@ static int effective_prio(task_t *p) | ||
433 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
434 | #define LOAD_WEIGHT(lp) \ | ||
435 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
436 | -#define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
437 | - LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
438 | -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
439 | - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
440 | +#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(slice(p)) | ||
441 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
442 | + (LOAD_WEIGHT((RR_INTERVAL() + 20 + (rp)))) | ||
443 | |||
444 | static void set_load_weight(task_t *p) | ||
445 | { | ||
446 | @@ -705,7 +592,7 @@ static void set_load_weight(task_t *p) | ||
447 | #endif | ||
448 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
449 | } else | ||
450 | - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
451 | + p->load_weight = TASK_LOAD_WEIGHT(p); | ||
452 | } | ||
453 | |||
454 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
455 | @@ -733,9 +620,9 @@ static inline void dec_nr_running(task_t | ||
456 | /* | ||
457 | * __activate_task - move a task to the runqueue. | ||
458 | */ | ||
459 | -static inline void __activate_task(task_t *p, runqueue_t *rq) | ||
460 | +static void fastcall __activate_task(task_t *p, runqueue_t *rq) | ||
461 | { | ||
462 | - enqueue_task(p, rq->active); | ||
463 | + enqueue_task(p, rq); | ||
464 | inc_nr_running(p, rq); | ||
465 | } | ||
466 | |||
467 | @@ -744,74 +631,157 @@ static inline void __activate_task(task_ | ||
468 | */ | ||
469 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | ||
470 | { | ||
471 | - enqueue_task_head(p, rq->active); | ||
472 | + enqueue_task_head(p, rq); | ||
473 | inc_nr_running(p, rq); | ||
474 | } | ||
475 | |||
476 | -static int recalc_task_prio(task_t *p, unsigned long long now) | ||
477 | +/* | ||
478 | + * Bonus - How much higher than its base priority an interactive task can run. | ||
479 | + */ | ||
480 | +static inline unsigned int bonus(const task_t *p) | ||
481 | { | ||
482 | - /* Caller must always ensure 'now >= p->timestamp' */ | ||
483 | - unsigned long long __sleep_time = now - p->timestamp; | ||
484 | - unsigned long sleep_time; | ||
485 | - | ||
486 | - if (unlikely(p->policy == SCHED_BATCH)) | ||
487 | - sleep_time = 0; | ||
488 | - else { | ||
489 | - if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
490 | - sleep_time = NS_MAX_SLEEP_AVG; | ||
491 | - else | ||
492 | - sleep_time = (unsigned long)__sleep_time; | ||
493 | - } | ||
494 | + return TASK_USER_PRIO(p); | ||
495 | +} | ||
496 | |||
497 | - if (likely(sleep_time > 0)) { | ||
498 | - /* | ||
499 | - * User tasks that sleep a long time are categorised as | ||
500 | - * idle and will get just interactive status to stay active & | ||
501 | - * prevent them suddenly becoming cpu hogs and starving | ||
502 | - * other processes. | ||
503 | - */ | ||
504 | - if (p->mm && p->activated != -1 && | ||
505 | - sleep_time > INTERACTIVE_SLEEP(p)) { | ||
506 | - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - | ||
507 | - DEF_TIMESLICE); | ||
508 | - } else { | ||
509 | - /* | ||
510 | - * The lower the sleep avg a task has the more | ||
511 | - * rapidly it will rise with sleep time. | ||
512 | - */ | ||
513 | - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; | ||
514 | +static unsigned int fastcall rr_interval(const task_t *p) | ||
515 | +{ | ||
516 | + int nice = TASK_NICE(p); | ||
517 | |||
518 | - /* | ||
519 | - * Tasks waking from uninterruptible sleep are | ||
520 | - * limited in their sleep_avg rise as they | ||
521 | - * are likely to be waiting on I/O | ||
522 | - */ | ||
523 | - if (p->activated == -1 && p->mm) { | ||
524 | - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | ||
525 | - sleep_time = 0; | ||
526 | - else if (p->sleep_avg + sleep_time >= | ||
527 | - INTERACTIVE_SLEEP(p)) { | ||
528 | - p->sleep_avg = INTERACTIVE_SLEEP(p); | ||
529 | - sleep_time = 0; | ||
530 | - } | ||
531 | - } | ||
532 | + if (nice < 0 && !rt_task(p)) | ||
533 | + return RR_INTERVAL() * (20 - nice) / 20; | ||
534 | + return RR_INTERVAL(); | ||
535 | +} | ||
536 | |||
537 | - /* | ||
538 | - * This code gives a bonus to interactive tasks. | ||
539 | - * | ||
540 | - * The boost works by updating the 'average sleep time' | ||
541 | - * value here, based on ->timestamp. The more time a | ||
542 | - * task spends sleeping, the higher the average gets - | ||
543 | - * and the higher the priority boost gets as well. | ||
544 | - */ | ||
545 | - p->sleep_avg += sleep_time; | ||
546 | +/* | ||
547 | + * slice - the duration a task runs before getting requeued at its best | ||
548 | + * priority and has its bonus decremented. | ||
549 | + */ | ||
550 | +static unsigned int fastcall slice(const task_t *p) | ||
551 | +{ | ||
552 | + unsigned int slice, rr; | ||
553 | |||
554 | - if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
555 | - p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
556 | - } | ||
557 | + slice = rr = rr_interval(p); | ||
558 | + if (likely(!rt_task(p))) | ||
559 | + slice += (39 - TASK_USER_PRIO(p)) * rr; | ||
560 | + return slice; | ||
561 | +} | ||
562 | + | ||
563 | +/* | ||
564 | + * We increase our bonus by sleeping more than the time we ran. | ||
565 | + * The ratio of sleep to run gives us the cpu% that we last ran and determines | ||
566 | + * the maximum bonus we can acquire. | ||
567 | + */ | ||
568 | +static void fastcall inc_bonus(task_t *p, const unsigned long totalrun, | ||
569 | + const unsigned long sleep) | ||
570 | +{ | ||
571 | + unsigned int best_bonus; | ||
572 | + | ||
573 | + best_bonus = sleep / (totalrun + 1); | ||
574 | + if (p->bonus >= best_bonus) | ||
575 | + return; | ||
576 | + | ||
577 | + p->bonus++; | ||
578 | + best_bonus = bonus(p); | ||
579 | + if (p->bonus > best_bonus) | ||
580 | + p->bonus = best_bonus; | ||
581 | +} | ||
582 | + | ||
583 | +static void dec_bonus(task_t *p) | ||
584 | +{ | ||
585 | + if (p->bonus) | ||
586 | + p->bonus--; | ||
587 | +} | ||
588 | + | ||
589 | +/* | ||
590 | + * sched_interactive - sysctl which allows interactive tasks to have bonus | ||
591 | + * raise its priority. | ||
592 | + */ | ||
593 | +int sched_interactive __read_mostly = 1; | ||
594 | + | ||
595 | +/* | ||
596 | + * effective_prio - dynamic priority dependent on bonus. | ||
597 | + * The priority normally decreases by one each RR_INTERVAL. | ||
598 | + * As the bonus increases the initial priority starts at a higher "stair" or | ||
599 | + * priority for longer. | ||
600 | + */ | ||
601 | +static int effective_prio(const task_t *p) | ||
602 | +{ | ||
603 | + int prio; | ||
604 | + unsigned int full_slice, used_slice = 0; | ||
605 | + unsigned int best_bonus, rr; | ||
606 | + | ||
607 | + if (rt_task(p)) | ||
608 | + return p->prio; | ||
609 | + | ||
610 | + full_slice = slice(p); | ||
611 | + if (full_slice > p->slice) | ||
612 | + used_slice = full_slice - p->slice; | ||
613 | + | ||
614 | + best_bonus = bonus(p); | ||
615 | + prio = MAX_RT_PRIO + best_bonus; | ||
616 | + if (sched_interactive && !sched_compute && p->policy != SCHED_BATCH) | ||
617 | + prio -= p->bonus; | ||
618 | + | ||
619 | + rr = rr_interval(p); | ||
620 | + prio += used_slice / rr; | ||
621 | + if (prio > MAX_PRIO - 1) | ||
622 | + prio = MAX_PRIO - 1; | ||
623 | + return prio; | ||
624 | +} | ||
625 | + | ||
626 | +static inline void continue_slice(task_t *p) | ||
627 | +{ | ||
628 | + unsigned long total_run = NS_TO_JIFFIES(p->totalrun); | ||
629 | + | ||
630 | + if (total_run >= p->slice) { | ||
631 | + p->totalrun -= JIFFIES_TO_NS(p->slice); | ||
632 | + dec_bonus(p); | ||
633 | + } else { | ||
634 | + unsigned int remainder; | ||
635 | + | ||
636 | + p->slice -= total_run; | ||
637 | + remainder = p->slice % rr_interval(p); | ||
638 | + if (remainder) | ||
639 | + p->time_slice = remainder; | ||
640 | } | ||
641 | +} | ||
642 | |||
643 | - return effective_prio(p); | ||
644 | +/* | ||
645 | + * recalc_task_prio - this checks for tasks that run ultra short timeslices | ||
646 | + * or have just forked a thread/process and make them continue their old | ||
647 | + * slice instead of starting a new one at high priority. | ||
648 | + */ | ||
649 | +static inline void recalc_task_prio(task_t *p, const unsigned long long now) | ||
650 | +{ | ||
651 | + unsigned long sleep_time = ns_diff(now, p->timestamp); | ||
652 | + | ||
653 | + /* | ||
654 | + * Add the total for this last scheduled run (p->runtime) to the | ||
655 | + * running total so far used (p->totalrun). | ||
656 | + */ | ||
657 | + p->totalrun += p->runtime; | ||
658 | + | ||
659 | + /* | ||
660 | + * If we sleep longer than our running total and have not set the | ||
661 | + * PF_NONSLEEP flag we gain a bonus. | ||
662 | + */ | ||
663 | + if (sleep_time >= p->totalrun && !(p->flags & PF_NONSLEEP) && | ||
664 | + !sched_compute) { | ||
665 | + inc_bonus(p, p->totalrun, sleep_time); | ||
666 | + p->totalrun = 0; | ||
667 | + return; | ||
668 | + } | ||
669 | + | ||
670 | + /* | ||
671 | + * If we have not set the PF_NONSLEEP flag we elevate priority by the | ||
672 | + * amount of time we slept. | ||
673 | + */ | ||
674 | + if (p->flags & PF_NONSLEEP) | ||
675 | + p->flags &= ~PF_NONSLEEP; | ||
676 | + else | ||
677 | + p->totalrun -= sleep_time; | ||
678 | + | ||
679 | + continue_slice(p); | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | @@ -820,11 +790,11 @@ static int recalc_task_prio(task_t *p, u | ||
684 | * Update all the scheduling statistics stuff. (sleep average | ||
685 | * calculation, priority modifiers, etc.) | ||
686 | */ | ||
687 | -static void activate_task(task_t *p, runqueue_t *rq, int local) | ||
688 | +static void activate_task(task_t *p, runqueue_t *rq, const int local) | ||
689 | { | ||
690 | - unsigned long long now; | ||
691 | + unsigned long long now = sched_clock(); | ||
692 | + unsigned long rr = rr_interval(p); | ||
693 | |||
694 | - now = sched_clock(); | ||
695 | #ifdef CONFIG_SMP | ||
696 | if (!local) { | ||
697 | /* Compensate for drifting sched_clock */ | ||
698 | @@ -833,45 +803,24 @@ static void activate_task(task_t *p, run | ||
699 | + rq->timestamp_last_tick; | ||
700 | } | ||
701 | #endif | ||
702 | - | ||
703 | - if (!rt_task(p)) | ||
704 | - p->prio = recalc_task_prio(p, now); | ||
705 | - | ||
706 | - /* | ||
707 | - * This checks to make sure it's not an uninterruptible task | ||
708 | - * that is now waking up. | ||
709 | - */ | ||
710 | - if (!p->activated) { | ||
711 | - /* | ||
712 | - * Tasks which were woken up by interrupts (ie. hw events) | ||
713 | - * are most likely of interactive nature. So we give them | ||
714 | - * the credit of extending their sleep time to the period | ||
715 | - * of time they spend on the runqueue, waiting for execution | ||
716 | - * on a CPU, first time around: | ||
717 | - */ | ||
718 | - if (in_interrupt()) | ||
719 | - p->activated = 2; | ||
720 | - else { | ||
721 | - /* | ||
722 | - * Normal first-time wakeups get a credit too for | ||
723 | - * on-runqueue time, but it will be weighted down: | ||
724 | - */ | ||
725 | - p->activated = 1; | ||
726 | - } | ||
727 | + p->slice = slice(p); | ||
728 | + p->time_slice = p->slice % rr ? : rr; | ||
729 | + if (!rt_task(p)) { | ||
730 | + recalc_task_prio(p, now); | ||
731 | + p->flags &= ~PF_NONSLEEP; | ||
732 | + p->prio = effective_prio(p); | ||
733 | } | ||
734 | p->timestamp = now; | ||
735 | - | ||
736 | __activate_task(p, rq); | ||
737 | } | ||
738 | |||
739 | /* | ||
740 | * deactivate_task - remove a task from the runqueue. | ||
741 | */ | ||
742 | -static void deactivate_task(struct task_struct *p, runqueue_t *rq) | ||
743 | +static void fastcall deactivate_task(task_t *p, runqueue_t *rq) | ||
744 | { | ||
745 | dec_nr_running(p, rq); | ||
746 | - dequeue_task(p, p->array); | ||
747 | - p->array = NULL; | ||
748 | + dequeue_task(p, rq); | ||
749 | } | ||
750 | |||
751 | /* | ||
752 | @@ -947,7 +896,7 @@ static int migrate_task(task_t *p, int d | ||
753 | * If the task is not on a runqueue (and not running), then | ||
754 | * it is sufficient to simply update the task's cpu field. | ||
755 | */ | ||
756 | - if (!p->array && !task_running(rq, p)) { | ||
757 | + if (!task_queued(p) && !task_running(rq, p)) { | ||
758 | set_task_cpu(p, dest_cpu); | ||
759 | return 0; | ||
760 | } | ||
761 | @@ -977,7 +926,7 @@ void wait_task_inactive(task_t *p) | ||
762 | repeat: | ||
763 | rq = task_rq_lock(p, &flags); | ||
764 | /* Must be off runqueue entirely, not preempted. */ | ||
765 | - if (unlikely(p->array || task_running(rq, p))) { | ||
766 | + if (unlikely(task_queued(p) || task_running(rq, p))) { | ||
767 | /* If it's preempted, we yield. It could be a while. */ | ||
768 | preempted = !task_running(rq, p); | ||
769 | task_rq_unlock(rq, &flags); | ||
770 | @@ -1228,6 +1177,26 @@ static inline int wake_idle(int cpu, tas | ||
771 | } | ||
772 | #endif | ||
773 | |||
774 | +/* | ||
775 | + * CACHE_DELAY is the time preemption is delayed in sched_compute mode | ||
776 | + * and is set to a nominal 10ms. | ||
777 | + */ | ||
778 | +#define CACHE_DELAY (10 * (HZ) / 1001 + 1) | ||
779 | + | ||
780 | +/* | ||
781 | + * Check to see if p preempts rq->curr and resched if it does. In compute | ||
782 | + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted. | ||
783 | + */ | ||
784 | +static void fastcall preempt(const task_t *p, runqueue_t *rq) | ||
785 | +{ | ||
786 | + if (p->prio >= rq->curr->prio) | ||
787 | + return; | ||
788 | + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || | ||
789 | + !p->mm || rt_task(p)) | ||
790 | + resched_task(rq->curr); | ||
791 | + rq->preempted = 1; | ||
792 | +} | ||
793 | + | ||
794 | /*** | ||
795 | * try_to_wake_up - wake up a thread | ||
796 | * @p: the to-be-woken-up thread | ||
797 | @@ -1259,7 +1228,7 @@ static int try_to_wake_up(task_t *p, uns | ||
798 | if (!(old_state & state)) | ||
799 | goto out; | ||
800 | |||
801 | - if (p->array) | ||
802 | + if (task_queued(p)) | ||
803 | goto out_running; | ||
804 | |||
805 | cpu = task_cpu(p); | ||
806 | @@ -1350,7 +1319,7 @@ out_set_cpu: | ||
807 | old_state = p->state; | ||
808 | if (!(old_state & state)) | ||
809 | goto out; | ||
810 | - if (p->array) | ||
811 | + if (task_queued(p)) | ||
812 | goto out_running; | ||
813 | |||
814 | this_cpu = smp_processor_id(); | ||
815 | @@ -1359,26 +1328,10 @@ out_set_cpu: | ||
816 | |||
817 | out_activate: | ||
818 | #endif /* CONFIG_SMP */ | ||
819 | - if (old_state == TASK_UNINTERRUPTIBLE) { | ||
820 | + if (old_state == TASK_UNINTERRUPTIBLE) | ||
821 | rq->nr_uninterruptible--; | ||
822 | - /* | ||
823 | - * Tasks on involuntary sleep don't earn | ||
824 | - * sleep_avg beyond just interactive state. | ||
825 | - */ | ||
826 | - p->activated = -1; | ||
827 | - } | ||
828 | |||
829 | /* | ||
830 | - * Tasks that have marked their sleep as noninteractive get | ||
831 | - * woken up without updating their sleep average. (i.e. their | ||
832 | - * sleep is handled in a priority-neutral manner, no priority | ||
833 | - * boost and no penalty.) | ||
834 | - */ | ||
835 | - if (old_state & TASK_NONINTERACTIVE) | ||
836 | - __activate_task(p, rq); | ||
837 | - else | ||
838 | - activate_task(p, rq, cpu == this_cpu); | ||
839 | - /* | ||
840 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
841 | * has indicated that it will leave the CPU in short order) | ||
842 | * don't trigger a preemption, if the woken up task will run on | ||
843 | @@ -1386,10 +1339,9 @@ out_activate: | ||
844 | * the waker guarantees that the freshly woken up task is going | ||
845 | * to be considered on this CPU.) | ||
846 | */ | ||
847 | - if (!sync || cpu != this_cpu) { | ||
848 | - if (TASK_PREEMPTS_CURR(p, rq)) | ||
849 | - resched_task(rq->curr); | ||
850 | - } | ||
851 | + activate_task(p, rq, cpu == this_cpu); | ||
852 | + if (!sync || cpu != this_cpu) | ||
853 | + preempt(p, rq); | ||
854 | success = 1; | ||
855 | |||
856 | out_running: | ||
857 | @@ -1434,7 +1386,6 @@ void fastcall sched_fork(task_t *p, int | ||
858 | */ | ||
859 | p->state = TASK_RUNNING; | ||
860 | INIT_LIST_HEAD(&p->run_list); | ||
861 | - p->array = NULL; | ||
862 | #ifdef CONFIG_SCHEDSTATS | ||
863 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
864 | #endif | ||
865 | @@ -1445,30 +1396,6 @@ void fastcall sched_fork(task_t *p, int | ||
866 | /* Want to start with kernel preemption disabled. */ | ||
867 | task_thread_info(p)->preempt_count = 1; | ||
868 | #endif | ||
869 | - /* | ||
870 | - * Share the timeslice between parent and child, thus the | ||
871 | - * total amount of pending timeslices in the system doesn't change, | ||
872 | - * resulting in more scheduling fairness. | ||
873 | - */ | ||
874 | - local_irq_disable(); | ||
875 | - p->time_slice = (current->time_slice + 1) >> 1; | ||
876 | - /* | ||
877 | - * The remainder of the first timeslice might be recovered by | ||
878 | - * the parent if the child exits early enough. | ||
879 | - */ | ||
880 | - p->first_time_slice = 1; | ||
881 | - current->time_slice >>= 1; | ||
882 | - p->timestamp = sched_clock(); | ||
883 | - if (unlikely(!current->time_slice)) { | ||
884 | - /* | ||
885 | - * This case is rare, it happens when the parent has only | ||
886 | - * a single jiffy left from its timeslice. Taking the | ||
887 | - * runqueue lock is not a problem. | ||
888 | - */ | ||
889 | - current->time_slice = 1; | ||
890 | - scheduler_tick(); | ||
891 | - } | ||
892 | - local_irq_enable(); | ||
893 | put_cpu(); | ||
894 | } | ||
895 | |||
896 | @@ -1491,36 +1418,20 @@ void fastcall wake_up_new_task(task_t *p | ||
897 | cpu = task_cpu(p); | ||
898 | |||
899 | /* | ||
900 | - * We decrease the sleep average of forking parents | ||
901 | - * and children as well, to keep max-interactive tasks | ||
902 | - * from forking tasks that are max-interactive. The parent | ||
903 | - * (current) is done further down, under its lock. | ||
904 | + * Forked process gets no bonus to prevent fork bombs. | ||
905 | */ | ||
906 | - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
907 | - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
908 | - | ||
909 | - p->prio = effective_prio(p); | ||
910 | + p->bonus = 0; | ||
911 | |||
912 | if (likely(cpu == this_cpu)) { | ||
913 | - if (!(clone_flags & CLONE_VM)) { | ||
914 | + current->flags |= PF_NONSLEEP; | ||
915 | + activate_task(p, rq, 1); | ||
916 | + if (!(clone_flags & CLONE_VM)) | ||
917 | /* | ||
918 | * The VM isn't cloned, so we're in a good position to | ||
919 | * do child-runs-first in anticipation of an exec. This | ||
920 | * usually avoids a lot of COW overhead. | ||
921 | */ | ||
922 | - if (unlikely(!current->array)) | ||
923 | - __activate_task(p, rq); | ||
924 | - else { | ||
925 | - p->prio = current->prio; | ||
926 | - list_add_tail(&p->run_list, ¤t->run_list); | ||
927 | - p->array = current->array; | ||
928 | - p->array->nr_active++; | ||
929 | - inc_nr_running(p, rq); | ||
930 | - } | ||
931 | set_need_resched(); | ||
932 | - } else | ||
933 | - /* Run child last */ | ||
934 | - __activate_task(p, rq); | ||
935 | /* | ||
936 | * We skip the following code due to cpu == this_cpu | ||
937 | * | ||
938 | @@ -1537,53 +1448,20 @@ void fastcall wake_up_new_task(task_t *p | ||
939 | */ | ||
940 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | ||
941 | + rq->timestamp_last_tick; | ||
942 | - __activate_task(p, rq); | ||
943 | - if (TASK_PREEMPTS_CURR(p, rq)) | ||
944 | - resched_task(rq->curr); | ||
945 | + activate_task(p, rq, 0); | ||
946 | + preempt(p, rq); | ||
947 | |||
948 | /* | ||
949 | * Parent and child are on different CPUs, now get the | ||
950 | - * parent runqueue to update the parent's ->sleep_avg: | ||
951 | + * parent runqueue to update the parent's ->flags: | ||
952 | */ | ||
953 | task_rq_unlock(rq, &flags); | ||
954 | this_rq = task_rq_lock(current, &flags); | ||
955 | + current->flags |= PF_NONSLEEP; | ||
956 | } | ||
957 | - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | ||
958 | - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
959 | task_rq_unlock(this_rq, &flags); | ||
960 | } | ||
961 | |||
962 | -/* | ||
963 | - * Potentially available exiting-child timeslices are | ||
964 | - * retrieved here - this way the parent does not get | ||
965 | - * penalized for creating too many threads. | ||
966 | - * | ||
967 | - * (this cannot be used to 'generate' timeslices | ||
968 | - * artificially, because any timeslice recovered here | ||
969 | - * was given away by the parent in the first place.) | ||
970 | - */ | ||
971 | -void fastcall sched_exit(task_t *p) | ||
972 | -{ | ||
973 | - unsigned long flags; | ||
974 | - runqueue_t *rq; | ||
975 | - | ||
976 | - /* | ||
977 | - * If the child was a (relative-) CPU hog then decrease | ||
978 | - * the sleep_avg of the parent as well. | ||
979 | - */ | ||
980 | - rq = task_rq_lock(p->parent, &flags); | ||
981 | - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { | ||
982 | - p->parent->time_slice += p->time_slice; | ||
983 | - if (unlikely(p->parent->time_slice > task_timeslice(p))) | ||
984 | - p->parent->time_slice = task_timeslice(p); | ||
985 | - } | ||
986 | - if (p->sleep_avg < p->parent->sleep_avg) | ||
987 | - p->parent->sleep_avg = p->parent->sleep_avg / | ||
988 | - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | ||
989 | - (EXIT_WEIGHT + 1); | ||
990 | - task_rq_unlock(rq, &flags); | ||
991 | -} | ||
992 | - | ||
993 | /** | ||
994 | * prepare_task_switch - prepare to switch tasks | ||
995 | * @rq: the runqueue preparing to switch | ||
996 | @@ -1855,32 +1733,28 @@ void sched_exec(void) | ||
997 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
998 | * Both runqueues must be locked. | ||
999 | */ | ||
1000 | -static | ||
1001 | -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | ||
1002 | - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | ||
1003 | +static void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, | ||
1004 | + const int this_cpu) | ||
1005 | { | ||
1006 | - dequeue_task(p, src_array); | ||
1007 | + dequeue_task(p, src_rq); | ||
1008 | dec_nr_running(p, src_rq); | ||
1009 | set_task_cpu(p, this_cpu); | ||
1010 | inc_nr_running(p, this_rq); | ||
1011 | - enqueue_task(p, this_array); | ||
1012 | + enqueue_task(p, this_rq); | ||
1013 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | ||
1014 | + this_rq->timestamp_last_tick; | ||
1015 | /* | ||
1016 | * Note that idle threads have a prio of MAX_PRIO, for this test | ||
1017 | * to be always true for them. | ||
1018 | */ | ||
1019 | - if (TASK_PREEMPTS_CURR(p, this_rq)) | ||
1020 | - resched_task(this_rq->curr); | ||
1021 | + preempt(p, this_rq); | ||
1022 | } | ||
1023 | |||
1024 | /* | ||
1025 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | ||
1026 | */ | ||
1027 | -static | ||
1028 | -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | ||
1029 | - struct sched_domain *sd, enum idle_type idle, | ||
1030 | - int *all_pinned) | ||
1031 | +static int can_migrate_task(task_t *p, runqueue_t *rq, const int this_cpu, | ||
1032 | + struct sched_domain *sd, const enum idle_type idle, int *all_pinned) | ||
1033 | { | ||
1034 | /* | ||
1035 | * We do not migrate tasks that are: | ||
1036 | @@ -1921,7 +1795,6 @@ static int move_tasks(runqueue_t *this_r | ||
1037 | struct sched_domain *sd, enum idle_type idle, | ||
1038 | int *all_pinned) | ||
1039 | { | ||
1040 | - prio_array_t *array, *dst_array; | ||
1041 | struct list_head *head, *curr; | ||
1042 | int idx, pulled = 0, pinned = 0; | ||
1043 | long rem_load_move; | ||
1044 | @@ -1933,38 +1806,17 @@ static int move_tasks(runqueue_t *this_r | ||
1045 | rem_load_move = max_load_move; | ||
1046 | pinned = 1; | ||
1047 | |||
1048 | - /* | ||
1049 | - * We first consider expired tasks. Those will likely not be | ||
1050 | - * executed in the near future, and they are most likely to | ||
1051 | - * be cache-cold, thus switching CPUs has the least effect | ||
1052 | - * on them. | ||
1053 | - */ | ||
1054 | - if (busiest->expired->nr_active) { | ||
1055 | - array = busiest->expired; | ||
1056 | - dst_array = this_rq->expired; | ||
1057 | - } else { | ||
1058 | - array = busiest->active; | ||
1059 | - dst_array = this_rq->active; | ||
1060 | - } | ||
1061 | - | ||
1062 | -new_array: | ||
1063 | /* Start searching at priority 0: */ | ||
1064 | idx = 0; | ||
1065 | skip_bitmap: | ||
1066 | if (!idx) | ||
1067 | - idx = sched_find_first_bit(array->bitmap); | ||
1068 | + idx = sched_find_first_bit(busiest->bitmap); | ||
1069 | else | ||
1070 | - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
1071 | - if (idx >= MAX_PRIO) { | ||
1072 | - if (array == busiest->expired && busiest->active->nr_active) { | ||
1073 | - array = busiest->active; | ||
1074 | - dst_array = this_rq->active; | ||
1075 | - goto new_array; | ||
1076 | - } | ||
1077 | + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); | ||
1078 | + if (idx >= MAX_PRIO) | ||
1079 | goto out; | ||
1080 | - } | ||
1081 | |||
1082 | - head = array->queue + idx; | ||
1083 | + head = busiest->queue + idx; | ||
1084 | curr = head->prev; | ||
1085 | skip_queue: | ||
1086 | tmp = list_entry(curr, task_t, run_list); | ||
1087 | @@ -1984,7 +1836,7 @@ skip_queue: | ||
1088 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
1089 | #endif | ||
1090 | |||
1091 | - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | ||
1092 | + pull_task(busiest, tmp, this_rq, this_cpu); | ||
1093 | pulled++; | ||
1094 | rem_load_move -= tmp->load_weight; | ||
1095 | |||
1096 | @@ -2507,15 +2359,13 @@ static void rebalance_tick(int this_cpu, | ||
1097 | continue; | ||
1098 | |||
1099 | interval = sd->balance_interval; | ||
1100 | - if (idle != SCHED_IDLE) | ||
1101 | - interval *= sd->busy_factor; | ||
1102 | |||
1103 | /* scale ms to jiffies */ | ||
1104 | interval = msecs_to_jiffies(interval); | ||
1105 | if (unlikely(!interval)) | ||
1106 | interval = 1; | ||
1107 | |||
1108 | - if (j - sd->last_balance >= interval) { | ||
1109 | + if (idle != SCHED_IDLE || j - sd->last_balance >= interval) { | ||
1110 | if (load_balance(this_cpu, this_rq, sd, idle)) { | ||
1111 | /* | ||
1112 | * We've pulled tasks over so either we're no | ||
1113 | @@ -2589,22 +2439,6 @@ unsigned long long current_sched_time(co | ||
1114 | } | ||
1115 | |||
1116 | /* | ||
1117 | - * We place interactive tasks back into the active array, if possible. | ||
1118 | - * | ||
1119 | - * To guarantee that this does not starve expired tasks we ignore the | ||
1120 | - * interactivity of a task if the first expired task had to wait more | ||
1121 | - * than a 'reasonable' amount of time. This deadline timeout is | ||
1122 | - * load-dependent, as the frequency of array switched decreases with | ||
1123 | - * increasing number of running tasks. We also ignore the interactivity | ||
1124 | - * if a better static_prio task has expired: | ||
1125 | - */ | ||
1126 | -#define EXPIRED_STARVING(rq) \ | ||
1127 | - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | ||
1128 | - (jiffies - (rq)->expired_timestamp >= \ | ||
1129 | - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | ||
1130 | - ((rq)->curr->static_prio > (rq)->best_expired_prio)) | ||
1131 | - | ||
1132 | -/* | ||
1133 | * Account user cpu time to a process. | ||
1134 | * @p: the process that the cpu time gets accounted to | ||
1135 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
1136 | @@ -2652,6 +2486,7 @@ void account_system_time(struct task_str | ||
1137 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | ||
1138 | else | ||
1139 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | ||
1140 | + | ||
1141 | /* Account for system time used */ | ||
1142 | acct_update_integrals(p); | ||
1143 | } | ||
1144 | @@ -2677,18 +2512,25 @@ void account_steal_time(struct task_stru | ||
1145 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | ||
1146 | } | ||
1147 | |||
1148 | +static void time_slice_expired(task_t *p, runqueue_t *rq) | ||
1149 | +{ | ||
1150 | + set_tsk_need_resched(p); | ||
1151 | + dequeue_task(p, rq); | ||
1152 | + p->prio = effective_prio(p); | ||
1153 | + p->time_slice = rr_interval(p); | ||
1154 | + enqueue_task(p, rq); | ||
1155 | +} | ||
1156 | + | ||
1157 | /* | ||
1158 | * This function gets called by the timer code, with HZ frequency. | ||
1159 | * We call it with interrupts disabled. | ||
1160 | - * | ||
1161 | - * It also gets called by the fork code, when changing the parent's | ||
1162 | - * timeslices. | ||
1163 | */ | ||
1164 | void scheduler_tick(void) | ||
1165 | { | ||
1166 | int cpu = smp_processor_id(); | ||
1167 | runqueue_t *rq = this_rq(); | ||
1168 | task_t *p = current; | ||
1169 | + unsigned long debit, expired_balance = rq->nr_running; | ||
1170 | unsigned long long now = sched_clock(); | ||
1171 | |||
1172 | update_cpu_clock(p, rq, now); | ||
1173 | @@ -2703,78 +2545,53 @@ void scheduler_tick(void) | ||
1174 | } | ||
1175 | |||
1176 | /* Task might have expired already, but not scheduled off yet */ | ||
1177 | - if (p->array != rq->active) { | ||
1178 | + if (unlikely(!task_queued(p))) { | ||
1179 | set_tsk_need_resched(p); | ||
1180 | goto out; | ||
1181 | } | ||
1182 | - spin_lock(&rq->lock); | ||
1183 | /* | ||
1184 | - * The task was running during this tick - update the | ||
1185 | - * time slice counter. Note: we do not update a thread's | ||
1186 | - * priority until it either goes to sleep or uses up its | ||
1187 | - * timeslice. This makes it possible for interactive tasks | ||
1188 | - * to use up their timeslices at their highest priority levels. | ||
1189 | + * SCHED_FIFO tasks never run out of timeslice. | ||
1190 | */ | ||
1191 | - if (rt_task(p)) { | ||
1192 | - /* | ||
1193 | - * RR tasks need a special form of timeslice management. | ||
1194 | - * FIFO tasks have no timeslices. | ||
1195 | - */ | ||
1196 | - if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
1197 | - p->time_slice = task_timeslice(p); | ||
1198 | - p->first_time_slice = 0; | ||
1199 | - set_tsk_need_resched(p); | ||
1200 | + if (unlikely(p->policy == SCHED_FIFO)) { | ||
1201 | + expired_balance = 0; | ||
1202 | + goto out; | ||
1203 | + } | ||
1204 | |||
1205 | - /* put it at the end of the queue: */ | ||
1206 | - requeue_task(p, rq->active); | ||
1207 | - } | ||
1208 | + spin_lock(&rq->lock); | ||
1209 | + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); | ||
1210 | + p->ns_debit += debit; | ||
1211 | + if (p->ns_debit < NSJIFFY) | ||
1212 | + goto out_unlock; | ||
1213 | + p->ns_debit %= NSJIFFY; | ||
1214 | + /* | ||
1215 | + * Tasks lose bonus each time they use up a full slice(). | ||
1216 | + */ | ||
1217 | + if (!--p->slice) { | ||
1218 | + dec_bonus(p); | ||
1219 | + p->slice = slice(p); | ||
1220 | + time_slice_expired(p, rq); | ||
1221 | + p->totalrun = 0; | ||
1222 | goto out_unlock; | ||
1223 | } | ||
1224 | + /* | ||
1225 | + * Tasks that run out of time_slice but still have slice left get | ||
1226 | + * requeued with a lower priority && RR_INTERVAL time_slice. | ||
1227 | + */ | ||
1228 | if (!--p->time_slice) { | ||
1229 | - dequeue_task(p, rq->active); | ||
1230 | + time_slice_expired(p, rq); | ||
1231 | + goto out_unlock; | ||
1232 | + } | ||
1233 | + rq->cache_ticks++; | ||
1234 | + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) { | ||
1235 | set_tsk_need_resched(p); | ||
1236 | - p->prio = effective_prio(p); | ||
1237 | - p->time_slice = task_timeslice(p); | ||
1238 | - p->first_time_slice = 0; | ||
1239 | - | ||
1240 | - if (!rq->expired_timestamp) | ||
1241 | - rq->expired_timestamp = jiffies; | ||
1242 | - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | ||
1243 | - enqueue_task(p, rq->expired); | ||
1244 | - if (p->static_prio < rq->best_expired_prio) | ||
1245 | - rq->best_expired_prio = p->static_prio; | ||
1246 | - } else | ||
1247 | - enqueue_task(p, rq->active); | ||
1248 | - } else { | ||
1249 | - /* | ||
1250 | - * Prevent a too long timeslice allowing a task to monopolize | ||
1251 | - * the CPU. We do this by splitting up the timeslice into | ||
1252 | - * smaller pieces. | ||
1253 | - * | ||
1254 | - * Note: this does not mean the task's timeslices expire or | ||
1255 | - * get lost in any way, they just might be preempted by | ||
1256 | - * another task of equal priority. (one with higher | ||
1257 | - * priority would have preempted this task already.) We | ||
1258 | - * requeue this task to the end of the list on this priority | ||
1259 | - * level, which is in essence a round-robin of tasks with | ||
1260 | - * equal priority. | ||
1261 | - * | ||
1262 | - * This only applies to tasks in the interactive | ||
1263 | - * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
1264 | - */ | ||
1265 | - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
1266 | - p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
1267 | - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
1268 | - (p->array == rq->active)) { | ||
1269 | - | ||
1270 | - requeue_task(p, rq->active); | ||
1271 | - set_tsk_need_resched(p); | ||
1272 | - } | ||
1273 | + goto out_unlock; | ||
1274 | } | ||
1275 | + expired_balance = 0; | ||
1276 | out_unlock: | ||
1277 | spin_unlock(&rq->lock); | ||
1278 | out: | ||
1279 | - rebalance_tick(cpu, rq, NOT_IDLE); | ||
1280 | + if (expired_balance > 1) | ||
1281 | + rebalance_tick(cpu, rq, NOT_IDLE); | ||
1282 | } | ||
1283 | |||
1284 | #ifdef CONFIG_SCHED_SMT | ||
1285 | @@ -2831,19 +2648,19 @@ static void wake_sleeping_dependent(int | ||
1286 | |||
1287 | /* | ||
1288 | * number of 'lost' timeslices this task wont be able to fully | ||
1289 | - * utilize, if another task runs on a sibling. This models the | ||
1290 | + * utilise, if another task runs on a sibling. This models the | ||
1291 | * slowdown effect of other tasks running on siblings: | ||
1292 | */ | ||
1293 | -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | ||
1294 | +static inline unsigned long smt_slice(const task_t *p, | ||
1295 | + const struct sched_domain *sd) | ||
1296 | { | ||
1297 | - return p->time_slice * (100 - sd->per_cpu_gain) / 100; | ||
1298 | + return p->slice * (100 - sd->per_cpu_gain) / 100; | ||
1299 | } | ||
1300 | |||
1301 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | ||
1302 | { | ||
1303 | struct sched_domain *tmp, *sd = NULL; | ||
1304 | cpumask_t sibling_map; | ||
1305 | - prio_array_t *array; | ||
1306 | int ret = 0, i; | ||
1307 | task_t *p; | ||
1308 | |||
1309 | @@ -2870,12 +2687,8 @@ static int dependent_sleeper(int this_cp | ||
1310 | */ | ||
1311 | if (!this_rq->nr_running) | ||
1312 | goto out_unlock; | ||
1313 | - array = this_rq->active; | ||
1314 | - if (!array->nr_active) | ||
1315 | - array = this_rq->expired; | ||
1316 | - BUG_ON(!array->nr_active); | ||
1317 | |||
1318 | - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | ||
1319 | + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, | ||
1320 | task_t, run_list); | ||
1321 | |||
1322 | for_each_cpu_mask(i, sibling_map) { | ||
1323 | @@ -2905,7 +2718,7 @@ static int dependent_sleeper(int this_cp | ||
1324 | } else | ||
1325 | if (smt_curr->static_prio < p->static_prio && | ||
1326 | !TASK_PREEMPTS_CURR(p, smt_rq) && | ||
1327 | - smt_slice(smt_curr, sd) > task_timeslice(p)) | ||
1328 | + smt_slice(smt_curr, sd) > slice(p)) | ||
1329 | ret = 1; | ||
1330 | |||
1331 | check_smt_task: | ||
1332 | @@ -2928,7 +2741,7 @@ check_smt_task: | ||
1333 | resched_task(smt_curr); | ||
1334 | } else { | ||
1335 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
1336 | - smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
1337 | + smt_slice(p, sd) > slice(smt_curr)) | ||
1338 | resched_task(smt_curr); | ||
1339 | else | ||
1340 | wakeup_busy_runqueue(smt_rq); | ||
1341 | @@ -2990,11 +2803,10 @@ asmlinkage void __sched schedule(void) | ||
1342 | long *switch_count; | ||
1343 | task_t *prev, *next; | ||
1344 | runqueue_t *rq; | ||
1345 | - prio_array_t *array; | ||
1346 | struct list_head *queue; | ||
1347 | unsigned long long now; | ||
1348 | - unsigned long run_time; | ||
1349 | - int cpu, idx, new_prio; | ||
1350 | + unsigned long debit; | ||
1351 | + int cpu, idx; | ||
1352 | |||
1353 | /* | ||
1354 | * Test if we are atomic. Since do_exit() needs to call into | ||
1355 | @@ -3029,20 +2841,11 @@ need_resched_nonpreemptible: | ||
1356 | |||
1357 | schedstat_inc(rq, sched_cnt); | ||
1358 | now = sched_clock(); | ||
1359 | - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | ||
1360 | - run_time = now - prev->timestamp; | ||
1361 | - if (unlikely((long long)(now - prev->timestamp) < 0)) | ||
1362 | - run_time = 0; | ||
1363 | - } else | ||
1364 | - run_time = NS_MAX_SLEEP_AVG; | ||
1365 | - | ||
1366 | - /* | ||
1367 | - * Tasks charged proportionately less run_time at high sleep_avg to | ||
1368 | - * delay them losing their interactive status | ||
1369 | - */ | ||
1370 | - run_time /= (CURRENT_BONUS(prev) ? : 1); | ||
1371 | |||
1372 | spin_lock_irq(&rq->lock); | ||
1373 | + prev->runtime = ns_diff(now, prev->timestamp); | ||
1374 | + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY; | ||
1375 | + prev->ns_debit += debit; | ||
1376 | |||
1377 | if (unlikely(prev->flags & PF_DEAD)) | ||
1378 | prev->state = EXIT_DEAD; | ||
1379 | @@ -3054,8 +2857,10 @@ need_resched_nonpreemptible: | ||
1380 | unlikely(signal_pending(prev)))) | ||
1381 | prev->state = TASK_RUNNING; | ||
1382 | else { | ||
1383 | - if (prev->state == TASK_UNINTERRUPTIBLE) | ||
1384 | + if (prev->state == TASK_UNINTERRUPTIBLE) { | ||
1385 | + prev->flags |= PF_NONSLEEP; | ||
1386 | rq->nr_uninterruptible++; | ||
1387 | + } | ||
1388 | deactivate_task(prev, rq); | ||
1389 | } | ||
1390 | } | ||
1391 | @@ -3066,7 +2871,6 @@ go_idle: | ||
1392 | idle_balance(cpu, rq); | ||
1393 | if (!rq->nr_running) { | ||
1394 | next = rq->idle; | ||
1395 | - rq->expired_timestamp = 0; | ||
1396 | wake_sleeping_dependent(cpu, rq); | ||
1397 | /* | ||
1398 | * wake_sleeping_dependent() might have released | ||
1399 | @@ -3090,45 +2894,15 @@ go_idle: | ||
1400 | goto go_idle; | ||
1401 | } | ||
1402 | |||
1403 | - array = rq->active; | ||
1404 | - if (unlikely(!array->nr_active)) { | ||
1405 | - /* | ||
1406 | - * Switch the active and expired arrays. | ||
1407 | - */ | ||
1408 | - schedstat_inc(rq, sched_switch); | ||
1409 | - rq->active = rq->expired; | ||
1410 | - rq->expired = array; | ||
1411 | - array = rq->active; | ||
1412 | - rq->expired_timestamp = 0; | ||
1413 | - rq->best_expired_prio = MAX_PRIO; | ||
1414 | - } | ||
1415 | - | ||
1416 | - idx = sched_find_first_bit(array->bitmap); | ||
1417 | - queue = array->queue + idx; | ||
1418 | + idx = sched_find_first_bit(rq->bitmap); | ||
1419 | + queue = rq->queue + idx; | ||
1420 | next = list_entry(queue->next, task_t, run_list); | ||
1421 | |||
1422 | - if (!rt_task(next) && next->activated > 0) { | ||
1423 | - unsigned long long delta = now - next->timestamp; | ||
1424 | - if (unlikely((long long)(now - next->timestamp) < 0)) | ||
1425 | - delta = 0; | ||
1426 | - | ||
1427 | - if (next->activated == 1) | ||
1428 | - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | ||
1429 | - | ||
1430 | - array = next->array; | ||
1431 | - new_prio = recalc_task_prio(next, next->timestamp + delta); | ||
1432 | - | ||
1433 | - if (unlikely(next->prio != new_prio)) { | ||
1434 | - dequeue_task(next, array); | ||
1435 | - next->prio = new_prio; | ||
1436 | - enqueue_task(next, array); | ||
1437 | - } else | ||
1438 | - requeue_task(next, array); | ||
1439 | - } | ||
1440 | - next->activated = 0; | ||
1441 | switch_tasks: | ||
1442 | if (next == rq->idle) | ||
1443 | schedstat_inc(rq, sched_goidle); | ||
1444 | + prev->timestamp = now; | ||
1445 | + | ||
1446 | prefetch(next); | ||
1447 | prefetch_stack(next); | ||
1448 | clear_tsk_need_resched(prev); | ||
1449 | @@ -3136,13 +2910,10 @@ switch_tasks: | ||
1450 | |||
1451 | update_cpu_clock(prev, rq, now); | ||
1452 | |||
1453 | - prev->sleep_avg -= run_time; | ||
1454 | - if ((long)prev->sleep_avg <= 0) | ||
1455 | - prev->sleep_avg = 0; | ||
1456 | - prev->timestamp = prev->last_ran = now; | ||
1457 | - | ||
1458 | sched_info_switch(prev, next); | ||
1459 | if (likely(prev != next)) { | ||
1460 | + rq->preempted = 0; | ||
1461 | + rq->cache_ticks = 0; | ||
1462 | next->timestamp = now; | ||
1463 | rq->nr_switches++; | ||
1464 | rq->curr = next; | ||
1465 | @@ -3572,9 +3343,8 @@ EXPORT_SYMBOL(sleep_on_timeout); | ||
1466 | void set_user_nice(task_t *p, long nice) | ||
1467 | { | ||
1468 | unsigned long flags; | ||
1469 | - prio_array_t *array; | ||
1470 | runqueue_t *rq; | ||
1471 | - int old_prio, new_prio, delta; | ||
1472 | + int queued, old_prio, new_prio, delta; | ||
1473 | |||
1474 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | ||
1475 | return; | ||
1476 | @@ -3593,9 +3363,8 @@ void set_user_nice(task_t *p, long nice) | ||
1477 | p->static_prio = NICE_TO_PRIO(nice); | ||
1478 | goto out_unlock; | ||
1479 | } | ||
1480 | - array = p->array; | ||
1481 | - if (array) { | ||
1482 | - dequeue_task(p, array); | ||
1483 | + if ((queued = task_queued(p))) { | ||
1484 | + dequeue_task(p, rq); | ||
1485 | dec_raw_weighted_load(rq, p); | ||
1486 | } | ||
1487 | |||
1488 | @@ -3605,9 +3374,11 @@ void set_user_nice(task_t *p, long nice) | ||
1489 | p->static_prio = NICE_TO_PRIO(nice); | ||
1490 | set_load_weight(p); | ||
1491 | p->prio += delta; | ||
1492 | + if (p->bonus > bonus(p)) | ||
1493 | + p->bonus= bonus(p); | ||
1494 | |||
1495 | - if (array) { | ||
1496 | - enqueue_task(p, array); | ||
1497 | + if (queued) { | ||
1498 | + enqueue_task(p, rq); | ||
1499 | inc_raw_weighted_load(rq, p); | ||
1500 | /* | ||
1501 | * If the task increased its priority or is running and | ||
1502 | @@ -3731,19 +3502,13 @@ static inline task_t *find_process_by_pi | ||
1503 | /* Actually do priority change: must hold rq lock. */ | ||
1504 | static void __setscheduler(struct task_struct *p, int policy, int prio) | ||
1505 | { | ||
1506 | - BUG_ON(p->array); | ||
1507 | + BUG_ON(task_queued(p)); | ||
1508 | p->policy = policy; | ||
1509 | p->rt_priority = prio; | ||
1510 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | ||
1511 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
1512 | - } else { | ||
1513 | + } else | ||
1514 | p->prio = p->static_prio; | ||
1515 | - /* | ||
1516 | - * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
1517 | - */ | ||
1518 | - if (policy == SCHED_BATCH) | ||
1519 | - p->sleep_avg = 0; | ||
1520 | - } | ||
1521 | set_load_weight(p); | ||
1522 | } | ||
1523 | |||
1524 | @@ -3758,8 +3523,7 @@ int sched_setscheduler(struct task_struc | ||
1525 | struct sched_param *param) | ||
1526 | { | ||
1527 | int retval; | ||
1528 | - int oldprio, oldpolicy = -1; | ||
1529 | - prio_array_t *array; | ||
1530 | + int queued, oldprio, oldpolicy = -1; | ||
1531 | unsigned long flags; | ||
1532 | runqueue_t *rq; | ||
1533 | |||
1534 | @@ -3821,12 +3585,11 @@ recheck: | ||
1535 | task_rq_unlock(rq, &flags); | ||
1536 | goto recheck; | ||
1537 | } | ||
1538 | - array = p->array; | ||
1539 | - if (array) | ||
1540 | + if ((queued = task_queued(p))) | ||
1541 | deactivate_task(p, rq); | ||
1542 | oldprio = p->prio; | ||
1543 | __setscheduler(p, policy, param->sched_priority); | ||
1544 | - if (array) { | ||
1545 | + if (queued) { | ||
1546 | __activate_task(p, rq); | ||
1547 | /* | ||
1548 | * Reschedule if we are currently running on this runqueue and | ||
1549 | @@ -3836,8 +3599,8 @@ recheck: | ||
1550 | if (task_running(rq, p)) { | ||
1551 | if (p->prio > oldprio) | ||
1552 | resched_task(rq->curr); | ||
1553 | - } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
1554 | - resched_task(rq->curr); | ||
1555 | + } else | ||
1556 | + preempt(p, rq); | ||
1557 | } | ||
1558 | task_rq_unlock(rq, &flags); | ||
1559 | return 0; | ||
1560 | @@ -4094,43 +3857,27 @@ asmlinkage long sys_sched_getaffinity(pi | ||
1561 | |||
1562 | /** | ||
1563 | * sys_sched_yield - yield the current processor to other threads. | ||
1564 | - * | ||
1565 | - * this function yields the current CPU by moving the calling thread | ||
1566 | - * to the expired array. If there are no other threads running on this | ||
1567 | - * CPU then this function will return. | ||
1568 | + * This function yields the current CPU by dropping the priority of current | ||
1569 | + * to the lowest priority. | ||
1570 | */ | ||
1571 | asmlinkage long sys_sched_yield(void) | ||
1572 | { | ||
1573 | + int newprio; | ||
1574 | runqueue_t *rq = this_rq_lock(); | ||
1575 | - prio_array_t *array = current->array; | ||
1576 | - prio_array_t *target = rq->expired; | ||
1577 | |||
1578 | + newprio = current->prio; | ||
1579 | schedstat_inc(rq, yld_cnt); | ||
1580 | - /* | ||
1581 | - * We implement yielding by moving the task into the expired | ||
1582 | - * queue. | ||
1583 | - * | ||
1584 | - * (special rule: RT tasks will just roundrobin in the active | ||
1585 | - * array.) | ||
1586 | - */ | ||
1587 | - if (rt_task(current)) | ||
1588 | - target = rq->active; | ||
1589 | - | ||
1590 | - if (array->nr_active == 1) { | ||
1591 | - schedstat_inc(rq, yld_act_empty); | ||
1592 | - if (!rq->expired->nr_active) | ||
1593 | - schedstat_inc(rq, yld_both_empty); | ||
1594 | - } else if (!rq->expired->nr_active) | ||
1595 | - schedstat_inc(rq, yld_exp_empty); | ||
1596 | - | ||
1597 | - if (array != target) { | ||
1598 | - dequeue_task(current, array); | ||
1599 | - enqueue_task(current, target); | ||
1600 | + current->slice = slice(current); | ||
1601 | + current->time_slice = rr_interval(current); | ||
1602 | + if (likely(!rt_task(current))) | ||
1603 | + newprio = MAX_PRIO - 1; | ||
1604 | + | ||
1605 | + if (newprio != current->prio) { | ||
1606 | + dequeue_task(current, rq); | ||
1607 | + current->prio = newprio; | ||
1608 | + enqueue_task(current, rq); | ||
1609 | } else | ||
1610 | - /* | ||
1611 | - * requeue_task is cheaper so perform that if possible. | ||
1612 | - */ | ||
1613 | - requeue_task(current, array); | ||
1614 | + requeue_task(current, rq); | ||
1615 | |||
1616 | /* | ||
1617 | * Since we are going to call schedule() anyway, there's | ||
1618 | @@ -4339,7 +4086,7 @@ long sys_sched_rr_get_interval(pid_t pid | ||
1619 | goto out_unlock; | ||
1620 | |||
1621 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | ||
1622 | - 0 : task_timeslice(p), &t); | ||
1623 | + 0 : slice(p), &t); | ||
1624 | read_unlock(&tasklist_lock); | ||
1625 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | ||
1626 | out_nounlock: | ||
1627 | @@ -4462,8 +4209,6 @@ void __devinit init_idle(task_t *idle, i | ||
1628 | unsigned long flags; | ||
1629 | |||
1630 | idle->timestamp = sched_clock(); | ||
1631 | - idle->sleep_avg = 0; | ||
1632 | - idle->array = NULL; | ||
1633 | idle->prio = MAX_PRIO; | ||
1634 | idle->state = TASK_RUNNING; | ||
1635 | idle->cpus_allowed = cpumask_of_cpu(cpu); | ||
1636 | @@ -4580,7 +4325,7 @@ static void __migrate_task(struct task_s | ||
1637 | goto out; | ||
1638 | |||
1639 | set_task_cpu(p, dest_cpu); | ||
1640 | - if (p->array) { | ||
1641 | + if (task_queued(p)) { | ||
1642 | /* | ||
1643 | * Sync timestamp with rq_dest's before activating. | ||
1644 | * The same thing could be achieved by doing this step | ||
1645 | @@ -4591,8 +4336,7 @@ static void __migrate_task(struct task_s | ||
1646 | + rq_dest->timestamp_last_tick; | ||
1647 | deactivate_task(p, rq_src); | ||
1648 | activate_task(p, rq_dest, 0); | ||
1649 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
1650 | - resched_task(rq_dest->curr); | ||
1651 | + preempt(p, rq_dest); | ||
1652 | } | ||
1653 | |||
1654 | out: | ||
1655 | @@ -4806,7 +4550,7 @@ static void migrate_dead_tasks(unsigned | ||
1656 | |||
1657 | for (arr = 0; arr < 2; arr++) { | ||
1658 | for (i = 0; i < MAX_PRIO; i++) { | ||
1659 | - struct list_head *list = &rq->arrays[arr].queue[i]; | ||
1660 | + struct list_head *list = &rq->queue[i]; | ||
1661 | while (!list_empty(list)) | ||
1662 | migrate_dead(dead_cpu, | ||
1663 | list_entry(list->next, task_t, | ||
1664 | @@ -6148,17 +5892,15 @@ int in_sched_functions(unsigned long add | ||
1665 | void __init sched_init(void) | ||
1666 | { | ||
1667 | runqueue_t *rq; | ||
1668 | - int i, j, k; | ||
1669 | + int i, j; | ||
1670 | |||
1671 | for_each_cpu(i) { | ||
1672 | - prio_array_t *array; | ||
1673 | |||
1674 | rq = cpu_rq(i); | ||
1675 | spin_lock_init(&rq->lock); | ||
1676 | rq->nr_running = 0; | ||
1677 | - rq->active = rq->arrays; | ||
1678 | - rq->expired = rq->arrays + 1; | ||
1679 | - rq->best_expired_prio = MAX_PRIO; | ||
1680 | + rq->cache_ticks = 0; | ||
1681 | + rq->preempted = 0; | ||
1682 | |||
1683 | #ifdef CONFIG_SMP | ||
1684 | rq->sd = NULL; | ||
1685 | @@ -6170,16 +5912,13 @@ void __init sched_init(void) | ||
1686 | INIT_LIST_HEAD(&rq->migration_queue); | ||
1687 | #endif | ||
1688 | atomic_set(&rq->nr_iowait, 0); | ||
1689 | - | ||
1690 | - for (j = 0; j < 2; j++) { | ||
1691 | - array = rq->arrays + j; | ||
1692 | - for (k = 0; k < MAX_PRIO; k++) { | ||
1693 | - INIT_LIST_HEAD(array->queue + k); | ||
1694 | - __clear_bit(k, array->bitmap); | ||
1695 | - } | ||
1696 | - // delimiter for bitsearch | ||
1697 | - __set_bit(MAX_PRIO, array->bitmap); | ||
1698 | - } | ||
1699 | + for (j = 0; j < MAX_PRIO; j++) | ||
1700 | + INIT_LIST_HEAD(&rq->queue[j]); | ||
1701 | + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long)); | ||
1702 | + /* | ||
1703 | + * delimiter for bitsearch | ||
1704 | + */ | ||
1705 | + __set_bit(MAX_PRIO, rq->bitmap); | ||
1706 | } | ||
1707 | |||
1708 | set_load_weight(&init_task); | ||
1709 | @@ -6224,9 +5963,9 @@ EXPORT_SYMBOL(__might_sleep); | ||
1710 | void normalize_rt_tasks(void) | ||
1711 | { | ||
1712 | struct task_struct *p; | ||
1713 | - prio_array_t *array; | ||
1714 | unsigned long flags; | ||
1715 | runqueue_t *rq; | ||
1716 | + int queued; | ||
1717 | |||
1718 | read_lock_irq(&tasklist_lock); | ||
1719 | for_each_process (p) { | ||
1720 | @@ -6235,11 +5974,10 @@ void normalize_rt_tasks(void) | ||
1721 | |||
1722 | rq = task_rq_lock(p, &flags); | ||
1723 | |||
1724 | - array = p->array; | ||
1725 | - if (array) | ||
1726 | + if ((queued = task_queued(p))) | ||
1727 | deactivate_task(p, task_rq(p)); | ||
1728 | __setscheduler(p, SCHED_NORMAL, 0); | ||
1729 | - if (array) { | ||
1730 | + if (queued) { | ||
1731 | __activate_task(p, task_rq(p)); | ||
1732 | resched_task(rq->curr); | ||
1733 | } | ||
1734 | Index: linux-2.6.16-ck1/kernel/sysctl.c | ||
1735 | =================================================================== | ||
1736 | --- linux-2.6.16-ck1.orig/kernel/sysctl.c 2006-03-20 20:46:26.000000000 +1100 | ||
1737 | +++ linux-2.6.16-ck1/kernel/sysctl.c 2006-03-20 20:46:48.000000000 +1100 | ||
1738 | @@ -623,6 +623,22 @@ static ctl_table kern_table[] = { | ||
1739 | .mode = 0444, | ||
1740 | .proc_handler = &proc_dointvec, | ||
1741 | }, | ||
1742 | + { | ||
1743 | + .ctl_name = KERN_INTERACTIVE, | ||
1744 | + .procname = "interactive", | ||
1745 | + .data = &sched_interactive, | ||
1746 | + .maxlen = sizeof (int), | ||
1747 | + .mode = 0644, | ||
1748 | + .proc_handler = &proc_dointvec, | ||
1749 | + }, | ||
1750 | + { | ||
1751 | + .ctl_name = KERN_COMPUTE, | ||
1752 | + .procname = "compute", | ||
1753 | + .data = &sched_compute, | ||
1754 | + .maxlen = sizeof (int), | ||
1755 | + .mode = 0644, | ||
1756 | + .proc_handler = &proc_dointvec, | ||
1757 | + }, | ||
1758 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
1759 | { | ||
1760 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, |