Annotation of /trunk/kernel26-magellan/patches-2.6.16-r12/0001-2.6.16-sched-implement-smpnice.patch
Parent Directory | Revision Log
Revision 72 -
(hide annotations)
(download)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 22925 byte(s)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 22925 byte(s)
ver bump to 2.6.16-r12: - updated to linux-2.6.16.19 - updated to ck11
1 | niro | 72 | |
2 | From: Peter Williams <pwil3058@bigpond.net.au> | ||
3 | |||
4 | Problem: | ||
5 | |||
6 | The introduction of separate run queues per CPU has brought with it "nice" | ||
7 | enforcement problems that are best described by a simple example. | ||
8 | |||
9 | For the sake of argument suppose that on a single CPU machine with a | ||
10 | nice==19 hard spinner and a nice==0 hard spinner running that the nice==0 | ||
11 | task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now | ||
12 | suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and | ||
13 | 2 nice==0 hard spinners running. The user of this system would be entitled | ||
14 | to expect that the nice==0 tasks each get 95% of a CPU and the nice==19 | ||
15 | tasks only get 5% each. However, whether this expectation is met is pretty | ||
16 | much down to luck as there are four equally likely distributions of the | ||
17 | tasks to the CPUs that the load balancing code will consider to be balanced | ||
18 | with loads of 2.0 for each CPU. Two of these distributions involve one | ||
19 | nice==0 and one nice==19 task per CPU and in these circumstances the users | ||
20 | expectations will be met. The other two distributions both involve both | ||
21 | nice==0 tasks being on one CPU and both nice==19 being on the other CPU and | ||
22 | each task will get 50% of a CPU and the user's expectations will not be | ||
23 | met. | ||
24 | |||
25 | Solution: | ||
26 | |||
27 | The solution to this problem that is implemented in the attached patch is | ||
28 | to use weighted loads when determining if the system is balanced and, when | ||
29 | an imbalance is detected, to move an amount of weighted load between run | ||
30 | queues (as opposed to a number of tasks) to restore the balance. Once | ||
31 | again, the easiest way to explain why both of these measures are necessary | ||
32 | is to use a simple example. Suppose that (in a slight variation of the | ||
33 | above example) that we have a two CPU system with 4 nice==0 and 4 nice=19 | ||
34 | hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and | ||
35 | the 4 nice==19 tasks are on the other CPU. The weighted loads for the two | ||
36 | CPUs would be 4.0 and 0.2 respectively and the load balancing code would | ||
37 | move 2 tasks resulting in one CPU with a load of 2.0 and the other with | ||
38 | load of 2.2. If this was considered to be a big enough imbalance to | ||
39 | justify moving a task and that task was moved using the current | ||
40 | move_tasks() then it would move the highest priority task that it found and | ||
41 | this would result in one CPU with a load of 3.0 and the other with a load | ||
42 | of 1.2 which would result in the movement of a task in the opposite | ||
43 | direction and so on -- infinite loop. If, on the other hand, an amount of | ||
44 | load to be moved is calculated from the imbalance (in this case 0.1) and | ||
45 | move_tasks() skips tasks until it find ones whose contributions to the | ||
46 | weighted load are less than this amount it would move two of the nice==19 | ||
47 | tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with | ||
48 | loads of 2.1 for each CPU. | ||
49 | |||
50 | One of the advantages of this mechanism is that on a system where all tasks | ||
51 | have nice==0 the load balancing calculations would be mathematically | ||
52 | identical to the current load balancing code. | ||
53 | |||
54 | Notes: | ||
55 | |||
56 | struct task_struct: | ||
57 | |||
58 | has a new field load_weight which (in a trade off of space for speed) | ||
59 | stores the contribution that this task makes to a CPU's weighted load when | ||
60 | it is runnable. | ||
61 | |||
62 | struct runqueue: | ||
63 | |||
64 | has a new field raw_weighted_load which is the sum of the load_weight | ||
65 | values for the currently runnable tasks on this run queue. This field | ||
66 | always needs to be updated when nr_running is updated so two new inline | ||
67 | functions inc_nr_running() and dec_nr_running() have been created to make | ||
68 | sure that this happens. This also offers a convenient way to optimize away | ||
69 | this part of the smpnice mechanism when CONFIG_SMP is not defined. | ||
70 | |||
71 | int try_to_wake_up(): | ||
72 | |||
73 | in this function the value SCHED_LOAD_BALANCE is used to represent the load | ||
74 | contribution of a single task in various calculations in the code that | ||
75 | decides which CPU to put the waking task on. While this would be a valid | ||
76 | on a system where the nice values for the runnable tasks were distributed | ||
77 | evenly around zero it will lead to anomalous load balancing if the | ||
78 | distribution is skewed in either direction. To overcome this problem | ||
79 | SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task | ||
80 | or by the average load_weight per task for the queue in question (as | ||
81 | appropriate). | ||
82 | |||
83 | int move_tasks(): | ||
84 | |||
85 | The modifications to this function were complicated by the fact that | ||
86 | active_load_balance() uses it to move exactly one task without checking | ||
87 | whether an imbalance actually exists. This precluded the simple | ||
88 | overloading of max_nr_move with max_load_move and necessitated the addition | ||
89 | of the latter as an extra argument to the function. The internal | ||
90 | implementation is then modified to move up to max_nr_move tasks and | ||
91 | max_load_move of weighted load. This slightly complicates the code where | ||
92 | move_tasks() is called and if ever active_load_balance() is changed to not | ||
93 | use move_tasks() the implementation of move_tasks() should be simplified | ||
94 | accordingly. | ||
95 | |||
96 | struct sched_group *find_busiest_group(): | ||
97 | |||
98 | Similar to try_to_wake_up(), there are places in this function where | ||
99 | SCHED_LOAD_SCALE is used to represent the load contribution of a single | ||
100 | task and the same issues are created. A similar solution is adopted except | ||
101 | that it is now the average per task contribution to a group's load (as | ||
102 | opposed to a run queue) that is required. As this value is not directly | ||
103 | available from the group it is calculated on the fly as the queues in the | ||
104 | groups are visited when determining the busiest group. | ||
105 | |||
106 | A key change to this function is that it is no longer to scale down | ||
107 | *imbalance on exit as move_tasks() uses the load in its scaled form. | ||
108 | |||
109 | void set_user_nice(): | ||
110 | |||
111 | has been modified to update the task's load_weight field when it's nice | ||
112 | value and also to ensure that its run queue's raw_weighted_load field is | ||
113 | updated if it was runnable. | ||
114 | |||
115 | Signed-off-by: Peter Williams <pwil3058@bigpond.com.au> | ||
116 | Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> | ||
117 | Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com> | ||
118 | Acked-by: Ingo Molnar <mingo@elte.hu> | ||
119 | Cc: Nick Piggin <nickpiggin@yahoo.com.au> | ||
120 | Signed-off-by: Con Kolivas <kernel@kolivas.org> | ||
121 | Cc: John Hawkes <hawkes@sgi.com> | ||
122 | Signed-off-by: Andrew Morton <akpm@osdl.org> | ||
123 | include/linux/sched.h | 3 | ||
124 | kernel/sched.c | 230 +++++++++++++++++++++++++++++++++++++++----------- | ||
125 | 2 files changed, 183 insertions(+), 50 deletions(-) | ||
126 | |||
127 | Index: linux-2.6.16-ck1/include/linux/sched.h | ||
128 | =================================================================== | ||
129 | --- linux-2.6.16-ck1.orig/include/linux/sched.h 2006-03-20 20:46:27.000000000 +1100 | ||
130 | +++ linux-2.6.16-ck1/include/linux/sched.h 2006-03-20 20:46:44.000000000 +1100 | ||
131 | @@ -702,6 +702,9 @@ struct task_struct { | ||
132 | int oncpu; | ||
133 | #endif | ||
134 | int prio, static_prio; | ||
135 | +#ifdef CONFIG_SMP | ||
136 | + int load_weight; /* for load balancing purposes */ | ||
137 | +#endif | ||
138 | struct list_head run_list; | ||
139 | prio_array_t *array; | ||
140 | |||
141 | Index: linux-2.6.16-ck1/kernel/sched.c | ||
142 | =================================================================== | ||
143 | --- linux-2.6.16-ck1.orig/kernel/sched.c 2006-03-20 20:46:27.000000000 +1100 | ||
144 | +++ linux-2.6.16-ck1/kernel/sched.c 2006-03-20 20:46:44.000000000 +1100 | ||
145 | @@ -208,6 +208,7 @@ struct runqueue { | ||
146 | */ | ||
147 | unsigned long nr_running; | ||
148 | #ifdef CONFIG_SMP | ||
149 | + unsigned long raw_weighted_load; | ||
150 | unsigned long cpu_load[3]; | ||
151 | #endif | ||
152 | unsigned long long nr_switches; | ||
153 | @@ -661,13 +662,85 @@ static int effective_prio(task_t *p) | ||
154 | return prio; | ||
155 | } | ||
156 | |||
157 | +#ifdef CONFIG_SMP | ||
158 | +/* | ||
159 | + * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
160 | + * of tasks with abnormal "nice" values across CPUs the contribution that | ||
161 | + * each task makes to its run queue's load is weighted according to its | ||
162 | + * scheduling class and "nice" value. | ||
163 | + */ | ||
164 | + | ||
165 | +/* | ||
166 | + * Priority weight for load balancing ranges from 1/20 (nice==19) to 459/20 (RT | ||
167 | + * priority of 100). | ||
168 | + */ | ||
169 | +#define NICE_TO_LOAD_PRIO(nice) \ | ||
170 | + ((nice >= 0) ? (20 - (nice)) : (20 + (nice) * (nice))) | ||
171 | +#define LOAD_WEIGHT(lp) \ | ||
172 | + (((lp) * SCHED_LOAD_SCALE) / NICE_TO_LOAD_PRIO(0)) | ||
173 | +#define NICE_TO_LOAD_WEIGHT(nice) LOAD_WEIGHT(NICE_TO_LOAD_PRIO(nice)) | ||
174 | +#define PRIO_TO_LOAD_WEIGHT(prio) NICE_TO_LOAD_WEIGHT(PRIO_TO_NICE(prio)) | ||
175 | +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
176 | + LOAD_WEIGHT(NICE_TO_LOAD_PRIO(-20) + (rp)) | ||
177 | + | ||
178 | +static inline void set_load_weight(task_t *p) | ||
179 | +{ | ||
180 | + if (rt_task(p)) { | ||
181 | + if (p == task_rq(p)->migration_thread) | ||
182 | + /* | ||
183 | + * The migration thread does the actual balancing. | ||
184 | + * Giving its load any weight will skew balancing | ||
185 | + * adversely. | ||
186 | + */ | ||
187 | + p->load_weight = 0; | ||
188 | + else | ||
189 | + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
190 | + } else | ||
191 | + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
192 | +} | ||
193 | + | ||
194 | +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
195 | +{ | ||
196 | + rq->raw_weighted_load += p->load_weight; | ||
197 | +} | ||
198 | + | ||
199 | +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
200 | +{ | ||
201 | + rq->raw_weighted_load -= p->load_weight; | ||
202 | +} | ||
203 | +#else | ||
204 | +static inline void set_load_weight(task_t *p) | ||
205 | +{ | ||
206 | +} | ||
207 | + | ||
208 | +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
209 | +{ | ||
210 | +} | ||
211 | + | ||
212 | +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
213 | +{ | ||
214 | +} | ||
215 | +#endif | ||
216 | + | ||
217 | +static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
218 | +{ | ||
219 | + rq->nr_running++; | ||
220 | + inc_raw_weighted_load(rq, p); | ||
221 | +} | ||
222 | + | ||
223 | +static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
224 | +{ | ||
225 | + rq->nr_running--; | ||
226 | + dec_raw_weighted_load(rq, p); | ||
227 | +} | ||
228 | + | ||
229 | /* | ||
230 | * __activate_task - move a task to the runqueue. | ||
231 | */ | ||
232 | static inline void __activate_task(task_t *p, runqueue_t *rq) | ||
233 | { | ||
234 | enqueue_task(p, rq->active); | ||
235 | - rq->nr_running++; | ||
236 | + inc_nr_running(p, rq); | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | @@ -676,7 +749,7 @@ static inline void __activate_task(task_ | ||
241 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | ||
242 | { | ||
243 | enqueue_task_head(p, rq->active); | ||
244 | - rq->nr_running++; | ||
245 | + inc_nr_running(p, rq); | ||
246 | } | ||
247 | |||
248 | static int recalc_task_prio(task_t *p, unsigned long long now) | ||
249 | @@ -800,7 +873,7 @@ static void activate_task(task_t *p, run | ||
250 | */ | ||
251 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | ||
252 | { | ||
253 | - rq->nr_running--; | ||
254 | + dec_nr_running(p, rq); | ||
255 | dequeue_task(p, p->array); | ||
256 | p->array = NULL; | ||
257 | } | ||
258 | @@ -939,7 +1012,8 @@ void kick_process(task_t *p) | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | - * Return a low guess at the load of a migration-source cpu. | ||
263 | + * Return a low guess at the load of a migration-source cpu weighted | ||
264 | + * according to the scheduling class and "nice" value. | ||
265 | * | ||
266 | * We want to under-estimate the load of migration sources, to | ||
267 | * balance conservatively. | ||
268 | @@ -947,24 +1021,36 @@ void kick_process(task_t *p) | ||
269 | static inline unsigned long source_load(int cpu, int type) | ||
270 | { | ||
271 | runqueue_t *rq = cpu_rq(cpu); | ||
272 | - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | ||
273 | + | ||
274 | if (type == 0) | ||
275 | - return load_now; | ||
276 | + return rq->raw_weighted_load; | ||
277 | |||
278 | - return min(rq->cpu_load[type-1], load_now); | ||
279 | + return min(rq->cpu_load[type-1], rq->raw_weighted_load); | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | - * Return a high guess at the load of a migration-target cpu | ||
284 | + * Return a high guess at the load of a migration-target cpu weighted | ||
285 | + * according to the scheduling class and "nice" value. | ||
286 | */ | ||
287 | static inline unsigned long target_load(int cpu, int type) | ||
288 | { | ||
289 | runqueue_t *rq = cpu_rq(cpu); | ||
290 | - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | ||
291 | + | ||
292 | if (type == 0) | ||
293 | - return load_now; | ||
294 | + return rq->raw_weighted_load; | ||
295 | |||
296 | - return max(rq->cpu_load[type-1], load_now); | ||
297 | + return max(rq->cpu_load[type-1], rq->raw_weighted_load); | ||
298 | +} | ||
299 | + | ||
300 | +/* | ||
301 | + * Return the average load per task on the cpu's run queue | ||
302 | + */ | ||
303 | +static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
304 | +{ | ||
305 | + runqueue_t *rq = cpu_rq(cpu); | ||
306 | + unsigned long n = rq->nr_running; | ||
307 | + | ||
308 | + return n ? rq->raw_weighted_load / n : rq->raw_weighted_load; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | @@ -1216,17 +1302,19 @@ static int try_to_wake_up(task_t *p, uns | ||
313 | |||
314 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
315 | unsigned long tl = this_load; | ||
316 | + unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
317 | + | ||
318 | /* | ||
319 | * If sync wakeup then subtract the (maximum possible) | ||
320 | * effect of the currently running task from the load | ||
321 | * of the current CPU: | ||
322 | */ | ||
323 | if (sync) | ||
324 | - tl -= SCHED_LOAD_SCALE; | ||
325 | + tl -= current->load_weight; | ||
326 | |||
327 | if ((tl <= load && | ||
328 | - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | ||
329 | - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | ||
330 | + tl + target_load(cpu, idx) <= tl_per_task) || | ||
331 | + 100*(tl + p->load_weight) <= imbalance*load) { | ||
332 | /* | ||
333 | * This domain has SD_WAKE_AFFINE and | ||
334 | * p is cache cold in this domain, and | ||
335 | @@ -1425,7 +1513,7 @@ void fastcall wake_up_new_task(task_t *p | ||
336 | list_add_tail(&p->run_list, ¤t->run_list); | ||
337 | p->array = current->array; | ||
338 | p->array->nr_active++; | ||
339 | - rq->nr_running++; | ||
340 | + inc_nr_running(p, rq); | ||
341 | } | ||
342 | set_need_resched(); | ||
343 | } else | ||
344 | @@ -1770,9 +1858,9 @@ void pull_task(runqueue_t *src_rq, prio_ | ||
345 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | ||
346 | { | ||
347 | dequeue_task(p, src_array); | ||
348 | - src_rq->nr_running--; | ||
349 | + dec_nr_running(p, src_rq); | ||
350 | set_task_cpu(p, this_cpu); | ||
351 | - this_rq->nr_running++; | ||
352 | + inc_nr_running(p, this_rq); | ||
353 | enqueue_task(p, this_array); | ||
354 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | ||
355 | + this_rq->timestamp_last_tick; | ||
356 | @@ -1820,24 +1908,27 @@ int can_migrate_task(task_t *p, runqueue | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | ||
361 | - * as part of a balancing operation within "domain". Returns the number of | ||
362 | - * tasks moved. | ||
363 | + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
364 | + * load from busiest to this_rq, as part of a balancing operation within | ||
365 | + * "domain". Returns the number of tasks moved. | ||
366 | * | ||
367 | * Called with both runqueues locked. | ||
368 | */ | ||
369 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | ||
370 | - unsigned long max_nr_move, struct sched_domain *sd, | ||
371 | - enum idle_type idle, int *all_pinned) | ||
372 | + unsigned long max_nr_move, unsigned long max_load_move, | ||
373 | + struct sched_domain *sd, enum idle_type idle, | ||
374 | + int *all_pinned) | ||
375 | { | ||
376 | prio_array_t *array, *dst_array; | ||
377 | struct list_head *head, *curr; | ||
378 | int idx, pulled = 0, pinned = 0; | ||
379 | + long rem_load_move; | ||
380 | task_t *tmp; | ||
381 | |||
382 | - if (max_nr_move == 0) | ||
383 | + if (max_nr_move == 0 || max_load_move == 0) | ||
384 | goto out; | ||
385 | |||
386 | + rem_load_move = max_load_move; | ||
387 | pinned = 1; | ||
388 | |||
389 | /* | ||
390 | @@ -1878,7 +1969,8 @@ skip_queue: | ||
391 | |||
392 | curr = curr->prev; | ||
393 | |||
394 | - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
395 | + if (tmp->load_weight > rem_load_move || | ||
396 | + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
397 | if (curr != head) | ||
398 | goto skip_queue; | ||
399 | idx++; | ||
400 | @@ -1892,9 +1984,13 @@ skip_queue: | ||
401 | |||
402 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | ||
403 | pulled++; | ||
404 | + rem_load_move -= tmp->load_weight; | ||
405 | |||
406 | - /* We only want to steal up to the prescribed number of tasks. */ | ||
407 | - if (pulled < max_nr_move) { | ||
408 | + /* | ||
409 | + * We only want to steal up to the prescribed number of tasks | ||
410 | + * and the prescribed amount of weighted load. | ||
411 | + */ | ||
412 | + if (pulled < max_nr_move && rem_load_move > 0) { | ||
413 | if (curr != head) | ||
414 | goto skip_queue; | ||
415 | idx++; | ||
416 | @@ -1915,7 +2011,7 @@ out: | ||
417 | |||
418 | /* | ||
419 | * find_busiest_group finds and returns the busiest CPU group within the | ||
420 | - * domain. It calculates and returns the number of tasks which should be | ||
421 | + * domain. It calculates and returns the amount of weighted load which should be | ||
422 | * moved to restore balance via the imbalance parameter. | ||
423 | */ | ||
424 | static struct sched_group * | ||
425 | @@ -1925,9 +2021,13 @@ find_busiest_group(struct sched_domain * | ||
426 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | ||
427 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | ||
428 | unsigned long max_pull; | ||
429 | + unsigned long busiest_load_per_task, busiest_nr_running; | ||
430 | + unsigned long this_load_per_task, this_nr_running; | ||
431 | int load_idx; | ||
432 | |||
433 | max_load = this_load = total_load = total_pwr = 0; | ||
434 | + busiest_load_per_task = busiest_nr_running = 0; | ||
435 | + this_load_per_task = this_nr_running = 0; | ||
436 | if (idle == NOT_IDLE) | ||
437 | load_idx = sd->busy_idx; | ||
438 | else if (idle == NEWLY_IDLE) | ||
439 | @@ -1939,13 +2039,16 @@ find_busiest_group(struct sched_domain * | ||
440 | unsigned long load; | ||
441 | int local_group; | ||
442 | int i; | ||
443 | + unsigned long sum_nr_running, sum_weighted_load; | ||
444 | |||
445 | local_group = cpu_isset(this_cpu, group->cpumask); | ||
446 | |||
447 | /* Tally up the load of all CPUs in the group */ | ||
448 | - avg_load = 0; | ||
449 | + sum_weighted_load = sum_nr_running = avg_load = 0; | ||
450 | |||
451 | for_each_cpu_mask(i, group->cpumask) { | ||
452 | + runqueue_t *rq = cpu_rq(i); | ||
453 | + | ||
454 | if (*sd_idle && !idle_cpu(i)) | ||
455 | *sd_idle = 0; | ||
456 | |||
457 | @@ -1956,6 +2059,8 @@ find_busiest_group(struct sched_domain * | ||
458 | load = source_load(i, load_idx); | ||
459 | |||
460 | avg_load += load; | ||
461 | + sum_nr_running += rq->nr_running; | ||
462 | + sum_weighted_load += rq->raw_weighted_load; | ||
463 | } | ||
464 | |||
465 | total_load += avg_load; | ||
466 | @@ -1967,14 +2072,18 @@ find_busiest_group(struct sched_domain * | ||
467 | if (local_group) { | ||
468 | this_load = avg_load; | ||
469 | this = group; | ||
470 | + this_nr_running = sum_nr_running; | ||
471 | + this_load_per_task = sum_weighted_load; | ||
472 | } else if (avg_load > max_load) { | ||
473 | max_load = avg_load; | ||
474 | busiest = group; | ||
475 | + busiest_nr_running = sum_nr_running; | ||
476 | + busiest_load_per_task = sum_weighted_load; | ||
477 | } | ||
478 | group = group->next; | ||
479 | } while (group != sd->groups); | ||
480 | |||
481 | - if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | ||
482 | + if (!busiest || this_load >= max_load || busiest_nr_running <= 1) | ||
483 | goto out_balanced; | ||
484 | |||
485 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | ||
486 | @@ -1983,6 +2092,7 @@ find_busiest_group(struct sched_domain * | ||
487 | 100*max_load <= sd->imbalance_pct*this_load) | ||
488 | goto out_balanced; | ||
489 | |||
490 | + busiest_load_per_task /= busiest_nr_running; | ||
491 | /* | ||
492 | * We're trying to get all the cpus to the average_load, so we don't | ||
493 | * want to push ourselves above the average load, nor do we wish to | ||
494 | @@ -1996,19 +2106,25 @@ find_busiest_group(struct sched_domain * | ||
495 | */ | ||
496 | |||
497 | /* Don't want to pull so many tasks that a group would go idle */ | ||
498 | - max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | ||
499 | + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | ||
500 | |||
501 | /* How much load to actually move to equalise the imbalance */ | ||
502 | *imbalance = min(max_pull * busiest->cpu_power, | ||
503 | (avg_load - this_load) * this->cpu_power) | ||
504 | / SCHED_LOAD_SCALE; | ||
505 | |||
506 | - if (*imbalance < SCHED_LOAD_SCALE) { | ||
507 | + /* | ||
508 | + * if *imbalance is less than the average load per runnable task | ||
509 | + * there is no gaurantee that any tasks will be moved so we'll have | ||
510 | + * a think about bumping its value to force at least one task to be | ||
511 | + * moved | ||
512 | + */ | ||
513 | + if (*imbalance < busiest_load_per_task) { | ||
514 | unsigned long pwr_now = 0, pwr_move = 0; | ||
515 | unsigned long tmp; | ||
516 | |||
517 | - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | ||
518 | - *imbalance = 1; | ||
519 | + if (max_load - this_load >= busiest_load_per_task*2) { | ||
520 | + *imbalance = busiest_load_per_task; | ||
521 | return busiest; | ||
522 | } | ||
523 | |||
524 | @@ -2018,35 +2134,39 @@ find_busiest_group(struct sched_domain * | ||
525 | * moving them. | ||
526 | */ | ||
527 | |||
528 | - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | ||
529 | - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | ||
530 | + pwr_now += busiest->cpu_power * | ||
531 | + min(busiest_load_per_task, max_load); | ||
532 | + if (this_nr_running) | ||
533 | + this_load_per_task /= this_nr_running; | ||
534 | + pwr_now += this->cpu_power * | ||
535 | + min(this_load_per_task, this_load); | ||
536 | pwr_now /= SCHED_LOAD_SCALE; | ||
537 | |||
538 | /* Amount of load we'd subtract */ | ||
539 | - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | ||
540 | + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; | ||
541 | if (max_load > tmp) | ||
542 | - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | ||
543 | - max_load - tmp); | ||
544 | + pwr_move += busiest->cpu_power * | ||
545 | + min(busiest_load_per_task, max_load - tmp); | ||
546 | |||
547 | /* Amount of load we'd add */ | ||
548 | if (max_load*busiest->cpu_power < | ||
549 | - SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | ||
550 | + busiest_load_per_task*SCHED_LOAD_SCALE) | ||
551 | tmp = max_load*busiest->cpu_power/this->cpu_power; | ||
552 | else | ||
553 | - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | ||
554 | - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | ||
555 | + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; | ||
556 | + pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); | ||
557 | pwr_move /= SCHED_LOAD_SCALE; | ||
558 | |||
559 | /* Move if we gain throughput */ | ||
560 | - if (pwr_move <= pwr_now) | ||
561 | + if (pwr_move > pwr_now) | ||
562 | + *imbalance = busiest_load_per_task; | ||
563 | + /* or if there's a reasonable chance that *imbalance is big | ||
564 | + * enough to cause a move | ||
565 | + */ | ||
566 | + else if (*imbalance <= busiest_load_per_task / 2) | ||
567 | goto out_balanced; | ||
568 | - | ||
569 | - *imbalance = 1; | ||
570 | - return busiest; | ||
571 | } | ||
572 | |||
573 | - /* Get rid of the scaling factor, rounding down as we divide */ | ||
574 | - *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
575 | return busiest; | ||
576 | |||
577 | out_balanced: | ||
578 | @@ -2083,6 +2203,7 @@ static runqueue_t *find_busiest_queue(st | ||
579 | */ | ||
580 | #define MAX_PINNED_INTERVAL 512 | ||
581 | |||
582 | +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) | ||
583 | /* | ||
584 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
585 | * tasks if there is an imbalance. | ||
586 | @@ -2130,6 +2251,7 @@ static int load_balance(int this_cpu, ru | ||
587 | */ | ||
588 | double_rq_lock(this_rq, busiest); | ||
589 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | ||
590 | + minus_1_or_zero(busiest->nr_running), | ||
591 | imbalance, sd, idle, &all_pinned); | ||
592 | double_rq_unlock(this_rq, busiest); | ||
593 | |||
594 | @@ -2248,6 +2370,7 @@ static int load_balance_newidle(int this | ||
595 | /* Attempt to move tasks */ | ||
596 | double_lock_balance(this_rq, busiest); | ||
597 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | ||
598 | + minus_1_or_zero(busiest->nr_running), | ||
599 | imbalance, sd, NEWLY_IDLE, NULL); | ||
600 | spin_unlock(&busiest->lock); | ||
601 | } | ||
602 | @@ -2328,7 +2451,8 @@ static void active_load_balance(runqueue | ||
603 | |||
604 | schedstat_inc(sd, alb_cnt); | ||
605 | |||
606 | - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | ||
607 | + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, | ||
608 | + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) | ||
609 | schedstat_inc(sd, alb_pushed); | ||
610 | else | ||
611 | schedstat_inc(sd, alb_failed); | ||
612 | @@ -2356,7 +2480,7 @@ static void rebalance_tick(int this_cpu, | ||
613 | struct sched_domain *sd; | ||
614 | int i; | ||
615 | |||
616 | - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | ||
617 | + this_load = this_rq->raw_weighted_load; | ||
618 | /* Update our load */ | ||
619 | for (i = 0; i < 3; i++) { | ||
620 | unsigned long new_load = this_load; | ||
621 | @@ -3466,17 +3590,21 @@ void set_user_nice(task_t *p, long nice) | ||
622 | goto out_unlock; | ||
623 | } | ||
624 | array = p->array; | ||
625 | - if (array) | ||
626 | + if (array) { | ||
627 | dequeue_task(p, array); | ||
628 | + dec_raw_weighted_load(rq, p); | ||
629 | + } | ||
630 | |||
631 | old_prio = p->prio; | ||
632 | new_prio = NICE_TO_PRIO(nice); | ||
633 | delta = new_prio - old_prio; | ||
634 | p->static_prio = NICE_TO_PRIO(nice); | ||
635 | + set_load_weight(p); | ||
636 | p->prio += delta; | ||
637 | |||
638 | if (array) { | ||
639 | enqueue_task(p, array); | ||
640 | + inc_raw_weighted_load(rq, p); | ||
641 | /* | ||
642 | * If the task increased its priority or is running and | ||
643 | * lowered its priority, then reschedule its CPU: | ||
644 | @@ -3612,6 +3740,7 @@ static void __setscheduler(struct task_s | ||
645 | if (policy == SCHED_BATCH) | ||
646 | p->sleep_avg = 0; | ||
647 | } | ||
648 | + set_load_weight(p); | ||
649 | } | ||
650 | |||
651 | /** | ||
652 | @@ -6049,6 +6178,7 @@ void __init sched_init(void) | ||
653 | } | ||
654 | } | ||
655 | |||
656 | + set_load_weight(&init_task); | ||
657 | /* | ||
658 | * The boot idle thread does lazy MMU switching as well: | ||
659 | */ |