Magellan Linux

Contents of /trunk/kernel26-alx/patches-2.6.17-r7/0001-2.6.17-sched-implement-smpnice.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (show annotations) (download)
Fri May 18 11:04:36 2007 UTC (16 years, 11 months ago) by niro
File size: 22845 byte(s)
-import

1
2 To aid in avoiding the subversion of "niceness" due to uneven distribution
3 of tasks with abnormal "nice" values across CPUs the contribution that
4 each task makes to its run queue's load is weighted according to its
5 scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
6 scaled version of the new time slice allocation that they receive on time
7 slice expiry etc.
8
9 Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
10 Signed-off-by: Con Kolivas <kernel@kolivas.org>
11
12 ---
13 include/linux/sched.h | 8 -
14 kernel/sched.c | 313 +++++++++++++++++++++++++++++++++++++++-----------
15 2 files changed, 253 insertions(+), 68 deletions(-)
16
17 Index: linux-ck-dev/include/linux/sched.h
18 ===================================================================
19 --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:20:15.000000000 +1000
20 +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:21:31.000000000 +1000
21 @@ -102,6 +102,7 @@ extern unsigned long nr_running(void);
22 extern unsigned long nr_uninterruptible(void);
23 extern unsigned long nr_active(void);
24 extern unsigned long nr_iowait(void);
25 +extern unsigned long weighted_cpuload(const int cpu);
26
27 #include <linux/time.h>
28 #include <linux/param.h>
29 @@ -547,9 +548,9 @@ enum idle_type
30 /*
31 * sched-domains (multiprocessor balancing) declarations:
32 */
33 -#ifdef CONFIG_SMP
34 #define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
35
36 +#ifdef CONFIG_SMP
37 #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
38 #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
39 #define SD_BALANCE_EXEC 4 /* Balance on exec */
40 @@ -702,9 +703,12 @@ struct task_struct {
41
42 int lock_depth; /* BKL lock depth */
43
44 -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
45 +#ifdef CONFIG_SMP
46 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
47 int oncpu;
48 #endif
49 +#endif
50 + int load_weight; /* for niceness load balancing purposes */
51 int prio, static_prio;
52 struct list_head run_list;
53 prio_array_t *array;
54 Index: linux-ck-dev/kernel/sched.c
55 ===================================================================
56 --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:20:15.000000000 +1000
57 +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000
58 @@ -168,15 +168,21 @@
59 */
60
61 #define SCALE_PRIO(x, prio) \
62 - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
63 + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
64
65 -static unsigned int task_timeslice(task_t *p)
66 +static unsigned int static_prio_timeslice(int static_prio)
67 {
68 - if (p->static_prio < NICE_TO_PRIO(0))
69 - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
70 + if (static_prio < NICE_TO_PRIO(0))
71 + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
72 else
73 - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
74 + return SCALE_PRIO(DEF_TIMESLICE, static_prio);
75 }
76 +
77 +static inline unsigned int task_timeslice(task_t *p)
78 +{
79 + return static_prio_timeslice(p->static_prio);
80 +}
81 +
82 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
83 < (long long) (sd)->cache_hot_time)
84
85 @@ -209,6 +215,7 @@ struct runqueue {
86 * remote CPUs use both these fields when doing load calculation.
87 */
88 unsigned long nr_running;
89 + unsigned long raw_weighted_load;
90 #ifdef CONFIG_SMP
91 unsigned long cpu_load[3];
92 #endif
93 @@ -665,6 +672,68 @@ static int effective_prio(task_t *p)
94 }
95
96 /*
97 + * To aid in avoiding the subversion of "niceness" due to uneven distribution
98 + * of tasks with abnormal "nice" values across CPUs the contribution that
99 + * each task makes to its run queue's load is weighted according to its
100 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
101 + * scaled version of the new time slice allocation that they receive on time
102 + * slice expiry etc.
103 + */
104 +
105 +/*
106 + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
107 + * If static_prio_timeslice() is ever changed to break this assumption then
108 + * this code will need modification
109 + */
110 +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
111 +#define LOAD_WEIGHT(lp) \
112 + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
113 +#define PRIO_TO_LOAD_WEIGHT(prio) \
114 + LOAD_WEIGHT(static_prio_timeslice(prio))
115 +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
116 + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
117 +
118 +static void set_load_weight(task_t *p)
119 +{
120 + if (rt_task(p)) {
121 +#ifdef CONFIG_SMP
122 + if (p == task_rq(p)->migration_thread)
123 + /*
124 + * The migration thread does the actual balancing.
125 + * Giving its load any weight will skew balancing
126 + * adversely.
127 + */
128 + p->load_weight = 0;
129 + else
130 +#endif
131 + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
132 + } else
133 + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
134 +}
135 +
136 +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
137 +{
138 + rq->raw_weighted_load += p->load_weight;
139 +}
140 +
141 +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
142 +{
143 + rq->raw_weighted_load -= p->load_weight;
144 +}
145 +
146 +static inline void inc_nr_running(task_t *p, runqueue_t *rq)
147 +{
148 + rq->nr_running++;
149 + inc_raw_weighted_load(rq, p);
150 +}
151 +
152 +static inline void dec_nr_running(task_t *p, runqueue_t *rq)
153 +{
154 + rq->nr_running--;
155 + dec_raw_weighted_load(rq, p);
156 +}
157 +
158 +/*
159 * __activate_task - move a task to the runqueue.
160 */
161 static void __activate_task(task_t *p, runqueue_t *rq)
162 @@ -674,7 +743,7 @@ static void __activate_task(task_t *p, r
163 if (batch_task(p))
164 target = rq->expired;
165 enqueue_task(p, target);
166 - rq->nr_running++;
167 + inc_nr_running(p, rq);
168 }
169
170 /*
171 @@ -683,7 +752,7 @@ static void __activate_task(task_t *p, r
172 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
173 {
174 enqueue_task_head(p, rq->active);
175 - rq->nr_running++;
176 + inc_nr_running(p, rq);
177 }
178
179 static int recalc_task_prio(task_t *p, unsigned long long now)
180 @@ -805,7 +874,7 @@ static void activate_task(task_t *p, run
181 */
182 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
183 {
184 - rq->nr_running--;
185 + dec_nr_running(p, rq);
186 dequeue_task(p, p->array);
187 p->array = NULL;
188 }
189 @@ -855,6 +924,12 @@ inline int task_curr(const task_t *p)
190 return cpu_curr(task_cpu(p)) == p;
191 }
192
193 +/* Used instead of source_load when we know the type == 0 */
194 +unsigned long weighted_cpuload(const int cpu)
195 +{
196 + return cpu_rq(cpu)->raw_weighted_load;
197 +}
198 +
199 #ifdef CONFIG_SMP
200 typedef struct {
201 struct list_head list;
202 @@ -944,7 +1019,8 @@ void kick_process(task_t *p)
203 }
204
205 /*
206 - * Return a low guess at the load of a migration-source cpu.
207 + * Return a low guess at the load of a migration-source cpu weighted
208 + * according to the scheduling class and "nice" value.
209 *
210 * We want to under-estimate the load of migration sources, to
211 * balance conservatively.
212 @@ -952,24 +1028,36 @@ void kick_process(task_t *p)
213 static inline unsigned long source_load(int cpu, int type)
214 {
215 runqueue_t *rq = cpu_rq(cpu);
216 - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
217 +
218 if (type == 0)
219 - return load_now;
220 + return rq->raw_weighted_load;
221
222 - return min(rq->cpu_load[type-1], load_now);
223 + return min(rq->cpu_load[type-1], rq->raw_weighted_load);
224 }
225
226 /*
227 - * Return a high guess at the load of a migration-target cpu
228 + * Return a high guess at the load of a migration-target cpu weighted
229 + * according to the scheduling class and "nice" value.
230 */
231 static inline unsigned long target_load(int cpu, int type)
232 {
233 runqueue_t *rq = cpu_rq(cpu);
234 - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
235 +
236 if (type == 0)
237 - return load_now;
238 + return rq->raw_weighted_load;
239 +
240 + return max(rq->cpu_load[type-1], rq->raw_weighted_load);
241 +}
242 +
243 +/*
244 + * Return the average load per task on the cpu's run queue
245 + */
246 +static inline unsigned long cpu_avg_load_per_task(int cpu)
247 +{
248 + runqueue_t *rq = cpu_rq(cpu);
249 + unsigned long n = rq->nr_running;
250
251 - return max(rq->cpu_load[type-1], load_now);
252 + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
253 }
254
255 /*
256 @@ -1042,7 +1130,7 @@ find_idlest_cpu(struct sched_group *grou
257 cpus_and(tmp, group->cpumask, p->cpus_allowed);
258
259 for_each_cpu_mask(i, tmp) {
260 - load = source_load(i, 0);
261 + load = weighted_cpuload(i);
262
263 if (load < min_load || (load == min_load && i == this_cpu)) {
264 min_load = load;
265 @@ -1221,17 +1309,19 @@ static int try_to_wake_up(task_t *p, uns
266
267 if (this_sd->flags & SD_WAKE_AFFINE) {
268 unsigned long tl = this_load;
269 + unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
270 +
271 /*
272 * If sync wakeup then subtract the (maximum possible)
273 * effect of the currently running task from the load
274 * of the current CPU:
275 */
276 if (sync)
277 - tl -= SCHED_LOAD_SCALE;
278 + tl -= current->load_weight;
279
280 if ((tl <= load &&
281 - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
282 - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
283 + tl + target_load(cpu, idx) <= tl_per_task) ||
284 + 100*(tl + p->load_weight) <= imbalance*load) {
285 /*
286 * This domain has SD_WAKE_AFFINE and
287 * p is cache cold in this domain, and
288 @@ -1430,7 +1520,7 @@ void fastcall wake_up_new_task(task_t *p
289 list_add_tail(&p->run_list, &current->run_list);
290 p->array = current->array;
291 p->array->nr_active++;
292 - rq->nr_running++;
293 + inc_nr_running(p, rq);
294 }
295 set_need_resched();
296 } else
297 @@ -1799,9 +1889,9 @@ void pull_task(runqueue_t *src_rq, prio_
298 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
299 {
300 dequeue_task(p, src_array);
301 - src_rq->nr_running--;
302 + dec_nr_running(p, src_rq);
303 set_task_cpu(p, this_cpu);
304 - this_rq->nr_running++;
305 + inc_nr_running(p, this_rq);
306 enqueue_task(p, this_array);
307 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
308 + this_rq->timestamp_last_tick;
309 @@ -1848,26 +1938,42 @@ int can_migrate_task(task_t *p, runqueue
310 return 1;
311 }
312
313 +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
314 /*
315 - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
316 - * as part of a balancing operation within "domain". Returns the number of
317 - * tasks moved.
318 + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
319 + * load from busiest to this_rq, as part of a balancing operation within
320 + * "domain". Returns the number of tasks moved.
321 *
322 * Called with both runqueues locked.
323 */
324 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
325 - unsigned long max_nr_move, struct sched_domain *sd,
326 - enum idle_type idle, int *all_pinned)
327 + unsigned long max_nr_move, unsigned long max_load_move,
328 + struct sched_domain *sd, enum idle_type idle,
329 + int *all_pinned)
330 {
331 prio_array_t *array, *dst_array;
332 struct list_head *head, *curr;
333 - int idx, pulled = 0, pinned = 0;
334 + int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
335 + int busiest_best_prio_seen;
336 + int skip_for_load; /* skip the task based on weighted load issues */
337 + long rem_load_move;
338 task_t *tmp;
339
340 - if (max_nr_move == 0)
341 + if (max_nr_move == 0 || max_load_move == 0)
342 goto out;
343
344 + rem_load_move = max_load_move;
345 pinned = 1;
346 + this_best_prio = rq_best_prio(this_rq);
347 + busiest_best_prio = rq_best_prio(busiest);
348 + /*
349 + * Enable handling of the case where there is more than one task
350 + * with the best priority. If the current running task is one
351 + * of those with prio==busiest_best_prio we know it won't be moved
352 + * and therefore it's safe to override the skip (based on load) of
353 + * any task we find with that prio.
354 + */
355 + busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
356
357 /*
358 * We first consider expired tasks. Those will likely not be
359 @@ -1907,7 +2013,17 @@ skip_queue:
360
361 curr = curr->prev;
362
363 - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
364 + /*
365 + * To help distribute high priority tasks accross CPUs we don't
366 + * skip a task if it will be the highest priority task (i.e. smallest
367 + * prio value) on its new queue regardless of its load weight
368 + */
369 + skip_for_load = tmp->load_weight > rem_load_move;
370 + if (skip_for_load && idx < this_best_prio)
371 + skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
372 + if (skip_for_load ||
373 + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
374 + busiest_best_prio_seen |= idx == busiest_best_prio;
375 if (curr != head)
376 goto skip_queue;
377 idx++;
378 @@ -1921,9 +2037,15 @@ skip_queue:
379
380 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
381 pulled++;
382 + rem_load_move -= tmp->load_weight;
383
384 - /* We only want to steal up to the prescribed number of tasks. */
385 - if (pulled < max_nr_move) {
386 + /*
387 + * We only want to steal up to the prescribed number of tasks
388 + * and the prescribed amount of weighted load.
389 + */
390 + if (pulled < max_nr_move && rem_load_move > 0) {
391 + if (idx < this_best_prio)
392 + this_best_prio = idx;
393 if (curr != head)
394 goto skip_queue;
395 idx++;
396 @@ -1944,7 +2066,7 @@ out:
397
398 /*
399 * find_busiest_group finds and returns the busiest CPU group within the
400 - * domain. It calculates and returns the number of tasks which should be
401 + * domain. It calculates and returns the amount of weighted load which should be
402 * moved to restore balance via the imbalance parameter.
403 */
404 static struct sched_group *
405 @@ -1954,9 +2076,13 @@ find_busiest_group(struct sched_domain *
406 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
407 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
408 unsigned long max_pull;
409 + unsigned long busiest_load_per_task, busiest_nr_running;
410 + unsigned long this_load_per_task, this_nr_running;
411 int load_idx;
412
413 max_load = this_load = total_load = total_pwr = 0;
414 + busiest_load_per_task = busiest_nr_running = 0;
415 + this_load_per_task = this_nr_running = 0;
416 if (idle == NOT_IDLE)
417 load_idx = sd->busy_idx;
418 else if (idle == NEWLY_IDLE)
419 @@ -1968,13 +2094,17 @@ find_busiest_group(struct sched_domain *
420 unsigned long load;
421 int local_group;
422 int i;
423 + unsigned long sum_nr_running, sum_weighted_load;
424 + unsigned int nr_loaded_cpus = 0; /* where nr_running > 1 */
425
426 local_group = cpu_isset(this_cpu, group->cpumask);
427
428 /* Tally up the load of all CPUs in the group */
429 - avg_load = 0;
430 + sum_weighted_load = sum_nr_running = avg_load = 0;
431
432 for_each_cpu_mask(i, group->cpumask) {
433 + runqueue_t *rq = cpu_rq(i);
434 +
435 if (*sd_idle && !idle_cpu(i))
436 *sd_idle = 0;
437
438 @@ -1985,6 +2115,10 @@ find_busiest_group(struct sched_domain *
439 load = source_load(i, load_idx);
440
441 avg_load += load;
442 + sum_nr_running += rq->nr_running;
443 + if (rq->nr_running > 1)
444 + ++nr_loaded_cpus;
445 + sum_weighted_load += rq->raw_weighted_load;
446 }
447
448 total_load += avg_load;
449 @@ -1996,14 +2130,19 @@ find_busiest_group(struct sched_domain *
450 if (local_group) {
451 this_load = avg_load;
452 this = group;
453 - } else if (avg_load > max_load) {
454 + this_nr_running = sum_nr_running;
455 + this_load_per_task = sum_weighted_load;
456 + } else if (avg_load > max_load &&
457 + sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
458 max_load = avg_load;
459 busiest = group;
460 + busiest_nr_running = sum_nr_running;
461 + busiest_load_per_task = sum_weighted_load;
462 }
463 group = group->next;
464 } while (group != sd->groups);
465
466 - if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
467 + if (!busiest || this_load >= max_load || busiest_nr_running == 0)
468 goto out_balanced;
469
470 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
471 @@ -2012,6 +2151,7 @@ find_busiest_group(struct sched_domain *
472 100*max_load <= sd->imbalance_pct*this_load)
473 goto out_balanced;
474
475 + busiest_load_per_task /= busiest_nr_running;
476 /*
477 * We're trying to get all the cpus to the average_load, so we don't
478 * want to push ourselves above the average load, nor do we wish to
479 @@ -2023,21 +2163,50 @@ find_busiest_group(struct sched_domain *
480 * by pulling tasks to us. Be careful of negative numbers as they'll
481 * appear as very large values with unsigned longs.
482 */
483 + if (max_load <= busiest_load_per_task)
484 + goto out_balanced;
485 +
486 + /*
487 + * In the presence of smp nice balancing, certain scenarios can have
488 + * max load less than avg load(as we skip the groups at or below
489 + * its cpu_power, while calculating max_load..)
490 + */
491 + if (max_load < avg_load) {
492 + *imbalance = 0;
493 + goto small_imbalance;
494 + }
495
496 /* Don't want to pull so many tasks that a group would go idle */
497 - max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
498 + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
499
500 /* How much load to actually move to equalise the imbalance */
501 *imbalance = min(max_pull * busiest->cpu_power,
502 (avg_load - this_load) * this->cpu_power)
503 / SCHED_LOAD_SCALE;
504
505 - if (*imbalance < SCHED_LOAD_SCALE) {
506 - unsigned long pwr_now = 0, pwr_move = 0;
507 + /*
508 + * if *imbalance is less than the average load per runnable task
509 + * there is no gaurantee that any tasks will be moved so we'll have
510 + * a think about bumping its value to force at least one task to be
511 + * moved
512 + */
513 + if (*imbalance < busiest_load_per_task) {
514 + unsigned long pwr_now, pwr_move;
515 unsigned long tmp;
516 + unsigned int imbn;
517
518 - if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
519 - *imbalance = 1;
520 +small_imbalance:
521 + pwr_move = pwr_now = 0;
522 + imbn = 2;
523 + if (this_nr_running) {
524 + this_load_per_task /= this_nr_running;
525 + if (busiest_load_per_task > this_load_per_task)
526 + imbn = 1;
527 + } else
528 + this_load_per_task = SCHED_LOAD_SCALE;
529 +
530 + if (max_load - this_load >= busiest_load_per_task * imbn) {
531 + *imbalance = busiest_load_per_task;
532 return busiest;
533 }
534
535 @@ -2047,35 +2216,34 @@ find_busiest_group(struct sched_domain *
536 * moving them.
537 */
538
539 - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
540 - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
541 + pwr_now += busiest->cpu_power *
542 + min(busiest_load_per_task, max_load);
543 + pwr_now += this->cpu_power *
544 + min(this_load_per_task, this_load);
545 pwr_now /= SCHED_LOAD_SCALE;
546
547 /* Amount of load we'd subtract */
548 - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
549 + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
550 if (max_load > tmp)
551 - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
552 - max_load - tmp);
553 + pwr_move += busiest->cpu_power *
554 + min(busiest_load_per_task, max_load - tmp);
555
556 /* Amount of load we'd add */
557 if (max_load*busiest->cpu_power <
558 - SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
559 + busiest_load_per_task*SCHED_LOAD_SCALE)
560 tmp = max_load*busiest->cpu_power/this->cpu_power;
561 else
562 - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
563 - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
564 + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
565 + pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
566 pwr_move /= SCHED_LOAD_SCALE;
567
568 /* Move if we gain throughput */
569 if (pwr_move <= pwr_now)
570 goto out_balanced;
571
572 - *imbalance = 1;
573 - return busiest;
574 + *imbalance = busiest_load_per_task;
575 }
576
577 - /* Get rid of the scaling factor, rounding down as we divide */
578 - *imbalance = *imbalance / SCHED_LOAD_SCALE;
579 return busiest;
580
581 out_balanced:
582 @@ -2088,18 +2256,21 @@ out_balanced:
583 * find_busiest_queue - find the busiest runqueue among the cpus in group.
584 */
585 static runqueue_t *find_busiest_queue(struct sched_group *group,
586 - enum idle_type idle)
587 + enum idle_type idle, unsigned long imbalance)
588 {
589 - unsigned long load, max_load = 0;
590 - runqueue_t *busiest = NULL;
591 + unsigned long max_load = 0;
592 + runqueue_t *busiest = NULL, *rqi;
593 int i;
594
595 for_each_cpu_mask(i, group->cpumask) {
596 - load = source_load(i, 0);
597 + rqi = cpu_rq(i);
598 +
599 + if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
600 + continue;
601
602 - if (load > max_load) {
603 - max_load = load;
604 - busiest = cpu_rq(i);
605 + if (rqi->raw_weighted_load > max_load) {
606 + max_load = rqi->raw_weighted_load;
607 + busiest = rqi;
608 }
609 }
610
611 @@ -2112,6 +2283,7 @@ static runqueue_t *find_busiest_queue(st
612 */
613 #define MAX_PINNED_INTERVAL 512
614
615 +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
616 /*
617 * Check this_cpu to ensure it is balanced within domain. Attempt to move
618 * tasks if there is an imbalance.
619 @@ -2139,7 +2311,7 @@ static int load_balance(int this_cpu, ru
620 goto out_balanced;
621 }
622
623 - busiest = find_busiest_queue(group, idle);
624 + busiest = find_busiest_queue(group, idle, imbalance);
625 if (!busiest) {
626 schedstat_inc(sd, lb_nobusyq[idle]);
627 goto out_balanced;
628 @@ -2159,6 +2331,7 @@ static int load_balance(int this_cpu, ru
629 */
630 double_rq_lock(this_rq, busiest);
631 nr_moved = move_tasks(this_rq, this_cpu, busiest,
632 + minus_1_or_zero(busiest->nr_running),
633 imbalance, sd, idle, &all_pinned);
634 double_rq_unlock(this_rq, busiest);
635
636 @@ -2262,7 +2435,7 @@ static int load_balance_newidle(int this
637 goto out_balanced;
638 }
639
640 - busiest = find_busiest_queue(group, NEWLY_IDLE);
641 + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
642 if (!busiest) {
643 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
644 goto out_balanced;
645 @@ -2277,6 +2450,7 @@ static int load_balance_newidle(int this
646 /* Attempt to move tasks */
647 double_lock_balance(this_rq, busiest);
648 nr_moved = move_tasks(this_rq, this_cpu, busiest,
649 + minus_1_or_zero(busiest->nr_running),
650 imbalance, sd, NEWLY_IDLE, NULL);
651 spin_unlock(&busiest->lock);
652 }
653 @@ -2357,7 +2531,8 @@ static void active_load_balance(runqueue
654
655 schedstat_inc(sd, alb_cnt);
656
657 - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
658 + if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
659 + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
660 schedstat_inc(sd, alb_pushed);
661 else
662 schedstat_inc(sd, alb_failed);
663 @@ -2385,7 +2560,7 @@ static void rebalance_tick(int this_cpu,
664 struct sched_domain *sd;
665 int i;
666
667 - this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
668 + this_load = this_rq->raw_weighted_load;
669 /* Update our load */
670 for (i = 0; i < 3; i++) {
671 unsigned long new_load = this_load;
672 @@ -3498,17 +3673,21 @@ void set_user_nice(task_t *p, long nice)
673 goto out_unlock;
674 }
675 array = p->array;
676 - if (array)
677 + if (array) {
678 dequeue_task(p, array);
679 + dec_raw_weighted_load(rq, p);
680 + }
681
682 old_prio = p->prio;
683 new_prio = NICE_TO_PRIO(nice);
684 delta = new_prio - old_prio;
685 p->static_prio = NICE_TO_PRIO(nice);
686 + set_load_weight(p);
687 p->prio += delta;
688
689 if (array) {
690 enqueue_task(p, array);
691 + inc_raw_weighted_load(rq, p);
692 /*
693 * If the task increased its priority or is running and
694 * lowered its priority, then reschedule its CPU:
695 @@ -3644,6 +3823,7 @@ static void __setscheduler(struct task_s
696 if (policy == SCHED_BATCH)
697 p->sleep_avg = 0;
698 }
699 + set_load_weight(p);
700 }
701
702 /**
703 @@ -6141,6 +6321,7 @@ void __init sched_init(void)
704 }
705 }
706
707 + set_load_weight(&init_task);
708 /*
709 * The boot idle thread does lazy MMU switching as well:
710 */