Magellan Linux

Annotation of /trunk/kernel26-alx/patches-2.6.17-r5/0001-2.6.17-sched-implement-smpnice.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (hide annotations) (download)
Fri May 18 11:04:36 2007 UTC (17 years ago) by niro
File size: 22845 byte(s)
-import

1 niro 199
2     To aid in avoiding the subversion of "niceness" due to uneven distribution
3     of tasks with abnormal "nice" values across CPUs the contribution that
4     each task makes to its run queue's load is weighted according to its
5     scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
6     scaled version of the new time slice allocation that they receive on time
7     slice expiry etc.
8    
9     Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
10     Signed-off-by: Con Kolivas <kernel@kolivas.org>
11    
12     ---
13     include/linux/sched.h | 8 -
14     kernel/sched.c | 313 +++++++++++++++++++++++++++++++++++++++-----------
15     2 files changed, 253 insertions(+), 68 deletions(-)
16    
17     Index: linux-ck-dev/include/linux/sched.h
18     ===================================================================
19     --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:20:15.000000000 +1000
20     +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:21:31.000000000 +1000
21     @@ -102,6 +102,7 @@ extern unsigned long nr_running(void);
22     extern unsigned long nr_uninterruptible(void);
23     extern unsigned long nr_active(void);
24     extern unsigned long nr_iowait(void);
25     +extern unsigned long weighted_cpuload(const int cpu);
26    
27     #include <linux/time.h>
28     #include <linux/param.h>
29     @@ -547,9 +548,9 @@ enum idle_type
30     /*
31     * sched-domains (multiprocessor balancing) declarations:
32     */
33     -#ifdef CONFIG_SMP
34     #define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
35    
36     +#ifdef CONFIG_SMP
37     #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
38     #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
39     #define SD_BALANCE_EXEC 4 /* Balance on exec */
40     @@ -702,9 +703,12 @@ struct task_struct {
41    
42     int lock_depth; /* BKL lock depth */
43    
44     -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
45     +#ifdef CONFIG_SMP
46     +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
47     int oncpu;
48     #endif
49     +#endif
50     + int load_weight; /* for niceness load balancing purposes */
51     int prio, static_prio;
52     struct list_head run_list;
53     prio_array_t *array;
54     Index: linux-ck-dev/kernel/sched.c
55     ===================================================================
56     --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:20:15.000000000 +1000
57     +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000
58     @@ -168,15 +168,21 @@
59     */
60    
61     #define SCALE_PRIO(x, prio) \
62     - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
63     + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
64    
65     -static unsigned int task_timeslice(task_t *p)
66     +static unsigned int static_prio_timeslice(int static_prio)
67     {
68     - if (p->static_prio < NICE_TO_PRIO(0))
69     - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
70     + if (static_prio < NICE_TO_PRIO(0))
71     + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
72     else
73     - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
74     + return SCALE_PRIO(DEF_TIMESLICE, static_prio);
75     }
76     +
77     +static inline unsigned int task_timeslice(task_t *p)
78     +{
79     + return static_prio_timeslice(p->static_prio);
80     +}
81     +
82     #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
83     < (long long) (sd)->cache_hot_time)
84    
85     @@ -209,6 +215,7 @@ struct runqueue {
86     * remote CPUs use both these fields when doing load calculation.
87     */
88     unsigned long nr_running;
89     + unsigned long raw_weighted_load;
90     #ifdef CONFIG_SMP
91     unsigned long cpu_load[3];
92     #endif
93     @@ -665,6 +672,68 @@ static int effective_prio(task_t *p)
94     }
95    
96     /*
97     + * To aid in avoiding the subversion of "niceness" due to uneven distribution
98     + * of tasks with abnormal "nice" values across CPUs the contribution that
99     + * each task makes to its run queue's load is weighted according to its
100     + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
101     + * scaled version of the new time slice allocation that they receive on time
102     + * slice expiry etc.
103     + */
104     +
105     +/*
106     + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
107     + * If static_prio_timeslice() is ever changed to break this assumption then
108     + * this code will need modification
109     + */
110     +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
111     +#define LOAD_WEIGHT(lp) \
112     + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
113     +#define PRIO_TO_LOAD_WEIGHT(prio) \
114     + LOAD_WEIGHT(static_prio_timeslice(prio))
115     +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
116     + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
117     +
118     +static void set_load_weight(task_t *p)
119     +{
120     + if (rt_task(p)) {
121     +#ifdef CONFIG_SMP
122     + if (p == task_rq(p)->migration_thread)
123     + /*
124     + * The migration thread does the actual balancing.
125     + * Giving its load any weight will skew balancing
126     + * adversely.
127     + */
128     + p->load_weight = 0;
129     + else
130     +#endif
131     + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
132     + } else
133     + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
134     +}
135     +
136     +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
137     +{
138     + rq->raw_weighted_load += p->load_weight;
139     +}
140     +
141     +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
142     +{
143     + rq->raw_weighted_load -= p->load_weight;
144     +}
145     +
146     +static inline void inc_nr_running(task_t *p, runqueue_t *rq)
147     +{
148     + rq->nr_running++;
149     + inc_raw_weighted_load(rq, p);
150     +}
151     +
152     +static inline void dec_nr_running(task_t *p, runqueue_t *rq)
153     +{
154     + rq->nr_running--;
155     + dec_raw_weighted_load(rq, p);
156     +}
157     +
158     +/*
159     * __activate_task - move a task to the runqueue.
160     */
161     static void __activate_task(task_t *p, runqueue_t *rq)
162     @@ -674,7 +743,7 @@ static void __activate_task(task_t *p, r
163     if (batch_task(p))
164     target = rq->expired;
165     enqueue_task(p, target);
166     - rq->nr_running++;
167     + inc_nr_running(p, rq);
168     }
169    
170     /*
171     @@ -683,7 +752,7 @@ static void __activate_task(task_t *p, r
172     static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
173     {
174     enqueue_task_head(p, rq->active);
175     - rq->nr_running++;
176     + inc_nr_running(p, rq);
177     }
178    
179     static int recalc_task_prio(task_t *p, unsigned long long now)
180     @@ -805,7 +874,7 @@ static void activate_task(task_t *p, run
181     */
182     static void deactivate_task(struct task_struct *p, runqueue_t *rq)
183     {
184     - rq->nr_running--;
185     + dec_nr_running(p, rq);
186     dequeue_task(p, p->array);
187     p->array = NULL;
188     }
189     @@ -855,6 +924,12 @@ inline int task_curr(const task_t *p)
190     return cpu_curr(task_cpu(p)) == p;
191     }
192    
193     +/* Used instead of source_load when we know the type == 0 */
194     +unsigned long weighted_cpuload(const int cpu)
195     +{
196     + return cpu_rq(cpu)->raw_weighted_load;
197     +}
198     +
199     #ifdef CONFIG_SMP
200     typedef struct {
201     struct list_head list;
202     @@ -944,7 +1019,8 @@ void kick_process(task_t *p)
203     }
204    
205     /*
206     - * Return a low guess at the load of a migration-source cpu.
207     + * Return a low guess at the load of a migration-source cpu weighted
208     + * according to the scheduling class and "nice" value.
209     *
210     * We want to under-estimate the load of migration sources, to
211     * balance conservatively.
212     @@ -952,24 +1028,36 @@ void kick_process(task_t *p)
213     static inline unsigned long source_load(int cpu, int type)
214     {
215     runqueue_t *rq = cpu_rq(cpu);
216     - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
217     +
218     if (type == 0)
219     - return load_now;
220     + return rq->raw_weighted_load;
221    
222     - return min(rq->cpu_load[type-1], load_now);
223     + return min(rq->cpu_load[type-1], rq->raw_weighted_load);
224     }
225    
226     /*
227     - * Return a high guess at the load of a migration-target cpu
228     + * Return a high guess at the load of a migration-target cpu weighted
229     + * according to the scheduling class and "nice" value.
230     */
231     static inline unsigned long target_load(int cpu, int type)
232     {
233     runqueue_t *rq = cpu_rq(cpu);
234     - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
235     +
236     if (type == 0)
237     - return load_now;
238     + return rq->raw_weighted_load;
239     +
240     + return max(rq->cpu_load[type-1], rq->raw_weighted_load);
241     +}
242     +
243     +/*
244     + * Return the average load per task on the cpu's run queue
245     + */
246     +static inline unsigned long cpu_avg_load_per_task(int cpu)
247     +{
248     + runqueue_t *rq = cpu_rq(cpu);
249     + unsigned long n = rq->nr_running;
250    
251     - return max(rq->cpu_load[type-1], load_now);
252     + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
253     }
254    
255     /*
256     @@ -1042,7 +1130,7 @@ find_idlest_cpu(struct sched_group *grou
257     cpus_and(tmp, group->cpumask, p->cpus_allowed);
258    
259     for_each_cpu_mask(i, tmp) {
260     - load = source_load(i, 0);
261     + load = weighted_cpuload(i);
262    
263     if (load < min_load || (load == min_load && i == this_cpu)) {
264     min_load = load;
265     @@ -1221,17 +1309,19 @@ static int try_to_wake_up(task_t *p, uns
266    
267     if (this_sd->flags & SD_WAKE_AFFINE) {
268     unsigned long tl = this_load;
269     + unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
270     +
271     /*
272     * If sync wakeup then subtract the (maximum possible)
273     * effect of the currently running task from the load
274     * of the current CPU:
275     */
276     if (sync)
277     - tl -= SCHED_LOAD_SCALE;
278     + tl -= current->load_weight;
279    
280     if ((tl <= load &&
281     - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
282     - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
283     + tl + target_load(cpu, idx) <= tl_per_task) ||
284     + 100*(tl + p->load_weight) <= imbalance*load) {
285     /*
286     * This domain has SD_WAKE_AFFINE and
287     * p is cache cold in this domain, and
288     @@ -1430,7 +1520,7 @@ void fastcall wake_up_new_task(task_t *p
289     list_add_tail(&p->run_list, &current->run_list);
290     p->array = current->array;
291     p->array->nr_active++;
292     - rq->nr_running++;
293     + inc_nr_running(p, rq);
294     }
295     set_need_resched();
296     } else
297     @@ -1799,9 +1889,9 @@ void pull_task(runqueue_t *src_rq, prio_
298     runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
299     {
300     dequeue_task(p, src_array);
301     - src_rq->nr_running--;
302     + dec_nr_running(p, src_rq);
303     set_task_cpu(p, this_cpu);
304     - this_rq->nr_running++;
305     + inc_nr_running(p, this_rq);
306     enqueue_task(p, this_array);
307     p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
308     + this_rq->timestamp_last_tick;
309     @@ -1848,26 +1938,42 @@ int can_migrate_task(task_t *p, runqueue
310     return 1;
311     }
312    
313     +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
314     /*
315     - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
316     - * as part of a balancing operation within "domain". Returns the number of
317     - * tasks moved.
318     + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
319     + * load from busiest to this_rq, as part of a balancing operation within
320     + * "domain". Returns the number of tasks moved.
321     *
322     * Called with both runqueues locked.
323     */
324     static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
325     - unsigned long max_nr_move, struct sched_domain *sd,
326     - enum idle_type idle, int *all_pinned)
327     + unsigned long max_nr_move, unsigned long max_load_move,
328     + struct sched_domain *sd, enum idle_type idle,
329     + int *all_pinned)
330     {
331     prio_array_t *array, *dst_array;
332     struct list_head *head, *curr;
333     - int idx, pulled = 0, pinned = 0;
334     + int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
335     + int busiest_best_prio_seen;
336     + int skip_for_load; /* skip the task based on weighted load issues */
337     + long rem_load_move;
338     task_t *tmp;
339    
340     - if (max_nr_move == 0)
341     + if (max_nr_move == 0 || max_load_move == 0)
342     goto out;
343    
344     + rem_load_move = max_load_move;
345     pinned = 1;
346     + this_best_prio = rq_best_prio(this_rq);
347     + busiest_best_prio = rq_best_prio(busiest);
348     + /*
349     + * Enable handling of the case where there is more than one task
350     + * with the best priority. If the current running task is one
351     + * of those with prio==busiest_best_prio we know it won't be moved
352     + * and therefore it's safe to override the skip (based on load) of
353     + * any task we find with that prio.
354     + */
355     + busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
356    
357     /*
358     * We first consider expired tasks. Those will likely not be
359     @@ -1907,7 +2013,17 @@ skip_queue:
360    
361     curr = curr->prev;
362    
363     - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
364     + /*
365     + * To help distribute high priority tasks accross CPUs we don't
366     + * skip a task if it will be the highest priority task (i.e. smallest
367     + * prio value) on its new queue regardless of its load weight
368     + */
369     + skip_for_load = tmp->load_weight > rem_load_move;
370     + if (skip_for_load && idx < this_best_prio)
371     + skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
372     + if (skip_for_load ||
373     + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
374     + busiest_best_prio_seen |= idx == busiest_best_prio;
375     if (curr != head)
376     goto skip_queue;
377     idx++;
378     @@ -1921,9 +2037,15 @@ skip_queue:
379    
380     pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
381     pulled++;
382     + rem_load_move -= tmp->load_weight;
383    
384     - /* We only want to steal up to the prescribed number of tasks. */
385     - if (pulled < max_nr_move) {
386     + /*
387     + * We only want to steal up to the prescribed number of tasks
388     + * and the prescribed amount of weighted load.
389     + */
390     + if (pulled < max_nr_move && rem_load_move > 0) {
391     + if (idx < this_best_prio)
392     + this_best_prio = idx;
393     if (curr != head)
394     goto skip_queue;
395     idx++;
396     @@ -1944,7 +2066,7 @@ out:
397    
398     /*
399     * find_busiest_group finds and returns the busiest CPU group within the
400     - * domain. It calculates and returns the number of tasks which should be
401     + * domain. It calculates and returns the amount of weighted load which should be
402     * moved to restore balance via the imbalance parameter.
403     */
404     static struct sched_group *
405     @@ -1954,9 +2076,13 @@ find_busiest_group(struct sched_domain *
406     struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
407     unsigned long max_load, avg_load, total_load, this_load, total_pwr;
408     unsigned long max_pull;
409     + unsigned long busiest_load_per_task, busiest_nr_running;
410     + unsigned long this_load_per_task, this_nr_running;
411     int load_idx;
412    
413     max_load = this_load = total_load = total_pwr = 0;
414     + busiest_load_per_task = busiest_nr_running = 0;
415     + this_load_per_task = this_nr_running = 0;
416     if (idle == NOT_IDLE)
417     load_idx = sd->busy_idx;
418     else if (idle == NEWLY_IDLE)
419     @@ -1968,13 +2094,17 @@ find_busiest_group(struct sched_domain *
420     unsigned long load;
421     int local_group;
422     int i;
423     + unsigned long sum_nr_running, sum_weighted_load;
424     + unsigned int nr_loaded_cpus = 0; /* where nr_running > 1 */
425    
426     local_group = cpu_isset(this_cpu, group->cpumask);
427    
428     /* Tally up the load of all CPUs in the group */
429     - avg_load = 0;
430     + sum_weighted_load = sum_nr_running = avg_load = 0;
431    
432     for_each_cpu_mask(i, group->cpumask) {
433     + runqueue_t *rq = cpu_rq(i);
434     +
435     if (*sd_idle && !idle_cpu(i))
436     *sd_idle = 0;
437    
438     @@ -1985,6 +2115,10 @@ find_busiest_group(struct sched_domain *
439     load = source_load(i, load_idx);
440    
441     avg_load += load;
442     + sum_nr_running += rq->nr_running;
443     + if (rq->nr_running > 1)
444     + ++nr_loaded_cpus;
445     + sum_weighted_load += rq->raw_weighted_load;
446     }
447    
448     total_load += avg_load;
449     @@ -1996,14 +2130,19 @@ find_busiest_group(struct sched_domain *
450     if (local_group) {
451     this_load = avg_load;
452     this = group;
453     - } else if (avg_load > max_load) {
454     + this_nr_running = sum_nr_running;
455     + this_load_per_task = sum_weighted_load;
456     + } else if (avg_load > max_load &&
457     + sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
458     max_load = avg_load;
459     busiest = group;
460     + busiest_nr_running = sum_nr_running;
461     + busiest_load_per_task = sum_weighted_load;
462     }
463     group = group->next;
464     } while (group != sd->groups);
465    
466     - if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
467     + if (!busiest || this_load >= max_load || busiest_nr_running == 0)
468     goto out_balanced;
469    
470     avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
471     @@ -2012,6 +2151,7 @@ find_busiest_group(struct sched_domain *
472     100*max_load <= sd->imbalance_pct*this_load)
473     goto out_balanced;
474    
475     + busiest_load_per_task /= busiest_nr_running;
476     /*
477     * We're trying to get all the cpus to the average_load, so we don't
478     * want to push ourselves above the average load, nor do we wish to
479     @@ -2023,21 +2163,50 @@ find_busiest_group(struct sched_domain *
480     * by pulling tasks to us. Be careful of negative numbers as they'll
481     * appear as very large values with unsigned longs.
482     */
483     + if (max_load <= busiest_load_per_task)
484     + goto out_balanced;
485     +
486     + /*
487     + * In the presence of smp nice balancing, certain scenarios can have
488     + * max load less than avg load(as we skip the groups at or below
489     + * its cpu_power, while calculating max_load..)
490     + */
491     + if (max_load < avg_load) {
492     + *imbalance = 0;
493     + goto small_imbalance;
494     + }
495    
496     /* Don't want to pull so many tasks that a group would go idle */
497     - max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
498     + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
499    
500     /* How much load to actually move to equalise the imbalance */
501     *imbalance = min(max_pull * busiest->cpu_power,
502     (avg_load - this_load) * this->cpu_power)
503     / SCHED_LOAD_SCALE;
504    
505     - if (*imbalance < SCHED_LOAD_SCALE) {
506     - unsigned long pwr_now = 0, pwr_move = 0;
507     + /*
508     + * if *imbalance is less than the average load per runnable task
509     + * there is no gaurantee that any tasks will be moved so we'll have
510     + * a think about bumping its value to force at least one task to be
511     + * moved
512     + */
513     + if (*imbalance < busiest_load_per_task) {
514     + unsigned long pwr_now, pwr_move;
515     unsigned long tmp;
516     + unsigned int imbn;
517    
518     - if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
519     - *imbalance = 1;
520     +small_imbalance:
521     + pwr_move = pwr_now = 0;
522     + imbn = 2;
523     + if (this_nr_running) {
524     + this_load_per_task /= this_nr_running;
525     + if (busiest_load_per_task > this_load_per_task)
526     + imbn = 1;
527     + } else
528     + this_load_per_task = SCHED_LOAD_SCALE;
529     +
530     + if (max_load - this_load >= busiest_load_per_task * imbn) {
531     + *imbalance = busiest_load_per_task;
532     return busiest;
533     }
534    
535     @@ -2047,35 +2216,34 @@ find_busiest_group(struct sched_domain *
536     * moving them.
537     */
538    
539     - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
540     - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
541     + pwr_now += busiest->cpu_power *
542     + min(busiest_load_per_task, max_load);
543     + pwr_now += this->cpu_power *
544     + min(this_load_per_task, this_load);
545     pwr_now /= SCHED_LOAD_SCALE;
546    
547     /* Amount of load we'd subtract */
548     - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
549     + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
550     if (max_load > tmp)
551     - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
552     - max_load - tmp);
553     + pwr_move += busiest->cpu_power *
554     + min(busiest_load_per_task, max_load - tmp);
555    
556     /* Amount of load we'd add */
557     if (max_load*busiest->cpu_power <
558     - SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
559     + busiest_load_per_task*SCHED_LOAD_SCALE)
560     tmp = max_load*busiest->cpu_power/this->cpu_power;
561     else
562     - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
563     - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
564     + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
565     + pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
566     pwr_move /= SCHED_LOAD_SCALE;
567    
568     /* Move if we gain throughput */
569     if (pwr_move <= pwr_now)
570     goto out_balanced;
571    
572     - *imbalance = 1;
573     - return busiest;
574     + *imbalance = busiest_load_per_task;
575     }
576    
577     - /* Get rid of the scaling factor, rounding down as we divide */
578     - *imbalance = *imbalance / SCHED_LOAD_SCALE;
579     return busiest;
580    
581     out_balanced:
582     @@ -2088,18 +2256,21 @@ out_balanced:
583     * find_busiest_queue - find the busiest runqueue among the cpus in group.
584     */
585     static runqueue_t *find_busiest_queue(struct sched_group *group,
586     - enum idle_type idle)
587     + enum idle_type idle, unsigned long imbalance)
588     {
589     - unsigned long load, max_load = 0;
590     - runqueue_t *busiest = NULL;
591     + unsigned long max_load = 0;
592     + runqueue_t *busiest = NULL, *rqi;
593     int i;
594    
595     for_each_cpu_mask(i, group->cpumask) {
596     - load = source_load(i, 0);
597     + rqi = cpu_rq(i);
598     +
599     + if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
600     + continue;
601    
602     - if (load > max_load) {
603     - max_load = load;
604     - busiest = cpu_rq(i);
605     + if (rqi->raw_weighted_load > max_load) {
606     + max_load = rqi->raw_weighted_load;
607     + busiest = rqi;
608     }
609     }
610    
611     @@ -2112,6 +2283,7 @@ static runqueue_t *find_busiest_queue(st
612     */
613     #define MAX_PINNED_INTERVAL 512
614    
615     +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
616     /*
617     * Check this_cpu to ensure it is balanced within domain. Attempt to move
618     * tasks if there is an imbalance.
619     @@ -2139,7 +2311,7 @@ static int load_balance(int this_cpu, ru
620     goto out_balanced;
621     }
622    
623     - busiest = find_busiest_queue(group, idle);
624     + busiest = find_busiest_queue(group, idle, imbalance);
625     if (!busiest) {
626     schedstat_inc(sd, lb_nobusyq[idle]);
627     goto out_balanced;
628     @@ -2159,6 +2331,7 @@ static int load_balance(int this_cpu, ru
629     */
630     double_rq_lock(this_rq, busiest);
631     nr_moved = move_tasks(this_rq, this_cpu, busiest,
632     + minus_1_or_zero(busiest->nr_running),
633     imbalance, sd, idle, &all_pinned);
634     double_rq_unlock(this_rq, busiest);
635    
636     @@ -2262,7 +2435,7 @@ static int load_balance_newidle(int this
637     goto out_balanced;
638     }
639    
640     - busiest = find_busiest_queue(group, NEWLY_IDLE);
641     + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
642     if (!busiest) {
643     schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
644     goto out_balanced;
645     @@ -2277,6 +2450,7 @@ static int load_balance_newidle(int this
646     /* Attempt to move tasks */
647     double_lock_balance(this_rq, busiest);
648     nr_moved = move_tasks(this_rq, this_cpu, busiest,
649     + minus_1_or_zero(busiest->nr_running),
650     imbalance, sd, NEWLY_IDLE, NULL);
651     spin_unlock(&busiest->lock);
652     }
653     @@ -2357,7 +2531,8 @@ static void active_load_balance(runqueue
654    
655     schedstat_inc(sd, alb_cnt);
656    
657     - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
658     + if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
659     + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
660     schedstat_inc(sd, alb_pushed);
661     else
662     schedstat_inc(sd, alb_failed);
663     @@ -2385,7 +2560,7 @@ static void rebalance_tick(int this_cpu,
664     struct sched_domain *sd;
665     int i;
666    
667     - this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
668     + this_load = this_rq->raw_weighted_load;
669     /* Update our load */
670     for (i = 0; i < 3; i++) {
671     unsigned long new_load = this_load;
672     @@ -3498,17 +3673,21 @@ void set_user_nice(task_t *p, long nice)
673     goto out_unlock;
674     }
675     array = p->array;
676     - if (array)
677     + if (array) {
678     dequeue_task(p, array);
679     + dec_raw_weighted_load(rq, p);
680     + }
681    
682     old_prio = p->prio;
683     new_prio = NICE_TO_PRIO(nice);
684     delta = new_prio - old_prio;
685     p->static_prio = NICE_TO_PRIO(nice);
686     + set_load_weight(p);
687     p->prio += delta;
688    
689     if (array) {
690     enqueue_task(p, array);
691     + inc_raw_weighted_load(rq, p);
692     /*
693     * If the task increased its priority or is running and
694     * lowered its priority, then reschedule its CPU:
695     @@ -3644,6 +3823,7 @@ static void __setscheduler(struct task_s
696     if (policy == SCHED_BATCH)
697     p->sleep_avg = 0;
698     }
699     + set_load_weight(p);
700     }
701    
702     /**
703     @@ -6141,6 +6321,7 @@ void __init sched_init(void)
704     }
705     }
706    
707     + set_load_weight(&init_task);
708     /*
709     * The boot idle thread does lazy MMU switching as well:
710     */