Magellan Linux

Annotation of /alx-src/tags/kernel26-2.6.12-alx-r9/kernel/sched.c.orig

Parent Directory Parent Directory | Revision Log Revision Log


Revision 630 - (hide annotations) (download)
Wed Mar 4 11:03:09 2009 UTC (15 years, 3 months ago) by niro
File size: 122911 byte(s)
Tag kernel26-2.6.12-alx-r9
1 niro 628 /*
2     * kernel/sched.c
3     *
4     * Kernel scheduler and related syscalls
5     *
6     * Copyright (C) 1991-2002 Linus Torvalds
7     *
8     * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9     * make semaphores SMP safe
10     * 1998-11-19 Implemented schedule_timeout() and related stuff
11     * by Andrea Arcangeli
12     * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13     * hybrid priority-list and round-robin design with
14     * an array-switch method of distributing timeslices
15     * and per-CPU runqueues. Cleanups and useful suggestions
16     * by Davide Libenzi, preemptible kernel bits by Robert Love.
17     * 2003-09-03 Interactivity tuning by Con Kolivas.
18     * 2004-04-02 Scheduler domains code by Nick Piggin
19     * 2005-06-07 New staircase scheduling policy by Con Kolivas with help
20     * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
21     * Staircase v11.3
22     */
23    
24     #include <linux/mm.h>
25     #include <linux/module.h>
26     #include <linux/nmi.h>
27     #include <linux/init.h>
28     #include <asm/uaccess.h>
29     #include <linux/highmem.h>
30     #include <linux/smp_lock.h>
31     #include <asm/mmu_context.h>
32     #include <linux/interrupt.h>
33     #include <linux/completion.h>
34     #include <linux/kernel_stat.h>
35     #include <linux/security.h>
36     #include <linux/notifier.h>
37     #include <linux/profile.h>
38     #include <linux/suspend.h>
39     #include <linux/blkdev.h>
40     #include <linux/delay.h>
41     #include <linux/smp.h>
42     #include <linux/threads.h>
43     #include <linux/timer.h>
44     #include <linux/rcupdate.h>
45     #include <linux/cpu.h>
46     #include <linux/cpuset.h>
47     #include <linux/percpu.h>
48     #include <linux/kthread.h>
49     #include <linux/seq_file.h>
50     #include <linux/syscalls.h>
51     #include <linux/times.h>
52     #include <linux/acct.h>
53     #include <asm/tlb.h>
54    
55     #include <asm/unistd.h>
56    
57     /*
58     * Convert user-nice values [ -20 ... 0 ... 19 ]
59     * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
60     * and back.
61     */
62     #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
63     #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
64     #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
65    
66     /*
67     * 'User priority' is the nice value converted to something we
68     * can work with better when scaling various scheduler parameters,
69     * it's a [ 0 ... 39 ] range.
70     */
71     #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
72     #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
73     #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
74    
75     /*
76     * Some helpers for converting nanosecond timing to jiffy resolution
77     */
78     #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
79     #define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */
80    
81     int sched_compute = 0;
82     /*
83     *This is the time all tasks within the same priority round robin.
84     *compute setting is reserved for dedicated computational scheduling
85     *and has ten times larger intervals.
86     */
87     #define _RR_INTERVAL ((10 * HZ / 1000) ? : 1)
88     #define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute))
89     #define DEF_TIMESLICE (RR_INTERVAL() * 19)
90    
91     #define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \
92     < (long long) (sd)->cache_hot_time)
93    
94     /*
95     * These are the runqueue data structures:
96     */
97    
98     typedef struct runqueue runqueue_t;
99    
100     /*
101     * This is the main, per-CPU runqueue data structure.
102     *
103     * Locking rule: those places that want to lock multiple runqueues
104     * (such as the load balancing or the thread migration code), lock
105     * acquire operations must be ordered by ascending &runqueue.
106     */
107     struct runqueue {
108     spinlock_t lock;
109    
110     /*
111     * nr_running and cpu_load should be in the same cacheline because
112     * remote CPUs use both these fields when doing load calculation.
113     */
114     unsigned long nr_running;
115     #ifdef CONFIG_SMP
116     unsigned long prio_bias;
117     unsigned long cpu_load;
118     #endif
119     unsigned long long nr_switches;
120    
121     /*
122     * This is part of a global counter where only the total sum
123     * over all CPUs matters. A task can increase this counter on
124     * one CPU and if it got migrated afterwards it may decrease
125     * it on another CPU. Always updated under the runqueue lock:
126     */
127     unsigned long nr_uninterruptible;
128    
129     unsigned long long timestamp_last_tick;
130     unsigned int cache_ticks, preempted;
131     task_t *curr, *idle;
132     struct mm_struct *prev_mm;
133     unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
134     struct list_head queue[MAX_PRIO];
135     atomic_t nr_iowait;
136    
137     #ifdef CONFIG_SMP
138     struct sched_domain *sd;
139    
140     /* For active balancing */
141     int active_balance;
142     int push_cpu;
143    
144     task_t *migration_thread;
145     struct list_head migration_queue;
146     #endif
147    
148     #ifdef CONFIG_SCHEDSTATS
149     /* latency stats */
150     struct sched_info rq_sched_info;
151    
152     /* sys_sched_yield() stats */
153     unsigned long yld_exp_empty;
154     unsigned long yld_act_empty;
155     unsigned long yld_both_empty;
156     unsigned long yld_cnt;
157    
158     /* schedule() stats */
159     unsigned long sched_switch;
160     unsigned long sched_cnt;
161     unsigned long sched_goidle;
162    
163     /* try_to_wake_up() stats */
164     unsigned long ttwu_cnt;
165     unsigned long ttwu_local;
166     #endif
167     };
168    
169     static DEFINE_PER_CPU(struct runqueue, runqueues);
170    
171     #define for_each_domain(cpu, domain) \
172     for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
173    
174     #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
175     #define this_rq() (&__get_cpu_var(runqueues))
176     #define task_rq(p) cpu_rq(task_cpu(p))
177     #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
178    
179     /*
180     * Default context-switch locking:
181     */
182     #ifndef prepare_arch_switch
183     # define prepare_arch_switch(rq, next) do { } while (0)
184     # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
185     # define task_running(rq, p) ((rq)->curr == (p))
186     #endif
187    
188     /*
189     * task_rq_lock - lock the runqueue a given task resides on and disable
190     * interrupts. Note the ordering: we can safely lookup the task_rq without
191     * explicitly disabling preemption.
192     */
193     static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
194     __acquires(rq->lock)
195     {
196     struct runqueue *rq;
197    
198     repeat_lock_task:
199     local_irq_save(*flags);
200     rq = task_rq(p);
201     spin_lock(&rq->lock);
202     if (unlikely(rq != task_rq(p))) {
203     spin_unlock_irqrestore(&rq->lock, *flags);
204     goto repeat_lock_task;
205     }
206     return rq;
207     }
208    
209     static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
210     __releases(rq->lock)
211     {
212     spin_unlock_irqrestore(&rq->lock, *flags);
213     }
214    
215     #ifdef CONFIG_SCHEDSTATS
216     /*
217     * bump this up when changing the output format or the meaning of an existing
218     * format, so that tools can adapt (or abort)
219     */
220     #define SCHEDSTAT_VERSION 11
221    
222     static int show_schedstat(struct seq_file *seq, void *v)
223     {
224     int cpu;
225    
226     seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
227     seq_printf(seq, "timestamp %lu\n", jiffies);
228     for_each_online_cpu(cpu) {
229     runqueue_t *rq = cpu_rq(cpu);
230     #ifdef CONFIG_SMP
231     struct sched_domain *sd;
232     int dcnt = 0;
233     #endif
234    
235     /* runqueue-specific stats */
236     seq_printf(seq,
237     "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
238     cpu, rq->yld_both_empty,
239     rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
240     rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
241     rq->ttwu_cnt, rq->ttwu_local,
242     rq->rq_sched_info.cpu_time,
243     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
244    
245     seq_printf(seq, "\n");
246    
247     #ifdef CONFIG_SMP
248     /* domain-specific stats */
249     for_each_domain(cpu, sd) {
250     enum idle_type itype;
251     char mask_str[NR_CPUS];
252    
253     cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
254     seq_printf(seq, "domain%d %s", dcnt++, mask_str);
255     for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
256     itype++) {
257     seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
258     sd->lb_cnt[itype],
259     sd->lb_balanced[itype],
260     sd->lb_failed[itype],
261     sd->lb_imbalance[itype],
262     sd->lb_gained[itype],
263     sd->lb_hot_gained[itype],
264     sd->lb_nobusyq[itype],
265     sd->lb_nobusyg[itype]);
266     }
267     seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
268     sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
269     sd->sbe_pushed, sd->sbe_attempts,
270     sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
271     }
272     #endif
273     }
274     return 0;
275     }
276    
277     static int schedstat_open(struct inode *inode, struct file *file)
278     {
279     unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
280     char *buf = kmalloc(size, GFP_KERNEL);
281     struct seq_file *m;
282     int res;
283    
284     if (!buf)
285     return -ENOMEM;
286     res = single_open(file, show_schedstat, NULL);
287     if (!res) {
288     m = file->private_data;
289     m->buf = buf;
290     m->size = size;
291     } else
292     kfree(buf);
293     return res;
294     }
295    
296     struct file_operations proc_schedstat_operations = {
297     .open = schedstat_open,
298     .read = seq_read,
299     .llseek = seq_lseek,
300     .release = single_release,
301     };
302    
303     # define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
304     # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
305     #else /* !CONFIG_SCHEDSTATS */
306     # define schedstat_inc(rq, field) do { } while (0)
307     # define schedstat_add(rq, field, amt) do { } while (0)
308     #endif
309    
310     /*
311     * rq_lock - lock a given runqueue and disable interrupts.
312     */
313     static inline runqueue_t *this_rq_lock(void)
314     __acquires(rq->lock)
315     {
316     runqueue_t *rq;
317    
318     local_irq_disable();
319     rq = this_rq();
320     spin_lock(&rq->lock);
321    
322     return rq;
323     }
324    
325     #ifdef CONFIG_SCHED_SMT
326     static int cpu_and_siblings_are_idle(int cpu)
327     {
328     int sib;
329     for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
330     if (idle_cpu(sib))
331     continue;
332     return 0;
333     }
334    
335     return 1;
336     }
337     #else
338     #define cpu_and_siblings_are_idle(A) idle_cpu(A)
339     #endif
340    
341     #ifdef CONFIG_SCHEDSTATS
342     /*
343     * Called when a process is dequeued from the active array and given
344     * the cpu. We should note that with the exception of interactive
345     * tasks, the expired queue will become the active queue after the active
346     * queue is empty, without explicitly dequeuing and requeuing tasks in the
347     * expired queue. (Interactive tasks may be requeued directly to the
348     * active queue, thus delaying tasks in the expired queue from running;
349     * see scheduler_tick()).
350     *
351     * This function is only called from sched_info_arrive(), rather than
352     * dequeue_task(). Even though a task may be queued and dequeued multiple
353     * times as it is shuffled about, we're really interested in knowing how
354     * long it was from the *first* time it was queued to the time that it
355     * finally hit a cpu.
356     */
357     static inline void sched_info_dequeued(task_t *t)
358     {
359     t->sched_info.last_queued = 0;
360     }
361    
362     /*
363     * Called when a task finally hits the cpu. We can now calculate how
364     * long it was waiting to run. We also note when it began so that we
365     * can keep stats on how long its timeslice is.
366     */
367     static inline void sched_info_arrive(task_t *t)
368     {
369     unsigned long now = jiffies, diff = 0;
370     struct runqueue *rq = task_rq(t);
371    
372     if (t->sched_info.last_queued)
373     diff = now - t->sched_info.last_queued;
374     sched_info_dequeued(t);
375     t->sched_info.run_delay += diff;
376     t->sched_info.last_arrival = now;
377     t->sched_info.pcnt++;
378    
379     if (!rq)
380     return;
381    
382     rq->rq_sched_info.run_delay += diff;
383     rq->rq_sched_info.pcnt++;
384     }
385    
386     /*
387     * Called when a process is queued into either the active or expired
388     * array. The time is noted and later used to determine how long we
389     * had to wait for us to reach the cpu. Since the expired queue will
390     * become the active queue after active queue is empty, without dequeuing
391     * and requeuing any tasks, we are interested in queuing to either. It
392     * is unusual but not impossible for tasks to be dequeued and immediately
393     * requeued in the same or another array: this can happen in sched_yield(),
394     * set_user_nice(), and even load_balance() as it moves tasks from runqueue
395     * to runqueue.
396     *
397     * This function is only called from enqueue_task(), but also only updates
398     * the timestamp if it is already not set. It's assumed that
399     * sched_info_dequeued() will clear that stamp when appropriate.
400     */
401     static inline void sched_info_queued(task_t *t)
402     {
403     if (!t->sched_info.last_queued)
404     t->sched_info.last_queued = jiffies;
405     }
406    
407     /*
408     * Called when a process ceases being the active-running process, either
409     * voluntarily or involuntarily. Now we can calculate how long we ran.
410     */
411     static inline void sched_info_depart(task_t *t)
412     {
413     struct runqueue *rq = task_rq(t);
414     unsigned long diff = jiffies - t->sched_info.last_arrival;
415    
416     t->sched_info.cpu_time += diff;
417    
418     if (rq)
419     rq->rq_sched_info.cpu_time += diff;
420     }
421    
422     /*
423     * Called when tasks are switched involuntarily due, typically, to expiring
424     * their time slice. (This may also be called when switching to or from
425     * the idle task.) We are only called when prev != next.
426     */
427     static inline void sched_info_switch(task_t *prev, task_t *next)
428     {
429     struct runqueue *rq = task_rq(prev);
430    
431     /*
432     * prev now departs the cpu. It's not interesting to record
433     * stats about how efficient we were at scheduling the idle
434     * process, however.
435     */
436     if (prev != rq->idle)
437     sched_info_depart(prev);
438    
439     if (next != rq->idle)
440     sched_info_arrive(next);
441     }
442     #else
443     #define sched_info_queued(t) do { } while (0)
444     #define sched_info_switch(t, next) do { } while (0)
445     #endif /* CONFIG_SCHEDSTATS */
446    
447     /*
448     * Get nanosecond clock difference without overflowing unsigned long.
449     */
450     static inline unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
451     {
452     unsigned long long vdiff;
453     if (likely(v1 > v2)) {
454     vdiff = v1 - v2;
455     if (vdiff > (1 << 31))
456     vdiff = 1 << 31;
457     } else
458     /*
459     * Rarely the clock appears to go backwards. There should
460     * always be a positive difference so return 1.
461     */
462     vdiff = 1;
463     return (unsigned long)vdiff;
464     }
465    
466     static inline int task_queued(task_t *task)
467     {
468     return !list_empty(&task->run_list);
469     }
470    
471     /*
472     * Adding/removing a task to/from a runqueue:
473     */
474     static inline void dequeue_task(struct task_struct *p, runqueue_t *rq)
475     {
476     list_del_init(&p->run_list);
477     if (list_empty(rq->queue + p->prio))
478     __clear_bit(p->prio, rq->bitmap);
479     p->ns_debit = 0;
480     }
481    
482     static void enqueue_task(struct task_struct *p, runqueue_t *rq)
483     {
484     list_add_tail(&p->run_list, rq->queue + p->prio);
485     __set_bit(p->prio, rq->bitmap);
486     }
487    
488     /*
489     * Put task to the end of the run list without the overhead of dequeue
490     * followed by enqueue.
491     */
492     static inline void requeue_task(struct task_struct *p, runqueue_t *rq)
493     {
494     list_move_tail(&p->run_list, rq->queue + p->prio);
495     }
496    
497     static inline void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
498     {
499     list_add(&p->run_list, rq->queue + p->prio);
500     __set_bit(p->prio, rq->bitmap);
501     }
502    
503     #ifdef CONFIG_SMP
504     static inline void inc_prio_bias(runqueue_t *rq, int prio)
505     {
506     rq->prio_bias += MAX_PRIO - prio;
507     }
508    
509     static inline void dec_prio_bias(runqueue_t *rq, int prio)
510     {
511     rq->prio_bias -= MAX_PRIO - prio;
512     }
513     #else
514     static inline void inc_prio_bias(runqueue_t *rq, int prio)
515     {
516     }
517    
518     static inline void dec_prio_bias(runqueue_t *rq, int prio)
519     {
520     }
521     #endif
522    
523     static inline void inc_nr_running(task_t *p, runqueue_t *rq)
524     {
525     rq->nr_running++;
526     if (rt_task(p)) {
527     if (p != rq->migration_thread)
528     inc_prio_bias(rq, p->prio);
529     } else
530     inc_prio_bias(rq, p->static_prio);
531     }
532    
533     static inline void dec_nr_running(task_t *p, runqueue_t *rq)
534     {
535     rq->nr_running--;
536     if (rt_task(p)) {
537     if (p != rq->migration_thread)
538     dec_prio_bias(rq, p->prio);
539     } else
540     dec_prio_bias(rq, p->static_prio);
541     }
542    
543     /*
544     * __activate_task - move a task to the runqueue.
545     */
546     static void __activate_task(task_t *p, runqueue_t *rq)
547     {
548     enqueue_task(p, rq);
549     inc_nr_running(p, rq);
550     }
551    
552     /*
553     * __activate_idle_task - move idle task to the _front_ of runqueue.
554     */
555     static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
556     {
557     enqueue_task_head(p, rq);
558     inc_nr_running(p, rq);
559     }
560    
561     /*
562     * burst - extra intervals an interactive task can run for at best priority
563     * instead of descending priorities.
564     */
565     static inline unsigned int burst(task_t *p)
566     {
567     if (likely(!rt_task(p))) {
568     unsigned int task_user_prio = TASK_USER_PRIO(p);
569     return 39 - task_user_prio;
570     } else
571     return p->burst;
572     }
573    
574     static void inc_burst(task_t *p)
575     {
576     unsigned int best_burst;
577     best_burst = burst(p);
578     if (p->burst < best_burst)
579     p->burst++;
580     }
581    
582     static void dec_burst(task_t *p)
583     {
584     if (p->burst)
585     p->burst--;
586     }
587    
588     static inline unsigned int rr_interval(task_t * p)
589     {
590     unsigned int rr_interval = RR_INTERVAL();
591     int nice = TASK_NICE(p);
592    
593     if (nice < 0 && !rt_task(p))
594     rr_interval += -(nice);
595     return rr_interval;
596     }
597    
598     /*
599     * slice - the duration a task runs before getting requeued at its best
600     * priority and has its burst decremented.
601     */
602     static inline unsigned int slice(task_t *p)
603     {
604     unsigned int slice, rr;
605     slice = rr = rr_interval(p);
606     if (likely(!rt_task(p)))
607     slice += burst(p) * rr;
608     return slice;
609     }
610    
611     /*
612     * sched_interactive - sysctl which allows interactive tasks to have bursts
613     */
614     int sched_interactive = 1;
615    
616     /*
617     * effective_prio - dynamic priority dependent on burst.
618     * The priority normally decreases by one each RR_INTERVAL.
619     * As the burst increases the priority stays at the top "stair" or
620     * priority for longer.
621     */
622     static int effective_prio(task_t *p)
623     {
624     int prio;
625     unsigned int full_slice, used_slice, first_slice;
626     unsigned int best_burst, rr;
627     if (rt_task(p))
628     return p->prio;
629     if (batch_task(p)) {
630     if (unlikely(p->flags & (PF_NONSLEEP | PF_FREEZE))) {
631     /*
632     * If batch is waking up from in kernel activity
633     * or being frozen, reschedule at a normal priority
634     * to begin with.
635     */
636     p->flags |= PF_YIELDED;
637     return MAX_PRIO - 2;
638     }
639     return MAX_PRIO - 1;
640     }
641    
642     best_burst = burst(p);
643     full_slice = slice(p);
644     rr = rr_interval(p);
645     used_slice = full_slice - p->slice;
646     if (p->burst > best_burst)
647     p->burst = best_burst;
648     first_slice = rr;
649     if (sched_interactive && !sched_compute && p->mm)
650     first_slice *= (p->burst + 1);
651     prio = MAX_PRIO - 2 - best_burst;
652    
653     if (used_slice < first_slice)
654     return prio;
655     prio += 1 + (used_slice - first_slice) / rr;
656     if (prio >= MAX_PRIO - 2)
657     prio = MAX_PRIO - 2;
658     return prio;
659     }
660    
661     static void continue_slice(task_t *p)
662     {
663     unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
664    
665     if (total_run >= p->slice) {
666     p->totalrun = 0;
667     dec_burst(p);
668     } else {
669     unsigned int remainder;
670     p->slice -= total_run;
671     remainder = p->slice % rr_interval(p);
672     if (remainder)
673     p->time_slice = remainder;
674     }
675     }
676    
677     /*
678     * recalc_task_prio - this checks for tasks that run ultra short timeslices
679     * or have just forked a thread/process and make them continue their old
680     * slice instead of starting a new one at high priority.
681     */
682     static inline void recalc_task_prio(task_t *p, unsigned long long now,
683     unsigned long rq_running)
684     {
685     unsigned long sleep_time = ns_diff(now, p->timestamp);
686    
687     /*
688     * Priority is elevated back to best by amount of sleep_time.
689     * sleep_time is scaled down by number of tasks currently running.
690     */
691     if (rq_running > 1)
692     sleep_time /= rq_running;
693    
694     p->totalrun += p->runtime;
695     if (NS_TO_JIFFIES(p->totalrun) >= p->slice &&
696     NS_TO_JIFFIES(sleep_time) < p->slice) {
697     p->flags &= ~PF_NONSLEEP;
698     dec_burst(p);
699     goto new_slice;
700     }
701    
702     if (p->flags & PF_NONSLEEP) {
703     continue_slice(p);
704     p->flags &= ~PF_NONSLEEP;
705     goto out;
706     }
707    
708     if (sched_compute) {
709     continue_slice(p);
710     goto out;
711     }
712    
713     if (sleep_time >= p->totalrun) {
714     if (!(p->flags & PF_NONSLEEP))
715     inc_burst(p);
716     goto new_slice;
717     }
718    
719     p->totalrun -= sleep_time;
720     continue_slice(p);
721     goto out;
722     new_slice:
723     p->totalrun = 0;
724     out:
725     return;
726     }
727    
728     /*
729     * activate_task - move a task to the runqueue and do priority recalculation
730     *
731     * Update all the scheduling statistics stuff. (sleep average
732     * calculation, priority modifiers, etc.)
733     */
734     static void activate_task(task_t *p, runqueue_t *rq, int local)
735     {
736     unsigned long long now = sched_clock();
737     #ifdef CONFIG_SMP
738     if (!local) {
739     /* Compensate for drifting sched_clock */
740     runqueue_t *this_rq = this_rq();
741     now = (now - this_rq->timestamp_last_tick)
742     + rq->timestamp_last_tick;
743     }
744     #endif
745     p->slice = slice(p);
746     p->time_slice = rr_interval(p);
747     recalc_task_prio(p, now, rq->nr_running);
748     p->flags &= ~PF_NONSLEEP;
749     p->prio = effective_prio(p);
750     p->timestamp = now;
751     __activate_task(p, rq);
752     }
753    
754     /*
755     * deactivate_task - remove a task from the runqueue.
756     */
757     static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
758     {
759     dec_nr_running(p, rq);
760     dequeue_task(p, rq);
761     }
762    
763     /*
764     * resched_task - mark a task 'to be rescheduled now'.
765     *
766     * On UP this means the setting of the need_resched flag, on SMP it
767     * might also involve a cross-CPU call to trigger the scheduler on
768     * the target CPU.
769     */
770     #ifdef CONFIG_SMP
771     static void resched_task(task_t *p)
772     {
773     int need_resched, nrpolling;
774    
775     assert_spin_locked(&task_rq(p)->lock);
776    
777     /* minimise the chance of sending an interrupt to poll_idle() */
778     nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
779     need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
780     nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
781    
782     if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
783     smp_send_reschedule(task_cpu(p));
784     }
785     #else
786     static inline void resched_task(task_t *p)
787     {
788     set_tsk_need_resched(p);
789     }
790     #endif
791    
792     /**
793     * task_curr - is this task currently executing on a CPU?
794     * @p: the task in question.
795     */
796     inline int task_curr(const task_t *p)
797     {
798     return cpu_curr(task_cpu(p)) == p;
799     }
800    
801     #ifdef CONFIG_SMP
802     enum request_type {
803     REQ_MOVE_TASK,
804     REQ_SET_DOMAIN,
805     };
806    
807     typedef struct {
808     struct list_head list;
809     enum request_type type;
810    
811     /* For REQ_MOVE_TASK */
812     task_t *task;
813     int dest_cpu;
814    
815     /* For REQ_SET_DOMAIN */
816     struct sched_domain *sd;
817    
818     struct completion done;
819     } migration_req_t;
820    
821     /*
822     * The task's runqueue lock must be held.
823     * Returns true if you have to wait for migration thread.
824     */
825     static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
826     {
827     runqueue_t *rq = task_rq(p);
828    
829     /*
830     * If the task is not on a runqueue (and not running), then
831     * it is sufficient to simply update the task's cpu field.
832     */
833     if (!task_queued(p) && !task_running(rq, p)) {
834     set_task_cpu(p, dest_cpu);
835     return 0;
836     }
837    
838     init_completion(&req->done);
839     req->type = REQ_MOVE_TASK;
840     req->task = p;
841     req->dest_cpu = dest_cpu;
842     list_add(&req->list, &rq->migration_queue);
843     return 1;
844     }
845    
846     /*
847     * wait_task_inactive - wait for a thread to unschedule.
848     *
849     * The caller must ensure that the task *will* unschedule sometime soon,
850     * else this function might spin for a *long* time. This function can't
851     * be called with interrupts off, or it may introduce deadlock with
852     * smp_call_function() if an IPI is sent by the same process we are
853     * waiting to become inactive.
854     */
855     void wait_task_inactive(task_t * p)
856     {
857     unsigned long flags;
858     runqueue_t *rq;
859     int preempted;
860    
861     repeat:
862     rq = task_rq_lock(p, &flags);
863     /* Must be off runqueue entirely, not preempted. */
864     if (unlikely(task_queued(p) || task_running(rq, p))) {
865     /* If it's preempted, we yield. It could be a while. */
866     preempted = !task_running(rq, p);
867     task_rq_unlock(rq, &flags);
868     cpu_relax();
869     if (preempted)
870     yield();
871     goto repeat;
872     }
873     task_rq_unlock(rq, &flags);
874     }
875    
876     /***
877     * kick_process - kick a running thread to enter/exit the kernel
878     * @p: the to-be-kicked thread
879     *
880     * Cause a process which is running on another CPU to enter
881     * kernel-mode, without any delay. (to get signals handled.)
882     *
883     * NOTE: this function doesnt have to take the runqueue lock,
884     * because all it wants to ensure is that the remote task enters
885     * the kernel. If the IPI races and the task has been migrated
886     * to another CPU then no harm is done and the purpose has been
887     * achieved as well.
888     */
889     void kick_process(task_t *p)
890     {
891     int cpu;
892    
893     preempt_disable();
894     cpu = task_cpu(p);
895     if ((cpu != smp_processor_id()) && task_curr(p))
896     smp_send_reschedule(cpu);
897     preempt_enable();
898     }
899    
900     /*
901     * Return a low guess at the load of a migration-source cpu.
902     *
903     * We want to under-estimate the load of migration sources, to
904     * balance conservatively.
905     */
906     static inline unsigned long __source_load(int cpu, enum idle_type idle)
907     {
908     runqueue_t *rq = cpu_rq(cpu);
909     unsigned long source_load, cpu_load = rq->cpu_load,
910     load_now = rq->nr_running * SCHED_LOAD_SCALE;
911    
912     source_load = min(cpu_load, load_now);
913    
914     if (rq->nr_running > 1 || (idle == NOT_IDLE && rq->nr_running))
915     /*
916     * If we are busy rebalancing the load is biased by
917     * priority to create 'nice' support across cpus. When
918     * idle rebalancing we should only bias the source_load if
919     * there is more than one task running on that queue to
920     * prevent idle rebalance from trying to pull tasks from a
921     * queue with only one running task.
922     */
923     source_load = source_load * rq->prio_bias / rq->nr_running;
924    
925     return source_load;
926     }
927    
928     static inline unsigned long source_load(int cpu)
929     {
930     return __source_load(cpu, NOT_IDLE);
931     }
932    
933     /*
934     * Return a high guess at the load of a migration-target cpu
935     */
936     static inline unsigned long __target_load(int cpu, enum idle_type idle)
937     {
938     runqueue_t *rq = cpu_rq(cpu);
939     unsigned long target_load, cpu_load = rq->cpu_load,
940     load_now = rq->nr_running * SCHED_LOAD_SCALE;
941    
942     target_load = max(cpu_load, load_now);
943    
944     if (rq->nr_running > 1 || (idle == NOT_IDLE && rq->nr_running))
945     target_load = target_load * rq->prio_bias / rq->nr_running;
946    
947     return target_load;
948     }
949    
950     static inline unsigned long target_load(int cpu)
951     {
952     return __target_load(cpu, NOT_IDLE);
953     }
954     #endif
955    
956     /*
957     * wake_idle() will wake a task on an idle cpu if task->cpu is
958     * not idle and an idle cpu is available. The span of cpus to
959     * search starts with cpus closest then further out as needed,
960     * so we always favor a closer, idle cpu.
961     *
962     * Returns the CPU we should wake onto.
963     */
964     #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
965     static inline int wake_idle(int cpu, task_t *p)
966     {
967     cpumask_t tmp;
968     struct sched_domain *sd;
969     int i;
970    
971     if (idle_cpu(cpu))
972     return cpu;
973    
974     for_each_domain(cpu, sd) {
975     if (sd->flags & SD_WAKE_IDLE) {
976     cpus_and(tmp, sd->span, cpu_online_map);
977     cpus_and(tmp, tmp, p->cpus_allowed);
978     for_each_cpu_mask(i, tmp) {
979     if (idle_cpu(i))
980     return i;
981     }
982     }
983     else break;
984     }
985     return cpu;
986     }
987     #else
988     static inline int wake_idle(int cpu, task_t *p)
989     {
990     return cpu;
991     }
992     #endif
993    
994     /*
995     * cache_delay is the time preemption is delayed in sched_compute mode
996     * and is set to a nominal 10ms.
997     */
998     static int cache_delay = 10 * HZ / 1000;
999    
1000     /*
1001     * Check to see if p preempts rq->curr and resched if it does. In compute
1002     * mode we do not preempt for at least cache_delay and set rq->preempted.
1003     */
1004     static void preempt(task_t *p, runqueue_t *rq)
1005     {
1006     if (p->prio >= rq->curr->prio)
1007     return;
1008     if (!sched_compute || rq->cache_ticks >= cache_delay ||
1009     !p->mm || rt_task(p))
1010     resched_task(rq->curr);
1011     rq->preempted = 1;
1012     }
1013    
1014     /***
1015     * try_to_wake_up - wake up a thread
1016     * @p: the to-be-woken-up thread
1017     * @state: the mask of task states that can be woken
1018     * @sync: do a synchronous wakeup?
1019     *
1020     * Put it on the run-queue if it's not already there. The "current"
1021     * thread is always on the run-queue (except when the actual
1022     * re-schedule is in progress), and as such you're allowed to do
1023     * the simpler "current->state = TASK_RUNNING" to mark yourself
1024     * runnable without the overhead of this.
1025     *
1026     * returns failure only if the task is already active.
1027     */
1028     static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1029     {
1030     int cpu, this_cpu, success = 0;
1031     unsigned long flags;
1032     long old_state;
1033     runqueue_t *rq;
1034     #ifdef CONFIG_SMP
1035     unsigned long load, this_load;
1036     struct sched_domain *sd;
1037     int new_cpu;
1038     #endif
1039    
1040     rq = task_rq_lock(p, &flags);
1041     old_state = p->state;
1042     if (!(old_state & state))
1043     goto out;
1044    
1045     if (task_queued(p))
1046     goto out_running;
1047    
1048     cpu = task_cpu(p);
1049     this_cpu = smp_processor_id();
1050    
1051     #ifdef CONFIG_SMP
1052     if (unlikely(task_running(rq, p)))
1053     goto out_activate;
1054    
1055     #ifdef CONFIG_SCHEDSTATS
1056     schedstat_inc(rq, ttwu_cnt);
1057     if (cpu == this_cpu) {
1058     schedstat_inc(rq, ttwu_local);
1059     } else {
1060     for_each_domain(this_cpu, sd) {
1061     if (cpu_isset(cpu, sd->span)) {
1062     schedstat_inc(sd, ttwu_wake_remote);
1063     break;
1064     }
1065     }
1066     }
1067     #endif
1068    
1069     new_cpu = cpu;
1070     if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1071     goto out_set_cpu;
1072    
1073     load = source_load(cpu);
1074     this_load = target_load(this_cpu);
1075    
1076     /*
1077     * If sync wakeup then subtract the (maximum possible) effect of
1078     * the currently running task from the load of the current CPU:
1079     */
1080     if (sync)
1081     this_load -= SCHED_LOAD_SCALE;
1082    
1083     /* Don't pull the task off an idle CPU to a busy one */
1084     if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1085     goto out_set_cpu;
1086    
1087     new_cpu = this_cpu; /* Wake to this CPU if we can */
1088    
1089     /*
1090     * Scan domains for affine wakeup and passive balancing
1091     * possibilities.
1092     */
1093     for_each_domain(this_cpu, sd) {
1094     unsigned int imbalance;
1095     /*
1096     * Start passive balancing when half the imbalance_pct
1097     * limit is reached.
1098     */
1099     imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1100    
1101     if ((sd->flags & SD_WAKE_AFFINE) &&
1102     !task_hot(p, rq->timestamp_last_tick, sd)) {
1103     /*
1104     * This domain has SD_WAKE_AFFINE and p is cache cold
1105     * in this domain.
1106     */
1107     if (cpu_isset(cpu, sd->span)) {
1108     schedstat_inc(sd, ttwu_move_affine);
1109     goto out_set_cpu;
1110     }
1111     } else if ((sd->flags & SD_WAKE_BALANCE) &&
1112     imbalance*this_load <= 100*load) {
1113     /*
1114     * This domain has SD_WAKE_BALANCE and there is
1115     * an imbalance.
1116     */
1117     if (cpu_isset(cpu, sd->span)) {
1118     schedstat_inc(sd, ttwu_move_balance);
1119     goto out_set_cpu;
1120     }
1121     }
1122     }
1123    
1124     new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1125     out_set_cpu:
1126     new_cpu = wake_idle(new_cpu, p);
1127     if (new_cpu != cpu) {
1128     set_task_cpu(p, new_cpu);
1129     task_rq_unlock(rq, &flags);
1130     /* might preempt at this point */
1131     rq = task_rq_lock(p, &flags);
1132     old_state = p->state;
1133     if (!(old_state & state))
1134     goto out;
1135     if (task_queued(p))
1136     goto out_running;
1137    
1138     this_cpu = smp_processor_id();
1139     cpu = task_cpu(p);
1140     }
1141    
1142     out_activate:
1143     #endif /* CONFIG_SMP */
1144     if (old_state == TASK_UNINTERRUPTIBLE)
1145     rq->nr_uninterruptible--;
1146    
1147     /*
1148     * Tasks that have marked their sleep as noninteractive get
1149     * woken up without their sleep counting.
1150     */
1151     if (old_state & TASK_NONINTERACTIVE)
1152     p->flags |= PF_NONSLEEP;
1153    
1154     /*
1155     * Sync wakeups (i.e. those types of wakeups where the waker
1156     * has indicated that it will leave the CPU in short order)
1157     * don't trigger a preemption, if the woken up task will run on
1158     * this cpu. (in this case the 'I will reschedule' promise of
1159     * the waker guarantees that the freshly woken up task is going
1160     * to be considered on this CPU.)
1161     */
1162     activate_task(p, rq, cpu == this_cpu);
1163     if (!sync || cpu != this_cpu) {
1164     preempt(p, rq);
1165     }
1166     success = 1;
1167    
1168     out_running:
1169     p->state = TASK_RUNNING;
1170     out:
1171     task_rq_unlock(rq, &flags);
1172    
1173     return success;
1174     }
1175    
1176     int fastcall wake_up_process(task_t * p)
1177     {
1178     return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1179     TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1180     }
1181    
1182     EXPORT_SYMBOL(wake_up_process);
1183    
1184     int fastcall wake_up_state(task_t *p, unsigned int state)
1185     {
1186     return try_to_wake_up(p, state, 0);
1187     }
1188    
1189     #ifdef CONFIG_SMP
1190     static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1191     struct sched_domain *sd);
1192     #endif
1193    
1194     /*
1195     * Perform scheduler related setup for a newly forked process p.
1196     * p is forked by current.
1197     */
1198     void fastcall sched_fork(task_t *p)
1199     {
1200     /*
1201     * We mark the process as running here, but have not actually
1202     * inserted it onto the runqueue yet. This guarantees that
1203     * nobody will actually run it, and a signal or other external
1204     * event cannot wake it up and insert it on the runqueue either.
1205     */
1206     p->state = TASK_RUNNING;
1207     INIT_LIST_HEAD(&p->run_list);
1208     spin_lock_init(&p->switch_lock);
1209     #ifdef CONFIG_SCHEDSTATS
1210     memset(&p->sched_info, 0, sizeof(p->sched_info));
1211     #endif
1212     #ifdef CONFIG_PREEMPT
1213     /*
1214     * During context-switch we hold precisely one spinlock, which
1215     * schedule_tail drops. (in the common case it's this_rq()->lock,
1216     * but it also can be p->switch_lock.) So we compensate with a count
1217     * of 1. Also, we want to start with kernel preemption disabled.
1218     */
1219     p->thread_info->preempt_count = 1;
1220     #endif
1221     }
1222    
1223     /*
1224     * wake_up_new_task - wake up a newly created task for the first time.
1225     *
1226     * This function will do some initial scheduler statistics housekeeping
1227     * that must be done for every newly created context, then puts the task
1228     * on the runqueue and wakes it.
1229     */
1230     void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1231     {
1232     unsigned long flags;
1233     int this_cpu, cpu;
1234     runqueue_t *rq, *this_rq;
1235    
1236     rq = task_rq_lock(p, &flags);
1237     cpu = task_cpu(p);
1238     this_cpu = smp_processor_id();
1239    
1240     BUG_ON(p->state != TASK_RUNNING);
1241    
1242     /*
1243     * Forked process gets no burst to prevent fork bombs.
1244     */
1245     p->burst = 0;
1246    
1247     if (likely(cpu == this_cpu)) {
1248     current->flags |= PF_NONSLEEP;
1249     activate_task(p, rq, 1);
1250     if (!(clone_flags & CLONE_VM))
1251     /*
1252     * The VM isn't cloned, so we're in a good position to
1253     * do child-runs-first in anticipation of an exec. This
1254     * usually avoids a lot of COW overhead.
1255     */
1256     set_need_resched();
1257     /*
1258     * We skip the following code due to cpu == this_cpu
1259     *
1260     * task_rq_unlock(rq, &flags);
1261     * this_rq = task_rq_lock(current, &flags);
1262     */
1263     this_rq = rq;
1264     } else {
1265     this_rq = cpu_rq(this_cpu);
1266    
1267     /*
1268     * Not the local CPU - must adjust timestamp. This should
1269     * get optimised away in the !CONFIG_SMP case.
1270     */
1271     p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1272     + rq->timestamp_last_tick;
1273     activate_task(p, rq, 0);
1274     preempt(p, rq);
1275    
1276     /*
1277     * Parent and child are on different CPUs, now get the
1278     * parent runqueue to update the parent's ->flags:
1279     */
1280     task_rq_unlock(rq, &flags);
1281     this_rq = task_rq_lock(current, &flags);
1282     current->flags |= PF_NONSLEEP;
1283     }
1284     task_rq_unlock(this_rq, &flags);
1285     }
1286    
1287     /**
1288     * finish_task_switch - clean up after a task-switch
1289     * @prev: the thread we just switched away from.
1290     *
1291     * We enter this with the runqueue still locked, and finish_arch_switch()
1292     * will unlock it along with doing any other architecture-specific cleanup
1293     * actions.
1294     *
1295     * Note that we may have delayed dropping an mm in context_switch(). If
1296     * so, we finish that here outside of the runqueue lock. (Doing it
1297     * with the lock held can cause deadlocks; see schedule() for
1298     * details.)
1299     */
1300     static inline void finish_task_switch(task_t *prev)
1301     __releases(rq->lock)
1302     {
1303     runqueue_t *rq = this_rq();
1304     struct mm_struct *mm = rq->prev_mm;
1305     unsigned long prev_task_flags;
1306    
1307     rq->prev_mm = NULL;
1308    
1309     /*
1310     * A task struct has one reference for the use as "current".
1311     * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1312     * calls schedule one last time. The schedule call will never return,
1313     * and the scheduled task must drop that reference.
1314     * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1315     * still held, otherwise prev could be scheduled on another cpu, die
1316     * there before we look at prev->state, and then the reference would
1317     * be dropped twice.
1318     * Manfred Spraul <manfred@colorfullife.com>
1319     */
1320     prev_task_flags = prev->flags;
1321     finish_arch_switch(rq, prev);
1322     if (mm)
1323     mmdrop(mm);
1324     if (unlikely(prev_task_flags & PF_DEAD))
1325     put_task_struct(prev);
1326     }
1327    
1328     /**
1329     * schedule_tail - first thing a freshly forked thread must call.
1330     * @prev: the thread we just switched away from.
1331     */
1332     asmlinkage void schedule_tail(task_t *prev)
1333     __releases(rq->lock)
1334     {
1335     finish_task_switch(prev);
1336    
1337     if (current->set_child_tid)
1338     put_user(current->pid, current->set_child_tid);
1339     }
1340    
1341     /*
1342     * context_switch - switch to the new MM and the new
1343     * thread's register state.
1344     */
1345     static inline
1346     task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1347     {
1348     struct mm_struct *mm = next->mm;
1349     struct mm_struct *oldmm = prev->active_mm;
1350    
1351     if (unlikely(!mm)) {
1352     next->active_mm = oldmm;
1353     atomic_inc(&oldmm->mm_count);
1354     enter_lazy_tlb(oldmm, next);
1355     } else
1356     switch_mm(oldmm, mm, next);
1357    
1358     if (unlikely(!prev->mm)) {
1359     prev->active_mm = NULL;
1360     WARN_ON(rq->prev_mm);
1361     rq->prev_mm = oldmm;
1362     }
1363    
1364     /* Here we just switch the register state and the stack. */
1365     switch_to(prev, next, prev);
1366    
1367     return prev;
1368     }
1369    
1370     /*
1371     * nr_running, nr_uninterruptible and nr_context_switches:
1372     *
1373     * externally visible scheduler statistics: current number of runnable
1374     * threads, current number of uninterruptible-sleeping threads, total
1375     * number of context switches performed since bootup.
1376     */
1377     unsigned long nr_running(void)
1378     {
1379     unsigned long i, sum = 0;
1380    
1381     for_each_online_cpu(i)
1382     sum += cpu_rq(i)->nr_running;
1383    
1384     return sum;
1385     }
1386    
1387     unsigned long nr_uninterruptible(void)
1388     {
1389     unsigned long i, sum = 0;
1390    
1391     for_each_cpu(i)
1392     sum += cpu_rq(i)->nr_uninterruptible;
1393    
1394     /*
1395     * Since we read the counters lockless, it might be slightly
1396     * inaccurate. Do not allow it to go below zero though:
1397     */
1398     if (unlikely((long)sum < 0))
1399     sum = 0;
1400    
1401     return sum;
1402     }
1403    
1404     unsigned long long nr_context_switches(void)
1405     {
1406     unsigned long long i, sum = 0;
1407    
1408     for_each_cpu(i)
1409     sum += cpu_rq(i)->nr_switches;
1410    
1411     return sum;
1412     }
1413    
1414     unsigned long nr_iowait(void)
1415     {
1416     unsigned long i, sum = 0;
1417    
1418     for_each_cpu(i)
1419     sum += atomic_read(&cpu_rq(i)->nr_iowait);
1420    
1421     return sum;
1422     }
1423    
1424     #ifdef CONFIG_SMP
1425    
1426     /*
1427     * double_rq_lock - safely lock two runqueues
1428     *
1429     * Note this does not disable interrupts like task_rq_lock,
1430     * you need to do so manually before calling.
1431     */
1432     static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1433     __acquires(rq1->lock)
1434     __acquires(rq2->lock)
1435     {
1436     if (rq1 == rq2) {
1437     spin_lock(&rq1->lock);
1438     __acquire(rq2->lock); /* Fake it out ;) */
1439     } else {
1440     if (rq1 < rq2) {
1441     spin_lock(&rq1->lock);
1442     spin_lock(&rq2->lock);
1443     } else {
1444     spin_lock(&rq2->lock);
1445     spin_lock(&rq1->lock);
1446     }
1447     }
1448     }
1449    
1450     /*
1451     * double_rq_unlock - safely unlock two runqueues
1452     *
1453     * Note this does not restore interrupts like task_rq_unlock,
1454     * you need to do so manually after calling.
1455     */
1456     static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1457     __releases(rq1->lock)
1458     __releases(rq2->lock)
1459     {
1460     spin_unlock(&rq1->lock);
1461     if (rq1 != rq2)
1462     spin_unlock(&rq2->lock);
1463     else
1464     __release(rq2->lock);
1465     }
1466    
1467     /*
1468     * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1469     */
1470     static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1471     __releases(this_rq->lock)
1472     __acquires(busiest->lock)
1473     __acquires(this_rq->lock)
1474     {
1475     if (unlikely(!spin_trylock(&busiest->lock))) {
1476     if (busiest < this_rq) {
1477     spin_unlock(&this_rq->lock);
1478     spin_lock(&busiest->lock);
1479     spin_lock(&this_rq->lock);
1480     } else
1481     spin_lock(&busiest->lock);
1482     }
1483     }
1484    
1485     /*
1486     * find_idlest_cpu - find the least busy runqueue.
1487     */
1488     static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1489     struct sched_domain *sd)
1490     {
1491     unsigned long load, min_load, this_load;
1492     int i, min_cpu;
1493     cpumask_t mask;
1494    
1495     min_cpu = UINT_MAX;
1496     min_load = ULONG_MAX;
1497    
1498     cpus_and(mask, sd->span, p->cpus_allowed);
1499    
1500     for_each_cpu_mask(i, mask) {
1501     load = target_load(i);
1502    
1503     if (load < min_load) {
1504     min_cpu = i;
1505     min_load = load;
1506    
1507     /* break out early on an idle CPU: */
1508     if (!min_load)
1509     break;
1510     }
1511     }
1512    
1513     /* add +1 to account for the new task */
1514     this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1515    
1516     /*
1517     * Would with the addition of the new task to the
1518     * current CPU there be an imbalance between this
1519     * CPU and the idlest CPU?
1520     *
1521     * Use half of the balancing threshold - new-context is
1522     * a good opportunity to balance.
1523     */
1524     if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1525     return min_cpu;
1526    
1527     return this_cpu;
1528     }
1529    
1530     /*
1531     * If dest_cpu is allowed for this process, migrate the task to it.
1532     * This is accomplished by forcing the cpu_allowed mask to only
1533     * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1534     * the cpu_allowed mask is restored.
1535     */
1536     static inline void sched_migrate_task(task_t *p, int dest_cpu)
1537     {
1538     migration_req_t req;
1539     runqueue_t *rq;
1540     unsigned long flags;
1541    
1542     rq = task_rq_lock(p, &flags);
1543     if (!cpu_isset(dest_cpu, p->cpus_allowed)
1544     || unlikely(cpu_is_offline(dest_cpu)))
1545     goto out;
1546    
1547     /* force the process onto the specified CPU */
1548     if (migrate_task(p, dest_cpu, &req)) {
1549     /* Need to wait for migration thread (might exit: take ref). */
1550     struct task_struct *mt = rq->migration_thread;
1551     get_task_struct(mt);
1552     task_rq_unlock(rq, &flags);
1553     wake_up_process(mt);
1554     put_task_struct(mt);
1555     wait_for_completion(&req.done);
1556     return;
1557     }
1558     out:
1559     task_rq_unlock(rq, &flags);
1560     }
1561    
1562     /*
1563     * sched_exec(): find the highest-level, exec-balance-capable
1564     * domain and try to migrate the task to the least loaded CPU.
1565     *
1566     * execve() is a valuable balancing opportunity, because at this point
1567     * the task has the smallest effective memory and cache footprint.
1568     */
1569     void sched_exec(void)
1570     {
1571     struct sched_domain *tmp, *sd = NULL;
1572     int new_cpu, this_cpu = get_cpu();
1573    
1574     /* Prefer the current CPU if there's only this task running */
1575     if (this_rq()->nr_running <= 1)
1576     goto out;
1577    
1578     for_each_domain(this_cpu, tmp)
1579     if (tmp->flags & SD_BALANCE_EXEC)
1580     sd = tmp;
1581    
1582     if (sd) {
1583     schedstat_inc(sd, sbe_attempts);
1584     new_cpu = find_idlest_cpu(current, this_cpu, sd);
1585     if (new_cpu != this_cpu) {
1586     schedstat_inc(sd, sbe_pushed);
1587     put_cpu();
1588     sched_migrate_task(current, new_cpu);
1589     return;
1590     }
1591     }
1592     out:
1593     put_cpu();
1594     }
1595    
1596     /*
1597     * pull_task - move a task from a remote runqueue to the local runqueue.
1598     * Both runqueues must be locked.
1599     */
1600     static inline void pull_task(runqueue_t *src_rq, task_t *p,
1601     runqueue_t *this_rq, int this_cpu)
1602     {
1603     dequeue_task(p, src_rq);
1604     dec_nr_running(p, src_rq);
1605     set_task_cpu(p, this_cpu);
1606     inc_nr_running(p, this_rq);
1607     enqueue_task(p, this_rq);
1608     p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1609     + this_rq->timestamp_last_tick;
1610     /*
1611     * Note that idle threads have a prio of MAX_PRIO, for this test
1612     * to be always true for them.
1613     */
1614     preempt(p, this_rq);
1615     }
1616    
1617     /*
1618     * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1619     */
1620     static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1621     struct sched_domain *sd, enum idle_type idle)
1622     {
1623     /*
1624     * We do not migrate tasks that are:
1625     * 1) running (obviously), or
1626     * 2) cannot be migrated to this CPU due to cpus_allowed, or
1627     * 3) are cache-hot on their current CPU.
1628     */
1629     if (task_running(rq, p))
1630     return 0;
1631     if (!cpu_isset(this_cpu, p->cpus_allowed))
1632     return 0;
1633    
1634     /*
1635     * Aggressive migration if:
1636     * 1) the [whole] cpu is idle, or
1637     * 2) too many balance attempts have failed.
1638     */
1639    
1640     if (cpu_and_siblings_are_idle(this_cpu) || \
1641     sd->nr_balance_failed > sd->cache_nice_tries)
1642     return 1;
1643    
1644     if (task_hot(p, rq->timestamp_last_tick, sd))
1645     return 0;
1646     return 1;
1647     }
1648    
1649     /*
1650     * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1651     * as part of a balancing operation within "domain". Returns the number of
1652     * tasks moved.
1653     *
1654     * Called with both runqueues locked.
1655     */
1656     static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1657     unsigned long max_nr_move, struct sched_domain *sd,
1658     enum idle_type idle)
1659     {
1660     struct list_head *head, *curr;
1661     int idx, pulled = 0;
1662     task_t *tmp;
1663    
1664     if (max_nr_move <= 0 || busiest->nr_running <= 1)
1665     goto out;
1666    
1667     /* Start searching at priority 0: */
1668     idx = 0;
1669     skip_bitmap:
1670     if (!idx)
1671     idx = sched_find_first_bit(busiest->bitmap);
1672     else
1673     idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
1674     if (idx >= MAX_PRIO)
1675     goto out;
1676    
1677     head = busiest->queue + idx;
1678     curr = head->prev;
1679     skip_queue:
1680     tmp = list_entry(curr, task_t, run_list);
1681    
1682     curr = curr->prev;
1683    
1684     if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1685     if (curr != head)
1686     goto skip_queue;
1687     idx++;
1688     goto skip_bitmap;
1689     }
1690    
1691     #ifdef CONFIG_SCHEDSTATS
1692     if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1693     schedstat_inc(sd, lb_hot_gained[idle]);
1694     #endif
1695    
1696     pull_task(busiest, tmp, this_rq, this_cpu);
1697     pulled++;
1698    
1699     /* We only want to steal up to the prescribed number of tasks. */
1700     if (pulled < max_nr_move) {
1701     if (curr != head)
1702     goto skip_queue;
1703     idx++;
1704     goto skip_bitmap;
1705     }
1706     out:
1707     /*
1708     * Right now, this is the only place pull_task() is called,
1709     * so we can safely collect pull_task() stats here rather than
1710     * inside pull_task().
1711     */
1712     schedstat_add(sd, lb_gained[idle], pulled);
1713     return pulled;
1714     }
1715    
1716     /*
1717     * find_busiest_group finds and returns the busiest CPU group within the
1718     * domain. It calculates and returns the number of tasks which should be
1719     * moved to restore balance via the imbalance parameter.
1720     */
1721     static inline struct sched_group *
1722     find_busiest_group(struct sched_domain *sd, int this_cpu,
1723     unsigned long *imbalance, enum idle_type idle)
1724     {
1725     struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1726     unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1727    
1728     max_load = this_load = total_load = total_pwr = 0;
1729    
1730     do {
1731     unsigned long load;
1732     int local_group;
1733     int i;
1734    
1735     local_group = cpu_isset(this_cpu, group->cpumask);
1736    
1737     /* Tally up the load of all CPUs in the group */
1738     avg_load = 0;
1739    
1740     for_each_cpu_mask(i, group->cpumask) {
1741     /* Bias balancing toward cpus of our domain */
1742     if (local_group)
1743     load = __target_load(i, idle);
1744     else
1745     load = __source_load(i, idle);
1746    
1747     avg_load += load;
1748     }
1749    
1750     total_load += avg_load;
1751     total_pwr += group->cpu_power;
1752    
1753     /* Adjust by relative CPU power of the group */
1754     avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1755    
1756     if (local_group) {
1757     this_load = avg_load;
1758     this = group;
1759     goto nextgroup;
1760     } else if (avg_load > max_load) {
1761     max_load = avg_load;
1762     busiest = group;
1763     }
1764     nextgroup:
1765     group = group->next;
1766     } while (group != sd->groups);
1767    
1768     if (!busiest || this_load >= max_load)
1769     goto out_balanced;
1770    
1771     avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1772    
1773     if (this_load >= avg_load ||
1774     100*max_load <= sd->imbalance_pct*this_load)
1775     goto out_balanced;
1776    
1777     /*
1778     * We're trying to get all the cpus to the average_load, so we don't
1779     * want to push ourselves above the average load, nor do we wish to
1780     * reduce the max loaded cpu below the average load, as either of these
1781     * actions would just result in more rebalancing later, and ping-pong
1782     * tasks around. Thus we look for the minimum possible imbalance.
1783     * Negative imbalances (*we* are more loaded than anyone else) will
1784     * be counted as no imbalance for these purposes -- we can't fix that
1785     * by pulling tasks to us. Be careful of negative numbers as they'll
1786     * appear as very large values with unsigned longs.
1787     */
1788     /* How much load to actually move to equalise the imbalance */
1789     *imbalance = min((max_load - avg_load) * busiest->cpu_power,
1790     (avg_load - this_load) * this->cpu_power)
1791     / SCHED_LOAD_SCALE;
1792    
1793     if (*imbalance < SCHED_LOAD_SCALE) {
1794     unsigned long pwr_now = 0, pwr_move = 0;
1795     unsigned long tmp;
1796    
1797     if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1798     *imbalance = 1;
1799     return busiest;
1800     }
1801    
1802     /*
1803     * OK, we don't have enough imbalance to justify moving tasks,
1804     * however we may be able to increase total CPU power used by
1805     * moving them.
1806     */
1807    
1808     pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1809     pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1810     pwr_now /= SCHED_LOAD_SCALE;
1811    
1812     /* Amount of load we'd subtract */
1813     tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1814     if (max_load > tmp)
1815     pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1816     max_load - tmp);
1817    
1818     /* Amount of load we'd add */
1819     if (max_load*busiest->cpu_power <
1820     SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
1821     tmp = max_load*busiest->cpu_power/this->cpu_power;
1822     else
1823     tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1824     pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1825     pwr_move /= SCHED_LOAD_SCALE;
1826    
1827     /* Move if we gain throughput */
1828     if (pwr_move <= pwr_now)
1829     goto out_balanced;
1830    
1831     *imbalance = 1;
1832     return busiest;
1833     }
1834    
1835     /* Get rid of the scaling factor, rounding down as we divide */
1836     *imbalance = *imbalance / SCHED_LOAD_SCALE;
1837    
1838     return busiest;
1839    
1840     out_balanced:
1841     if (busiest && (idle == NEWLY_IDLE ||
1842     (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1843     *imbalance = 1;
1844     return busiest;
1845     }
1846    
1847     *imbalance = 0;
1848     return NULL;
1849     }
1850    
1851     /*
1852     * find_busiest_queue - find the busiest runqueue among the cpus in group.
1853     */
1854     static runqueue_t *find_busiest_queue(struct sched_group *group,
1855     enum idle_type idle)
1856     {
1857     unsigned long load, max_load = 0;
1858     runqueue_t *busiest = NULL;
1859     int i;
1860    
1861     for_each_cpu_mask(i, group->cpumask) {
1862     load = __source_load(i, idle);
1863    
1864     if (load > max_load) {
1865     max_load = load;
1866     busiest = cpu_rq(i);
1867     }
1868     }
1869    
1870     return busiest;
1871     }
1872    
1873     /*
1874     * Check this_cpu to ensure it is balanced within domain. Attempt to move
1875     * tasks if there is an imbalance.
1876     *
1877     * Called with this_rq unlocked.
1878     */
1879     static inline int load_balance(int this_cpu, runqueue_t *this_rq,
1880     struct sched_domain *sd, enum idle_type idle)
1881     {
1882     struct sched_group *group;
1883     runqueue_t *busiest;
1884     unsigned long imbalance;
1885     int nr_moved;
1886    
1887     spin_lock(&this_rq->lock);
1888     schedstat_inc(sd, lb_cnt[idle]);
1889    
1890     group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1891     if (!group) {
1892     schedstat_inc(sd, lb_nobusyg[idle]);
1893     goto out_balanced;
1894     }
1895    
1896     busiest = find_busiest_queue(group, idle);
1897     if (!busiest) {
1898     schedstat_inc(sd, lb_nobusyq[idle]);
1899     goto out_balanced;
1900     }
1901    
1902     /*
1903     * This should be "impossible", but since load
1904     * balancing is inherently racy and statistical,
1905     * it could happen in theory.
1906     */
1907     if (unlikely(busiest == this_rq)) {
1908     WARN_ON(1);
1909     goto out_balanced;
1910     }
1911    
1912     schedstat_add(sd, lb_imbalance[idle], imbalance);
1913    
1914     nr_moved = 0;
1915     if (busiest->nr_running > 1) {
1916     /*
1917     * Attempt to move tasks. If find_busiest_group has found
1918     * an imbalance but busiest->nr_running <= 1, the group is
1919     * still unbalanced. nr_moved simply stays zero, so it is
1920     * correctly treated as an imbalance.
1921     */
1922     double_lock_balance(this_rq, busiest);
1923     nr_moved = move_tasks(this_rq, this_cpu, busiest,
1924     imbalance, sd, idle);
1925     spin_unlock(&busiest->lock);
1926     }
1927     spin_unlock(&this_rq->lock);
1928    
1929     if (!nr_moved) {
1930     schedstat_inc(sd, lb_failed[idle]);
1931     sd->nr_balance_failed++;
1932    
1933     if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1934     int wake = 0;
1935    
1936     spin_lock(&busiest->lock);
1937     if (!busiest->active_balance) {
1938     busiest->active_balance = 1;
1939     busiest->push_cpu = this_cpu;
1940     wake = 1;
1941     }
1942     spin_unlock(&busiest->lock);
1943     if (wake)
1944     wake_up_process(busiest->migration_thread);
1945    
1946     /*
1947     * We've kicked active balancing, reset the failure
1948     * counter.
1949     */
1950     sd->nr_balance_failed = sd->cache_nice_tries;
1951     }
1952    
1953     /*
1954     * We were unbalanced, but unsuccessful in move_tasks(),
1955     * so bump the balance_interval to lessen the lock contention.
1956     */
1957     if (sd->balance_interval < sd->max_interval)
1958     sd->balance_interval++;
1959     } else {
1960     sd->nr_balance_failed = 0;
1961    
1962     /* We were unbalanced, so reset the balancing interval */
1963     sd->balance_interval = sd->min_interval;
1964     }
1965    
1966     return nr_moved;
1967    
1968     out_balanced:
1969     spin_unlock(&this_rq->lock);
1970    
1971     schedstat_inc(sd, lb_balanced[idle]);
1972    
1973     /* tune up the balancing interval */
1974     if (sd->balance_interval < sd->max_interval)
1975     sd->balance_interval *= 2;
1976    
1977     return 0;
1978     }
1979    
1980     /*
1981     * Check this_cpu to ensure it is balanced within domain. Attempt to move
1982     * tasks if there is an imbalance.
1983     *
1984     * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
1985     * this_rq is locked.
1986     */
1987     static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
1988     struct sched_domain *sd)
1989     {
1990     struct sched_group *group;
1991     runqueue_t *busiest = NULL;
1992     unsigned long imbalance;
1993     int nr_moved = 0;
1994    
1995     schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
1996     group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
1997     if (!group) {
1998     schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
1999     schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2000     goto out;
2001     }
2002    
2003     busiest = find_busiest_queue(group, NEWLY_IDLE);
2004     if (!busiest || busiest == this_rq) {
2005     schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2006     schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2007     goto out;
2008     }
2009    
2010     /* Attempt to move tasks */
2011     double_lock_balance(this_rq, busiest);
2012    
2013     schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2014     nr_moved = move_tasks(this_rq, this_cpu, busiest,
2015     imbalance, sd, NEWLY_IDLE);
2016     if (!nr_moved)
2017     schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2018    
2019     spin_unlock(&busiest->lock);
2020    
2021     out:
2022     return nr_moved;
2023     }
2024    
2025     /*
2026     * idle_balance is called by schedule() if this_cpu is about to become
2027     * idle. Attempts to pull tasks from other CPUs.
2028     */
2029     static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2030     {
2031     struct sched_domain *sd;
2032    
2033     for_each_domain(this_cpu, sd) {
2034     if (sd->flags & SD_BALANCE_NEWIDLE) {
2035     if (load_balance_newidle(this_cpu, this_rq, sd)) {
2036     /* We've pulled tasks over so stop searching */
2037     break;
2038     }
2039     }
2040     }
2041     }
2042    
2043     /*
2044     * active_load_balance is run by migration threads. It pushes running tasks
2045     * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2046     * running on each physical CPU where possible, and avoids physical /
2047     * logical imbalances.
2048     *
2049     * Called with busiest_rq locked.
2050     */
2051     static inline void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2052     {
2053     struct sched_domain *sd;
2054     struct sched_group *cpu_group;
2055     runqueue_t *target_rq;
2056     cpumask_t visited_cpus;
2057     int cpu;
2058    
2059     /*
2060     * Search for suitable CPUs to push tasks to in successively higher
2061     * domains with SD_LOAD_BALANCE set.
2062     */
2063     visited_cpus = CPU_MASK_NONE;
2064     for_each_domain(busiest_cpu, sd) {
2065     if (!(sd->flags & SD_LOAD_BALANCE))
2066     /* no more domains to search */
2067     break;
2068    
2069     schedstat_inc(sd, alb_cnt);
2070    
2071     cpu_group = sd->groups;
2072     do {
2073     for_each_cpu_mask(cpu, cpu_group->cpumask) {
2074     if (busiest_rq->nr_running <= 1)
2075     /* no more tasks left to move */
2076     return;
2077     if (cpu_isset(cpu, visited_cpus))
2078     continue;
2079     cpu_set(cpu, visited_cpus);
2080     if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
2081     continue;
2082    
2083     target_rq = cpu_rq(cpu);
2084     /*
2085     * This condition is "impossible", if it occurs
2086     * we need to fix it. Originally reported by
2087     * Bjorn Helgaas on a 128-cpu setup.
2088     */
2089     BUG_ON(busiest_rq == target_rq);
2090    
2091     /* move a task from busiest_rq to target_rq */
2092     double_lock_balance(busiest_rq, target_rq);
2093     if (move_tasks(target_rq, cpu, busiest_rq,
2094     1, sd, SCHED_IDLE)) {
2095     schedstat_inc(sd, alb_pushed);
2096     } else {
2097     schedstat_inc(sd, alb_failed);
2098     }
2099     spin_unlock(&target_rq->lock);
2100     }
2101     cpu_group = cpu_group->next;
2102     } while (cpu_group != sd->groups);
2103     }
2104     }
2105    
2106     /*
2107     * rebalance_tick will get called every timer tick, on every CPU.
2108     *
2109     * It checks each scheduling domain to see if it is due to be balanced,
2110     * and initiates a balancing operation if so.
2111     *
2112     * Balancing parameters are set up in arch_init_sched_domains.
2113     */
2114    
2115     /* Don't have all balancing operations going off at once */
2116     #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2117    
2118     static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2119     enum idle_type idle)
2120     {
2121     unsigned long old_load, this_load;
2122     unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2123     struct sched_domain *sd;
2124    
2125     /* Update our load */
2126     old_load = this_rq->cpu_load;
2127     this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2128     /*
2129     * Round up the averaging division if load is increasing. This
2130     * prevents us from getting stuck on 9 if the load is 10, for
2131     * example.
2132     */
2133     if (this_load > old_load)
2134     old_load++;
2135     this_rq->cpu_load = (old_load + this_load) / 2;
2136    
2137     for_each_domain(this_cpu, sd) {
2138     unsigned long interval;
2139    
2140     if (!(sd->flags & SD_LOAD_BALANCE))
2141     continue;
2142    
2143     interval = sd->balance_interval;
2144    
2145     /* scale ms to jiffies */
2146     interval = msecs_to_jiffies(interval);
2147     if (unlikely(!interval))
2148     interval = 1;
2149    
2150     if (idle != SCHED_IDLE || j - sd->last_balance >= interval) {
2151     if (load_balance(this_cpu, this_rq, sd, idle)) {
2152     /* We've pulled tasks over so no longer idle */
2153     idle = NOT_IDLE;
2154     }
2155     sd->last_balance += interval;
2156     }
2157     }
2158     }
2159     #else
2160     /*
2161     * on UP we do not need to balance between CPUs:
2162     */
2163     static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2164     {
2165     }
2166     static inline void idle_balance(int cpu, runqueue_t *rq)
2167     {
2168     }
2169     #endif
2170    
2171     static inline int wake_priority_sleeper(runqueue_t *rq)
2172     {
2173     int ret = 0;
2174     #ifdef CONFIG_SCHED_SMT
2175     spin_lock(&rq->lock);
2176     /*
2177     * If an SMT sibling task has been put to sleep for priority
2178     * reasons reschedule the idle task to see if it can now run.
2179     */
2180     if (rq->nr_running) {
2181     resched_task(rq->idle);
2182     ret = 1;
2183     }
2184     spin_unlock(&rq->lock);
2185     #endif
2186     return ret;
2187     }
2188    
2189     DEFINE_PER_CPU(struct kernel_stat, kstat);
2190    
2191     EXPORT_PER_CPU_SYMBOL(kstat);
2192    
2193     /*
2194     * This is called on clock ticks and on context switches.
2195     * Bank in p->sched_time the ns elapsed since the last tick or switch.
2196     */
2197     static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
2198     unsigned long long now)
2199     {
2200     unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2201     p->sched_time += now - last;
2202     }
2203    
2204     /*
2205     * Return current->sched_time plus any more ns on the sched_clock
2206     * that have not yet been banked.
2207     */
2208     unsigned long long current_sched_time(const task_t *tsk)
2209     {
2210     unsigned long long ns;
2211     unsigned long flags;
2212     local_irq_save(flags);
2213     ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2214     ns = tsk->sched_time + (sched_clock() - ns);
2215     local_irq_restore(flags);
2216     return ns;
2217     }
2218    
2219     /*
2220     * Account user cpu time to a process.
2221     * @p: the process that the cpu time gets accounted to
2222     * @hardirq_offset: the offset to subtract from hardirq_count()
2223     * @cputime: the cpu time spent in user space since the last update
2224     */
2225     void account_user_time(struct task_struct *p, cputime_t cputime)
2226     {
2227     struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2228     cputime64_t tmp;
2229    
2230     p->utime = cputime_add(p->utime, cputime);
2231    
2232     /* Add user time to cpustat. */
2233     tmp = cputime_to_cputime64(cputime);
2234     if (TASK_NICE(p) > 0 || batch_task(p))
2235     cpustat->nice = cputime64_add(cpustat->nice, tmp);
2236     else
2237     cpustat->user = cputime64_add(cpustat->user, tmp);
2238     }
2239    
2240     /*
2241     * Account system cpu time to a process.
2242     * @p: the process that the cpu time gets accounted to
2243     * @hardirq_offset: the offset to subtract from hardirq_count()
2244     * @cputime: the cpu time spent in kernel space since the last update
2245     */
2246     void account_system_time(struct task_struct *p, int hardirq_offset,
2247     cputime_t cputime)
2248     {
2249     struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2250     runqueue_t *rq = this_rq();
2251     cputime64_t tmp;
2252    
2253     p->stime = cputime_add(p->stime, cputime);
2254    
2255     /* Add system time to cpustat. */
2256     tmp = cputime_to_cputime64(cputime);
2257     if (hardirq_count() - hardirq_offset)
2258     cpustat->irq = cputime64_add(cpustat->irq, tmp);
2259     else if (softirq_count())
2260     cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2261     else if (p != rq->idle)
2262     cpustat->system = cputime64_add(cpustat->system, tmp);
2263     else if (atomic_read(&rq->nr_iowait) > 0)
2264     cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2265     else
2266     cpustat->idle = cputime64_add(cpustat->idle, tmp);
2267    
2268     /* Account for system time used */
2269     acct_update_integrals(p);
2270     /* Update rss highwater mark */
2271     update_mem_hiwater(p);
2272     }
2273    
2274     /*
2275     * Account for involuntary wait time.
2276     * @p: the process from which the cpu time has been stolen
2277     * @steal: the cpu time spent in involuntary wait
2278     */
2279     void account_steal_time(struct task_struct *p, cputime_t steal)
2280     {
2281     struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2282     cputime64_t tmp = cputime_to_cputime64(steal);
2283     runqueue_t *rq = this_rq();
2284    
2285     if (p == rq->idle) {
2286     p->stime = cputime_add(p->stime, steal);
2287     if (atomic_read(&rq->nr_iowait) > 0)
2288     cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2289     else
2290     cpustat->idle = cputime64_add(cpustat->idle, tmp);
2291     } else
2292     cpustat->steal = cputime64_add(cpustat->steal, tmp);
2293     }
2294    
2295     static void time_slice_expired(task_t *p, runqueue_t *rq)
2296     {
2297     set_tsk_need_resched(p);
2298     dequeue_task(p, rq);
2299     p->prio = effective_prio(p);
2300     p->time_slice = rr_interval(p);
2301     enqueue_task(p, rq);
2302     }
2303    
2304     /*
2305     * This function gets called by the timer code, with HZ frequency.
2306     * We call it with interrupts disabled.
2307     */
2308     void scheduler_tick(void)
2309     {
2310     int cpu = smp_processor_id();
2311     runqueue_t *rq = this_rq();
2312     task_t *p = current;
2313     unsigned long debit, expired_balance = rq->nr_running;
2314     unsigned long long now = sched_clock();
2315    
2316     update_cpu_clock(p, rq, now);
2317    
2318     rq->timestamp_last_tick = now;
2319    
2320     if (p == rq->idle) {
2321     if (wake_priority_sleeper(rq))
2322     goto out;
2323     rebalance_tick(cpu, rq, SCHED_IDLE);
2324     return;
2325     }
2326    
2327     /* Task might have expired already, but not scheduled off yet */
2328     if (unlikely(!task_queued(p))) {
2329     set_tsk_need_resched(p);
2330     goto out;
2331     }
2332     /*
2333     * SCHED_FIFO tasks never run out of timeslice.
2334     */
2335     if (unlikely(p->policy == SCHED_FIFO)) {
2336     expired_balance = 0;
2337     goto out;
2338     }
2339    
2340     spin_lock(&rq->lock);
2341     debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
2342     p->ns_debit += debit;
2343     if (p->ns_debit < NSJIFFY)
2344     goto out_unlock;
2345     p->ns_debit %= NSJIFFY;
2346     /*
2347     * Tasks lose burst each time they use up a full slice().
2348     */
2349     if (!--p->slice) {
2350     dec_burst(p);
2351     p->slice = slice(p);
2352     time_slice_expired(p, rq);
2353     p->totalrun = 0;
2354     goto out_unlock;
2355     }
2356     /*
2357     * Tasks that run out of time_slice but still have slice left get
2358     * requeued with a lower priority && RR_INTERVAL time_slice.
2359     */
2360     if (!--p->time_slice) {
2361     time_slice_expired(p, rq);
2362     goto out_unlock;
2363     }
2364     rq->cache_ticks++;
2365     if (rq->preempted && rq->cache_ticks >= cache_delay) {
2366     set_tsk_need_resched(p);
2367     goto out_unlock;
2368     }
2369     expired_balance = 0;
2370     out_unlock:
2371     spin_unlock(&rq->lock);
2372     out:
2373     if (expired_balance > 1)
2374     rebalance_tick(cpu, rq, NOT_IDLE);
2375     }
2376    
2377     #ifdef CONFIG_SCHED_SMT
2378     static inline void wakeup_busy_runqueue(runqueue_t *rq)
2379     {
2380     /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2381     if (rq->curr == rq->idle && rq->nr_running)
2382     resched_task(rq->idle);
2383     }
2384    
2385     static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2386     {
2387     struct sched_domain *sd = this_rq->sd;
2388     cpumask_t sibling_map;
2389     int i;
2390    
2391     if (!(sd->flags & SD_SHARE_CPUPOWER))
2392     return;
2393    
2394     /*
2395     * Unlock the current runqueue because we have to lock in
2396     * CPU order to avoid deadlocks. Caller knows that we might
2397     * unlock. We keep IRQs disabled.
2398     */
2399     spin_unlock(&this_rq->lock);
2400    
2401     sibling_map = sd->span;
2402    
2403     for_each_cpu_mask(i, sibling_map)
2404     spin_lock(&cpu_rq(i)->lock);
2405     /*
2406     * We clear this CPU from the mask. This both simplifies the
2407     * inner loop and keps this_rq locked when we exit:
2408     */
2409     cpu_clear(this_cpu, sibling_map);
2410    
2411     for_each_cpu_mask(i, sibling_map) {
2412     runqueue_t *smt_rq = cpu_rq(i);
2413    
2414     wakeup_busy_runqueue(smt_rq);
2415     }
2416    
2417     for_each_cpu_mask(i, sibling_map)
2418     spin_unlock(&cpu_rq(i)->lock);
2419     /*
2420     * We exit with this_cpu's rq still held and IRQs
2421     * still disabled:
2422     */
2423     }
2424    
2425     static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2426     {
2427     struct sched_domain *sd = this_rq->sd;
2428     cpumask_t sibling_map;
2429     int ret = 0, i;
2430     task_t *p;
2431    
2432     if (!(sd->flags & SD_SHARE_CPUPOWER))
2433     return 0;
2434    
2435     /*
2436     * The same locking rules and details apply as for
2437     * wake_sleeping_dependent():
2438     */
2439     spin_unlock(&this_rq->lock);
2440     sibling_map = sd->span;
2441     for_each_cpu_mask(i, sibling_map)
2442     spin_lock(&cpu_rq(i)->lock);
2443     cpu_clear(this_cpu, sibling_map);
2444    
2445     /*
2446     * Establish next task to be run - it might have gone away because
2447     * we released the runqueue lock above:
2448     */
2449     if (!this_rq->nr_running)
2450     goto out_unlock;
2451    
2452     p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next,
2453     task_t, run_list);
2454    
2455     for_each_cpu_mask(i, sibling_map) {
2456     runqueue_t *smt_rq = cpu_rq(i);
2457     task_t *smt_curr = smt_rq->curr;
2458    
2459     /* Kernel threads do not participate in dependent sleeping */
2460     if (!p->mm || !smt_curr->mm || rt_task(p))
2461     goto check_smt_task;
2462    
2463     /*
2464     * If a user task with lower static priority than the
2465     * running task on the SMT sibling is trying to schedule,
2466     * delay it till there is proportionately less timeslice
2467     * left of the sibling task to prevent a lower priority
2468     * task from using an unfair proportion of the
2469     * physical cpu's resources. -ck
2470     */
2471     if (rt_task(smt_curr)) {
2472     /*
2473     * With real time tasks we run non-rt tasks only
2474     * per_cpu_gain% of the time.
2475     */
2476     if ((jiffies % DEF_TIMESLICE) >
2477     (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2478     ret = 1;
2479     else if (batch_task(p))
2480     ret = 1;
2481     } else {
2482     if (((smt_curr->slice * (100 - sd->per_cpu_gain) /
2483     100) > slice(p)))
2484     ret = 1;
2485     else if (batch_task(p) && !batch_task(smt_curr) &&
2486     smt_curr->slice * sd->per_cpu_gain >
2487     slice(smt_curr))
2488     /*
2489     * With batch tasks they run just the last
2490     * per_cpu_gain percent of the smt task's slice.
2491     */
2492     ret = 1;
2493     }
2494    
2495     check_smt_task:
2496     if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2497     rt_task(smt_curr))
2498     continue;
2499     if (!p->mm) {
2500     wakeup_busy_runqueue(smt_rq);
2501     continue;
2502     }
2503    
2504     /*
2505     * Reschedule a lower priority task on the SMT sibling,
2506     * or wake it up if it has been put to sleep for priority
2507     * reasons to see if it should run now.
2508     */
2509     if (rt_task(p)) {
2510     if ((jiffies % DEF_TIMESLICE) >
2511     (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2512     resched_task(smt_curr);
2513     else if (batch_task(smt_curr))
2514     resched_task(smt_curr);
2515     } else {
2516     if ((p->slice * (100 - sd->per_cpu_gain) / 100) >
2517     slice(smt_curr))
2518     resched_task(smt_curr);
2519     else if (batch_task(smt_curr) && !batch_task(p) &&
2520     p->slice * sd->per_cpu_gain > slice(p))
2521     resched_task(smt_curr);
2522     else
2523     wakeup_busy_runqueue(smt_rq);
2524     }
2525     }
2526     out_unlock:
2527     for_each_cpu_mask(i, sibling_map)
2528     spin_unlock(&cpu_rq(i)->lock);
2529     return ret;
2530     }
2531     #else
2532     static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2533     {
2534     }
2535    
2536     static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2537     {
2538     return 0;
2539     }
2540     #endif
2541    
2542     #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2543    
2544     void fastcall add_preempt_count(int val)
2545     {
2546     /*
2547     * Underflow?
2548     */
2549     BUG_ON(((int)preempt_count() < 0));
2550     preempt_count() += val;
2551     /*
2552     * Spinlock count overflowing soon?
2553     */
2554     BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2555     }
2556     EXPORT_SYMBOL(add_preempt_count);
2557    
2558     void fastcall sub_preempt_count(int val)
2559     {
2560     /*
2561     * Underflow?
2562     */
2563     BUG_ON(val > preempt_count());
2564     /*
2565     * Is the spinlock portion underflowing?
2566     */
2567     BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2568     preempt_count() -= val;
2569     }
2570     EXPORT_SYMBOL(sub_preempt_count);
2571    
2572     #endif
2573    
2574     /*
2575     * schedule() is the main scheduler function.
2576     */
2577     asmlinkage void __sched schedule(void)
2578     {
2579     long *switch_count;
2580     task_t *prev, *next;
2581     runqueue_t *rq;
2582     struct list_head *queue;
2583     unsigned long long now;
2584     unsigned long debit;
2585     int cpu, idx;
2586    
2587     /*
2588     * Test if we are atomic. Since do_exit() needs to call into
2589     * schedule() atomically, we ignore that path for now.
2590     * Otherwise, whine if we are scheduling when we should not be.
2591     */
2592     if (likely(!current->exit_state)) {
2593     if (unlikely(in_atomic())) {
2594     printk(KERN_ERR "scheduling while atomic: "
2595     "%s/0x%08x/%d\n",
2596     current->comm, preempt_count(), current->pid);
2597     dump_stack();
2598     }
2599     }
2600     profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2601    
2602     need_resched:
2603     preempt_disable();
2604     prev = current;
2605     release_kernel_lock(prev);
2606     need_resched_nonpreemptible:
2607     rq = this_rq();
2608    
2609     /*
2610     * The idle thread is not allowed to schedule!
2611     * Remove this check after it has been exercised a bit.
2612     */
2613     if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2614     printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2615     dump_stack();
2616     }
2617    
2618     schedstat_inc(rq, sched_cnt);
2619     now = sched_clock();
2620    
2621     spin_lock_irq(&rq->lock);
2622     prev->runtime = ns_diff(now, prev->timestamp);
2623     debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY;
2624     prev->ns_debit += debit;
2625    
2626     if (unlikely(prev->flags & PF_DEAD))
2627     prev->state = EXIT_DEAD;
2628    
2629     switch_count = &prev->nivcsw;
2630     if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2631     switch_count = &prev->nvcsw;
2632     if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2633     unlikely(signal_pending(prev))))
2634     prev->state = TASK_RUNNING;
2635     else {
2636     if (prev->state == TASK_UNINTERRUPTIBLE) {
2637     prev->flags |= PF_NONSLEEP;
2638     rq->nr_uninterruptible++;
2639     }
2640     deactivate_task(prev, rq);
2641     }
2642     }
2643    
2644     cpu = smp_processor_id();
2645     if (unlikely(!rq->nr_running)) {
2646     go_idle:
2647     idle_balance(cpu, rq);
2648     if (!rq->nr_running) {
2649     next = rq->idle;
2650     wake_sleeping_dependent(cpu, rq);
2651     /*
2652     * wake_sleeping_dependent() might have released
2653     * the runqueue, so break out if we got new
2654     * tasks meanwhile:
2655     */
2656     if (!rq->nr_running)
2657     goto switch_tasks;
2658     }
2659     } else {
2660     if (dependent_sleeper(cpu, rq)) {
2661     next = rq->idle;
2662     goto switch_tasks;
2663     }
2664     /*
2665     * dependent_sleeper() releases and reacquires the runqueue
2666     * lock, hence go into the idle loop if the rq went
2667     * empty meanwhile:
2668     */
2669     if (unlikely(!rq->nr_running))
2670     goto go_idle;
2671     }
2672    
2673     idx = sched_find_first_bit(rq->bitmap);
2674     queue = rq->queue + idx;
2675     next = list_entry(queue->next, task_t, run_list);
2676    
2677     switch_tasks:
2678     if (next == rq->idle)
2679     schedstat_inc(rq, sched_goidle);
2680     prev->timestamp = now;
2681     if (unlikely(next->flags & PF_YIELDED)) {
2682     /*
2683     * Tasks that have yield()ed get requeued at normal priority
2684     */
2685     int newprio = effective_prio(next);
2686     next->flags &= ~PF_YIELDED;
2687     if (newprio != next->prio) {
2688     dequeue_task(next, rq);
2689     next->prio = newprio;
2690     enqueue_task(next, rq);
2691     }
2692     }
2693    
2694     prefetch(next);
2695     clear_tsk_need_resched(prev);
2696     rcu_qsctr_inc(task_cpu(prev));
2697    
2698     update_cpu_clock(prev, rq, now);
2699    
2700     sched_info_switch(prev, next);
2701     if (likely(prev != next)) {
2702     rq->preempted = 0;
2703     rq->cache_ticks = 0;
2704     next->timestamp = now;
2705     rq->nr_switches++;
2706     rq->curr = next;
2707     ++*switch_count;
2708    
2709     prepare_arch_switch(rq, next);
2710     prev = context_switch(rq, prev, next);
2711     barrier();
2712    
2713     finish_task_switch(prev);
2714     } else
2715     spin_unlock_irq(&rq->lock);
2716    
2717     prev = current;
2718     if (unlikely(reacquire_kernel_lock(prev) < 0))
2719     goto need_resched_nonpreemptible;
2720     preempt_enable_no_resched();
2721     if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2722     goto need_resched;
2723     }
2724    
2725     EXPORT_SYMBOL(schedule);
2726    
2727     #ifdef CONFIG_PREEMPT
2728     /*
2729     * this is is the entry point to schedule() from in-kernel preemption
2730     * off of preempt_enable. Kernel preemptions off return from interrupt
2731     * occur there and call schedule directly.
2732     */
2733     asmlinkage void __sched preempt_schedule(void)
2734     {
2735     struct thread_info *ti = current_thread_info();
2736     #ifdef CONFIG_PREEMPT_BKL
2737     struct task_struct *task = current;
2738     int saved_lock_depth;
2739     #endif
2740     /*
2741     * If there is a non-zero preempt_count or interrupts are disabled,
2742     * we do not want to preempt the current task. Just return..
2743     */
2744     if (unlikely(ti->preempt_count || irqs_disabled()))
2745     return;
2746    
2747     need_resched:
2748     add_preempt_count(PREEMPT_ACTIVE);
2749     /*
2750     * We keep the big kernel semaphore locked, but we
2751     * clear ->lock_depth so that schedule() doesnt
2752     * auto-release the semaphore:
2753     */
2754     #ifdef CONFIG_PREEMPT_BKL
2755     saved_lock_depth = task->lock_depth;
2756     task->lock_depth = -1;
2757     #endif
2758     schedule();
2759     #ifdef CONFIG_PREEMPT_BKL
2760     task->lock_depth = saved_lock_depth;
2761     #endif
2762     sub_preempt_count(PREEMPT_ACTIVE);
2763    
2764     /* we could miss a preemption opportunity between schedule and now */
2765     barrier();
2766     if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2767     goto need_resched;
2768     }
2769    
2770     EXPORT_SYMBOL(preempt_schedule);
2771    
2772     /*
2773     * this is is the entry point to schedule() from kernel preemption
2774     * off of irq context.
2775     * Note, that this is called and return with irqs disabled. This will
2776     * protect us against recursive calling from irq.
2777     */
2778     asmlinkage void __sched preempt_schedule_irq(void)
2779     {
2780     struct thread_info *ti = current_thread_info();
2781     #ifdef CONFIG_PREEMPT_BKL
2782     struct task_struct *task = current;
2783     int saved_lock_depth;
2784     #endif
2785     /* Catch callers which need to be fixed*/
2786     BUG_ON(ti->preempt_count || !irqs_disabled());
2787    
2788     need_resched:
2789     add_preempt_count(PREEMPT_ACTIVE);
2790     /*
2791     * We keep the big kernel semaphore locked, but we
2792     * clear ->lock_depth so that schedule() doesnt
2793     * auto-release the semaphore:
2794     */
2795     #ifdef CONFIG_PREEMPT_BKL
2796     saved_lock_depth = task->lock_depth;
2797     task->lock_depth = -1;
2798     #endif
2799     local_irq_enable();
2800     schedule();
2801     local_irq_disable();
2802     #ifdef CONFIG_PREEMPT_BKL
2803     task->lock_depth = saved_lock_depth;
2804     #endif
2805     sub_preempt_count(PREEMPT_ACTIVE);
2806    
2807     /* we could miss a preemption opportunity between schedule and now */
2808     barrier();
2809     if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2810     goto need_resched;
2811     }
2812    
2813     #endif /* CONFIG_PREEMPT */
2814    
2815     int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2816     {
2817     task_t *p = curr->task;
2818     return try_to_wake_up(p, mode, sync);
2819     }
2820    
2821     EXPORT_SYMBOL(default_wake_function);
2822    
2823     /*
2824     * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2825     * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2826     * number) then we wake all the non-exclusive tasks and one exclusive task.
2827     *
2828     * There are circumstances in which we can try to wake a task which has already
2829     * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2830     * zero in this (rare) case, and we handle it by continuing to scan the queue.
2831     */
2832     static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2833     int nr_exclusive, int sync, void *key)
2834     {
2835     struct list_head *tmp, *next;
2836    
2837     list_for_each_safe(tmp, next, &q->task_list) {
2838     wait_queue_t *curr;
2839     unsigned flags;
2840     curr = list_entry(tmp, wait_queue_t, task_list);
2841     flags = curr->flags;
2842     if (curr->func(curr, mode, sync, key) &&
2843     (flags & WQ_FLAG_EXCLUSIVE) &&
2844     !--nr_exclusive)
2845     break;
2846     }
2847     }
2848    
2849     /**
2850     * __wake_up - wake up threads blocked on a waitqueue.
2851     * @q: the waitqueue
2852     * @mode: which threads
2853     * @nr_exclusive: how many wake-one or wake-many threads to wake up
2854     * @key: is directly passed to the wakeup function
2855     */
2856     void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2857     int nr_exclusive, void *key)
2858     {
2859     unsigned long flags;
2860    
2861     spin_lock_irqsave(&q->lock, flags);
2862     __wake_up_common(q, mode, nr_exclusive, 0, key);
2863     spin_unlock_irqrestore(&q->lock, flags);
2864     }
2865    
2866     EXPORT_SYMBOL(__wake_up);
2867    
2868     /*
2869     * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2870     */
2871     void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2872     {
2873     __wake_up_common(q, mode, 1, 0, NULL);
2874     }
2875    
2876     /**
2877     * __wake_up_sync - wake up threads blocked on a waitqueue.
2878     * @q: the waitqueue
2879     * @mode: which threads
2880     * @nr_exclusive: how many wake-one or wake-many threads to wake up
2881     *
2882     * The sync wakeup differs that the waker knows that it will schedule
2883     * away soon, so while the target thread will be woken up, it will not
2884     * be migrated to another CPU - ie. the two threads are 'synchronized'
2885     * with each other. This can prevent needless bouncing between CPUs.
2886     *
2887     * On UP it can prevent extra preemption.
2888     */
2889     void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2890     {
2891     unsigned long flags;
2892     int sync = 1;
2893    
2894     if (unlikely(!q))
2895     return;
2896    
2897     if (unlikely(!nr_exclusive))
2898     sync = 0;
2899    
2900     spin_lock_irqsave(&q->lock, flags);
2901     __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2902     spin_unlock_irqrestore(&q->lock, flags);
2903     }
2904     EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2905    
2906     void fastcall complete(struct completion *x)
2907     {
2908     unsigned long flags;
2909    
2910     spin_lock_irqsave(&x->wait.lock, flags);
2911     x->done++;
2912     __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2913     1, 0, NULL);
2914     spin_unlock_irqrestore(&x->wait.lock, flags);
2915     }
2916     EXPORT_SYMBOL(complete);
2917    
2918     void fastcall complete_all(struct completion *x)
2919     {
2920     unsigned long flags;
2921    
2922     spin_lock_irqsave(&x->wait.lock, flags);
2923     x->done += UINT_MAX/2;
2924     __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2925     0, 0, NULL);
2926     spin_unlock_irqrestore(&x->wait.lock, flags);
2927     }
2928     EXPORT_SYMBOL(complete_all);
2929    
2930     void fastcall __sched wait_for_completion(struct completion *x)
2931     {
2932     might_sleep();
2933     spin_lock_irq(&x->wait.lock);
2934     if (!x->done) {
2935     DECLARE_WAITQUEUE(wait, current);
2936    
2937     wait.flags |= WQ_FLAG_EXCLUSIVE;
2938     __add_wait_queue_tail(&x->wait, &wait);
2939     do {
2940     __set_current_state(TASK_UNINTERRUPTIBLE);
2941     spin_unlock_irq(&x->wait.lock);
2942     schedule();
2943     spin_lock_irq(&x->wait.lock);
2944     } while (!x->done);
2945     __remove_wait_queue(&x->wait, &wait);
2946     }
2947     x->done--;
2948     spin_unlock_irq(&x->wait.lock);
2949     }
2950     EXPORT_SYMBOL(wait_for_completion);
2951    
2952     unsigned long fastcall __sched
2953     wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2954     {
2955     might_sleep();
2956    
2957     spin_lock_irq(&x->wait.lock);
2958     if (!x->done) {
2959     DECLARE_WAITQUEUE(wait, current);
2960    
2961     wait.flags |= WQ_FLAG_EXCLUSIVE;
2962     __add_wait_queue_tail(&x->wait, &wait);
2963     do {
2964     __set_current_state(TASK_UNINTERRUPTIBLE);
2965     spin_unlock_irq(&x->wait.lock);
2966     timeout = schedule_timeout(timeout);
2967     spin_lock_irq(&x->wait.lock);
2968     if (!timeout) {
2969     __remove_wait_queue(&x->wait, &wait);
2970     goto out;
2971     }
2972     } while (!x->done);
2973     __remove_wait_queue(&x->wait, &wait);
2974     }
2975     x->done--;
2976     out:
2977     spin_unlock_irq(&x->wait.lock);
2978     return timeout;
2979     }
2980     EXPORT_SYMBOL(wait_for_completion_timeout);
2981    
2982     int fastcall __sched wait_for_completion_interruptible(struct completion *x)
2983     {
2984     int ret = 0;
2985    
2986     might_sleep();
2987    
2988     spin_lock_irq(&x->wait.lock);
2989     if (!x->done) {
2990     DECLARE_WAITQUEUE(wait, current);
2991    
2992     wait.flags |= WQ_FLAG_EXCLUSIVE;
2993     __add_wait_queue_tail(&x->wait, &wait);
2994     do {
2995     if (signal_pending(current)) {
2996     ret = -ERESTARTSYS;
2997     __remove_wait_queue(&x->wait, &wait);
2998     goto out;
2999     }
3000     __set_current_state(TASK_INTERRUPTIBLE);
3001     spin_unlock_irq(&x->wait.lock);
3002     schedule();
3003     spin_lock_irq(&x->wait.lock);
3004     } while (!x->done);
3005     __remove_wait_queue(&x->wait, &wait);
3006     }
3007     x->done--;
3008     out:
3009     spin_unlock_irq(&x->wait.lock);
3010    
3011     return ret;
3012     }
3013     EXPORT_SYMBOL(wait_for_completion_interruptible);
3014    
3015     unsigned long fastcall __sched
3016     wait_for_completion_interruptible_timeout(struct completion *x,
3017     unsigned long timeout)
3018     {
3019     might_sleep();
3020    
3021     spin_lock_irq(&x->wait.lock);
3022     if (!x->done) {
3023     DECLARE_WAITQUEUE(wait, current);
3024    
3025     wait.flags |= WQ_FLAG_EXCLUSIVE;
3026     __add_wait_queue_tail(&x->wait, &wait);
3027     do {
3028     if (signal_pending(current)) {
3029     timeout = -ERESTARTSYS;
3030     __remove_wait_queue(&x->wait, &wait);
3031     goto out;
3032     }
3033     __set_current_state(TASK_INTERRUPTIBLE);
3034     spin_unlock_irq(&x->wait.lock);
3035     timeout = schedule_timeout(timeout);
3036     spin_lock_irq(&x->wait.lock);
3037     if (!timeout) {
3038     __remove_wait_queue(&x->wait, &wait);
3039     goto out;
3040     }
3041     } while (!x->done);
3042     __remove_wait_queue(&x->wait, &wait);
3043     }
3044     x->done--;
3045     out:
3046     spin_unlock_irq(&x->wait.lock);
3047     return timeout;
3048     }
3049     EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3050    
3051    
3052     #define SLEEP_ON_VAR \
3053     unsigned long flags; \
3054     wait_queue_t wait; \
3055     init_waitqueue_entry(&wait, current);
3056    
3057     #define SLEEP_ON_HEAD \
3058     spin_lock_irqsave(&q->lock,flags); \
3059     __add_wait_queue(q, &wait); \
3060     spin_unlock(&q->lock);
3061    
3062     #define SLEEP_ON_TAIL \
3063     spin_lock_irq(&q->lock); \
3064     __remove_wait_queue(q, &wait); \
3065     spin_unlock_irqrestore(&q->lock, flags);
3066    
3067     void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3068     {
3069     SLEEP_ON_VAR
3070    
3071     current->state = TASK_INTERRUPTIBLE;
3072    
3073     SLEEP_ON_HEAD
3074     schedule();
3075     SLEEP_ON_TAIL
3076     }
3077    
3078     EXPORT_SYMBOL(interruptible_sleep_on);
3079    
3080     long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3081     {
3082     SLEEP_ON_VAR
3083    
3084     current->state = TASK_INTERRUPTIBLE;
3085    
3086     SLEEP_ON_HEAD
3087     timeout = schedule_timeout(timeout);
3088     SLEEP_ON_TAIL
3089    
3090     return timeout;
3091     }
3092    
3093     EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3094    
3095     void fastcall __sched sleep_on(wait_queue_head_t *q)
3096     {
3097     SLEEP_ON_VAR
3098    
3099     current->state = TASK_UNINTERRUPTIBLE;
3100    
3101     SLEEP_ON_HEAD
3102     schedule();
3103     SLEEP_ON_TAIL
3104     }
3105    
3106     EXPORT_SYMBOL(sleep_on);
3107    
3108     long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3109     {
3110     SLEEP_ON_VAR
3111    
3112     current->state = TASK_UNINTERRUPTIBLE;
3113    
3114     SLEEP_ON_HEAD
3115     timeout = schedule_timeout(timeout);
3116     SLEEP_ON_TAIL
3117    
3118     return timeout;
3119     }
3120    
3121     EXPORT_SYMBOL(sleep_on_timeout);
3122    
3123     void set_user_nice(task_t *p, long nice)
3124     {
3125     unsigned long flags;
3126     runqueue_t *rq;
3127     int queued, old_prio, new_prio, delta;
3128    
3129     if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3130     return;
3131     /*
3132     * We have to be careful, if called from sys_setpriority(),
3133     * the task might be in the middle of scheduling on another CPU.
3134     */
3135     rq = task_rq_lock(p, &flags);
3136     /*
3137     * The RT priorities are set via sched_setscheduler(), but we still
3138     * allow the 'normal' nice value to be set - but as expected
3139     * it wont have any effect on scheduling until the task is
3140     * not SCHED_NORMAL:
3141     */
3142     if (rt_task(p)) {
3143     p->static_prio = NICE_TO_PRIO(nice);
3144     goto out_unlock;
3145     }
3146     if ((queued = task_queued(p))) {
3147     dequeue_task(p, rq);
3148     dec_prio_bias(rq, p->static_prio);
3149     }
3150    
3151     old_prio = p->prio;
3152     new_prio = NICE_TO_PRIO(nice);
3153     delta = new_prio - old_prio;
3154     p->static_prio = NICE_TO_PRIO(nice);
3155     p->prio += delta;
3156    
3157     if (queued) {
3158     enqueue_task(p, rq);
3159     inc_prio_bias(rq, p->static_prio);
3160     /*
3161     * If the task increased its priority or is running and
3162     * lowered its priority, then reschedule its CPU:
3163     */
3164     if (delta < 0 || ((delta > 0 || batch_task(p)) &&
3165     task_running(rq, p)))
3166     resched_task(rq->curr);
3167     }
3168     out_unlock:
3169     task_rq_unlock(rq, &flags);
3170     }
3171    
3172     EXPORT_SYMBOL(set_user_nice);
3173    
3174     /*
3175     * can_nice - check if a task can reduce its nice value
3176     * @p: task
3177     * @nice: nice value
3178     */
3179     int can_nice(const task_t *p, const int nice)
3180     {
3181     /* convert nice value [19,-20] to rlimit style value [0,39] */
3182     int nice_rlim = 19 - nice;
3183     return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3184     capable(CAP_SYS_NICE));
3185     }
3186    
3187     #ifdef __ARCH_WANT_SYS_NICE
3188    
3189     /*
3190     * sys_nice - change the priority of the current process.
3191     * @increment: priority increment
3192     *
3193     * sys_setpriority is a more generic, but much slower function that
3194     * does similar things.
3195     */
3196     asmlinkage long sys_nice(int increment)
3197     {
3198     int retval;
3199     long nice;
3200    
3201     /*
3202     * Setpriority might change our priority at the same moment.
3203     * We don't have to worry. Conceptually one call occurs first
3204     * and we have a single winner.
3205     */
3206     if (increment < -40)
3207     increment = -40;
3208     if (increment > 40)
3209     increment = 40;
3210    
3211     nice = PRIO_TO_NICE(current->static_prio) + increment;
3212     if (nice < -20)
3213     nice = -20;
3214     if (nice > 19)
3215     nice = 19;
3216    
3217     if (increment < 0 && !can_nice(current, nice))
3218     return -EPERM;
3219    
3220     retval = security_task_setnice(current, nice);
3221     if (retval)
3222     return retval;
3223    
3224     set_user_nice(current, nice);
3225     return 0;
3226     }
3227    
3228     #endif
3229    
3230     /**
3231     * task_prio - return the priority value of a given task.
3232     * @p: the task in question.
3233     *
3234     * This is the priority value as seen by users in /proc.
3235     * RT tasks are offset by -200. Normal tasks are centered
3236     * around 0, value goes from -16 to +15.
3237     */
3238     int task_prio(const task_t *p)
3239     {
3240     return p->prio - MAX_RT_PRIO;
3241     }
3242    
3243     /**
3244     * task_nice - return the nice value of a given task.
3245     * @p: the task in question.
3246     */
3247     int task_nice(const task_t *p)
3248     {
3249     return TASK_NICE(p);
3250     }
3251    
3252     /*
3253     * The only users of task_nice are binfmt_elf and binfmt_elf32.
3254     * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3255     * Therefore, task_nice is needed if there is a compat_mode.
3256     */
3257     #ifdef CONFIG_COMPAT
3258     EXPORT_SYMBOL_GPL(task_nice);
3259     #endif
3260    
3261     /**
3262     * idle_cpu - is a given cpu idle currently?
3263     * @cpu: the processor in question.
3264     */
3265     int idle_cpu(int cpu)
3266     {
3267     return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3268     }
3269    
3270     EXPORT_SYMBOL_GPL(idle_cpu);
3271    
3272     /**
3273     * idle_task - return the idle task for a given cpu.
3274     * @cpu: the processor in question.
3275     */
3276     task_t *idle_task(int cpu)
3277     {
3278     return cpu_rq(cpu)->idle;
3279     }
3280    
3281     /**
3282     * find_process_by_pid - find a process with a matching PID value.
3283     * @pid: the pid in question.
3284     */
3285     static inline task_t *find_process_by_pid(pid_t pid)
3286     {
3287     return pid ? find_task_by_pid(pid) : current;
3288     }
3289    
3290     /* Actually do priority change: must hold rq lock. */
3291     static void __setscheduler(struct task_struct *p, int policy, int prio)
3292     {
3293     BUG_ON(task_queued(p));
3294     p->policy = policy;
3295     p->rt_priority = prio;
3296     if (SCHED_RT(policy))
3297     p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3298     else
3299     p->prio = p->static_prio;
3300     }
3301    
3302     /**
3303     * sched_setscheduler - change the scheduling policy and/or RT priority of
3304     * a thread.
3305     * @p: the task in question.
3306     * @policy: new policy.
3307     * @param: structure containing the new RT priority.
3308     */
3309     int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3310     {
3311     int retval;
3312     int queued, oldprio, oldpolicy = -1;
3313     unsigned long flags;
3314     runqueue_t *rq;
3315    
3316     recheck:
3317     /* double check policy once rq lock held */
3318     if (policy < 0)
3319     policy = oldpolicy = p->policy;
3320     else if (!SCHED_RANGE(policy))
3321     return -EINVAL;
3322     /*
3323     * Valid priorities for SCHED_FIFO and SCHED_RR are
3324     * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3325     */
3326     if (param->sched_priority < 0 ||
3327     param->sched_priority > MAX_USER_RT_PRIO-1)
3328     return -EINVAL;
3329     if ((!SCHED_RT(policy)) != (param->sched_priority == 0))
3330     return -EINVAL;
3331    
3332     if (SCHED_RT(policy) &&
3333     param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
3334     !capable(CAP_SYS_NICE))
3335     return -EPERM;
3336     if ((current->euid != p->euid) && (current->euid != p->uid) &&
3337     !capable(CAP_SYS_NICE))
3338     return -EPERM;
3339    
3340     if (!(p->mm) && policy == SCHED_BATCH)
3341     /*
3342     * Don't allow kernel threads to be SCHED_BATCH.
3343     */
3344     return -EINVAL;
3345    
3346     retval = security_task_setscheduler(p, policy, param);
3347     if (retval)
3348     return retval;
3349     /*
3350     * To be able to change p->policy safely, the apropriate
3351     * runqueue lock must be held.
3352     */
3353     rq = task_rq_lock(p, &flags);
3354     /* recheck policy now with rq lock held */
3355     if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3356     policy = oldpolicy = -1;
3357     task_rq_unlock(rq, &flags);
3358     goto recheck;
3359     }
3360     if ((queued = task_queued(p)))
3361     deactivate_task(p, rq);
3362     oldprio = p->prio;
3363     __setscheduler(p, policy, param->sched_priority);
3364     if (queued) {
3365     __activate_task(p, rq);
3366     /*
3367     * Reschedule if we are currently running on this runqueue and
3368     * our priority decreased, or if we are not currently running on
3369     * this runqueue and our priority is higher than the current's
3370     */
3371     if (task_running(rq, p)) {
3372     if (p->prio > oldprio)
3373     resched_task(rq->curr);
3374     } else
3375     preempt(p, rq);
3376     }
3377     task_rq_unlock(rq, &flags);
3378     return 0;
3379     }
3380     EXPORT_SYMBOL_GPL(sched_setscheduler);
3381    
3382     static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3383     {
3384     int retval;
3385     struct sched_param lparam;
3386     struct task_struct *p;
3387    
3388     if (!param || pid < 0)
3389     return -EINVAL;
3390     if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3391     return -EFAULT;
3392     read_lock_irq(&tasklist_lock);
3393     p = find_process_by_pid(pid);
3394     if (!p) {
3395     read_unlock_irq(&tasklist_lock);
3396     return -ESRCH;
3397     }
3398     retval = sched_setscheduler(p, policy, &lparam);
3399     read_unlock_irq(&tasklist_lock);
3400     return retval;
3401     }
3402    
3403     /**
3404     * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3405     * @pid: the pid in question.
3406     * @policy: new policy.
3407     * @param: structure containing the new RT priority.
3408     */
3409     asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3410     struct sched_param __user *param)
3411     {
3412     return do_sched_setscheduler(pid, policy, param);
3413     }
3414    
3415     /**
3416     * sys_sched_setparam - set/change the RT priority of a thread
3417     * @pid: the pid in question.
3418     * @param: structure containing the new RT priority.
3419     */
3420     asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3421     {
3422     return do_sched_setscheduler(pid, -1, param);
3423     }
3424    
3425     /**
3426     * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3427     * @pid: the pid in question.
3428     */
3429     asmlinkage long sys_sched_getscheduler(pid_t pid)
3430     {
3431     int retval = -EINVAL;
3432     task_t *p;
3433    
3434     if (pid < 0)
3435     goto out_nounlock;
3436    
3437     retval = -ESRCH;
3438     read_lock(&tasklist_lock);
3439     p = find_process_by_pid(pid);
3440     if (p) {
3441     retval = security_task_getscheduler(p);
3442     if (!retval)
3443     retval = p->policy;
3444     }
3445     read_unlock(&tasklist_lock);
3446    
3447     out_nounlock:
3448     return retval;
3449     }
3450    
3451     /**
3452     * sys_sched_getscheduler - get the RT priority of a thread
3453     * @pid: the pid in question.
3454     * @param: structure containing the RT priority.
3455     */
3456     asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3457     {
3458     struct sched_param lp;
3459     int retval = -EINVAL;
3460     task_t *p;
3461    
3462     if (!param || pid < 0)
3463     goto out_nounlock;
3464    
3465     read_lock(&tasklist_lock);
3466     p = find_process_by_pid(pid);
3467     retval = -ESRCH;
3468     if (!p)
3469     goto out_unlock;
3470    
3471     retval = security_task_getscheduler(p);
3472     if (retval)
3473     goto out_unlock;
3474    
3475     lp.sched_priority = p->rt_priority;
3476     read_unlock(&tasklist_lock);
3477    
3478     /*
3479     * This one might sleep, we cannot do it with a spinlock held ...
3480     */
3481     retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3482    
3483     out_nounlock:
3484     return retval;
3485    
3486     out_unlock:
3487     read_unlock(&tasklist_lock);
3488     return retval;
3489     }
3490    
3491     long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3492     {
3493     task_t *p;
3494     int retval;
3495     cpumask_t cpus_allowed;
3496    
3497     lock_cpu_hotplug();
3498     read_lock(&tasklist_lock);
3499    
3500     p = find_process_by_pid(pid);
3501     if (!p) {
3502     read_unlock(&tasklist_lock);
3503     unlock_cpu_hotplug();
3504     return -ESRCH;
3505     }
3506    
3507     /*
3508     * It is not safe to call set_cpus_allowed with the
3509     * tasklist_lock held. We will bump the task_struct's
3510     * usage count and then drop tasklist_lock.
3511     */
3512     get_task_struct(p);
3513     read_unlock(&tasklist_lock);
3514    
3515     retval = -EPERM;
3516     if ((current->euid != p->euid) && (current->euid != p->uid) &&
3517     !capable(CAP_SYS_NICE))
3518     goto out_unlock;
3519    
3520     cpus_allowed = cpuset_cpus_allowed(p);
3521     cpus_and(new_mask, new_mask, cpus_allowed);
3522     retval = set_cpus_allowed(p, new_mask);
3523    
3524     out_unlock:
3525     put_task_struct(p);
3526     unlock_cpu_hotplug();
3527     return retval;
3528     }
3529    
3530     static inline int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3531     cpumask_t *new_mask)
3532     {
3533     if (len < sizeof(cpumask_t)) {
3534     memset(new_mask, 0, sizeof(cpumask_t));
3535     } else if (len > sizeof(cpumask_t)) {
3536     len = sizeof(cpumask_t);
3537     }
3538     return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3539     }
3540    
3541     /**
3542     * sys_sched_setaffinity - set the cpu affinity of a process
3543     * @pid: pid of the process
3544     * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3545     * @user_mask_ptr: user-space pointer to the new cpu mask
3546     */
3547     asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3548     unsigned long __user *user_mask_ptr)
3549     {
3550     cpumask_t new_mask;
3551     int retval;
3552    
3553     retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3554     if (retval)
3555     return retval;
3556    
3557     return sched_setaffinity(pid, new_mask);
3558     }
3559    
3560     /*
3561     * Represents all cpu's present in the system
3562     * In systems capable of hotplug, this map could dynamically grow
3563     * as new cpu's are detected in the system via any platform specific
3564     * method, such as ACPI for e.g.
3565     */
3566    
3567     cpumask_t cpu_present_map;
3568     EXPORT_SYMBOL(cpu_present_map);
3569    
3570     #ifndef CONFIG_SMP
3571     cpumask_t cpu_online_map = CPU_MASK_ALL;
3572     cpumask_t cpu_possible_map = CPU_MASK_ALL;
3573     #endif
3574    
3575     long sched_getaffinity(pid_t pid, cpumask_t *mask)
3576     {
3577     int retval;
3578     task_t *p;
3579    
3580     lock_cpu_hotplug();
3581     read_lock(&tasklist_lock);
3582    
3583     retval = -ESRCH;
3584     p = find_process_by_pid(pid);
3585     if (!p)
3586     goto out_unlock;
3587    
3588     retval = 0;
3589     cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3590    
3591     out_unlock:
3592     read_unlock(&tasklist_lock);
3593     unlock_cpu_hotplug();
3594     if (retval)
3595     return retval;
3596    
3597     return 0;
3598     }
3599    
3600     /**
3601     * sys_sched_getaffinity - get the cpu affinity of a process
3602     * @pid: pid of the process
3603     * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3604     * @user_mask_ptr: user-space pointer to hold the current cpu mask
3605     */
3606     asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3607     unsigned long __user *user_mask_ptr)
3608     {
3609     int ret;
3610     cpumask_t mask;
3611    
3612     if (len < sizeof(cpumask_t))
3613     return -EINVAL;
3614    
3615     ret = sched_getaffinity(pid, &mask);
3616     if (ret < 0)
3617     return ret;
3618    
3619     if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3620     return -EFAULT;
3621    
3622     return sizeof(cpumask_t);
3623     }
3624    
3625     /**
3626     * sys_sched_yield - yield the current processor to other threads.
3627     * This function yields the current CPU by dropping the priority of current
3628     * to the lowest priority and setting the PF_YIELDED flag.
3629     */
3630     asmlinkage long sys_sched_yield(void)
3631     {
3632     int newprio;
3633     runqueue_t *rq = this_rq_lock();
3634    
3635     newprio = current->prio;
3636     schedstat_inc(rq, yld_cnt);
3637     current->slice = slice(current);
3638     current->time_slice = rr_interval(current);
3639     if (likely(!rt_task(current) && !batch_task(current))) {
3640     current->flags |= PF_YIELDED;
3641     newprio = MAX_PRIO - 2;
3642     }
3643    
3644     if (newprio != current->prio) {
3645     dequeue_task(current, rq);
3646     current->prio = newprio;
3647     enqueue_task(current, rq);
3648     } else
3649     requeue_task(current, rq);
3650    
3651     /*
3652     * Since we are going to call schedule() anyway, there's
3653     * no need to preempt or enable interrupts:
3654     */
3655     __release(rq->lock);
3656     _raw_spin_unlock(&rq->lock);
3657     preempt_enable_no_resched();
3658    
3659     schedule();
3660    
3661     return 0;
3662     }
3663    
3664     static inline void __cond_resched(void)
3665     {
3666     do {
3667     add_preempt_count(PREEMPT_ACTIVE);
3668     schedule();
3669     sub_preempt_count(PREEMPT_ACTIVE);
3670     } while (need_resched());
3671     }
3672    
3673     int __sched cond_resched(void)
3674     {
3675     if (need_resched()) {
3676     __cond_resched();
3677     return 1;
3678     }
3679     return 0;
3680     }
3681    
3682     EXPORT_SYMBOL(cond_resched);
3683    
3684     /*
3685     * cond_resched_lock() - if a reschedule is pending, drop the given lock,
3686     * call schedule, and on return reacquire the lock.
3687     *
3688     * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
3689     * operations here to prevent schedule() from being called twice (once via
3690     * spin_unlock(), once by hand).
3691     */
3692     int cond_resched_lock(spinlock_t * lock)
3693     {
3694     int ret = 0;
3695    
3696     if (need_lockbreak(lock)) {
3697     spin_unlock(lock);
3698     cpu_relax();
3699     ret = 1;
3700     spin_lock(lock);
3701     }
3702     if (need_resched()) {
3703     _raw_spin_unlock(lock);
3704     preempt_enable_no_resched();
3705     __cond_resched();
3706     ret = 1;
3707     spin_lock(lock);
3708     }
3709     return ret;
3710     }
3711    
3712     EXPORT_SYMBOL(cond_resched_lock);
3713    
3714     int __sched cond_resched_softirq(void)
3715     {
3716     BUG_ON(!in_softirq());
3717    
3718     if (need_resched()) {
3719     __local_bh_enable();
3720     __cond_resched();
3721     local_bh_disable();
3722     return 1;
3723     }
3724     return 0;
3725     }
3726    
3727     EXPORT_SYMBOL(cond_resched_softirq);
3728    
3729    
3730     /**
3731     * yield - yield the current processor to other threads.
3732     *
3733     * this is a shortcut for kernel-space yielding - it marks the
3734     * thread runnable and calls sys_sched_yield().
3735     */
3736     void __sched yield(void)
3737     {
3738     set_current_state(TASK_RUNNING);
3739     sys_sched_yield();
3740     }
3741    
3742     EXPORT_SYMBOL(yield);
3743    
3744     /*
3745     * This task is about to go to sleep on IO. Increment rq->nr_iowait so
3746     * that process accounting knows that this is a task in IO wait state.
3747     *
3748     * But don't do that if it is a deliberate, throttling IO wait (this task
3749     * has set its backing_dev_info: the queue against which it should throttle)
3750     */
3751     void __sched io_schedule(void)
3752     {
3753     struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3754    
3755     atomic_inc(&rq->nr_iowait);
3756     schedule();
3757     atomic_dec(&rq->nr_iowait);
3758     }
3759    
3760     EXPORT_SYMBOL(io_schedule);
3761    
3762     long __sched io_schedule_timeout(long timeout)
3763     {
3764     struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3765     long ret;
3766    
3767     atomic_inc(&rq->nr_iowait);
3768     ret = schedule_timeout(timeout);
3769     atomic_dec(&rq->nr_iowait);
3770     return ret;
3771     }
3772    
3773     /**
3774     * sys_sched_get_priority_max - return maximum RT priority.
3775     * @policy: scheduling class.
3776     *
3777     * this syscall returns the maximum rt_priority that can be used
3778     * by a given scheduling class.
3779     */
3780     asmlinkage long sys_sched_get_priority_max(int policy)
3781     {
3782     int ret = -EINVAL;
3783    
3784     switch (policy) {
3785     case SCHED_FIFO:
3786     case SCHED_RR:
3787     ret = MAX_USER_RT_PRIO-1;
3788     break;
3789     case SCHED_NORMAL:
3790     case SCHED_BATCH:
3791     ret = 0;
3792     break;
3793     }
3794     return ret;
3795     }
3796    
3797     /**
3798     * sys_sched_get_priority_min - return minimum RT priority.
3799     * @policy: scheduling class.
3800     *
3801     * this syscall returns the minimum rt_priority that can be used
3802     * by a given scheduling class.
3803     */
3804     asmlinkage long sys_sched_get_priority_min(int policy)
3805     {
3806     int ret = -EINVAL;
3807    
3808     switch (policy) {
3809     case SCHED_FIFO:
3810     case SCHED_RR:
3811     ret = 1;
3812     break;
3813     case SCHED_NORMAL:
3814     case SCHED_BATCH:
3815     ret = 0;
3816     }
3817     return ret;
3818     }
3819    
3820     /**
3821     * sys_sched_rr_get_interval - return the default timeslice of a process.
3822     * @pid: pid of the process.
3823     * @interval: userspace pointer to the timeslice value.
3824     *
3825     * this syscall writes the default timeslice value of a given process
3826     * into the user-space timespec buffer. A value of '0' means infinity.
3827     */
3828     asmlinkage
3829     long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3830     {
3831     int retval = -EINVAL;
3832     struct timespec t;
3833     task_t *p;
3834    
3835     if (pid < 0)
3836     goto out_nounlock;
3837    
3838     retval = -ESRCH;
3839     read_lock(&tasklist_lock);
3840     p = find_process_by_pid(pid);
3841     if (!p)
3842     goto out_unlock;
3843    
3844     retval = security_task_getscheduler(p);
3845     if (retval)
3846     goto out_unlock;
3847    
3848     jiffies_to_timespec(p->policy & SCHED_FIFO ?
3849     0 : slice(p), &t);
3850     read_unlock(&tasklist_lock);
3851     retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3852     out_nounlock:
3853     return retval;
3854     out_unlock:
3855     read_unlock(&tasklist_lock);
3856     return retval;
3857     }
3858    
3859     static inline struct task_struct *eldest_child(struct task_struct *p)
3860     {
3861     if (list_empty(&p->children)) return NULL;
3862     return list_entry(p->children.next,struct task_struct,sibling);
3863     }
3864    
3865     static inline struct task_struct *older_sibling(struct task_struct *p)
3866     {
3867     if (p->sibling.prev==&p->parent->children) return NULL;
3868     return list_entry(p->sibling.prev,struct task_struct,sibling);
3869     }
3870    
3871     static inline struct task_struct *younger_sibling(struct task_struct *p)
3872     {
3873     if (p->sibling.next==&p->parent->children) return NULL;
3874     return list_entry(p->sibling.next,struct task_struct,sibling);
3875     }
3876    
3877     static inline void show_task(task_t * p)
3878     {
3879     task_t *relative;
3880     unsigned state;
3881     unsigned long free = 0;
3882     static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3883    
3884     printk("%-13.13s ", p->comm);
3885     state = p->state ? __ffs(p->state) + 1 : 0;
3886     if (state < ARRAY_SIZE(stat_nam))
3887     printk(stat_nam[state]);
3888     else
3889     printk("?");
3890     #if (BITS_PER_LONG == 32)
3891     if (state == TASK_RUNNING)
3892     printk(" running ");
3893     else
3894     printk(" %08lX ", thread_saved_pc(p));
3895     #else
3896     if (state == TASK_RUNNING)
3897     printk(" running task ");
3898     else
3899     printk(" %016lx ", thread_saved_pc(p));
3900     #endif
3901     #ifdef CONFIG_DEBUG_STACK_USAGE
3902     {
3903     unsigned long * n = (unsigned long *) (p->thread_info+1);
3904     while (!*n)
3905     n++;
3906     free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3907     }
3908     #endif
3909     printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3910     if ((relative = eldest_child(p)))
3911     printk("%5d ", relative->pid);
3912     else
3913     printk(" ");
3914     if ((relative = younger_sibling(p)))
3915     printk("%7d", relative->pid);
3916     else
3917     printk(" ");
3918     if ((relative = older_sibling(p)))
3919     printk(" %5d", relative->pid);
3920     else
3921     printk(" ");
3922     if (!p->mm)
3923     printk(" (L-TLB)\n");
3924     else
3925     printk(" (NOTLB)\n");
3926    
3927     if (state != TASK_RUNNING)
3928     show_stack(p, NULL);
3929     }
3930    
3931     void show_state(void)
3932     {
3933     task_t *g, *p;
3934    
3935     #if (BITS_PER_LONG == 32)
3936     printk("\n"
3937     " sibling\n");
3938     printk(" task PC pid father child younger older\n");
3939     #else
3940     printk("\n"
3941     " sibling\n");
3942     printk(" task PC pid father child younger older\n");
3943     #endif
3944     read_lock(&tasklist_lock);
3945     do_each_thread(g, p) {
3946     /*
3947     * reset the NMI-timeout, listing all files on a slow
3948     * console might take alot of time:
3949     */
3950     touch_nmi_watchdog();
3951     show_task(p);
3952     } while_each_thread(g, p);
3953    
3954     read_unlock(&tasklist_lock);
3955     }
3956    
3957     void __devinit init_idle(task_t *idle, int cpu)
3958     {
3959     runqueue_t *rq = cpu_rq(cpu);
3960     unsigned long flags;
3961    
3962     idle->prio = MAX_PRIO;
3963     idle->state = TASK_RUNNING;
3964     idle->cpus_allowed = cpumask_of_cpu(cpu);
3965     set_task_cpu(idle, cpu);
3966    
3967     spin_lock_irqsave(&rq->lock, flags);
3968     rq->curr = rq->idle = idle;
3969     set_tsk_need_resched(idle);
3970     spin_unlock_irqrestore(&rq->lock, flags);
3971    
3972     /* Set the preempt count _outside_ the spinlocks! */
3973     #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
3974     idle->thread_info->preempt_count = (idle->lock_depth >= 0);
3975     #else
3976     idle->thread_info->preempt_count = 0;
3977     #endif
3978     }
3979    
3980     /*
3981     * In a system that switches off the HZ timer nohz_cpu_mask
3982     * indicates which cpus entered this state. This is used
3983     * in the rcu update to wait only for active cpus. For system
3984     * which do not switch off the HZ timer nohz_cpu_mask should
3985     * always be CPU_MASK_NONE.
3986     */
3987     cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
3988    
3989     #ifdef CONFIG_SMP
3990     /*
3991     * This is how migration works:
3992     *
3993     * 1) we queue a migration_req_t structure in the source CPU's
3994     * runqueue and wake up that CPU's migration thread.
3995     * 2) we down() the locked semaphore => thread blocks.
3996     * 3) migration thread wakes up (implicitly it forces the migrated
3997     * thread off the CPU)
3998     * 4) it gets the migration request and checks whether the migrated
3999     * task is still in the wrong runqueue.
4000     * 5) if it's in the wrong runqueue then the migration thread removes
4001     * it and puts it into the right queue.
4002     * 6) migration thread up()s the semaphore.
4003     * 7) we wake up and the migration is done.
4004     */
4005    
4006     /*
4007     * Change a given task's CPU affinity. Migrate the thread to a
4008     * proper CPU and schedule it away if the CPU it's executing on
4009     * is removed from the allowed bitmask.
4010     *
4011     * NOTE: the caller must have a valid reference to the task, the
4012     * task must not exit() & deallocate itself prematurely. The
4013     * call is not atomic; no spinlocks may be held.
4014     */
4015     int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4016     {
4017     unsigned long flags;
4018     int ret = 0;
4019     migration_req_t req;
4020     runqueue_t *rq;
4021    
4022     rq = task_rq_lock(p, &flags);
4023     if (!cpus_intersects(new_mask, cpu_online_map)) {
4024     ret = -EINVAL;
4025     goto out;
4026     }
4027    
4028     p->cpus_allowed = new_mask;
4029     /* Can the task run on the task's current CPU? If so, we're done */
4030     if (cpu_isset(task_cpu(p), new_mask))
4031     goto out;
4032    
4033     if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4034     /* Need help from migration thread: drop lock and wait. */
4035     task_rq_unlock(rq, &flags);
4036     wake_up_process(rq->migration_thread);
4037     wait_for_completion(&req.done);
4038     tlb_migrate_finish(p->mm);
4039     return 0;
4040     }
4041     out:
4042     task_rq_unlock(rq, &flags);
4043     return ret;
4044     }
4045    
4046     EXPORT_SYMBOL_GPL(set_cpus_allowed);
4047    
4048     /*
4049     * Move (not current) task off this cpu, onto dest cpu. We're doing
4050     * this because either it can't run here any more (set_cpus_allowed()
4051     * away from this CPU, or CPU going down), or because we're
4052     * attempting to rebalance this task on exec (sched_exec).
4053     *
4054     * So we race with normal scheduler movements, but that's OK, as long
4055     * as the task is no longer on this CPU.
4056     */
4057     static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4058     {
4059     runqueue_t *rq_dest, *rq_src;
4060    
4061     if (unlikely(cpu_is_offline(dest_cpu)))
4062     return;
4063    
4064     rq_src = cpu_rq(src_cpu);
4065     rq_dest = cpu_rq(dest_cpu);
4066    
4067     double_rq_lock(rq_src, rq_dest);
4068     /* Already moved. */
4069     if (task_cpu(p) != src_cpu)
4070     goto out;
4071     /* Affinity changed (again). */
4072     if (!cpu_isset(dest_cpu, p->cpus_allowed))
4073     goto out;
4074    
4075     set_task_cpu(p, dest_cpu);
4076     if (task_queued(p)) {
4077     /*
4078     * Sync timestamp with rq_dest's before activating.
4079     * The same thing could be achieved by doing this step
4080     * afterwards, and pretending it was a local activate.
4081     * This way is cleaner and logically correct.
4082     */
4083     p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4084     + rq_dest->timestamp_last_tick;
4085     deactivate_task(p, rq_src);
4086     activate_task(p, rq_dest, 0);
4087     preempt(p, rq_dest);
4088     }
4089    
4090     out:
4091     double_rq_unlock(rq_src, rq_dest);
4092     }
4093    
4094     /*
4095     * migration_thread - this is a highprio system thread that performs
4096     * thread migration by bumping thread off CPU then 'pushing' onto
4097     * another runqueue.
4098     */
4099     static int migration_thread(void * data)
4100     {
4101     runqueue_t *rq;
4102     int cpu = (long)data;
4103    
4104     rq = cpu_rq(cpu);
4105     BUG_ON(rq->migration_thread != current);
4106    
4107     set_current_state(TASK_INTERRUPTIBLE);
4108     while (!kthread_should_stop()) {
4109     struct list_head *head;
4110     migration_req_t *req;
4111    
4112     if (current->flags & PF_FREEZE)
4113     refrigerator(PF_FREEZE);
4114    
4115     spin_lock_irq(&rq->lock);
4116    
4117     if (cpu_is_offline(cpu)) {
4118     spin_unlock_irq(&rq->lock);
4119     goto wait_to_die;
4120     }
4121    
4122     if (rq->active_balance) {
4123     active_load_balance(rq, cpu);
4124     rq->active_balance = 0;
4125     }
4126    
4127     head = &rq->migration_queue;
4128    
4129     if (list_empty(head)) {
4130     spin_unlock_irq(&rq->lock);
4131     schedule();
4132     set_current_state(TASK_INTERRUPTIBLE);
4133     continue;
4134     }
4135     req = list_entry(head->next, migration_req_t, list);
4136     list_del_init(head->next);
4137    
4138     if (req->type == REQ_MOVE_TASK) {
4139     spin_unlock(&rq->lock);
4140     __migrate_task(req->task, cpu, req->dest_cpu);
4141     local_irq_enable();
4142     } else if (req->type == REQ_SET_DOMAIN) {
4143     rq->sd = req->sd;
4144     spin_unlock_irq(&rq->lock);
4145     } else {
4146     spin_unlock_irq(&rq->lock);
4147     WARN_ON(1);
4148     }
4149    
4150     complete(&req->done);
4151     }
4152     __set_current_state(TASK_RUNNING);
4153     return 0;
4154    
4155     wait_to_die:
4156     /* Wait for kthread_stop */
4157     set_current_state(TASK_INTERRUPTIBLE);
4158     while (!kthread_should_stop()) {
4159     schedule();
4160     set_current_state(TASK_INTERRUPTIBLE);
4161     }
4162     __set_current_state(TASK_RUNNING);
4163     return 0;
4164     }
4165    
4166     #ifdef CONFIG_HOTPLUG_CPU
4167     /* Figure out where task on dead CPU should go, use force if neccessary. */
4168     static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4169     {
4170     int dest_cpu;
4171     cpumask_t mask;
4172    
4173     /* On same node? */
4174     mask = node_to_cpumask(cpu_to_node(dead_cpu));
4175     cpus_and(mask, mask, tsk->cpus_allowed);
4176     dest_cpu = any_online_cpu(mask);
4177    
4178     /* On any allowed CPU? */
4179     if (dest_cpu == NR_CPUS)
4180     dest_cpu = any_online_cpu(tsk->cpus_allowed);
4181    
4182     /* No more Mr. Nice Guy. */
4183     if (dest_cpu == NR_CPUS) {
4184     cpus_setall(tsk->cpus_allowed);
4185     dest_cpu = any_online_cpu(tsk->cpus_allowed);
4186    
4187     /*
4188     * Don't tell them about moving exiting tasks or
4189     * kernel threads (both mm NULL), since they never
4190     * leave kernel.
4191     */
4192     if (tsk->mm && printk_ratelimit())
4193     printk(KERN_INFO "process %d (%s) no "
4194     "longer affine to cpu%d\n",
4195     tsk->pid, tsk->comm, dead_cpu);
4196     }
4197     __migrate_task(tsk, dead_cpu, dest_cpu);
4198     }
4199    
4200     /*
4201     * While a dead CPU has no uninterruptible tasks queued at this point,
4202     * it might still have a nonzero ->nr_uninterruptible counter, because
4203     * for performance reasons the counter is not stricly tracking tasks to
4204     * their home CPUs. So we just add the counter to another CPU's counter,
4205     * to keep the global sum constant after CPU-down:
4206     */
4207     static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4208     {
4209     runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4210     unsigned long flags;
4211    
4212     local_irq_save(flags);
4213     double_rq_lock(rq_src, rq_dest);
4214     rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4215     rq_src->nr_uninterruptible = 0;
4216     double_rq_unlock(rq_src, rq_dest);
4217     local_irq_restore(flags);
4218     }
4219    
4220     /* Run through task list and migrate tasks from the dead cpu. */
4221     static void migrate_live_tasks(int src_cpu)
4222     {
4223     struct task_struct *tsk, *t;
4224    
4225     write_lock_irq(&tasklist_lock);
4226    
4227     do_each_thread(t, tsk) {
4228     if (tsk == current)
4229     continue;
4230    
4231     if (task_cpu(tsk) == src_cpu)
4232     move_task_off_dead_cpu(src_cpu, tsk);
4233     } while_each_thread(t, tsk);
4234    
4235     write_unlock_irq(&tasklist_lock);
4236     }
4237    
4238     /* Schedules idle task to be the next runnable task on current CPU.
4239     * It does so by boosting its priority to highest possible and adding it to
4240     * the _front_ of runqueue. Used by CPU offline code.
4241     */
4242     void sched_idle_next(void)
4243     {
4244     int cpu = smp_processor_id();
4245     runqueue_t *rq = this_rq();
4246     struct task_struct *p = rq->idle;
4247     unsigned long flags;
4248    
4249     /* cpu has to be offline */
4250     BUG_ON(cpu_online(cpu));
4251    
4252     /* Strictly not necessary since rest of the CPUs are stopped by now
4253     * and interrupts disabled on current cpu.
4254     */
4255     spin_lock_irqsave(&rq->lock, flags);
4256    
4257     __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4258     /* Add idle task to _front_ of it's priority queue */
4259     __activate_idle_task(p, rq);
4260    
4261     spin_unlock_irqrestore(&rq->lock, flags);
4262     }
4263    
4264     /* Ensures that the idle task is using init_mm right before its cpu goes
4265     * offline.
4266     */
4267     void idle_task_exit(void)
4268     {
4269     struct mm_struct *mm = current->active_mm;
4270    
4271     BUG_ON(cpu_online(smp_processor_id()));
4272    
4273     if (mm != &init_mm)
4274     switch_mm(mm, &init_mm, current);
4275     mmdrop(mm);
4276     }
4277    
4278     static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4279     {
4280     struct runqueue *rq = cpu_rq(dead_cpu);
4281    
4282     /* Must be exiting, otherwise would be on tasklist. */
4283     BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4284    
4285     /* Cannot have done final schedule yet: would have vanished. */
4286     BUG_ON(tsk->flags & PF_DEAD);
4287    
4288     get_task_struct(tsk);
4289    
4290     /*
4291     * Drop lock around migration; if someone else moves it,
4292     * that's OK. No task can be added to this CPU, so iteration is
4293     * fine.
4294     */
4295     spin_unlock_irq(&rq->lock);
4296     move_task_off_dead_cpu(dead_cpu, tsk);
4297     spin_lock_irq(&rq->lock);
4298    
4299     put_task_struct(tsk);
4300     }
4301    
4302     /* release_task() removes task from tasklist, so we won't find dead tasks. */
4303     static void migrate_dead_tasks(unsigned int dead_cpu)
4304     {
4305     unsigned arr, i;
4306     struct runqueue *rq = cpu_rq(dead_cpu);
4307    
4308     for (arr = 0; arr < 2; arr++) {
4309     for (i = 0; i < MAX_PRIO; i++) {
4310     struct list_head *list = &rq->queue[i];
4311     while (!list_empty(list))
4312     migrate_dead(dead_cpu,
4313     list_entry(list->next, task_t,
4314     run_list));
4315     }
4316     }
4317     }
4318     #endif /* CONFIG_HOTPLUG_CPU */
4319    
4320     /*
4321     * migration_call - callback that gets triggered when a CPU is added.
4322     * Here we can start up the necessary migration thread for the new CPU.
4323     */
4324     static int migration_call(struct notifier_block *nfb, unsigned long action,
4325     void *hcpu)
4326     {
4327     int cpu = (long)hcpu;
4328     struct task_struct *p;
4329     struct runqueue *rq;
4330     unsigned long flags;
4331    
4332     switch (action) {
4333     case CPU_UP_PREPARE:
4334     p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4335     if (IS_ERR(p))
4336     return NOTIFY_BAD;
4337     p->flags |= PF_NOFREEZE;
4338     kthread_bind(p, cpu);
4339     /* Must be high prio: stop_machine expects to yield to it. */
4340     rq = task_rq_lock(p, &flags);
4341     __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4342     task_rq_unlock(rq, &flags);
4343     cpu_rq(cpu)->migration_thread = p;
4344     break;
4345     case CPU_ONLINE:
4346     /* Strictly unneccessary, as first user will wake it. */
4347     wake_up_process(cpu_rq(cpu)->migration_thread);
4348     break;
4349     #ifdef CONFIG_HOTPLUG_CPU
4350     case CPU_UP_CANCELED:
4351     /* Unbind it from offline cpu so it can run. Fall thru. */
4352     kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4353     kthread_stop(cpu_rq(cpu)->migration_thread);
4354     cpu_rq(cpu)->migration_thread = NULL;
4355     break;
4356     case CPU_DEAD:
4357     migrate_live_tasks(cpu);
4358     rq = cpu_rq(cpu);
4359     kthread_stop(rq->migration_thread);
4360     rq->migration_thread = NULL;
4361     /* Idle task back to normal (off runqueue, low prio) */
4362     rq = task_rq_lock(rq->idle, &flags);
4363     deactivate_task(rq->idle, rq);
4364     rq->idle->static_prio = MAX_PRIO;
4365     __setscheduler(rq->idle, SCHED_NORMAL, 0);
4366     migrate_dead_tasks(cpu);
4367     task_rq_unlock(rq, &flags);
4368     migrate_nr_uninterruptible(rq);
4369     BUG_ON(rq->nr_running != 0);
4370    
4371     /* No need to migrate the tasks: it was best-effort if
4372     * they didn't do lock_cpu_hotplug(). Just wake up
4373     * the requestors. */
4374     spin_lock_irq(&rq->lock);
4375     while (!list_empty(&rq->migration_queue)) {
4376     migration_req_t *req;
4377     req = list_entry(rq->migration_queue.next,
4378     migration_req_t, list);
4379     BUG_ON(req->type != REQ_MOVE_TASK);
4380     list_del_init(&req->list);
4381     complete(&req->done);
4382     }
4383     spin_unlock_irq(&rq->lock);
4384     break;
4385     #endif
4386     }
4387     return NOTIFY_OK;
4388     }
4389    
4390     /* Register at highest priority so that task migration (migrate_all_tasks)
4391     * happens before everything else.
4392     */
4393     static struct notifier_block __devinitdata migration_notifier = {
4394     .notifier_call = migration_call,
4395     .priority = 10
4396     };
4397    
4398     int __init migration_init(void)
4399     {
4400     void *cpu = (void *)(long)smp_processor_id();
4401     /* Start one for boot CPU. */
4402     migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4403     migration_call(&migration_notifier, CPU_ONLINE, cpu);
4404     register_cpu_notifier(&migration_notifier);
4405     return 0;
4406     }
4407     #endif
4408    
4409     #ifdef CONFIG_SMP
4410     #define SCHED_DOMAIN_DEBUG
4411     #ifdef SCHED_DOMAIN_DEBUG
4412     static void sched_domain_debug(struct sched_domain *sd, int cpu)
4413     {
4414     int level = 0;
4415    
4416     printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4417    
4418     do {
4419     int i;
4420     char str[NR_CPUS];
4421     struct sched_group *group = sd->groups;
4422     cpumask_t groupmask;
4423    
4424     cpumask_scnprintf(str, NR_CPUS, sd->span);
4425     cpus_clear(groupmask);
4426    
4427     printk(KERN_DEBUG);
4428     for (i = 0; i < level + 1; i++)
4429     printk(" ");
4430     printk("domain %d: ", level);
4431    
4432     if (!(sd->flags & SD_LOAD_BALANCE)) {
4433     printk("does not load-balance\n");
4434     if (sd->parent)
4435     printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4436     break;
4437     }
4438    
4439     printk("span %s\n", str);
4440    
4441     if (!cpu_isset(cpu, sd->span))
4442     printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4443     if (!cpu_isset(cpu, group->cpumask))
4444     printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4445    
4446     printk(KERN_DEBUG);
4447     for (i = 0; i < level + 2; i++)
4448     printk(" ");
4449     printk("groups:");
4450     do {
4451     if (!group) {
4452     printk("\n");
4453     printk(KERN_ERR "ERROR: group is NULL\n");
4454     break;
4455     }
4456    
4457     if (!group->cpu_power) {
4458     printk("\n");
4459     printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4460     }
4461    
4462     if (!cpus_weight(group->cpumask)) {
4463     printk("\n");
4464     printk(KERN_ERR "ERROR: empty group\n");
4465     }
4466    
4467     if (cpus_intersects(groupmask, group->cpumask)) {
4468     printk("\n");
4469     printk(KERN_ERR "ERROR: repeated CPUs\n");
4470     }
4471    
4472     cpus_or(groupmask, groupmask, group->cpumask);
4473    
4474     cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4475     printk(" %s", str);
4476    
4477     group = group->next;
4478     } while (group != sd->groups);
4479     printk("\n");
4480    
4481     if (!cpus_equal(sd->span, groupmask))
4482     printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4483    
4484     level++;
4485     sd = sd->parent;
4486    
4487     if (sd) {
4488     if (!cpus_subset(groupmask, sd->span))
4489     printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4490     }
4491    
4492     } while (sd);
4493     }
4494     #else
4495     #define sched_domain_debug(sd, cpu) {}
4496     #endif
4497    
4498     /*
4499     * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4500     * hold the hotplug lock.
4501     */
4502     void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4503     {
4504     migration_req_t req;
4505     unsigned long flags;
4506     runqueue_t *rq = cpu_rq(cpu);
4507     int local = 1;
4508    
4509     sched_domain_debug(sd, cpu);
4510    
4511     spin_lock_irqsave(&rq->lock, flags);
4512    
4513     if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4514     rq->sd = sd;
4515     } else {
4516     init_completion(&req.done);
4517     req.type = REQ_SET_DOMAIN;
4518     req.sd = sd;
4519     list_add(&req.list, &rq->migration_queue);
4520     local = 0;
4521     }
4522    
4523     spin_unlock_irqrestore(&rq->lock, flags);
4524    
4525     if (!local) {
4526     wake_up_process(rq->migration_thread);
4527     wait_for_completion(&req.done);
4528     }
4529     }
4530    
4531     /* cpus with isolated domains */
4532     cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4533    
4534     /* Setup the mask of cpus configured for isolated domains */
4535     static int __init isolated_cpu_setup(char *str)
4536     {
4537     int ints[NR_CPUS], i;
4538    
4539     str = get_options(str, ARRAY_SIZE(ints), ints);
4540     cpus_clear(cpu_isolated_map);
4541     for (i = 1; i <= ints[0]; i++)
4542     if (ints[i] < NR_CPUS)
4543     cpu_set(ints[i], cpu_isolated_map);
4544     return 1;
4545     }
4546    
4547     __setup ("isolcpus=", isolated_cpu_setup);
4548    
4549     /*
4550     * init_sched_build_groups takes an array of groups, the cpumask we wish
4551     * to span, and a pointer to a function which identifies what group a CPU
4552     * belongs to. The return value of group_fn must be a valid index into the
4553     * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4554     * keep track of groups covered with a cpumask_t).
4555     *
4556     * init_sched_build_groups will build a circular linked list of the groups
4557     * covered by the given span, and will set each group's ->cpumask correctly,
4558     * and ->cpu_power to 0.
4559     */
4560     void __devinit init_sched_build_groups(struct sched_group groups[],
4561     cpumask_t span, int (*group_fn)(int cpu))
4562     {
4563     struct sched_group *first = NULL, *last = NULL;
4564     cpumask_t covered = CPU_MASK_NONE;
4565     int i;
4566    
4567     for_each_cpu_mask(i, span) {
4568     int group = group_fn(i);
4569     struct sched_group *sg = &groups[group];
4570     int j;
4571    
4572     if (cpu_isset(i, covered))
4573     continue;
4574    
4575     sg->cpumask = CPU_MASK_NONE;
4576     sg->cpu_power = 0;
4577    
4578     for_each_cpu_mask(j, span) {
4579     if (group_fn(j) != group)
4580     continue;
4581    
4582     cpu_set(j, covered);
4583     cpu_set(j, sg->cpumask);
4584     }
4585     if (!first)
4586     first = sg;
4587     if (last)
4588     last->next = sg;
4589     last = sg;
4590     }
4591     last->next = first;
4592     }
4593    
4594    
4595     #ifdef ARCH_HAS_SCHED_DOMAIN
4596     extern void __devinit arch_init_sched_domains(void);
4597     extern void __devinit arch_destroy_sched_domains(void);
4598     #else
4599     #ifdef CONFIG_SCHED_SMT
4600     static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4601     static struct sched_group sched_group_cpus[NR_CPUS];
4602     static int __devinit cpu_to_cpu_group(int cpu)
4603     {
4604     return cpu;
4605     }
4606     #endif
4607    
4608     static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4609     static struct sched_group sched_group_phys[NR_CPUS];
4610     static int __devinit cpu_to_phys_group(int cpu)
4611     {
4612     #ifdef CONFIG_SCHED_SMT
4613     return first_cpu(cpu_sibling_map[cpu]);
4614     #else
4615     return cpu;
4616     #endif
4617     }
4618    
4619     #ifdef CONFIG_NUMA
4620    
4621     static DEFINE_PER_CPU(struct sched_domain, node_domains);
4622     static struct sched_group sched_group_nodes[MAX_NUMNODES];
4623     static int __devinit cpu_to_node_group(int cpu)
4624     {
4625     return cpu_to_node(cpu);
4626     }
4627     #endif
4628    
4629     #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4630     /*
4631     * The domains setup code relies on siblings not spanning
4632     * multiple nodes. Make sure the architecture has a proper
4633     * siblings map:
4634     */
4635     static void check_sibling_maps(void)
4636     {
4637     int i, j;
4638    
4639     for_each_online_cpu(i) {
4640     for_each_cpu_mask(j, cpu_sibling_map[i]) {
4641     if (cpu_to_node(i) != cpu_to_node(j)) {
4642     printk(KERN_INFO "warning: CPU %d siblings map "
4643     "to different node - isolating "
4644     "them.\n", i);
4645     cpu_sibling_map[i] = cpumask_of_cpu(i);
4646     break;
4647     }
4648     }
4649     }
4650     }
4651     #endif
4652    
4653     /*
4654     * Set up scheduler domains and groups. Callers must hold the hotplug lock.
4655     */
4656     static void __devinit arch_init_sched_domains(void)
4657     {
4658     int i;
4659     cpumask_t cpu_default_map;
4660    
4661     #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4662     check_sibling_maps();
4663     #endif
4664     /*
4665     * Setup mask for cpus without special case scheduling requirements.
4666     * For now this just excludes isolated cpus, but could be used to
4667     * exclude other special cases in the future.
4668     */
4669     cpus_complement(cpu_default_map, cpu_isolated_map);
4670     cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4671    
4672     /*
4673     * Set up domains. Isolated domains just stay on the dummy domain.
4674     */
4675     for_each_cpu_mask(i, cpu_default_map) {
4676     int group;
4677     struct sched_domain *sd = NULL, *p;
4678     cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4679    
4680     cpus_and(nodemask, nodemask, cpu_default_map);
4681    
4682     #ifdef CONFIG_NUMA
4683     sd = &per_cpu(node_domains, i);
4684     group = cpu_to_node_group(i);
4685     *sd = SD_NODE_INIT;
4686     sd->span = cpu_default_map;
4687     sd->groups = &sched_group_nodes[group];
4688     #endif
4689    
4690     p = sd;
4691     sd = &per_cpu(phys_domains, i);
4692     group = cpu_to_phys_group(i);
4693     *sd = SD_CPU_INIT;
4694     sd->span = nodemask;
4695     sd->parent = p;
4696     sd->groups = &sched_group_phys[group];
4697    
4698     #ifdef CONFIG_SCHED_SMT
4699     p = sd;
4700     sd = &per_cpu(cpu_domains, i);
4701     group = cpu_to_cpu_group(i);
4702     *sd = SD_SIBLING_INIT;
4703     sd->span = cpu_sibling_map[i];
4704     cpus_and(sd->span, sd->span, cpu_default_map);
4705     sd->parent = p;
4706     sd->groups = &sched_group_cpus[group];
4707     #endif
4708     }
4709    
4710     #ifdef CONFIG_SCHED_SMT
4711     /* Set up CPU (sibling) groups */
4712     for_each_online_cpu(i) {
4713     cpumask_t this_sibling_map = cpu_sibling_map[i];
4714     cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4715     if (i != first_cpu(this_sibling_map))
4716     continue;
4717    
4718     init_sched_build_groups(sched_group_cpus, this_sibling_map,
4719     &cpu_to_cpu_group);
4720     }
4721     #endif
4722    
4723     /* Set up physical groups */
4724     for (i = 0; i < MAX_NUMNODES; i++) {
4725     cpumask_t nodemask = node_to_cpumask(i);
4726    
4727     cpus_and(nodemask, nodemask, cpu_default_map);
4728     if (cpus_empty(nodemask))
4729     continue;
4730    
4731     init_sched_build_groups(sched_group_phys, nodemask,
4732     &cpu_to_phys_group);
4733     }
4734    
4735     #ifdef CONFIG_NUMA
4736     /* Set up node groups */
4737     init_sched_build_groups(sched_group_nodes, cpu_default_map,
4738     &cpu_to_node_group);
4739     #endif
4740    
4741     /* Calculate CPU power for physical packages and nodes */
4742     for_each_cpu_mask(i, cpu_default_map) {
4743     int power;
4744     struct sched_domain *sd;
4745     #ifdef CONFIG_SCHED_SMT
4746     sd = &per_cpu(cpu_domains, i);
4747     power = SCHED_LOAD_SCALE;
4748     sd->groups->cpu_power = power;
4749     #endif
4750    
4751     sd = &per_cpu(phys_domains, i);
4752     power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4753     (cpus_weight(sd->groups->cpumask)-1) / 10;
4754     sd->groups->cpu_power = power;
4755    
4756     #ifdef CONFIG_NUMA
4757     if (i == first_cpu(sd->groups->cpumask)) {
4758     /* Only add "power" once for each physical package. */
4759     sd = &per_cpu(node_domains, i);
4760     sd->groups->cpu_power += power;
4761     }
4762     #endif
4763     }
4764    
4765     /* Attach the domains */
4766     for_each_online_cpu(i) {
4767     struct sched_domain *sd;
4768     #ifdef CONFIG_SCHED_SMT
4769     sd = &per_cpu(cpu_domains, i);
4770     #else
4771     sd = &per_cpu(phys_domains, i);
4772     #endif
4773     cpu_attach_domain(sd, i);
4774     }
4775     }
4776    
4777     #ifdef CONFIG_HOTPLUG_CPU
4778     static void __devinit arch_destroy_sched_domains(void)
4779     {
4780     /* Do nothing: everything is statically allocated. */
4781     }
4782     #endif
4783    
4784     #endif /* ARCH_HAS_SCHED_DOMAIN */
4785    
4786     /*
4787     * Initial dummy domain for early boot and for hotplug cpu. Being static,
4788     * it is initialized to zero, so all balancing flags are cleared which is
4789     * what we want.
4790     */
4791     static struct sched_domain sched_domain_dummy;
4792    
4793     #ifdef CONFIG_HOTPLUG_CPU
4794     /*
4795     * Force a reinitialization of the sched domains hierarchy. The domains
4796     * and groups cannot be updated in place without racing with the balancing
4797     * code, so we temporarily attach all running cpus to a "dummy" domain
4798     * which will prevent rebalancing while the sched domains are recalculated.
4799     */
4800     static int update_sched_domains(struct notifier_block *nfb,
4801     unsigned long action, void *hcpu)
4802     {
4803     int i;
4804    
4805     switch (action) {
4806     case CPU_UP_PREPARE:
4807     case CPU_DOWN_PREPARE:
4808     for_each_online_cpu(i)
4809     cpu_attach_domain(&sched_domain_dummy, i);
4810     arch_destroy_sched_domains();
4811     return NOTIFY_OK;
4812    
4813     case CPU_UP_CANCELED:
4814     case CPU_DOWN_FAILED:
4815     case CPU_ONLINE:
4816     case CPU_DEAD:
4817     /*
4818     * Fall through and re-initialise the domains.
4819     */
4820     break;
4821     default:
4822     return NOTIFY_DONE;
4823     }
4824    
4825     /* The hotplug lock is already held by cpu_up/cpu_down */
4826     arch_init_sched_domains();
4827    
4828     return NOTIFY_OK;
4829     }
4830     #endif
4831    
4832     void __init sched_init_smp(void)
4833     {
4834     lock_cpu_hotplug();
4835     arch_init_sched_domains();
4836     unlock_cpu_hotplug();
4837     /* XXX: Theoretical race here - CPU may be hotplugged now */
4838     hotcpu_notifier(update_sched_domains, 0);
4839     }
4840     #else
4841     void __init sched_init_smp(void)
4842     {
4843     }
4844     #endif /* CONFIG_SMP */
4845    
4846     int in_sched_functions(unsigned long addr)
4847     {
4848     /* Linker adds these: start and end of __sched functions */
4849     extern char __sched_text_start[], __sched_text_end[];
4850     return in_lock_functions(addr) ||
4851     (addr >= (unsigned long)__sched_text_start
4852     && addr < (unsigned long)__sched_text_end);
4853     }
4854    
4855     void __init sched_init(void)
4856     {
4857     runqueue_t *rq;
4858     int i, j;
4859    
4860     for (i = 0; i < NR_CPUS; i++) {
4861    
4862     rq = cpu_rq(i);
4863     spin_lock_init(&rq->lock);
4864     rq->cache_ticks = 0;
4865     rq->preempted = 0;
4866    
4867     #ifdef CONFIG_SMP
4868     rq->sd = &sched_domain_dummy;
4869     rq->cpu_load = 0;
4870     rq->active_balance = 0;
4871     rq->push_cpu = 0;
4872     rq->migration_thread = NULL;
4873     INIT_LIST_HEAD(&rq->migration_queue);
4874     #endif
4875     atomic_set(&rq->nr_iowait, 0);
4876     for (j = 0; j < MAX_PRIO; j++)
4877     INIT_LIST_HEAD(&rq->queue[j]);
4878     memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
4879     /*
4880     * delimiter for bitsearch
4881     */
4882     __set_bit(MAX_PRIO, rq->bitmap);
4883     }
4884    
4885     /*
4886     * The boot idle thread does lazy MMU switching as well:
4887     */
4888     atomic_inc(&init_mm.mm_count);
4889     enter_lazy_tlb(&init_mm, current);
4890    
4891     /*
4892     * Make us the idle thread. Technically, schedule() should not be
4893     * called from this thread, however somewhere below it might be,
4894     * but because we are the idle thread, we just pick up running again
4895     * when this runqueue becomes "idle".
4896     */
4897     init_idle(current, smp_processor_id());
4898     }
4899    
4900     #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4901     void __might_sleep(char *file, int line)
4902     {
4903     #if defined(in_atomic)
4904     static unsigned long prev_jiffy; /* ratelimiting */
4905    
4906     if ((in_atomic() || irqs_disabled()) &&
4907     system_state == SYSTEM_RUNNING && !oops_in_progress) {
4908     if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4909     return;
4910     prev_jiffy = jiffies;
4911     printk(KERN_ERR "Debug: sleeping function called from invalid"
4912     " context at %s:%d\n", file, line);
4913     printk("in_atomic():%d, irqs_disabled():%d\n",
4914     in_atomic(), irqs_disabled());
4915     dump_stack();
4916     }
4917     #endif
4918     }
4919     EXPORT_SYMBOL(__might_sleep);
4920     #endif
4921    
4922     #ifdef CONFIG_MAGIC_SYSRQ
4923     void normalize_rt_tasks(void)
4924     {
4925     struct task_struct *p;
4926     unsigned long flags;
4927     runqueue_t *rq;
4928     int queued;
4929    
4930     read_lock_irq(&tasklist_lock);
4931     for_each_process (p) {
4932     if (!rt_task(p))
4933     continue;
4934    
4935     rq = task_rq_lock(p, &flags);
4936    
4937     if ((queued = task_queued(p)))
4938     deactivate_task(p, task_rq(p));
4939     __setscheduler(p, SCHED_NORMAL, 0);
4940     if (queued) {
4941     __activate_task(p, task_rq(p));
4942     resched_task(rq->curr);
4943     }
4944    
4945     task_rq_unlock(rq, &flags);
4946     }
4947     read_unlock_irq(&tasklist_lock);
4948     }
4949    
4950     #endif /* CONFIG_MAGIC_SYSRQ */