Magellan Linux

Annotation of /trunk/kernel26-alx/patches-2.6.17-r7/0002-2.6.17-sched-revise_smt_nice_locking.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (hide annotations) (download)
Fri May 18 11:04:36 2007 UTC (17 years ago) by niro
File size: 9226 byte(s)
-import

1 niro 199 Initial report and lock contention fix from Chris Mason:
2    
3     Recent benchmarks showed some performance regressions between 2.6.16 and
4     2.6.5. We tracked down one of the regressions to lock contention in schedule
5     heavy workloads (~70,000 context switches per second)
6    
7     kernel/sched.c:dependent_sleeper() was responsible for most of the lock
8     contention, hammering on the run queue locks. The patch below is more of
9     a discussion point than a suggested fix (although it does reduce lock
10     contention significantly). The dependent_sleeper code looks very expensive
11     to me, especially for using a spinlock to bounce control between two different
12     siblings in the same cpu.
13    
14     It is further optimized:
15    
16     * perform dependent_sleeper check after next task is determined
17     * convert wake_sleeping_dependent to use trylock
18     * skip smt runqueue check if trylock fails
19     * optimize double_rq_lock now that smt nice is converted to trylock
20     * early exit in searching first SD_SHARE_CPUPOWER domain
21     * speedup fast path of dependent_sleeper
22    
23    
24     Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
25     Signed-off-by: Con Kolivas <kernel@kolivas.org>
26     Signed-off-by: Nick Piggin <npiggin@suse.de>
27     Signed-off-by: Chris Mason <mason@suse.com>
28    
29     ---
30     kernel/sched.c | 175 ++++++++++++++++++---------------------------------------
31     1 files changed, 57 insertions(+), 118 deletions(-)
32    
33     Index: linux-ck-dev/kernel/sched.c
34     ===================================================================
35     --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000
36     +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000
37     @@ -1157,9 +1157,10 @@ static int sched_balance_self(int cpu, i
38     struct task_struct *t = current;
39     struct sched_domain *tmp, *sd = NULL;
40    
41     - for_each_domain(cpu, tmp)
42     + for_each_domain(cpu, tmp) {
43     if (tmp->flags & flag)
44     sd = tmp;
45     + }
46    
47     while (sd) {
48     cpumask_t span;
49     @@ -1790,7 +1791,7 @@ static void double_rq_lock(runqueue_t *r
50     spin_lock(&rq1->lock);
51     __acquire(rq2->lock); /* Fake it out ;) */
52     } else {
53     - if (rq1->cpu < rq2->cpu) {
54     + if (rq1 < rq2) {
55     spin_lock(&rq1->lock);
56     spin_lock(&rq2->lock);
57     } else {
58     @@ -1826,7 +1827,7 @@ static void double_lock_balance(runqueue
59     __acquires(this_rq->lock)
60     {
61     if (unlikely(!spin_trylock(&busiest->lock))) {
62     - if (busiest->cpu < this_rq->cpu) {
63     + if (busiest < this_rq) {
64     spin_unlock(&this_rq->lock);
65     spin_lock(&busiest->lock);
66     spin_lock(&this_rq->lock);
67     @@ -2521,10 +2522,11 @@ static void active_load_balance(runqueue
68     double_lock_balance(busiest_rq, target_rq);
69    
70     /* Search for an sd spanning us and the target CPU. */
71     - for_each_domain(target_cpu, sd)
72     + for_each_domain(target_cpu, sd) {
73     if ((sd->flags & SD_LOAD_BALANCE) &&
74     cpu_isset(busiest_cpu, sd->span))
75     break;
76     + }
77    
78     if (unlikely(sd == NULL))
79     goto out;
80     @@ -2861,48 +2863,35 @@ static inline void wakeup_busy_runqueue(
81     resched_task(rq->idle);
82     }
83    
84     -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
85     +/*
86     + * Called with interrupt disabled and this_rq's runqueue locked.
87     + */
88     +static void wake_sleeping_dependent(int this_cpu)
89     {
90     struct sched_domain *tmp, *sd = NULL;
91     - cpumask_t sibling_map;
92     int i;
93    
94     - for_each_domain(this_cpu, tmp)
95     - if (tmp->flags & SD_SHARE_CPUPOWER)
96     + for_each_domain(this_cpu, tmp) {
97     + if (tmp->flags & SD_SHARE_CPUPOWER) {
98     sd = tmp;
99     + break;
100     + }
101     + }
102    
103     if (!sd)
104     return;
105    
106     - /*
107     - * Unlock the current runqueue because we have to lock in
108     - * CPU order to avoid deadlocks. Caller knows that we might
109     - * unlock. We keep IRQs disabled.
110     - */
111     - spin_unlock(&this_rq->lock);
112     -
113     - sibling_map = sd->span;
114     -
115     - for_each_cpu_mask(i, sibling_map)
116     - spin_lock(&cpu_rq(i)->lock);
117     - /*
118     - * We clear this CPU from the mask. This both simplifies the
119     - * inner loop and keps this_rq locked when we exit:
120     - */
121     - cpu_clear(this_cpu, sibling_map);
122     -
123     - for_each_cpu_mask(i, sibling_map) {
124     + for_each_cpu_mask(i, sd->span) {
125     runqueue_t *smt_rq = cpu_rq(i);
126    
127     + if (i == this_cpu)
128     + continue;
129     + if (unlikely(!spin_trylock(&smt_rq->lock)))
130     + continue;
131     +
132     wakeup_busy_runqueue(smt_rq);
133     + spin_unlock(&smt_rq->lock);
134     }
135     -
136     - for_each_cpu_mask(i, sibling_map)
137     - spin_unlock(&cpu_rq(i)->lock);
138     - /*
139     - * We exit with this_cpu's rq still held and IRQs
140     - * still disabled:
141     - */
142     }
143    
144     /*
145     @@ -2915,52 +2904,46 @@ static inline unsigned long smt_slice(ta
146     return p->time_slice * (100 - sd->per_cpu_gain) / 100;
147     }
148    
149     -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
150     +/*
151     + * To minimise lock contention and not have to drop this_rq's runlock we only
152     + * trylock the sibling runqueues and bypass those runqueues if we fail to
153     + * acquire their lock. As we only trylock the normal locking order does not
154     + * need to be obeyed.
155     + */
156     +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
157     {
158     struct sched_domain *tmp, *sd = NULL;
159     - cpumask_t sibling_map;
160     - prio_array_t *array;
161     int ret = 0, i;
162     - task_t *p;
163    
164     - for_each_domain(this_cpu, tmp)
165     - if (tmp->flags & SD_SHARE_CPUPOWER)
166     + /* kernel/rt threads do not participate in dependent sleeping */
167     + if (!p->mm || rt_task(p))
168     + return 0;
169     +
170     + for_each_domain(this_cpu, tmp) {
171     + if (tmp->flags & SD_SHARE_CPUPOWER) {
172     sd = tmp;
173     + break;
174     + }
175     + }
176    
177     if (!sd)
178     return 0;
179    
180     - /*
181     - * The same locking rules and details apply as for
182     - * wake_sleeping_dependent():
183     - */
184     - spin_unlock(&this_rq->lock);
185     - sibling_map = sd->span;
186     - for_each_cpu_mask(i, sibling_map)
187     - spin_lock(&cpu_rq(i)->lock);
188     - cpu_clear(this_cpu, sibling_map);
189     + for_each_cpu_mask(i, sd->span) {
190     + runqueue_t *smt_rq;
191     + task_t *smt_curr;
192    
193     - /*
194     - * Establish next task to be run - it might have gone away because
195     - * we released the runqueue lock above:
196     - */
197     - if (!this_rq->nr_running)
198     - goto out_unlock;
199     - array = this_rq->active;
200     - if (!array->nr_active)
201     - array = this_rq->expired;
202     - BUG_ON(!array->nr_active);
203     + if (i == this_cpu)
204     + continue;
205    
206     - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
207     - task_t, run_list);
208     + smt_rq = cpu_rq(i);
209     + if (unlikely(!spin_trylock(&smt_rq->lock)))
210     + continue;
211    
212     - for_each_cpu_mask(i, sibling_map) {
213     - runqueue_t *smt_rq = cpu_rq(i);
214     - task_t *smt_curr = smt_rq->curr;
215     + smt_curr = smt_rq->curr;
216    
217     - /* Kernel threads do not participate in dependent sleeping */
218     - if (!p->mm || !smt_curr->mm || rt_task(p))
219     - goto check_smt_task;
220     + if (!smt_curr->mm)
221     + goto unlock;
222    
223     /*
224     * If a user task with lower static priority than the
225     @@ -2984,43 +2967,17 @@ static int dependent_sleeper(int this_cp
226     smt_slice(smt_curr, sd) > task_timeslice(p))
227     ret = 1;
228    
229     -check_smt_task:
230     - if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
231     - rt_task(smt_curr))
232     - continue;
233     - if (!p->mm) {
234     - wakeup_busy_runqueue(smt_rq);
235     - continue;
236     - }
237     -
238     - /*
239     - * Reschedule a lower priority task on the SMT sibling for
240     - * it to be put to sleep, or wake it up if it has been put to
241     - * sleep for priority reasons to see if it should run now.
242     - */
243     - if (rt_task(p)) {
244     - if ((jiffies % DEF_TIMESLICE) >
245     - (sd->per_cpu_gain * DEF_TIMESLICE / 100))
246     - resched_task(smt_curr);
247     - } else {
248     - if (TASK_PREEMPTS_CURR(p, smt_rq) &&
249     - smt_slice(p, sd) > task_timeslice(smt_curr))
250     - resched_task(smt_curr);
251     - else
252     - wakeup_busy_runqueue(smt_rq);
253     - }
254     +unlock:
255     + spin_unlock(&smt_rq->lock);
256     }
257     -out_unlock:
258     - for_each_cpu_mask(i, sibling_map)
259     - spin_unlock(&cpu_rq(i)->lock);
260     return ret;
261     }
262     #else
263     -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
264     +static inline void wake_sleeping_dependent(int this_cpu)
265     {
266     }
267    
268     -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
269     +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
270     {
271     return 0;
272     }
273     @@ -3142,32 +3099,13 @@ need_resched_nonpreemptible:
274    
275     cpu = smp_processor_id();
276     if (unlikely(!rq->nr_running)) {
277     -go_idle:
278     idle_balance(cpu, rq);
279     if (!rq->nr_running) {
280     next = rq->idle;
281     rq->expired_timestamp = 0;
282     - wake_sleeping_dependent(cpu, rq);
283     - /*
284     - * wake_sleeping_dependent() might have released
285     - * the runqueue, so break out if we got new
286     - * tasks meanwhile:
287     - */
288     - if (!rq->nr_running)
289     - goto switch_tasks;
290     - }
291     - } else {
292     - if (dependent_sleeper(cpu, rq)) {
293     - next = rq->idle;
294     + wake_sleeping_dependent(cpu);
295     goto switch_tasks;
296     }
297     - /*
298     - * dependent_sleeper() releases and reacquires the runqueue
299     - * lock, hence go into the idle loop if the rq went
300     - * empty meanwhile:
301     - */
302     - if (unlikely(!rq->nr_running))
303     - goto go_idle;
304     }
305    
306     array = rq->active;
307     @@ -3205,6 +3143,8 @@ go_idle:
308     }
309     }
310     next->sleep_type = SLEEP_NORMAL;
311     + if (dependent_sleeper(cpu, rq, next))
312     + next = rq->idle;
313     switch_tasks:
314     if (next == rq->idle)
315     schedstat_inc(rq, sched_goidle);
316     @@ -6306,7 +6246,6 @@ void __init sched_init(void)
317     rq->push_cpu = 0;
318     rq->migration_thread = NULL;
319     INIT_LIST_HEAD(&rq->migration_queue);
320     - rq->cpu = i;
321     #endif
322     atomic_set(&rq->nr_iowait, 0);
323    
324     @@ -6368,7 +6307,7 @@ void normalize_rt_tasks(void)
325     runqueue_t *rq;
326    
327     read_lock_irq(&tasklist_lock);
328     - for_each_process (p) {
329     + for_each_process(p) {
330     if (!rt_task(p))
331     continue;
332