Magellan Linux

Contents of /trunk/kernel26-alx/patches-2.6.17-r6/0002-2.6.17-sched-revise_smt_nice_locking.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (show annotations) (download)
Fri May 18 11:04:36 2007 UTC (16 years, 11 months ago) by niro
File size: 9226 byte(s)
-import

1 Initial report and lock contention fix from Chris Mason:
2
3 Recent benchmarks showed some performance regressions between 2.6.16 and
4 2.6.5. We tracked down one of the regressions to lock contention in schedule
5 heavy workloads (~70,000 context switches per second)
6
7 kernel/sched.c:dependent_sleeper() was responsible for most of the lock
8 contention, hammering on the run queue locks. The patch below is more of
9 a discussion point than a suggested fix (although it does reduce lock
10 contention significantly). The dependent_sleeper code looks very expensive
11 to me, especially for using a spinlock to bounce control between two different
12 siblings in the same cpu.
13
14 It is further optimized:
15
16 * perform dependent_sleeper check after next task is determined
17 * convert wake_sleeping_dependent to use trylock
18 * skip smt runqueue check if trylock fails
19 * optimize double_rq_lock now that smt nice is converted to trylock
20 * early exit in searching first SD_SHARE_CPUPOWER domain
21 * speedup fast path of dependent_sleeper
22
23
24 Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
25 Signed-off-by: Con Kolivas <kernel@kolivas.org>
26 Signed-off-by: Nick Piggin <npiggin@suse.de>
27 Signed-off-by: Chris Mason <mason@suse.com>
28
29 ---
30 kernel/sched.c | 175 ++++++++++++++++++---------------------------------------
31 1 files changed, 57 insertions(+), 118 deletions(-)
32
33 Index: linux-ck-dev/kernel/sched.c
34 ===================================================================
35 --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000
36 +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000
37 @@ -1157,9 +1157,10 @@ static int sched_balance_self(int cpu, i
38 struct task_struct *t = current;
39 struct sched_domain *tmp, *sd = NULL;
40
41 - for_each_domain(cpu, tmp)
42 + for_each_domain(cpu, tmp) {
43 if (tmp->flags & flag)
44 sd = tmp;
45 + }
46
47 while (sd) {
48 cpumask_t span;
49 @@ -1790,7 +1791,7 @@ static void double_rq_lock(runqueue_t *r
50 spin_lock(&rq1->lock);
51 __acquire(rq2->lock); /* Fake it out ;) */
52 } else {
53 - if (rq1->cpu < rq2->cpu) {
54 + if (rq1 < rq2) {
55 spin_lock(&rq1->lock);
56 spin_lock(&rq2->lock);
57 } else {
58 @@ -1826,7 +1827,7 @@ static void double_lock_balance(runqueue
59 __acquires(this_rq->lock)
60 {
61 if (unlikely(!spin_trylock(&busiest->lock))) {
62 - if (busiest->cpu < this_rq->cpu) {
63 + if (busiest < this_rq) {
64 spin_unlock(&this_rq->lock);
65 spin_lock(&busiest->lock);
66 spin_lock(&this_rq->lock);
67 @@ -2521,10 +2522,11 @@ static void active_load_balance(runqueue
68 double_lock_balance(busiest_rq, target_rq);
69
70 /* Search for an sd spanning us and the target CPU. */
71 - for_each_domain(target_cpu, sd)
72 + for_each_domain(target_cpu, sd) {
73 if ((sd->flags & SD_LOAD_BALANCE) &&
74 cpu_isset(busiest_cpu, sd->span))
75 break;
76 + }
77
78 if (unlikely(sd == NULL))
79 goto out;
80 @@ -2861,48 +2863,35 @@ static inline void wakeup_busy_runqueue(
81 resched_task(rq->idle);
82 }
83
84 -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
85 +/*
86 + * Called with interrupt disabled and this_rq's runqueue locked.
87 + */
88 +static void wake_sleeping_dependent(int this_cpu)
89 {
90 struct sched_domain *tmp, *sd = NULL;
91 - cpumask_t sibling_map;
92 int i;
93
94 - for_each_domain(this_cpu, tmp)
95 - if (tmp->flags & SD_SHARE_CPUPOWER)
96 + for_each_domain(this_cpu, tmp) {
97 + if (tmp->flags & SD_SHARE_CPUPOWER) {
98 sd = tmp;
99 + break;
100 + }
101 + }
102
103 if (!sd)
104 return;
105
106 - /*
107 - * Unlock the current runqueue because we have to lock in
108 - * CPU order to avoid deadlocks. Caller knows that we might
109 - * unlock. We keep IRQs disabled.
110 - */
111 - spin_unlock(&this_rq->lock);
112 -
113 - sibling_map = sd->span;
114 -
115 - for_each_cpu_mask(i, sibling_map)
116 - spin_lock(&cpu_rq(i)->lock);
117 - /*
118 - * We clear this CPU from the mask. This both simplifies the
119 - * inner loop and keps this_rq locked when we exit:
120 - */
121 - cpu_clear(this_cpu, sibling_map);
122 -
123 - for_each_cpu_mask(i, sibling_map) {
124 + for_each_cpu_mask(i, sd->span) {
125 runqueue_t *smt_rq = cpu_rq(i);
126
127 + if (i == this_cpu)
128 + continue;
129 + if (unlikely(!spin_trylock(&smt_rq->lock)))
130 + continue;
131 +
132 wakeup_busy_runqueue(smt_rq);
133 + spin_unlock(&smt_rq->lock);
134 }
135 -
136 - for_each_cpu_mask(i, sibling_map)
137 - spin_unlock(&cpu_rq(i)->lock);
138 - /*
139 - * We exit with this_cpu's rq still held and IRQs
140 - * still disabled:
141 - */
142 }
143
144 /*
145 @@ -2915,52 +2904,46 @@ static inline unsigned long smt_slice(ta
146 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
147 }
148
149 -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
150 +/*
151 + * To minimise lock contention and not have to drop this_rq's runlock we only
152 + * trylock the sibling runqueues and bypass those runqueues if we fail to
153 + * acquire their lock. As we only trylock the normal locking order does not
154 + * need to be obeyed.
155 + */
156 +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
157 {
158 struct sched_domain *tmp, *sd = NULL;
159 - cpumask_t sibling_map;
160 - prio_array_t *array;
161 int ret = 0, i;
162 - task_t *p;
163
164 - for_each_domain(this_cpu, tmp)
165 - if (tmp->flags & SD_SHARE_CPUPOWER)
166 + /* kernel/rt threads do not participate in dependent sleeping */
167 + if (!p->mm || rt_task(p))
168 + return 0;
169 +
170 + for_each_domain(this_cpu, tmp) {
171 + if (tmp->flags & SD_SHARE_CPUPOWER) {
172 sd = tmp;
173 + break;
174 + }
175 + }
176
177 if (!sd)
178 return 0;
179
180 - /*
181 - * The same locking rules and details apply as for
182 - * wake_sleeping_dependent():
183 - */
184 - spin_unlock(&this_rq->lock);
185 - sibling_map = sd->span;
186 - for_each_cpu_mask(i, sibling_map)
187 - spin_lock(&cpu_rq(i)->lock);
188 - cpu_clear(this_cpu, sibling_map);
189 + for_each_cpu_mask(i, sd->span) {
190 + runqueue_t *smt_rq;
191 + task_t *smt_curr;
192
193 - /*
194 - * Establish next task to be run - it might have gone away because
195 - * we released the runqueue lock above:
196 - */
197 - if (!this_rq->nr_running)
198 - goto out_unlock;
199 - array = this_rq->active;
200 - if (!array->nr_active)
201 - array = this_rq->expired;
202 - BUG_ON(!array->nr_active);
203 + if (i == this_cpu)
204 + continue;
205
206 - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
207 - task_t, run_list);
208 + smt_rq = cpu_rq(i);
209 + if (unlikely(!spin_trylock(&smt_rq->lock)))
210 + continue;
211
212 - for_each_cpu_mask(i, sibling_map) {
213 - runqueue_t *smt_rq = cpu_rq(i);
214 - task_t *smt_curr = smt_rq->curr;
215 + smt_curr = smt_rq->curr;
216
217 - /* Kernel threads do not participate in dependent sleeping */
218 - if (!p->mm || !smt_curr->mm || rt_task(p))
219 - goto check_smt_task;
220 + if (!smt_curr->mm)
221 + goto unlock;
222
223 /*
224 * If a user task with lower static priority than the
225 @@ -2984,43 +2967,17 @@ static int dependent_sleeper(int this_cp
226 smt_slice(smt_curr, sd) > task_timeslice(p))
227 ret = 1;
228
229 -check_smt_task:
230 - if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
231 - rt_task(smt_curr))
232 - continue;
233 - if (!p->mm) {
234 - wakeup_busy_runqueue(smt_rq);
235 - continue;
236 - }
237 -
238 - /*
239 - * Reschedule a lower priority task on the SMT sibling for
240 - * it to be put to sleep, or wake it up if it has been put to
241 - * sleep for priority reasons to see if it should run now.
242 - */
243 - if (rt_task(p)) {
244 - if ((jiffies % DEF_TIMESLICE) >
245 - (sd->per_cpu_gain * DEF_TIMESLICE / 100))
246 - resched_task(smt_curr);
247 - } else {
248 - if (TASK_PREEMPTS_CURR(p, smt_rq) &&
249 - smt_slice(p, sd) > task_timeslice(smt_curr))
250 - resched_task(smt_curr);
251 - else
252 - wakeup_busy_runqueue(smt_rq);
253 - }
254 +unlock:
255 + spin_unlock(&smt_rq->lock);
256 }
257 -out_unlock:
258 - for_each_cpu_mask(i, sibling_map)
259 - spin_unlock(&cpu_rq(i)->lock);
260 return ret;
261 }
262 #else
263 -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
264 +static inline void wake_sleeping_dependent(int this_cpu)
265 {
266 }
267
268 -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
269 +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
270 {
271 return 0;
272 }
273 @@ -3142,32 +3099,13 @@ need_resched_nonpreemptible:
274
275 cpu = smp_processor_id();
276 if (unlikely(!rq->nr_running)) {
277 -go_idle:
278 idle_balance(cpu, rq);
279 if (!rq->nr_running) {
280 next = rq->idle;
281 rq->expired_timestamp = 0;
282 - wake_sleeping_dependent(cpu, rq);
283 - /*
284 - * wake_sleeping_dependent() might have released
285 - * the runqueue, so break out if we got new
286 - * tasks meanwhile:
287 - */
288 - if (!rq->nr_running)
289 - goto switch_tasks;
290 - }
291 - } else {
292 - if (dependent_sleeper(cpu, rq)) {
293 - next = rq->idle;
294 + wake_sleeping_dependent(cpu);
295 goto switch_tasks;
296 }
297 - /*
298 - * dependent_sleeper() releases and reacquires the runqueue
299 - * lock, hence go into the idle loop if the rq went
300 - * empty meanwhile:
301 - */
302 - if (unlikely(!rq->nr_running))
303 - goto go_idle;
304 }
305
306 array = rq->active;
307 @@ -3205,6 +3143,8 @@ go_idle:
308 }
309 }
310 next->sleep_type = SLEEP_NORMAL;
311 + if (dependent_sleeper(cpu, rq, next))
312 + next = rq->idle;
313 switch_tasks:
314 if (next == rq->idle)
315 schedstat_inc(rq, sched_goidle);
316 @@ -6306,7 +6246,6 @@ void __init sched_init(void)
317 rq->push_cpu = 0;
318 rq->migration_thread = NULL;
319 INIT_LIST_HEAD(&rq->migration_queue);
320 - rq->cpu = i;
321 #endif
322 atomic_set(&rq->nr_iowait, 0);
323
324 @@ -6368,7 +6307,7 @@ void normalize_rt_tasks(void)
325 runqueue_t *rq;
326
327 read_lock_irq(&tasklist_lock);
328 - for_each_process (p) {
329 + for_each_process(p) {
330 if (!rt_task(p))
331 continue;
332