Contents of /trunk/kernel26-alx/patches-2.6.17-r7/0002-2.6.17-sched-revise_smt_nice_locking.patch
Parent Directory | Revision Log
Revision 199 -
(show annotations)
(download)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 9226 byte(s)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 9226 byte(s)
-import
1 | Initial report and lock contention fix from Chris Mason: |
2 | |
3 | Recent benchmarks showed some performance regressions between 2.6.16 and |
4 | 2.6.5. We tracked down one of the regressions to lock contention in schedule |
5 | heavy workloads (~70,000 context switches per second) |
6 | |
7 | kernel/sched.c:dependent_sleeper() was responsible for most of the lock |
8 | contention, hammering on the run queue locks. The patch below is more of |
9 | a discussion point than a suggested fix (although it does reduce lock |
10 | contention significantly). The dependent_sleeper code looks very expensive |
11 | to me, especially for using a spinlock to bounce control between two different |
12 | siblings in the same cpu. |
13 | |
14 | It is further optimized: |
15 | |
16 | * perform dependent_sleeper check after next task is determined |
17 | * convert wake_sleeping_dependent to use trylock |
18 | * skip smt runqueue check if trylock fails |
19 | * optimize double_rq_lock now that smt nice is converted to trylock |
20 | * early exit in searching first SD_SHARE_CPUPOWER domain |
21 | * speedup fast path of dependent_sleeper |
22 | |
23 | |
24 | Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> |
25 | Signed-off-by: Con Kolivas <kernel@kolivas.org> |
26 | Signed-off-by: Nick Piggin <npiggin@suse.de> |
27 | Signed-off-by: Chris Mason <mason@suse.com> |
28 | |
29 | --- |
30 | kernel/sched.c | 175 ++++++++++++++++++--------------------------------------- |
31 | 1 files changed, 57 insertions(+), 118 deletions(-) |
32 | |
33 | Index: linux-ck-dev/kernel/sched.c |
34 | =================================================================== |
35 | --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000 |
36 | +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000 |
37 | @@ -1157,9 +1157,10 @@ static int sched_balance_self(int cpu, i |
38 | struct task_struct *t = current; |
39 | struct sched_domain *tmp, *sd = NULL; |
40 | |
41 | - for_each_domain(cpu, tmp) |
42 | + for_each_domain(cpu, tmp) { |
43 | if (tmp->flags & flag) |
44 | sd = tmp; |
45 | + } |
46 | |
47 | while (sd) { |
48 | cpumask_t span; |
49 | @@ -1790,7 +1791,7 @@ static void double_rq_lock(runqueue_t *r |
50 | spin_lock(&rq1->lock); |
51 | __acquire(rq2->lock); /* Fake it out ;) */ |
52 | } else { |
53 | - if (rq1->cpu < rq2->cpu) { |
54 | + if (rq1 < rq2) { |
55 | spin_lock(&rq1->lock); |
56 | spin_lock(&rq2->lock); |
57 | } else { |
58 | @@ -1826,7 +1827,7 @@ static void double_lock_balance(runqueue |
59 | __acquires(this_rq->lock) |
60 | { |
61 | if (unlikely(!spin_trylock(&busiest->lock))) { |
62 | - if (busiest->cpu < this_rq->cpu) { |
63 | + if (busiest < this_rq) { |
64 | spin_unlock(&this_rq->lock); |
65 | spin_lock(&busiest->lock); |
66 | spin_lock(&this_rq->lock); |
67 | @@ -2521,10 +2522,11 @@ static void active_load_balance(runqueue |
68 | double_lock_balance(busiest_rq, target_rq); |
69 | |
70 | /* Search for an sd spanning us and the target CPU. */ |
71 | - for_each_domain(target_cpu, sd) |
72 | + for_each_domain(target_cpu, sd) { |
73 | if ((sd->flags & SD_LOAD_BALANCE) && |
74 | cpu_isset(busiest_cpu, sd->span)) |
75 | break; |
76 | + } |
77 | |
78 | if (unlikely(sd == NULL)) |
79 | goto out; |
80 | @@ -2861,48 +2863,35 @@ static inline void wakeup_busy_runqueue( |
81 | resched_task(rq->idle); |
82 | } |
83 | |
84 | -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
85 | +/* |
86 | + * Called with interrupt disabled and this_rq's runqueue locked. |
87 | + */ |
88 | +static void wake_sleeping_dependent(int this_cpu) |
89 | { |
90 | struct sched_domain *tmp, *sd = NULL; |
91 | - cpumask_t sibling_map; |
92 | int i; |
93 | |
94 | - for_each_domain(this_cpu, tmp) |
95 | - if (tmp->flags & SD_SHARE_CPUPOWER) |
96 | + for_each_domain(this_cpu, tmp) { |
97 | + if (tmp->flags & SD_SHARE_CPUPOWER) { |
98 | sd = tmp; |
99 | + break; |
100 | + } |
101 | + } |
102 | |
103 | if (!sd) |
104 | return; |
105 | |
106 | - /* |
107 | - * Unlock the current runqueue because we have to lock in |
108 | - * CPU order to avoid deadlocks. Caller knows that we might |
109 | - * unlock. We keep IRQs disabled. |
110 | - */ |
111 | - spin_unlock(&this_rq->lock); |
112 | - |
113 | - sibling_map = sd->span; |
114 | - |
115 | - for_each_cpu_mask(i, sibling_map) |
116 | - spin_lock(&cpu_rq(i)->lock); |
117 | - /* |
118 | - * We clear this CPU from the mask. This both simplifies the |
119 | - * inner loop and keps this_rq locked when we exit: |
120 | - */ |
121 | - cpu_clear(this_cpu, sibling_map); |
122 | - |
123 | - for_each_cpu_mask(i, sibling_map) { |
124 | + for_each_cpu_mask(i, sd->span) { |
125 | runqueue_t *smt_rq = cpu_rq(i); |
126 | |
127 | + if (i == this_cpu) |
128 | + continue; |
129 | + if (unlikely(!spin_trylock(&smt_rq->lock))) |
130 | + continue; |
131 | + |
132 | wakeup_busy_runqueue(smt_rq); |
133 | + spin_unlock(&smt_rq->lock); |
134 | } |
135 | - |
136 | - for_each_cpu_mask(i, sibling_map) |
137 | - spin_unlock(&cpu_rq(i)->lock); |
138 | - /* |
139 | - * We exit with this_cpu's rq still held and IRQs |
140 | - * still disabled: |
141 | - */ |
142 | } |
143 | |
144 | /* |
145 | @@ -2915,52 +2904,46 @@ static inline unsigned long smt_slice(ta |
146 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
147 | } |
148 | |
149 | -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
150 | +/* |
151 | + * To minimise lock contention and not have to drop this_rq's runlock we only |
152 | + * trylock the sibling runqueues and bypass those runqueues if we fail to |
153 | + * acquire their lock. As we only trylock the normal locking order does not |
154 | + * need to be obeyed. |
155 | + */ |
156 | +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) |
157 | { |
158 | struct sched_domain *tmp, *sd = NULL; |
159 | - cpumask_t sibling_map; |
160 | - prio_array_t *array; |
161 | int ret = 0, i; |
162 | - task_t *p; |
163 | |
164 | - for_each_domain(this_cpu, tmp) |
165 | - if (tmp->flags & SD_SHARE_CPUPOWER) |
166 | + /* kernel/rt threads do not participate in dependent sleeping */ |
167 | + if (!p->mm || rt_task(p)) |
168 | + return 0; |
169 | + |
170 | + for_each_domain(this_cpu, tmp) { |
171 | + if (tmp->flags & SD_SHARE_CPUPOWER) { |
172 | sd = tmp; |
173 | + break; |
174 | + } |
175 | + } |
176 | |
177 | if (!sd) |
178 | return 0; |
179 | |
180 | - /* |
181 | - * The same locking rules and details apply as for |
182 | - * wake_sleeping_dependent(): |
183 | - */ |
184 | - spin_unlock(&this_rq->lock); |
185 | - sibling_map = sd->span; |
186 | - for_each_cpu_mask(i, sibling_map) |
187 | - spin_lock(&cpu_rq(i)->lock); |
188 | - cpu_clear(this_cpu, sibling_map); |
189 | + for_each_cpu_mask(i, sd->span) { |
190 | + runqueue_t *smt_rq; |
191 | + task_t *smt_curr; |
192 | |
193 | - /* |
194 | - * Establish next task to be run - it might have gone away because |
195 | - * we released the runqueue lock above: |
196 | - */ |
197 | - if (!this_rq->nr_running) |
198 | - goto out_unlock; |
199 | - array = this_rq->active; |
200 | - if (!array->nr_active) |
201 | - array = this_rq->expired; |
202 | - BUG_ON(!array->nr_active); |
203 | + if (i == this_cpu) |
204 | + continue; |
205 | |
206 | - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, |
207 | - task_t, run_list); |
208 | + smt_rq = cpu_rq(i); |
209 | + if (unlikely(!spin_trylock(&smt_rq->lock))) |
210 | + continue; |
211 | |
212 | - for_each_cpu_mask(i, sibling_map) { |
213 | - runqueue_t *smt_rq = cpu_rq(i); |
214 | - task_t *smt_curr = smt_rq->curr; |
215 | + smt_curr = smt_rq->curr; |
216 | |
217 | - /* Kernel threads do not participate in dependent sleeping */ |
218 | - if (!p->mm || !smt_curr->mm || rt_task(p)) |
219 | - goto check_smt_task; |
220 | + if (!smt_curr->mm) |
221 | + goto unlock; |
222 | |
223 | /* |
224 | * If a user task with lower static priority than the |
225 | @@ -2984,43 +2967,17 @@ static int dependent_sleeper(int this_cp |
226 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
227 | ret = 1; |
228 | |
229 | -check_smt_task: |
230 | - if ((!smt_curr->mm && smt_curr != smt_rq->idle) || |
231 | - rt_task(smt_curr)) |
232 | - continue; |
233 | - if (!p->mm) { |
234 | - wakeup_busy_runqueue(smt_rq); |
235 | - continue; |
236 | - } |
237 | - |
238 | - /* |
239 | - * Reschedule a lower priority task on the SMT sibling for |
240 | - * it to be put to sleep, or wake it up if it has been put to |
241 | - * sleep for priority reasons to see if it should run now. |
242 | - */ |
243 | - if (rt_task(p)) { |
244 | - if ((jiffies % DEF_TIMESLICE) > |
245 | - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
246 | - resched_task(smt_curr); |
247 | - } else { |
248 | - if (TASK_PREEMPTS_CURR(p, smt_rq) && |
249 | - smt_slice(p, sd) > task_timeslice(smt_curr)) |
250 | - resched_task(smt_curr); |
251 | - else |
252 | - wakeup_busy_runqueue(smt_rq); |
253 | - } |
254 | +unlock: |
255 | + spin_unlock(&smt_rq->lock); |
256 | } |
257 | -out_unlock: |
258 | - for_each_cpu_mask(i, sibling_map) |
259 | - spin_unlock(&cpu_rq(i)->lock); |
260 | return ret; |
261 | } |
262 | #else |
263 | -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
264 | +static inline void wake_sleeping_dependent(int this_cpu) |
265 | { |
266 | } |
267 | |
268 | -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
269 | +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) |
270 | { |
271 | return 0; |
272 | } |
273 | @@ -3142,32 +3099,13 @@ need_resched_nonpreemptible: |
274 | |
275 | cpu = smp_processor_id(); |
276 | if (unlikely(!rq->nr_running)) { |
277 | -go_idle: |
278 | idle_balance(cpu, rq); |
279 | if (!rq->nr_running) { |
280 | next = rq->idle; |
281 | rq->expired_timestamp = 0; |
282 | - wake_sleeping_dependent(cpu, rq); |
283 | - /* |
284 | - * wake_sleeping_dependent() might have released |
285 | - * the runqueue, so break out if we got new |
286 | - * tasks meanwhile: |
287 | - */ |
288 | - if (!rq->nr_running) |
289 | - goto switch_tasks; |
290 | - } |
291 | - } else { |
292 | - if (dependent_sleeper(cpu, rq)) { |
293 | - next = rq->idle; |
294 | + wake_sleeping_dependent(cpu); |
295 | goto switch_tasks; |
296 | } |
297 | - /* |
298 | - * dependent_sleeper() releases and reacquires the runqueue |
299 | - * lock, hence go into the idle loop if the rq went |
300 | - * empty meanwhile: |
301 | - */ |
302 | - if (unlikely(!rq->nr_running)) |
303 | - goto go_idle; |
304 | } |
305 | |
306 | array = rq->active; |
307 | @@ -3205,6 +3143,8 @@ go_idle: |
308 | } |
309 | } |
310 | next->sleep_type = SLEEP_NORMAL; |
311 | + if (dependent_sleeper(cpu, rq, next)) |
312 | + next = rq->idle; |
313 | switch_tasks: |
314 | if (next == rq->idle) |
315 | schedstat_inc(rq, sched_goidle); |
316 | @@ -6306,7 +6246,6 @@ void __init sched_init(void) |
317 | rq->push_cpu = 0; |
318 | rq->migration_thread = NULL; |
319 | INIT_LIST_HEAD(&rq->migration_queue); |
320 | - rq->cpu = i; |
321 | #endif |
322 | atomic_set(&rq->nr_iowait, 0); |
323 | |
324 | @@ -6368,7 +6307,7 @@ void normalize_rt_tasks(void) |
325 | runqueue_t *rq; |
326 | |
327 | read_lock_irq(&tasklist_lock); |
328 | - for_each_process (p) { |
329 | + for_each_process(p) { |
330 | if (!rt_task(p)) |
331 | continue; |
332 |