Annotation of /trunk/kernel26-alx/patches-2.6.17-r6/0002-2.6.17-sched-revise_smt_nice_locking.patch
Parent Directory | Revision Log
Revision 199 -
(hide annotations)
(download)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 9226 byte(s)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 9226 byte(s)
-import
1 | niro | 199 | Initial report and lock contention fix from Chris Mason: |
2 | |||
3 | Recent benchmarks showed some performance regressions between 2.6.16 and | ||
4 | 2.6.5. We tracked down one of the regressions to lock contention in schedule | ||
5 | heavy workloads (~70,000 context switches per second) | ||
6 | |||
7 | kernel/sched.c:dependent_sleeper() was responsible for most of the lock | ||
8 | contention, hammering on the run queue locks. The patch below is more of | ||
9 | a discussion point than a suggested fix (although it does reduce lock | ||
10 | contention significantly). The dependent_sleeper code looks very expensive | ||
11 | to me, especially for using a spinlock to bounce control between two different | ||
12 | siblings in the same cpu. | ||
13 | |||
14 | It is further optimized: | ||
15 | |||
16 | * perform dependent_sleeper check after next task is determined | ||
17 | * convert wake_sleeping_dependent to use trylock | ||
18 | * skip smt runqueue check if trylock fails | ||
19 | * optimize double_rq_lock now that smt nice is converted to trylock | ||
20 | * early exit in searching first SD_SHARE_CPUPOWER domain | ||
21 | * speedup fast path of dependent_sleeper | ||
22 | |||
23 | |||
24 | Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> | ||
25 | Signed-off-by: Con Kolivas <kernel@kolivas.org> | ||
26 | Signed-off-by: Nick Piggin <npiggin@suse.de> | ||
27 | Signed-off-by: Chris Mason <mason@suse.com> | ||
28 | |||
29 | --- | ||
30 | kernel/sched.c | 175 ++++++++++++++++++--------------------------------------- | ||
31 | 1 files changed, 57 insertions(+), 118 deletions(-) | ||
32 | |||
33 | Index: linux-ck-dev/kernel/sched.c | ||
34 | =================================================================== | ||
35 | --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000 | ||
36 | +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000 | ||
37 | @@ -1157,9 +1157,10 @@ static int sched_balance_self(int cpu, i | ||
38 | struct task_struct *t = current; | ||
39 | struct sched_domain *tmp, *sd = NULL; | ||
40 | |||
41 | - for_each_domain(cpu, tmp) | ||
42 | + for_each_domain(cpu, tmp) { | ||
43 | if (tmp->flags & flag) | ||
44 | sd = tmp; | ||
45 | + } | ||
46 | |||
47 | while (sd) { | ||
48 | cpumask_t span; | ||
49 | @@ -1790,7 +1791,7 @@ static void double_rq_lock(runqueue_t *r | ||
50 | spin_lock(&rq1->lock); | ||
51 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
52 | } else { | ||
53 | - if (rq1->cpu < rq2->cpu) { | ||
54 | + if (rq1 < rq2) { | ||
55 | spin_lock(&rq1->lock); | ||
56 | spin_lock(&rq2->lock); | ||
57 | } else { | ||
58 | @@ -1826,7 +1827,7 @@ static void double_lock_balance(runqueue | ||
59 | __acquires(this_rq->lock) | ||
60 | { | ||
61 | if (unlikely(!spin_trylock(&busiest->lock))) { | ||
62 | - if (busiest->cpu < this_rq->cpu) { | ||
63 | + if (busiest < this_rq) { | ||
64 | spin_unlock(&this_rq->lock); | ||
65 | spin_lock(&busiest->lock); | ||
66 | spin_lock(&this_rq->lock); | ||
67 | @@ -2521,10 +2522,11 @@ static void active_load_balance(runqueue | ||
68 | double_lock_balance(busiest_rq, target_rq); | ||
69 | |||
70 | /* Search for an sd spanning us and the target CPU. */ | ||
71 | - for_each_domain(target_cpu, sd) | ||
72 | + for_each_domain(target_cpu, sd) { | ||
73 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
74 | cpu_isset(busiest_cpu, sd->span)) | ||
75 | break; | ||
76 | + } | ||
77 | |||
78 | if (unlikely(sd == NULL)) | ||
79 | goto out; | ||
80 | @@ -2861,48 +2863,35 @@ static inline void wakeup_busy_runqueue( | ||
81 | resched_task(rq->idle); | ||
82 | } | ||
83 | |||
84 | -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | ||
85 | +/* | ||
86 | + * Called with interrupt disabled and this_rq's runqueue locked. | ||
87 | + */ | ||
88 | +static void wake_sleeping_dependent(int this_cpu) | ||
89 | { | ||
90 | struct sched_domain *tmp, *sd = NULL; | ||
91 | - cpumask_t sibling_map; | ||
92 | int i; | ||
93 | |||
94 | - for_each_domain(this_cpu, tmp) | ||
95 | - if (tmp->flags & SD_SHARE_CPUPOWER) | ||
96 | + for_each_domain(this_cpu, tmp) { | ||
97 | + if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
98 | sd = tmp; | ||
99 | + break; | ||
100 | + } | ||
101 | + } | ||
102 | |||
103 | if (!sd) | ||
104 | return; | ||
105 | |||
106 | - /* | ||
107 | - * Unlock the current runqueue because we have to lock in | ||
108 | - * CPU order to avoid deadlocks. Caller knows that we might | ||
109 | - * unlock. We keep IRQs disabled. | ||
110 | - */ | ||
111 | - spin_unlock(&this_rq->lock); | ||
112 | - | ||
113 | - sibling_map = sd->span; | ||
114 | - | ||
115 | - for_each_cpu_mask(i, sibling_map) | ||
116 | - spin_lock(&cpu_rq(i)->lock); | ||
117 | - /* | ||
118 | - * We clear this CPU from the mask. This both simplifies the | ||
119 | - * inner loop and keps this_rq locked when we exit: | ||
120 | - */ | ||
121 | - cpu_clear(this_cpu, sibling_map); | ||
122 | - | ||
123 | - for_each_cpu_mask(i, sibling_map) { | ||
124 | + for_each_cpu_mask(i, sd->span) { | ||
125 | runqueue_t *smt_rq = cpu_rq(i); | ||
126 | |||
127 | + if (i == this_cpu) | ||
128 | + continue; | ||
129 | + if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
130 | + continue; | ||
131 | + | ||
132 | wakeup_busy_runqueue(smt_rq); | ||
133 | + spin_unlock(&smt_rq->lock); | ||
134 | } | ||
135 | - | ||
136 | - for_each_cpu_mask(i, sibling_map) | ||
137 | - spin_unlock(&cpu_rq(i)->lock); | ||
138 | - /* | ||
139 | - * We exit with this_cpu's rq still held and IRQs | ||
140 | - * still disabled: | ||
141 | - */ | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | @@ -2915,52 +2904,46 @@ static inline unsigned long smt_slice(ta | ||
146 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | ||
147 | } | ||
148 | |||
149 | -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | ||
150 | +/* | ||
151 | + * To minimise lock contention and not have to drop this_rq's runlock we only | ||
152 | + * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
153 | + * acquire their lock. As we only trylock the normal locking order does not | ||
154 | + * need to be obeyed. | ||
155 | + */ | ||
156 | +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) | ||
157 | { | ||
158 | struct sched_domain *tmp, *sd = NULL; | ||
159 | - cpumask_t sibling_map; | ||
160 | - prio_array_t *array; | ||
161 | int ret = 0, i; | ||
162 | - task_t *p; | ||
163 | |||
164 | - for_each_domain(this_cpu, tmp) | ||
165 | - if (tmp->flags & SD_SHARE_CPUPOWER) | ||
166 | + /* kernel/rt threads do not participate in dependent sleeping */ | ||
167 | + if (!p->mm || rt_task(p)) | ||
168 | + return 0; | ||
169 | + | ||
170 | + for_each_domain(this_cpu, tmp) { | ||
171 | + if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
172 | sd = tmp; | ||
173 | + break; | ||
174 | + } | ||
175 | + } | ||
176 | |||
177 | if (!sd) | ||
178 | return 0; | ||
179 | |||
180 | - /* | ||
181 | - * The same locking rules and details apply as for | ||
182 | - * wake_sleeping_dependent(): | ||
183 | - */ | ||
184 | - spin_unlock(&this_rq->lock); | ||
185 | - sibling_map = sd->span; | ||
186 | - for_each_cpu_mask(i, sibling_map) | ||
187 | - spin_lock(&cpu_rq(i)->lock); | ||
188 | - cpu_clear(this_cpu, sibling_map); | ||
189 | + for_each_cpu_mask(i, sd->span) { | ||
190 | + runqueue_t *smt_rq; | ||
191 | + task_t *smt_curr; | ||
192 | |||
193 | - /* | ||
194 | - * Establish next task to be run - it might have gone away because | ||
195 | - * we released the runqueue lock above: | ||
196 | - */ | ||
197 | - if (!this_rq->nr_running) | ||
198 | - goto out_unlock; | ||
199 | - array = this_rq->active; | ||
200 | - if (!array->nr_active) | ||
201 | - array = this_rq->expired; | ||
202 | - BUG_ON(!array->nr_active); | ||
203 | + if (i == this_cpu) | ||
204 | + continue; | ||
205 | |||
206 | - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | ||
207 | - task_t, run_list); | ||
208 | + smt_rq = cpu_rq(i); | ||
209 | + if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
210 | + continue; | ||
211 | |||
212 | - for_each_cpu_mask(i, sibling_map) { | ||
213 | - runqueue_t *smt_rq = cpu_rq(i); | ||
214 | - task_t *smt_curr = smt_rq->curr; | ||
215 | + smt_curr = smt_rq->curr; | ||
216 | |||
217 | - /* Kernel threads do not participate in dependent sleeping */ | ||
218 | - if (!p->mm || !smt_curr->mm || rt_task(p)) | ||
219 | - goto check_smt_task; | ||
220 | + if (!smt_curr->mm) | ||
221 | + goto unlock; | ||
222 | |||
223 | /* | ||
224 | * If a user task with lower static priority than the | ||
225 | @@ -2984,43 +2967,17 @@ static int dependent_sleeper(int this_cp | ||
226 | smt_slice(smt_curr, sd) > task_timeslice(p)) | ||
227 | ret = 1; | ||
228 | |||
229 | -check_smt_task: | ||
230 | - if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
231 | - rt_task(smt_curr)) | ||
232 | - continue; | ||
233 | - if (!p->mm) { | ||
234 | - wakeup_busy_runqueue(smt_rq); | ||
235 | - continue; | ||
236 | - } | ||
237 | - | ||
238 | - /* | ||
239 | - * Reschedule a lower priority task on the SMT sibling for | ||
240 | - * it to be put to sleep, or wake it up if it has been put to | ||
241 | - * sleep for priority reasons to see if it should run now. | ||
242 | - */ | ||
243 | - if (rt_task(p)) { | ||
244 | - if ((jiffies % DEF_TIMESLICE) > | ||
245 | - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
246 | - resched_task(smt_curr); | ||
247 | - } else { | ||
248 | - if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
249 | - smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
250 | - resched_task(smt_curr); | ||
251 | - else | ||
252 | - wakeup_busy_runqueue(smt_rq); | ||
253 | - } | ||
254 | +unlock: | ||
255 | + spin_unlock(&smt_rq->lock); | ||
256 | } | ||
257 | -out_unlock: | ||
258 | - for_each_cpu_mask(i, sibling_map) | ||
259 | - spin_unlock(&cpu_rq(i)->lock); | ||
260 | return ret; | ||
261 | } | ||
262 | #else | ||
263 | -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | ||
264 | +static inline void wake_sleeping_dependent(int this_cpu) | ||
265 | { | ||
266 | } | ||
267 | |||
268 | -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | ||
269 | +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) | ||
270 | { | ||
271 | return 0; | ||
272 | } | ||
273 | @@ -3142,32 +3099,13 @@ need_resched_nonpreemptible: | ||
274 | |||
275 | cpu = smp_processor_id(); | ||
276 | if (unlikely(!rq->nr_running)) { | ||
277 | -go_idle: | ||
278 | idle_balance(cpu, rq); | ||
279 | if (!rq->nr_running) { | ||
280 | next = rq->idle; | ||
281 | rq->expired_timestamp = 0; | ||
282 | - wake_sleeping_dependent(cpu, rq); | ||
283 | - /* | ||
284 | - * wake_sleeping_dependent() might have released | ||
285 | - * the runqueue, so break out if we got new | ||
286 | - * tasks meanwhile: | ||
287 | - */ | ||
288 | - if (!rq->nr_running) | ||
289 | - goto switch_tasks; | ||
290 | - } | ||
291 | - } else { | ||
292 | - if (dependent_sleeper(cpu, rq)) { | ||
293 | - next = rq->idle; | ||
294 | + wake_sleeping_dependent(cpu); | ||
295 | goto switch_tasks; | ||
296 | } | ||
297 | - /* | ||
298 | - * dependent_sleeper() releases and reacquires the runqueue | ||
299 | - * lock, hence go into the idle loop if the rq went | ||
300 | - * empty meanwhile: | ||
301 | - */ | ||
302 | - if (unlikely(!rq->nr_running)) | ||
303 | - goto go_idle; | ||
304 | } | ||
305 | |||
306 | array = rq->active; | ||
307 | @@ -3205,6 +3143,8 @@ go_idle: | ||
308 | } | ||
309 | } | ||
310 | next->sleep_type = SLEEP_NORMAL; | ||
311 | + if (dependent_sleeper(cpu, rq, next)) | ||
312 | + next = rq->idle; | ||
313 | switch_tasks: | ||
314 | if (next == rq->idle) | ||
315 | schedstat_inc(rq, sched_goidle); | ||
316 | @@ -6306,7 +6246,6 @@ void __init sched_init(void) | ||
317 | rq->push_cpu = 0; | ||
318 | rq->migration_thread = NULL; | ||
319 | INIT_LIST_HEAD(&rq->migration_queue); | ||
320 | - rq->cpu = i; | ||
321 | #endif | ||
322 | atomic_set(&rq->nr_iowait, 0); | ||
323 | |||
324 | @@ -6368,7 +6307,7 @@ void normalize_rt_tasks(void) | ||
325 | runqueue_t *rq; | ||
326 | |||
327 | read_lock_irq(&tasklist_lock); | ||
328 | - for_each_process (p) { | ||
329 | + for_each_process(p) { | ||
330 | if (!rt_task(p)) | ||
331 | continue; | ||
332 |