kernel26-alx/patches-2.6.17-r6/0002-2.6.17-sched-revise_smt_nice_locking.patch

Initial report and lock contention fix from Chris Mason:

Recent benchmarks showed some performance regressions between 2.6.16 and 
2.6.5.  We tracked down one of the regressions to lock contention in schedule 
heavy workloads (~70,000 context switches per second)

kernel/sched.c:dependent_sleeper() was responsible for most of the lock 
contention, hammering on the run queue locks.  The patch below is more of 
a discussion point than a suggested fix (although it does reduce lock 
contention significantly).  The dependent_sleeper code looks very expensive 
to me, especially for using a spinlock to bounce control between two different 
siblings in the same cpu.

It is further optimized:

* perform dependent_sleeper check after next task is determined
* convert wake_sleeping_dependent to use trylock
* skip smt runqueue check if trylock fails
* optimize double_rq_lock now that smt nice is converted to trylock
* early exit in searching first SD_SHARE_CPUPOWER domain
* speedup fast path of dependent_sleeper


Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Chris Mason <mason@suse.com>

---
 kernel/sched.c |  175 ++++++++++++++++++---------------------------------------
 1 files changed, 57 insertions(+), 118 deletions(-)

Index: linux-ck-dev/kernel/sched.c
===================================================================
--- linux-ck-dev.orig/kernel/sched.c	2006-06-18 15:21:31.000000000 +1000
+++ linux-ck-dev/kernel/sched.c	2006-06-18 15:21:45.000000000 +1000
@@ -1157,9 +1157,10 @@ static int sched_balance_self(int cpu, i
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 
-	for_each_domain(cpu, tmp)
+	for_each_domain(cpu, tmp) {
 		if (tmp->flags & flag)
 			sd = tmp;
+	}
 
 	while (sd) {
 		cpumask_t span;
@@ -1790,7 +1791,7 @@ static void double_rq_lock(runqueue_t *r
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
-		if (rq1->cpu < rq2->cpu) {
+		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
@@ -1826,7 +1827,7 @@ static void double_lock_balance(runqueue
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest->cpu < this_rq->cpu) {
+		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
@@ -2521,10 +2522,11 @@ static void active_load_balance(runqueue
 	double_lock_balance(busiest_rq, target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
-	for_each_domain(target_cpu, sd)
+	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 			cpu_isset(busiest_cpu, sd->span))
 				break;
+	}
 
 	if (unlikely(sd == NULL))
 		goto out;
@@ -2861,48 +2863,35 @@ static inline void wakeup_busy_runqueue(
 		resched_task(rq->idle);
 }
 
-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
 {
 	struct sched_domain *tmp, *sd = NULL;
-	cpumask_t sibling_map;
 	int i;
 
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_SHARE_CPUPOWER)
+	for_each_domain(this_cpu, tmp) {
+		if (tmp->flags & SD_SHARE_CPUPOWER) {
 			sd = tmp;
+			break;
+		}
+	}
 
 	if (!sd)
 		return;
 
-	/*
-	 * Unlock the current runqueue because we have to lock in
-	 * CPU order to avoid deadlocks. Caller knows that we might
-	 * unlock. We keep IRQs disabled.
-	 */
-	spin_unlock(&this_rq->lock);
-
-	sibling_map = sd->span;
-
-	for_each_cpu_mask(i, sibling_map)
-		spin_lock(&cpu_rq(i)->lock);
-	/*
-	 * We clear this CPU from the mask. This both simplifies the
-	 * inner loop and keps this_rq locked when we exit:
-	 */
-	cpu_clear(this_cpu, sibling_map);
-
-	for_each_cpu_mask(i, sibling_map) {
+	for_each_cpu_mask(i, sd->span) {
 		runqueue_t *smt_rq = cpu_rq(i);
 
+		if (i == this_cpu)
+			continue;
+		if (unlikely(!spin_trylock(&smt_rq->lock)))
+			continue;
+
 		wakeup_busy_runqueue(smt_rq);
+		spin_unlock(&smt_rq->lock);
 	}
-
-	for_each_cpu_mask(i, sibling_map)
-		spin_unlock(&cpu_rq(i)->lock);
-	/*
-	 * We exit with this_cpu's rq still held and IRQs
-	 * still disabled:
-	 */
 }
 
 /*
@@ -2915,52 +2904,46 @@ static inline unsigned long smt_slice(ta
 	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
 
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
 {
 	struct sched_domain *tmp, *sd = NULL;
-	cpumask_t sibling_map;
-	prio_array_t *array;
 	int ret = 0, i;
-	task_t *p;
 
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_SHARE_CPUPOWER)
+	/* kernel/rt threads do not participate in dependent sleeping */
+	if (!p->mm || rt_task(p))
+		return 0;
+
+	for_each_domain(this_cpu, tmp) {
+		if (tmp->flags & SD_SHARE_CPUPOWER) {
 			sd = tmp;
+			break;
+		}
+	}
 
 	if (!sd)
 		return 0;
 
-	/*
-	 * The same locking rules and details apply as for
-	 * wake_sleeping_dependent():
-	 */
-	spin_unlock(&this_rq->lock);
-	sibling_map = sd->span;
-	for_each_cpu_mask(i, sibling_map)
-		spin_lock(&cpu_rq(i)->lock);
-	cpu_clear(this_cpu, sibling_map);
+	for_each_cpu_mask(i, sd->span) {
+		runqueue_t *smt_rq;
+		task_t *smt_curr;
 
-	/*
-	 * Establish next task to be run - it might have gone away because
-	 * we released the runqueue lock above:
-	 */
-	if (!this_rq->nr_running)
-		goto out_unlock;
-	array = this_rq->active;
-	if (!array->nr_active)
-		array = this_rq->expired;
-	BUG_ON(!array->nr_active);
+		if (i == this_cpu)
+			continue;
 
-	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
-		task_t, run_list);
+		smt_rq = cpu_rq(i);
+		if (unlikely(!spin_trylock(&smt_rq->lock)))
+			continue;
 
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq = cpu_rq(i);
-		task_t *smt_curr = smt_rq->curr;
+		smt_curr = smt_rq->curr;
 
-		/* Kernel threads do not participate in dependent sleeping */
-		if (!p->mm || !smt_curr->mm || rt_task(p))
-			goto check_smt_task;
+		if (!smt_curr->mm)
+			goto unlock;
 
 		/*
 		 * If a user task with lower static priority than the
@@ -2984,43 +2967,17 @@ static int dependent_sleeper(int this_cp
 				smt_slice(smt_curr, sd) > task_timeslice(p))
 					ret = 1;
 
-check_smt_task:
-		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
-			rt_task(smt_curr))
-				continue;
-		if (!p->mm) {
-			wakeup_busy_runqueue(smt_rq);
-			continue;
-		}
-
-		/*
-		 * Reschedule a lower priority task on the SMT sibling for
-		 * it to be put to sleep, or wake it up if it has been put to
-		 * sleep for priority reasons to see if it should run now.
-		 */
-		if (rt_task(p)) {
-			if ((jiffies % DEF_TIMESLICE) >
-				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
-					resched_task(smt_curr);
-		} else {
-			if (TASK_PREEMPTS_CURR(p, smt_rq) &&
-				smt_slice(p, sd) > task_timeslice(smt_curr))
-					resched_task(smt_curr);
-			else
-				wakeup_busy_runqueue(smt_rq);
-		}
+unlock:
+		spin_unlock(&smt_rq->lock);
 	}
-out_unlock:
-	for_each_cpu_mask(i, sibling_map)
-		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int this_cpu)
 {
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
 {
 	return 0;
 }
@@ -3142,32 +3099,13 @@ need_resched_nonpreemptible:
 
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
-go_idle:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
 			rq->expired_timestamp = 0;
-			wake_sleeping_dependent(cpu, rq);
-			/*
-			 * wake_sleeping_dependent() might have released
-			 * the runqueue, so break out if we got new
-			 * tasks meanwhile:
-			 */
-			if (!rq->nr_running)
-				goto switch_tasks;
-		}
-	} else {
-		if (dependent_sleeper(cpu, rq)) {
-			next = rq->idle;
+			wake_sleeping_dependent(cpu);
 			goto switch_tasks;
 		}
-		/*
-		 * dependent_sleeper() releases and reacquires the runqueue
-		 * lock, hence go into the idle loop if the rq went
-		 * empty meanwhile:
-		 */
-		if (unlikely(!rq->nr_running))
-			goto go_idle;
 	}
 
 	array = rq->active;
@@ -3205,6 +3143,8 @@ go_idle:
 		}
 	}
 	next->sleep_type = SLEEP_NORMAL;
+	if (dependent_sleeper(cpu, rq, next))
+		next = rq->idle;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
@@ -6306,7 +6246,6 @@ void __init sched_init(void)
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
-		rq->cpu = i;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
@@ -6368,7 +6307,7 @@ void normalize_rt_tasks(void)
 	runqueue_t *rq;
 
 	read_lock_irq(&tasklist_lock);
-	for_each_process (p) {
+	for_each_process(p) {
 		if (!rt_task(p))
 			continue;
 
1	Initial report and lock contention fix from Chris Mason:
2
3	Recent benchmarks showed some performance regressions between 2.6.16 and
4	2.6.5. We tracked down one of the regressions to lock contention in schedule
5	heavy workloads (~70,000 context switches per second)
6
7	kernel/sched.c:dependent_sleeper() was responsible for most of the lock
8	contention, hammering on the run queue locks. The patch below is more of
9	a discussion point than a suggested fix (although it does reduce lock
10	contention significantly). The dependent_sleeper code looks very expensive
11	to me, especially for using a spinlock to bounce control between two different
12	siblings in the same cpu.
13
14	It is further optimized:
15
16	* perform dependent_sleeper check after next task is determined
17	* convert wake_sleeping_dependent to use trylock
18	* skip smt runqueue check if trylock fails
19	* optimize double_rq_lock now that smt nice is converted to trylock
20	* early exit in searching first SD_SHARE_CPUPOWER domain
21	* speedup fast path of dependent_sleeper
22
23
24	Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
25	Signed-off-by: Con Kolivas <kernel@kolivas.org>
26	Signed-off-by: Nick Piggin <npiggin@suse.de>
27	Signed-off-by: Chris Mason <mason@suse.com>
28
29	---
30	kernel/sched.c \| 175 ++++++++++++++++++---------------------------------------
31	1 files changed, 57 insertions(+), 118 deletions(-)
32
33	Index: linux-ck-dev/kernel/sched.c
34	===================================================================
35	--- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:21:31.000000000 +1000
36	+++ linux-ck-dev/kernel/sched.c 2006-06-18 15:21:45.000000000 +1000
37	@@ -1157,9 +1157,10 @@ static int sched_balance_self(int cpu, i
38	struct task_struct *t = current;
39	struct sched_domain tmp, sd = NULL;
40
41	- for_each_domain(cpu, tmp)
42	+ for_each_domain(cpu, tmp) {
43	if (tmp->flags & flag)
44	sd = tmp;
45	+ }
46
47	while (sd) {
48	cpumask_t span;
49	@@ -1790,7 +1791,7 @@ static void double_rq_lock(runqueue_t *r
50	spin_lock(&rq1->lock);
51	__acquire(rq2->lock); /* Fake it out ;) */
52	} else {
53	- if (rq1->cpu < rq2->cpu) {
54	+ if (rq1 < rq2) {
55	spin_lock(&rq1->lock);
56	spin_lock(&rq2->lock);
57	} else {
58	@@ -1826,7 +1827,7 @@ static void double_lock_balance(runqueue
59	__acquires(this_rq->lock)
60	{
61	if (unlikely(!spin_trylock(&busiest->lock))) {
62	- if (busiest->cpu < this_rq->cpu) {
63	+ if (busiest < this_rq) {
64	spin_unlock(&this_rq->lock);
65	spin_lock(&busiest->lock);
66	spin_lock(&this_rq->lock);
67	@@ -2521,10 +2522,11 @@ static void active_load_balance(runqueue
68	double_lock_balance(busiest_rq, target_rq);
69
70	/* Search for an sd spanning us and the target CPU. */
71	- for_each_domain(target_cpu, sd)
72	+ for_each_domain(target_cpu, sd) {
73	if ((sd->flags & SD_LOAD_BALANCE) &&
74	cpu_isset(busiest_cpu, sd->span))
75	break;
76	+ }
77
78	if (unlikely(sd == NULL))
79	goto out;
80	@@ -2861,48 +2863,35 @@ static inline void wakeup_busy_runqueue(
81	resched_task(rq->idle);
82	}
83
84	-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
85	+/*
86	+ * Called with interrupt disabled and this_rq's runqueue locked.
87	+ */
88	+static void wake_sleeping_dependent(int this_cpu)
89	{
90	struct sched_domain tmp, sd = NULL;
91	- cpumask_t sibling_map;
92	int i;
93
94	- for_each_domain(this_cpu, tmp)
95	- if (tmp->flags & SD_SHARE_CPUPOWER)
96	+ for_each_domain(this_cpu, tmp) {
97	+ if (tmp->flags & SD_SHARE_CPUPOWER) {
98	sd = tmp;
99	+ break;
100	+ }
101	+ }
102
103	if (!sd)
104	return;
105
106	- /*
107	- * Unlock the current runqueue because we have to lock in
108	- * CPU order to avoid deadlocks. Caller knows that we might
109	- * unlock. We keep IRQs disabled.
110	- */
111	- spin_unlock(&this_rq->lock);
112	-
113	- sibling_map = sd->span;
114	-
115	- for_each_cpu_mask(i, sibling_map)
116	- spin_lock(&cpu_rq(i)->lock);
117	- /*
118	- * We clear this CPU from the mask. This both simplifies the
119	- * inner loop and keps this_rq locked when we exit:
120	- */
121	- cpu_clear(this_cpu, sibling_map);
122	-
123	- for_each_cpu_mask(i, sibling_map) {
124	+ for_each_cpu_mask(i, sd->span) {
125	runqueue_t *smt_rq = cpu_rq(i);
126
127	+ if (i == this_cpu)
128	+ continue;
129	+ if (unlikely(!spin_trylock(&smt_rq->lock)))
130	+ continue;
131	+
132	wakeup_busy_runqueue(smt_rq);
133	+ spin_unlock(&smt_rq->lock);
134	}
135	-
136	- for_each_cpu_mask(i, sibling_map)
137	- spin_unlock(&cpu_rq(i)->lock);
138	- /*
139	- * We exit with this_cpu's rq still held and IRQs
140	- * still disabled:
141	- */
142	}
143
144	/*
145	@@ -2915,52 +2904,46 @@ static inline unsigned long smt_slice(ta
146	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
147	}
148
149	-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
150	+/*
151	+ * To minimise lock contention and not have to drop this_rq's runlock we only
152	+ * trylock the sibling runqueues and bypass those runqueues if we fail to
153	+ * acquire their lock. As we only trylock the normal locking order does not
154	+ * need to be obeyed.
155	+ */
156	+static int dependent_sleeper(int this_cpu, runqueue_t this_rq, task_t p)
157	{
158	struct sched_domain tmp, sd = NULL;
159	- cpumask_t sibling_map;
160	- prio_array_t *array;
161	int ret = 0, i;
162	- task_t *p;
163
164	- for_each_domain(this_cpu, tmp)
165	- if (tmp->flags & SD_SHARE_CPUPOWER)
166	+ /* kernel/rt threads do not participate in dependent sleeping */
167	+ if (!p->mm \|\| rt_task(p))
168	+ return 0;
169	+
170	+ for_each_domain(this_cpu, tmp) {
171	+ if (tmp->flags & SD_SHARE_CPUPOWER) {
172	sd = tmp;
173	+ break;
174	+ }
175	+ }
176
177	if (!sd)
178	return 0;
179
180	- /*
181	- * The same locking rules and details apply as for
182	- * wake_sleeping_dependent():
183	- */
184	- spin_unlock(&this_rq->lock);
185	- sibling_map = sd->span;
186	- for_each_cpu_mask(i, sibling_map)
187	- spin_lock(&cpu_rq(i)->lock);
188	- cpu_clear(this_cpu, sibling_map);
189	+ for_each_cpu_mask(i, sd->span) {
190	+ runqueue_t *smt_rq;
191	+ task_t *smt_curr;
192
193	- /*
194	- * Establish next task to be run - it might have gone away because
195	- * we released the runqueue lock above:
196	- */
197	- if (!this_rq->nr_running)
198	- goto out_unlock;
199	- array = this_rq->active;
200	- if (!array->nr_active)
201	- array = this_rq->expired;
202	- BUG_ON(!array->nr_active);
203	+ if (i == this_cpu)
204	+ continue;
205
206	- p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
207	- task_t, run_list);
208	+ smt_rq = cpu_rq(i);
209	+ if (unlikely(!spin_trylock(&smt_rq->lock)))
210	+ continue;
211
212	- for_each_cpu_mask(i, sibling_map) {
213	- runqueue_t *smt_rq = cpu_rq(i);
214	- task_t *smt_curr = smt_rq->curr;
215	+ smt_curr = smt_rq->curr;
216
217	- /* Kernel threads do not participate in dependent sleeping */
218	- if (!p->mm \|\| !smt_curr->mm \|\| rt_task(p))
219	- goto check_smt_task;
220	+ if (!smt_curr->mm)
221	+ goto unlock;
222
223	/*
224	* If a user task with lower static priority than the
225	@@ -2984,43 +2967,17 @@ static int dependent_sleeper(int this_cp
226	smt_slice(smt_curr, sd) > task_timeslice(p))
227	ret = 1;
228
229	-check_smt_task:
230	- if ((!smt_curr->mm && smt_curr != smt_rq->idle) \|\|
231	- rt_task(smt_curr))
232	- continue;
233	- if (!p->mm) {
234	- wakeup_busy_runqueue(smt_rq);
235	- continue;
236	- }
237	-
238	- /*
239	- * Reschedule a lower priority task on the SMT sibling for
240	- * it to be put to sleep, or wake it up if it has been put to
241	- * sleep for priority reasons to see if it should run now.
242	- */
243	- if (rt_task(p)) {
244	- if ((jiffies % DEF_TIMESLICE) >
245	- (sd->per_cpu_gain * DEF_TIMESLICE / 100))
246	- resched_task(smt_curr);
247	- } else {
248	- if (TASK_PREEMPTS_CURR(p, smt_rq) &&
249	- smt_slice(p, sd) > task_timeslice(smt_curr))
250	- resched_task(smt_curr);
251	- else
252	- wakeup_busy_runqueue(smt_rq);
253	- }
254	+unlock:
255	+ spin_unlock(&smt_rq->lock);
256	}
257	-out_unlock:
258	- for_each_cpu_mask(i, sibling_map)
259	- spin_unlock(&cpu_rq(i)->lock);
260	return ret;
261	}
262	#else
263	-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
264	+static inline void wake_sleeping_dependent(int this_cpu)
265	{
266	}
267
268	-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
269	+static inline int dependent_sleeper(int this_cpu, runqueue_t this_rq, task_t p)
270	{
271	return 0;
272	}
273	@@ -3142,32 +3099,13 @@ need_resched_nonpreemptible:
274
275	cpu = smp_processor_id();
276	if (unlikely(!rq->nr_running)) {
277	-go_idle:
278	idle_balance(cpu, rq);
279	if (!rq->nr_running) {
280	next = rq->idle;
281	rq->expired_timestamp = 0;
282	- wake_sleeping_dependent(cpu, rq);
283	- /*
284	- * wake_sleeping_dependent() might have released
285	- * the runqueue, so break out if we got new
286	- * tasks meanwhile:
287	- */
288	- if (!rq->nr_running)
289	- goto switch_tasks;
290	- }
291	- } else {
292	- if (dependent_sleeper(cpu, rq)) {
293	- next = rq->idle;
294	+ wake_sleeping_dependent(cpu);
295	goto switch_tasks;
296	}
297	- /*
298	- * dependent_sleeper() releases and reacquires the runqueue
299	- * lock, hence go into the idle loop if the rq went
300	- * empty meanwhile:
301	- */
302	- if (unlikely(!rq->nr_running))
303	- goto go_idle;
304	}
305
306	array = rq->active;
307	@@ -3205,6 +3143,8 @@ go_idle:
308	}
309	}
310	next->sleep_type = SLEEP_NORMAL;
311	+ if (dependent_sleeper(cpu, rq, next))
312	+ next = rq->idle;
313	switch_tasks:
314	if (next == rq->idle)
315	schedstat_inc(rq, sched_goidle);
316	@@ -6306,7 +6246,6 @@ void __init sched_init(void)
317	rq->push_cpu = 0;
318	rq->migration_thread = NULL;
319	INIT_LIST_HEAD(&rq->migration_queue);
320	- rq->cpu = i;
321	#endif
322	atomic_set(&rq->nr_iowait, 0);
323
324	@@ -6368,7 +6307,7 @@ void normalize_rt_tasks(void)
325	runqueue_t *rq;
326
327	read_lock_irq(&tasklist_lock);
328	- for_each_process (p) {
329	+ for_each_process(p) {
330	if (!rt_task(p))
331	continue;
332