Magellan Linux

Contents of /trunk/kernel26-magellan/patches-2.6.21-r13/0004-2.6.21-sched-iso-5.4.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 319 - (show annotations) (download)
Sun Aug 19 18:14:21 2007 UTC (17 years, 2 months ago) by niro
File size: 11954 byte(s)
-2.6.21-magellan-r13

1 Add the SCHED_ISO policy (isochronous) which is a starvation free soft
2 realtime policy available to unprivileged users. The amount of cpu that
3 SCHED_ISO tasks will run as realtime is configurable by the tunable in
4
5 /proc/sys/kernel/iso_cpu
6
7 and is set to 80% by default.
8
9 The duration over which its cpu usage is averaged is controlled by the
10 tunable
11
12 /proc/sys/kernel/iso_period
13
14 and is set to 5 (seconds) by default.
15
16 Signed-off-by: Con Kolivas <kernel@kolivas.org>
17
18 Documentation/sysctl/kernel.txt | 21 +++++++
19 include/linux/sched.h | 8 ++
20 kernel/sched.c | 115 +++++++++++++++++++++++++++++++++++++---
21 kernel/sysctl.c | 24 ++++++++
22 4 files changed, 160 insertions(+), 8 deletions(-)
23
24 Index: linux-2.6.21-ck2/include/linux/sched.h
25 ===================================================================
26 --- linux-2.6.21-ck2.orig/include/linux/sched.h 2007-05-14 19:30:30.000000000 +1000
27 +++ linux-2.6.21-ck2/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000
28 @@ -34,10 +34,11 @@
29 #define SCHED_FIFO 1
30 #define SCHED_RR 2
31 #define SCHED_BATCH 3
32 +#define SCHED_ISO 4
33
34 #ifdef __KERNEL__
35
36 -#define SCHED_MAX SCHED_BATCH
37 +#define SCHED_MAX SCHED_ISO
38 #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
39
40 struct sched_param {
41 @@ -525,15 +526,17 @@ struct signal_struct {
42 #define MAX_USER_RT_PRIO 100
43 #define MAX_RT_PRIO MAX_USER_RT_PRIO
44 #define PRIO_RANGE (40)
45 +#define ISO_PRIO (MAX_RT_PRIO - 1)
46
47 #define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE)
48
49 -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
50 +#define rt_prio(prio) unlikely((prio) < ISO_PRIO)
51 #define rt_task(p) rt_prio((p)->prio)
52 #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
53 #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
54 (policy) == SCHED_RR)
55 #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
56 +#define iso_task(p) unlikely((p)->policy == SCHED_ISO)
57
58 /*
59 * Some day this will be a full-fledged user tracking system..
60 @@ -1166,6 +1169,7 @@ static inline void put_task_struct(struc
61 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
62 #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
63 #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
64 +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */
65 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
66 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
67
68 Index: linux-2.6.21-ck2/kernel/sched.c
69 ===================================================================
70 --- linux-2.6.21-ck2.orig/kernel/sched.c 2007-05-14 19:30:30.000000000 +1000
71 +++ linux-2.6.21-ck2/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000
72 @@ -104,6 +104,18 @@ int rr_interval __read_mostly = 8;
73 int sched_interactive __read_mostly = 1;
74
75 /*
76 + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
77 + * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
78 + * sched_iso_period - sysctl which determines the number of seconds over
79 + * which cpu usage of SCHED_ISO tasks is averaged to determine if they are
80 + * exceeding their allowable bandwidth.
81 +*/
82 +int sched_iso_cpu __read_mostly = 80;
83 +int sched_iso_period __read_mostly = 5;
84 +
85 +#define ISO_PERIOD ((sched_iso_period * HZ) + 1)
86 +
87 +/*
88 * This contains a bitmap for each dynamic priority level with empty slots
89 * for the valid priorities each different nice level can have. It allows
90 * us to stagger the slots where differing priorities run in a way that
91 @@ -200,6 +212,8 @@ struct rq {
92
93 /* How many times we have rotated the priority queue */
94 unsigned long prio_rotation;
95 + unsigned long iso_ticks;
96 + unsigned short iso_refractory;
97
98 atomic_t nr_iowait;
99
100 @@ -790,6 +804,11 @@ static inline void update_if_moved(struc
101 }
102 #endif
103
104 +static inline int isoprio_suitable(struct task_struct *p)
105 +{
106 + return !(p->flags & PF_ISOREF);
107 +}
108 +
109 /*
110 * recalc_task_prio determines what priority a non rt_task will be
111 * queued at. If the task has already been running during this runqueue's
112 @@ -806,6 +825,25 @@ static void recalc_task_prio(struct task
113 struct prio_array *array = rq->active;
114 int queue_prio;
115
116 + if (iso_task(p)) {
117 + if (isoprio_suitable(p)) {
118 + /*
119 + * If SCHED_ISO tasks have not used up their real time
120 + * quota they have run just better than highest
121 + * SCHED_NORMAL priority. Otherwise they run as
122 + * SCHED_NORMAL.
123 + */
124 + p->prio = p->normal_prio = ISO_PRIO;
125 + p->array = rq->active;
126 + if (p->time_slice <= 0)
127 + p->time_slice = p->quota;
128 + return;
129 + } else if (p->prio == ISO_PRIO) {
130 + /* Just about to be demoted to SCHED_NORMAL */
131 + p->time_slice = 0;
132 + }
133 + }
134 +
135 update_if_moved(p, rq);
136 if (p->rotation == rq->prio_rotation) {
137 if (p->array == array) {
138 @@ -3180,18 +3218,65 @@ static void task_expired_entitlement(str
139 p->time_slice += overrun;
140 }
141
142 +/*
143 + * Test if SCHED_ISO tasks have run longer than their alloted period as RT
144 + * tasks and set the refractory flag if necessary. There is 10% hysteresis
145 + * for unsetting the flag.
146 + */
147 +static unsigned int test_ret_isorefractory(struct rq *rq)
148 +{
149 + if (likely(!rq->iso_refractory)) {
150 + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
151 + rq->iso_refractory = 1;
152 + } else {
153 + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
154 + rq->iso_refractory = 0;
155 + }
156 + return rq->iso_refractory;
157 +}
158 +
159 +/* No SCHED_ISO task was running so decrease rq->iso_ticks */
160 +static inline void no_iso_tick(struct rq *rq)
161 +{
162 + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
163 +}
164 +
165 /* This manages tasks that have run out of timeslice during a scheduler_tick */
166 static void task_running_tick(struct rq *rq, struct task_struct *p)
167 {
168 + /*
169 + * If a SCHED_ISO task is running we increment the iso_ticks. In
170 + * order to prevent SCHED_ISO tasks from causing starvation in the
171 + * presence of true RT tasks we account those as iso_ticks as well.
172 + */
173 + if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) {
174 + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
175 + rq->iso_ticks += 100;
176 + } else
177 + no_iso_tick(rq);
178 +
179 + if (iso_task(p)) {
180 + if (unlikely(test_ret_isorefractory(rq))) {
181 + if (isoprio_suitable(p)) {
182 + /*
183 + * SCHED_ISO task is running as RT and limit
184 + * has been hit. Set the PF_ISOREF flag and
185 + * force it to reschedule as SCHED_NORMAL
186 + * by zeroing its time_slice
187 + */
188 + p->flags |= PF_ISOREF;
189 + p->time_slice = 0;
190 + }
191 + } else
192 + p->flags &= ~PF_ISOREF;
193 + }
194 /* SCHED_FIFO tasks never run out of timeslice. */
195 if (p->time_slice > 0 || p->policy == SCHED_FIFO)
196 return;
197 /* p->time_slice <= 0 */
198 - spin_lock(&rq->lock);
199 + set_tsk_need_resched(p);
200 if (likely(task_queued(p)))
201 task_expired_entitlement(rq, p);
202 - set_tsk_need_resched(p);
203 - spin_unlock(&rq->lock);
204 }
205
206 /*
207 @@ -3207,8 +3292,12 @@ void scheduler_tick(void)
208
209 update_cpu_clock(p, rq, now, 1);
210
211 + spin_lock(&rq->lock);
212 if (p != rq->idle)
213 task_running_tick(rq, p);
214 + else
215 + no_iso_tick(rq);
216 + spin_unlock(&rq->lock);
217 #ifdef CONFIG_SMP
218 update_load(rq);
219 if (time_after_eq(jiffies, rq->next_balance))
220 @@ -3285,7 +3374,8 @@ retry:
221 }
222 queue = array->queue + idx;
223 next = list_entry(queue->next, struct task_struct, run_list);
224 - if (unlikely(next->time_slice <= 0)) {
225 + if (unlikely(next->time_slice <= 0 && !(iso_task(next) &&
226 + isoprio_suitable(next)))) {
227 /*
228 * Unlucky enough that this task ran out of time_slice
229 * before it hit a scheduler_tick so it should have its
230 @@ -3377,7 +3467,7 @@ need_resched_nonpreemptible:
231 }
232
233 idx = sched_find_first_bit(rq->dyn_bitmap);
234 - if (!rt_prio(idx))
235 + if (likely(idx > ISO_PRIO))
236 next = next_dynamic_task(rq, idx);
237 else {
238 queue = rq->active->queue + idx;
239 @@ -4042,12 +4132,22 @@ static void __setscheduler(struct task_s
240 int sched_setscheduler(struct task_struct *p, int policy,
241 struct sched_param *param)
242 {
243 + struct sched_param zero_param = { .sched_priority = 0 };
244 int queued, retval, oldprio, oldpolicy = -1;
245 unsigned long flags;
246 struct rq *rq;
247
248 /* may grab non-irq protected spin_locks */
249 BUG_ON(in_interrupt());
250 + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
251 + /*
252 + * If the caller requested an RT policy without having the
253 + * necessary rights, we downgrade the policy to SCHED_ISO.
254 + * We also set the parameter to zero to pass the checks.
255 + */
256 + policy = SCHED_ISO;
257 + param = &zero_param;
258 + }
259 recheck:
260 /* double check policy once rq lock held */
261 if (policy < 0)
262 @@ -4577,6 +4677,7 @@ asmlinkage long sys_sched_get_priority_m
263 break;
264 case SCHED_NORMAL:
265 case SCHED_BATCH:
266 + case SCHED_ISO:
267 ret = 0;
268 break;
269 }
270 @@ -4601,6 +4702,7 @@ asmlinkage long sys_sched_get_priority_m
271 break;
272 case SCHED_NORMAL:
273 case SCHED_BATCH:
274 + case SCHED_ISO:
275 ret = 0;
276 }
277 return ret;
278 @@ -6708,6 +6810,7 @@ void __init sched_init(void)
279 rq = cpu_rq(i);
280 spin_lock_init(&rq->lock);
281 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
282 + rq->iso_ticks = 0;
283 rq->nr_running = 0;
284 rq->prio_rotation = 0;
285 rq->active = rq->arrays;
286 @@ -6801,7 +6904,7 @@ void normalize_rt_tasks(void)
287
288 read_lock_irq(&tasklist_lock);
289 for_each_process(p) {
290 - if (!rt_task(p))
291 + if (!rt_task(p) && !iso_task(p))
292 continue;
293
294 spin_lock_irqsave(&p->pi_lock, flags);
295 Index: linux-2.6.21-ck2/Documentation/sysctl/kernel.txt
296 ===================================================================
297 --- linux-2.6.21-ck2.orig/Documentation/sysctl/kernel.txt 2007-05-14 19:30:30.000000000 +1000
298 +++ linux-2.6.21-ck2/Documentation/sysctl/kernel.txt 2007-05-14 19:30:31.000000000 +1000
299 @@ -26,6 +26,8 @@ show up in /proc/sys/kernel:
300 - hostname
301 - hotplug
302 - interactive
303 +- iso_cpu
304 +- iso_period
305 - java-appletviewer [ binfmt_java, obsolete ]
306 - java-interpreter [ binfmt_java, obsolete ]
307 - kstack_depth_to_print [ X86 only ]
308 @@ -181,6 +183,25 @@ Default value is 1 (enabled).
309
310 ==============================================================
311
312 +iso_cpu:
313 +
314 +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
315 +run effectively at realtime priority, averaged over a rolling iso_period
316 +seconds.
317 +
318 +Set to 80 (percent) by default.
319 +
320 +==============================================================
321 +
322 +iso_period:
323 +
324 +This sets the number of seconds over which SCHED_ISO cpu usage is averaged
325 +to see if it exceeds its allocated cpu bandwidth.
326 +
327 +Set to 5 (seconds) by default.
328 +
329 +==============================================================
330 +
331 l2cr: (PPC only)
332
333 This flag controls the L2 cache of G3 processor boards. If
334 Index: linux-2.6.21-ck2/kernel/sysctl.c
335 ===================================================================
336 --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:30:30.000000000 +1000
337 +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:30:31.000000000 +1000
338 @@ -78,6 +78,8 @@ extern int percpu_pagelist_fraction;
339 extern int compat_log;
340 extern int rr_interval;
341 extern int sched_interactive;
342 +extern int sched_iso_cpu;
343 +extern int sched_iso_period;
344
345 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
346 static int maxolduid = 65535;
347 @@ -528,6 +530,28 @@ static ctl_table kern_table[] = {
348 .mode = 0644,
349 .proc_handler = &proc_dointvec,
350 },
351 + {
352 + .ctl_name = CTL_UNNUMBERED,
353 + .procname = "iso_cpu",
354 + .data = &sched_iso_cpu,
355 + .maxlen = sizeof (int),
356 + .mode = 0644,
357 + .proc_handler = &proc_dointvec_minmax,
358 + .strategy = &sysctl_intvec,
359 + .extra1 = &zero,
360 + .extra2 = &one_hundred,
361 + },
362 + {
363 + .ctl_name = CTL_UNNUMBERED,
364 + .procname = "iso_period",
365 + .data = &sched_iso_period,
366 + .maxlen = sizeof (int),
367 + .mode = 0644,
368 + .proc_handler = &proc_dointvec_minmax,
369 + .strategy = &sysctl_intvec,
370 + .extra1 = &one,
371 + .extra2 = &one_hundred,
372 + },
373 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
374 {
375 .ctl_name = KERN_UNKNOWN_NMI_PANIC,