Contents of /trunk/kernel26-magellan/patches-2.6.21-r13/0004-2.6.21-sched-iso-5.4.patch
Parent Directory | Revision Log
Revision 319 -
(show annotations)
(download)
Sun Aug 19 18:14:21 2007 UTC (17 years, 2 months ago) by niro
File size: 11954 byte(s)
Sun Aug 19 18:14:21 2007 UTC (17 years, 2 months ago) by niro
File size: 11954 byte(s)
-2.6.21-magellan-r13
1 | Add the SCHED_ISO policy (isochronous) which is a starvation free soft |
2 | realtime policy available to unprivileged users. The amount of cpu that |
3 | SCHED_ISO tasks will run as realtime is configurable by the tunable in |
4 | |
5 | /proc/sys/kernel/iso_cpu |
6 | |
7 | and is set to 80% by default. |
8 | |
9 | The duration over which its cpu usage is averaged is controlled by the |
10 | tunable |
11 | |
12 | /proc/sys/kernel/iso_period |
13 | |
14 | and is set to 5 (seconds) by default. |
15 | |
16 | Signed-off-by: Con Kolivas <kernel@kolivas.org> |
17 | |
18 | Documentation/sysctl/kernel.txt | 21 +++++++ |
19 | include/linux/sched.h | 8 ++ |
20 | kernel/sched.c | 115 +++++++++++++++++++++++++++++++++++++--- |
21 | kernel/sysctl.c | 24 ++++++++ |
22 | 4 files changed, 160 insertions(+), 8 deletions(-) |
23 | |
24 | Index: linux-2.6.21-ck2/include/linux/sched.h |
25 | =================================================================== |
26 | --- linux-2.6.21-ck2.orig/include/linux/sched.h 2007-05-14 19:30:30.000000000 +1000 |
27 | +++ linux-2.6.21-ck2/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000 |
28 | @@ -34,10 +34,11 @@ |
29 | #define SCHED_FIFO 1 |
30 | #define SCHED_RR 2 |
31 | #define SCHED_BATCH 3 |
32 | +#define SCHED_ISO 4 |
33 | |
34 | #ifdef __KERNEL__ |
35 | |
36 | -#define SCHED_MAX SCHED_BATCH |
37 | +#define SCHED_MAX SCHED_ISO |
38 | #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) |
39 | |
40 | struct sched_param { |
41 | @@ -525,15 +526,17 @@ struct signal_struct { |
42 | #define MAX_USER_RT_PRIO 100 |
43 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
44 | #define PRIO_RANGE (40) |
45 | +#define ISO_PRIO (MAX_RT_PRIO - 1) |
46 | |
47 | #define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) |
48 | |
49 | -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) |
50 | +#define rt_prio(prio) unlikely((prio) < ISO_PRIO) |
51 | #define rt_task(p) rt_prio((p)->prio) |
52 | #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) |
53 | #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ |
54 | (policy) == SCHED_RR) |
55 | #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) |
56 | +#define iso_task(p) unlikely((p)->policy == SCHED_ISO) |
57 | |
58 | /* |
59 | * Some day this will be a full-fledged user tracking system.. |
60 | @@ -1166,6 +1169,7 @@ static inline void put_task_struct(struc |
61 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
62 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ |
63 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
64 | +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ |
65 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
66 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
67 | |
68 | Index: linux-2.6.21-ck2/kernel/sched.c |
69 | =================================================================== |
70 | --- linux-2.6.21-ck2.orig/kernel/sched.c 2007-05-14 19:30:30.000000000 +1000 |
71 | +++ linux-2.6.21-ck2/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000 |
72 | @@ -104,6 +104,18 @@ int rr_interval __read_mostly = 8; |
73 | int sched_interactive __read_mostly = 1; |
74 | |
75 | /* |
76 | + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks |
77 | + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. |
78 | + * sched_iso_period - sysctl which determines the number of seconds over |
79 | + * which cpu usage of SCHED_ISO tasks is averaged to determine if they are |
80 | + * exceeding their allowable bandwidth. |
81 | +*/ |
82 | +int sched_iso_cpu __read_mostly = 80; |
83 | +int sched_iso_period __read_mostly = 5; |
84 | + |
85 | +#define ISO_PERIOD ((sched_iso_period * HZ) + 1) |
86 | + |
87 | +/* |
88 | * This contains a bitmap for each dynamic priority level with empty slots |
89 | * for the valid priorities each different nice level can have. It allows |
90 | * us to stagger the slots where differing priorities run in a way that |
91 | @@ -200,6 +212,8 @@ struct rq { |
92 | |
93 | /* How many times we have rotated the priority queue */ |
94 | unsigned long prio_rotation; |
95 | + unsigned long iso_ticks; |
96 | + unsigned short iso_refractory; |
97 | |
98 | atomic_t nr_iowait; |
99 | |
100 | @@ -790,6 +804,11 @@ static inline void update_if_moved(struc |
101 | } |
102 | #endif |
103 | |
104 | +static inline int isoprio_suitable(struct task_struct *p) |
105 | +{ |
106 | + return !(p->flags & PF_ISOREF); |
107 | +} |
108 | + |
109 | /* |
110 | * recalc_task_prio determines what priority a non rt_task will be |
111 | * queued at. If the task has already been running during this runqueue's |
112 | @@ -806,6 +825,25 @@ static void recalc_task_prio(struct task |
113 | struct prio_array *array = rq->active; |
114 | int queue_prio; |
115 | |
116 | + if (iso_task(p)) { |
117 | + if (isoprio_suitable(p)) { |
118 | + /* |
119 | + * If SCHED_ISO tasks have not used up their real time |
120 | + * quota they have run just better than highest |
121 | + * SCHED_NORMAL priority. Otherwise they run as |
122 | + * SCHED_NORMAL. |
123 | + */ |
124 | + p->prio = p->normal_prio = ISO_PRIO; |
125 | + p->array = rq->active; |
126 | + if (p->time_slice <= 0) |
127 | + p->time_slice = p->quota; |
128 | + return; |
129 | + } else if (p->prio == ISO_PRIO) { |
130 | + /* Just about to be demoted to SCHED_NORMAL */ |
131 | + p->time_slice = 0; |
132 | + } |
133 | + } |
134 | + |
135 | update_if_moved(p, rq); |
136 | if (p->rotation == rq->prio_rotation) { |
137 | if (p->array == array) { |
138 | @@ -3180,18 +3218,65 @@ static void task_expired_entitlement(str |
139 | p->time_slice += overrun; |
140 | } |
141 | |
142 | +/* |
143 | + * Test if SCHED_ISO tasks have run longer than their alloted period as RT |
144 | + * tasks and set the refractory flag if necessary. There is 10% hysteresis |
145 | + * for unsetting the flag. |
146 | + */ |
147 | +static unsigned int test_ret_isorefractory(struct rq *rq) |
148 | +{ |
149 | + if (likely(!rq->iso_refractory)) { |
150 | + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) |
151 | + rq->iso_refractory = 1; |
152 | + } else { |
153 | + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) |
154 | + rq->iso_refractory = 0; |
155 | + } |
156 | + return rq->iso_refractory; |
157 | +} |
158 | + |
159 | +/* No SCHED_ISO task was running so decrease rq->iso_ticks */ |
160 | +static inline void no_iso_tick(struct rq *rq) |
161 | +{ |
162 | + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; |
163 | +} |
164 | + |
165 | /* This manages tasks that have run out of timeslice during a scheduler_tick */ |
166 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
167 | { |
168 | + /* |
169 | + * If a SCHED_ISO task is running we increment the iso_ticks. In |
170 | + * order to prevent SCHED_ISO tasks from causing starvation in the |
171 | + * presence of true RT tasks we account those as iso_ticks as well. |
172 | + */ |
173 | + if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { |
174 | + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) |
175 | + rq->iso_ticks += 100; |
176 | + } else |
177 | + no_iso_tick(rq); |
178 | + |
179 | + if (iso_task(p)) { |
180 | + if (unlikely(test_ret_isorefractory(rq))) { |
181 | + if (isoprio_suitable(p)) { |
182 | + /* |
183 | + * SCHED_ISO task is running as RT and limit |
184 | + * has been hit. Set the PF_ISOREF flag and |
185 | + * force it to reschedule as SCHED_NORMAL |
186 | + * by zeroing its time_slice |
187 | + */ |
188 | + p->flags |= PF_ISOREF; |
189 | + p->time_slice = 0; |
190 | + } |
191 | + } else |
192 | + p->flags &= ~PF_ISOREF; |
193 | + } |
194 | /* SCHED_FIFO tasks never run out of timeslice. */ |
195 | if (p->time_slice > 0 || p->policy == SCHED_FIFO) |
196 | return; |
197 | /* p->time_slice <= 0 */ |
198 | - spin_lock(&rq->lock); |
199 | + set_tsk_need_resched(p); |
200 | if (likely(task_queued(p))) |
201 | task_expired_entitlement(rq, p); |
202 | - set_tsk_need_resched(p); |
203 | - spin_unlock(&rq->lock); |
204 | } |
205 | |
206 | /* |
207 | @@ -3207,8 +3292,12 @@ void scheduler_tick(void) |
208 | |
209 | update_cpu_clock(p, rq, now, 1); |
210 | |
211 | + spin_lock(&rq->lock); |
212 | if (p != rq->idle) |
213 | task_running_tick(rq, p); |
214 | + else |
215 | + no_iso_tick(rq); |
216 | + spin_unlock(&rq->lock); |
217 | #ifdef CONFIG_SMP |
218 | update_load(rq); |
219 | if (time_after_eq(jiffies, rq->next_balance)) |
220 | @@ -3285,7 +3374,8 @@ retry: |
221 | } |
222 | queue = array->queue + idx; |
223 | next = list_entry(queue->next, struct task_struct, run_list); |
224 | - if (unlikely(next->time_slice <= 0)) { |
225 | + if (unlikely(next->time_slice <= 0 && !(iso_task(next) && |
226 | + isoprio_suitable(next)))) { |
227 | /* |
228 | * Unlucky enough that this task ran out of time_slice |
229 | * before it hit a scheduler_tick so it should have its |
230 | @@ -3377,7 +3467,7 @@ need_resched_nonpreemptible: |
231 | } |
232 | |
233 | idx = sched_find_first_bit(rq->dyn_bitmap); |
234 | - if (!rt_prio(idx)) |
235 | + if (likely(idx > ISO_PRIO)) |
236 | next = next_dynamic_task(rq, idx); |
237 | else { |
238 | queue = rq->active->queue + idx; |
239 | @@ -4042,12 +4132,22 @@ static void __setscheduler(struct task_s |
240 | int sched_setscheduler(struct task_struct *p, int policy, |
241 | struct sched_param *param) |
242 | { |
243 | + struct sched_param zero_param = { .sched_priority = 0 }; |
244 | int queued, retval, oldprio, oldpolicy = -1; |
245 | unsigned long flags; |
246 | struct rq *rq; |
247 | |
248 | /* may grab non-irq protected spin_locks */ |
249 | BUG_ON(in_interrupt()); |
250 | + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { |
251 | + /* |
252 | + * If the caller requested an RT policy without having the |
253 | + * necessary rights, we downgrade the policy to SCHED_ISO. |
254 | + * We also set the parameter to zero to pass the checks. |
255 | + */ |
256 | + policy = SCHED_ISO; |
257 | + param = &zero_param; |
258 | + } |
259 | recheck: |
260 | /* double check policy once rq lock held */ |
261 | if (policy < 0) |
262 | @@ -4577,6 +4677,7 @@ asmlinkage long sys_sched_get_priority_m |
263 | break; |
264 | case SCHED_NORMAL: |
265 | case SCHED_BATCH: |
266 | + case SCHED_ISO: |
267 | ret = 0; |
268 | break; |
269 | } |
270 | @@ -4601,6 +4702,7 @@ asmlinkage long sys_sched_get_priority_m |
271 | break; |
272 | case SCHED_NORMAL: |
273 | case SCHED_BATCH: |
274 | + case SCHED_ISO: |
275 | ret = 0; |
276 | } |
277 | return ret; |
278 | @@ -6708,6 +6810,7 @@ void __init sched_init(void) |
279 | rq = cpu_rq(i); |
280 | spin_lock_init(&rq->lock); |
281 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
282 | + rq->iso_ticks = 0; |
283 | rq->nr_running = 0; |
284 | rq->prio_rotation = 0; |
285 | rq->active = rq->arrays; |
286 | @@ -6801,7 +6904,7 @@ void normalize_rt_tasks(void) |
287 | |
288 | read_lock_irq(&tasklist_lock); |
289 | for_each_process(p) { |
290 | - if (!rt_task(p)) |
291 | + if (!rt_task(p) && !iso_task(p)) |
292 | continue; |
293 | |
294 | spin_lock_irqsave(&p->pi_lock, flags); |
295 | Index: linux-2.6.21-ck2/Documentation/sysctl/kernel.txt |
296 | =================================================================== |
297 | --- linux-2.6.21-ck2.orig/Documentation/sysctl/kernel.txt 2007-05-14 19:30:30.000000000 +1000 |
298 | +++ linux-2.6.21-ck2/Documentation/sysctl/kernel.txt 2007-05-14 19:30:31.000000000 +1000 |
299 | @@ -26,6 +26,8 @@ show up in /proc/sys/kernel: |
300 | - hostname |
301 | - hotplug |
302 | - interactive |
303 | +- iso_cpu |
304 | +- iso_period |
305 | - java-appletviewer [ binfmt_java, obsolete ] |
306 | - java-interpreter [ binfmt_java, obsolete ] |
307 | - kstack_depth_to_print [ X86 only ] |
308 | @@ -181,6 +183,25 @@ Default value is 1 (enabled). |
309 | |
310 | ============================================================== |
311 | |
312 | +iso_cpu: |
313 | + |
314 | +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can |
315 | +run effectively at realtime priority, averaged over a rolling iso_period |
316 | +seconds. |
317 | + |
318 | +Set to 80 (percent) by default. |
319 | + |
320 | +============================================================== |
321 | + |
322 | +iso_period: |
323 | + |
324 | +This sets the number of seconds over which SCHED_ISO cpu usage is averaged |
325 | +to see if it exceeds its allocated cpu bandwidth. |
326 | + |
327 | +Set to 5 (seconds) by default. |
328 | + |
329 | +============================================================== |
330 | + |
331 | l2cr: (PPC only) |
332 | |
333 | This flag controls the L2 cache of G3 processor boards. If |
334 | Index: linux-2.6.21-ck2/kernel/sysctl.c |
335 | =================================================================== |
336 | --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:30:30.000000000 +1000 |
337 | +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:30:31.000000000 +1000 |
338 | @@ -78,6 +78,8 @@ extern int percpu_pagelist_fraction; |
339 | extern int compat_log; |
340 | extern int rr_interval; |
341 | extern int sched_interactive; |
342 | +extern int sched_iso_cpu; |
343 | +extern int sched_iso_period; |
344 | |
345 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
346 | static int maxolduid = 65535; |
347 | @@ -528,6 +530,28 @@ static ctl_table kern_table[] = { |
348 | .mode = 0644, |
349 | .proc_handler = &proc_dointvec, |
350 | }, |
351 | + { |
352 | + .ctl_name = CTL_UNNUMBERED, |
353 | + .procname = "iso_cpu", |
354 | + .data = &sched_iso_cpu, |
355 | + .maxlen = sizeof (int), |
356 | + .mode = 0644, |
357 | + .proc_handler = &proc_dointvec_minmax, |
358 | + .strategy = &sysctl_intvec, |
359 | + .extra1 = &zero, |
360 | + .extra2 = &one_hundred, |
361 | + }, |
362 | + { |
363 | + .ctl_name = CTL_UNNUMBERED, |
364 | + .procname = "iso_period", |
365 | + .data = &sched_iso_period, |
366 | + .maxlen = sizeof (int), |
367 | + .mode = 0644, |
368 | + .proc_handler = &proc_dointvec_minmax, |
369 | + .strategy = &sysctl_intvec, |
370 | + .extra1 = &one, |
371 | + .extra2 = &one_hundred, |
372 | + }, |
373 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
374 | { |
375 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, |