Annotation of /trunk/kernel26-tinyalx/patches-2.6.21-r14/0004-2.6.21-sched-iso-5.4.patch
Parent Directory | Revision Log
Revision 453 -
(hide annotations)
(download)
Fri Jan 25 23:34:48 2008 UTC (16 years, 8 months ago) by niro
File size: 11954 byte(s)
Fri Jan 25 23:34:48 2008 UTC (16 years, 8 months ago) by niro
File size: 11954 byte(s)
-tiny-alx 2.6.21-tinyalx-r14
1 | niro | 453 | Add the SCHED_ISO policy (isochronous) which is a starvation free soft |
2 | realtime policy available to unprivileged users. The amount of cpu that | ||
3 | SCHED_ISO tasks will run as realtime is configurable by the tunable in | ||
4 | |||
5 | /proc/sys/kernel/iso_cpu | ||
6 | |||
7 | and is set to 80% by default. | ||
8 | |||
9 | The duration over which its cpu usage is averaged is controlled by the | ||
10 | tunable | ||
11 | |||
12 | /proc/sys/kernel/iso_period | ||
13 | |||
14 | and is set to 5 (seconds) by default. | ||
15 | |||
16 | Signed-off-by: Con Kolivas <kernel@kolivas.org> | ||
17 | |||
18 | Documentation/sysctl/kernel.txt | 21 +++++++ | ||
19 | include/linux/sched.h | 8 ++ | ||
20 | kernel/sched.c | 115 +++++++++++++++++++++++++++++++++++++--- | ||
21 | kernel/sysctl.c | 24 ++++++++ | ||
22 | 4 files changed, 160 insertions(+), 8 deletions(-) | ||
23 | |||
24 | Index: linux-2.6.21-ck2/include/linux/sched.h | ||
25 | =================================================================== | ||
26 | --- linux-2.6.21-ck2.orig/include/linux/sched.h 2007-05-14 19:30:30.000000000 +1000 | ||
27 | +++ linux-2.6.21-ck2/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000 | ||
28 | @@ -34,10 +34,11 @@ | ||
29 | #define SCHED_FIFO 1 | ||
30 | #define SCHED_RR 2 | ||
31 | #define SCHED_BATCH 3 | ||
32 | +#define SCHED_ISO 4 | ||
33 | |||
34 | #ifdef __KERNEL__ | ||
35 | |||
36 | -#define SCHED_MAX SCHED_BATCH | ||
37 | +#define SCHED_MAX SCHED_ISO | ||
38 | #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) | ||
39 | |||
40 | struct sched_param { | ||
41 | @@ -525,15 +526,17 @@ struct signal_struct { | ||
42 | #define MAX_USER_RT_PRIO 100 | ||
43 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | ||
44 | #define PRIO_RANGE (40) | ||
45 | +#define ISO_PRIO (MAX_RT_PRIO - 1) | ||
46 | |||
47 | #define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) | ||
48 | |||
49 | -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) | ||
50 | +#define rt_prio(prio) unlikely((prio) < ISO_PRIO) | ||
51 | #define rt_task(p) rt_prio((p)->prio) | ||
52 | #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) | ||
53 | #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ | ||
54 | (policy) == SCHED_RR) | ||
55 | #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) | ||
56 | +#define iso_task(p) unlikely((p)->policy == SCHED_ISO) | ||
57 | |||
58 | /* | ||
59 | * Some day this will be a full-fledged user tracking system.. | ||
60 | @@ -1166,6 +1169,7 @@ static inline void put_task_struct(struc | ||
61 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | ||
62 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ | ||
63 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | ||
64 | +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ | ||
65 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | ||
66 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | ||
67 | |||
68 | Index: linux-2.6.21-ck2/kernel/sched.c | ||
69 | =================================================================== | ||
70 | --- linux-2.6.21-ck2.orig/kernel/sched.c 2007-05-14 19:30:30.000000000 +1000 | ||
71 | +++ linux-2.6.21-ck2/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000 | ||
72 | @@ -104,6 +104,18 @@ int rr_interval __read_mostly = 8; | ||
73 | int sched_interactive __read_mostly = 1; | ||
74 | |||
75 | /* | ||
76 | + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks | ||
77 | + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. | ||
78 | + * sched_iso_period - sysctl which determines the number of seconds over | ||
79 | + * which cpu usage of SCHED_ISO tasks is averaged to determine if they are | ||
80 | + * exceeding their allowable bandwidth. | ||
81 | +*/ | ||
82 | +int sched_iso_cpu __read_mostly = 80; | ||
83 | +int sched_iso_period __read_mostly = 5; | ||
84 | + | ||
85 | +#define ISO_PERIOD ((sched_iso_period * HZ) + 1) | ||
86 | + | ||
87 | +/* | ||
88 | * This contains a bitmap for each dynamic priority level with empty slots | ||
89 | * for the valid priorities each different nice level can have. It allows | ||
90 | * us to stagger the slots where differing priorities run in a way that | ||
91 | @@ -200,6 +212,8 @@ struct rq { | ||
92 | |||
93 | /* How many times we have rotated the priority queue */ | ||
94 | unsigned long prio_rotation; | ||
95 | + unsigned long iso_ticks; | ||
96 | + unsigned short iso_refractory; | ||
97 | |||
98 | atomic_t nr_iowait; | ||
99 | |||
100 | @@ -790,6 +804,11 @@ static inline void update_if_moved(struc | ||
101 | } | ||
102 | #endif | ||
103 | |||
104 | +static inline int isoprio_suitable(struct task_struct *p) | ||
105 | +{ | ||
106 | + return !(p->flags & PF_ISOREF); | ||
107 | +} | ||
108 | + | ||
109 | /* | ||
110 | * recalc_task_prio determines what priority a non rt_task will be | ||
111 | * queued at. If the task has already been running during this runqueue's | ||
112 | @@ -806,6 +825,25 @@ static void recalc_task_prio(struct task | ||
113 | struct prio_array *array = rq->active; | ||
114 | int queue_prio; | ||
115 | |||
116 | + if (iso_task(p)) { | ||
117 | + if (isoprio_suitable(p)) { | ||
118 | + /* | ||
119 | + * If SCHED_ISO tasks have not used up their real time | ||
120 | + * quota they have run just better than highest | ||
121 | + * SCHED_NORMAL priority. Otherwise they run as | ||
122 | + * SCHED_NORMAL. | ||
123 | + */ | ||
124 | + p->prio = p->normal_prio = ISO_PRIO; | ||
125 | + p->array = rq->active; | ||
126 | + if (p->time_slice <= 0) | ||
127 | + p->time_slice = p->quota; | ||
128 | + return; | ||
129 | + } else if (p->prio == ISO_PRIO) { | ||
130 | + /* Just about to be demoted to SCHED_NORMAL */ | ||
131 | + p->time_slice = 0; | ||
132 | + } | ||
133 | + } | ||
134 | + | ||
135 | update_if_moved(p, rq); | ||
136 | if (p->rotation == rq->prio_rotation) { | ||
137 | if (p->array == array) { | ||
138 | @@ -3180,18 +3218,65 @@ static void task_expired_entitlement(str | ||
139 | p->time_slice += overrun; | ||
140 | } | ||
141 | |||
142 | +/* | ||
143 | + * Test if SCHED_ISO tasks have run longer than their alloted period as RT | ||
144 | + * tasks and set the refractory flag if necessary. There is 10% hysteresis | ||
145 | + * for unsetting the flag. | ||
146 | + */ | ||
147 | +static unsigned int test_ret_isorefractory(struct rq *rq) | ||
148 | +{ | ||
149 | + if (likely(!rq->iso_refractory)) { | ||
150 | + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) | ||
151 | + rq->iso_refractory = 1; | ||
152 | + } else { | ||
153 | + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) | ||
154 | + rq->iso_refractory = 0; | ||
155 | + } | ||
156 | + return rq->iso_refractory; | ||
157 | +} | ||
158 | + | ||
159 | +/* No SCHED_ISO task was running so decrease rq->iso_ticks */ | ||
160 | +static inline void no_iso_tick(struct rq *rq) | ||
161 | +{ | ||
162 | + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; | ||
163 | +} | ||
164 | + | ||
165 | /* This manages tasks that have run out of timeslice during a scheduler_tick */ | ||
166 | static void task_running_tick(struct rq *rq, struct task_struct *p) | ||
167 | { | ||
168 | + /* | ||
169 | + * If a SCHED_ISO task is running we increment the iso_ticks. In | ||
170 | + * order to prevent SCHED_ISO tasks from causing starvation in the | ||
171 | + * presence of true RT tasks we account those as iso_ticks as well. | ||
172 | + */ | ||
173 | + if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { | ||
174 | + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) | ||
175 | + rq->iso_ticks += 100; | ||
176 | + } else | ||
177 | + no_iso_tick(rq); | ||
178 | + | ||
179 | + if (iso_task(p)) { | ||
180 | + if (unlikely(test_ret_isorefractory(rq))) { | ||
181 | + if (isoprio_suitable(p)) { | ||
182 | + /* | ||
183 | + * SCHED_ISO task is running as RT and limit | ||
184 | + * has been hit. Set the PF_ISOREF flag and | ||
185 | + * force it to reschedule as SCHED_NORMAL | ||
186 | + * by zeroing its time_slice | ||
187 | + */ | ||
188 | + p->flags |= PF_ISOREF; | ||
189 | + p->time_slice = 0; | ||
190 | + } | ||
191 | + } else | ||
192 | + p->flags &= ~PF_ISOREF; | ||
193 | + } | ||
194 | /* SCHED_FIFO tasks never run out of timeslice. */ | ||
195 | if (p->time_slice > 0 || p->policy == SCHED_FIFO) | ||
196 | return; | ||
197 | /* p->time_slice <= 0 */ | ||
198 | - spin_lock(&rq->lock); | ||
199 | + set_tsk_need_resched(p); | ||
200 | if (likely(task_queued(p))) | ||
201 | task_expired_entitlement(rq, p); | ||
202 | - set_tsk_need_resched(p); | ||
203 | - spin_unlock(&rq->lock); | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | @@ -3207,8 +3292,12 @@ void scheduler_tick(void) | ||
208 | |||
209 | update_cpu_clock(p, rq, now, 1); | ||
210 | |||
211 | + spin_lock(&rq->lock); | ||
212 | if (p != rq->idle) | ||
213 | task_running_tick(rq, p); | ||
214 | + else | ||
215 | + no_iso_tick(rq); | ||
216 | + spin_unlock(&rq->lock); | ||
217 | #ifdef CONFIG_SMP | ||
218 | update_load(rq); | ||
219 | if (time_after_eq(jiffies, rq->next_balance)) | ||
220 | @@ -3285,7 +3374,8 @@ retry: | ||
221 | } | ||
222 | queue = array->queue + idx; | ||
223 | next = list_entry(queue->next, struct task_struct, run_list); | ||
224 | - if (unlikely(next->time_slice <= 0)) { | ||
225 | + if (unlikely(next->time_slice <= 0 && !(iso_task(next) && | ||
226 | + isoprio_suitable(next)))) { | ||
227 | /* | ||
228 | * Unlucky enough that this task ran out of time_slice | ||
229 | * before it hit a scheduler_tick so it should have its | ||
230 | @@ -3377,7 +3467,7 @@ need_resched_nonpreemptible: | ||
231 | } | ||
232 | |||
233 | idx = sched_find_first_bit(rq->dyn_bitmap); | ||
234 | - if (!rt_prio(idx)) | ||
235 | + if (likely(idx > ISO_PRIO)) | ||
236 | next = next_dynamic_task(rq, idx); | ||
237 | else { | ||
238 | queue = rq->active->queue + idx; | ||
239 | @@ -4042,12 +4132,22 @@ static void __setscheduler(struct task_s | ||
240 | int sched_setscheduler(struct task_struct *p, int policy, | ||
241 | struct sched_param *param) | ||
242 | { | ||
243 | + struct sched_param zero_param = { .sched_priority = 0 }; | ||
244 | int queued, retval, oldprio, oldpolicy = -1; | ||
245 | unsigned long flags; | ||
246 | struct rq *rq; | ||
247 | |||
248 | /* may grab non-irq protected spin_locks */ | ||
249 | BUG_ON(in_interrupt()); | ||
250 | + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { | ||
251 | + /* | ||
252 | + * If the caller requested an RT policy without having the | ||
253 | + * necessary rights, we downgrade the policy to SCHED_ISO. | ||
254 | + * We also set the parameter to zero to pass the checks. | ||
255 | + */ | ||
256 | + policy = SCHED_ISO; | ||
257 | + param = &zero_param; | ||
258 | + } | ||
259 | recheck: | ||
260 | /* double check policy once rq lock held */ | ||
261 | if (policy < 0) | ||
262 | @@ -4577,6 +4677,7 @@ asmlinkage long sys_sched_get_priority_m | ||
263 | break; | ||
264 | case SCHED_NORMAL: | ||
265 | case SCHED_BATCH: | ||
266 | + case SCHED_ISO: | ||
267 | ret = 0; | ||
268 | break; | ||
269 | } | ||
270 | @@ -4601,6 +4702,7 @@ asmlinkage long sys_sched_get_priority_m | ||
271 | break; | ||
272 | case SCHED_NORMAL: | ||
273 | case SCHED_BATCH: | ||
274 | + case SCHED_ISO: | ||
275 | ret = 0; | ||
276 | } | ||
277 | return ret; | ||
278 | @@ -6708,6 +6810,7 @@ void __init sched_init(void) | ||
279 | rq = cpu_rq(i); | ||
280 | spin_lock_init(&rq->lock); | ||
281 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
282 | + rq->iso_ticks = 0; | ||
283 | rq->nr_running = 0; | ||
284 | rq->prio_rotation = 0; | ||
285 | rq->active = rq->arrays; | ||
286 | @@ -6801,7 +6904,7 @@ void normalize_rt_tasks(void) | ||
287 | |||
288 | read_lock_irq(&tasklist_lock); | ||
289 | for_each_process(p) { | ||
290 | - if (!rt_task(p)) | ||
291 | + if (!rt_task(p) && !iso_task(p)) | ||
292 | continue; | ||
293 | |||
294 | spin_lock_irqsave(&p->pi_lock, flags); | ||
295 | Index: linux-2.6.21-ck2/Documentation/sysctl/kernel.txt | ||
296 | =================================================================== | ||
297 | --- linux-2.6.21-ck2.orig/Documentation/sysctl/kernel.txt 2007-05-14 19:30:30.000000000 +1000 | ||
298 | +++ linux-2.6.21-ck2/Documentation/sysctl/kernel.txt 2007-05-14 19:30:31.000000000 +1000 | ||
299 | @@ -26,6 +26,8 @@ show up in /proc/sys/kernel: | ||
300 | - hostname | ||
301 | - hotplug | ||
302 | - interactive | ||
303 | +- iso_cpu | ||
304 | +- iso_period | ||
305 | - java-appletviewer [ binfmt_java, obsolete ] | ||
306 | - java-interpreter [ binfmt_java, obsolete ] | ||
307 | - kstack_depth_to_print [ X86 only ] | ||
308 | @@ -181,6 +183,25 @@ Default value is 1 (enabled). | ||
309 | |||
310 | ============================================================== | ||
311 | |||
312 | +iso_cpu: | ||
313 | + | ||
314 | +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can | ||
315 | +run effectively at realtime priority, averaged over a rolling iso_period | ||
316 | +seconds. | ||
317 | + | ||
318 | +Set to 80 (percent) by default. | ||
319 | + | ||
320 | +============================================================== | ||
321 | + | ||
322 | +iso_period: | ||
323 | + | ||
324 | +This sets the number of seconds over which SCHED_ISO cpu usage is averaged | ||
325 | +to see if it exceeds its allocated cpu bandwidth. | ||
326 | + | ||
327 | +Set to 5 (seconds) by default. | ||
328 | + | ||
329 | +============================================================== | ||
330 | + | ||
331 | l2cr: (PPC only) | ||
332 | |||
333 | This flag controls the L2 cache of G3 processor boards. If | ||
334 | Index: linux-2.6.21-ck2/kernel/sysctl.c | ||
335 | =================================================================== | ||
336 | --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:30:30.000000000 +1000 | ||
337 | +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:30:31.000000000 +1000 | ||
338 | @@ -78,6 +78,8 @@ extern int percpu_pagelist_fraction; | ||
339 | extern int compat_log; | ||
340 | extern int rr_interval; | ||
341 | extern int sched_interactive; | ||
342 | +extern int sched_iso_cpu; | ||
343 | +extern int sched_iso_period; | ||
344 | |||
345 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | ||
346 | static int maxolduid = 65535; | ||
347 | @@ -528,6 +530,28 @@ static ctl_table kern_table[] = { | ||
348 | .mode = 0644, | ||
349 | .proc_handler = &proc_dointvec, | ||
350 | }, | ||
351 | + { | ||
352 | + .ctl_name = CTL_UNNUMBERED, | ||
353 | + .procname = "iso_cpu", | ||
354 | + .data = &sched_iso_cpu, | ||
355 | + .maxlen = sizeof (int), | ||
356 | + .mode = 0644, | ||
357 | + .proc_handler = &proc_dointvec_minmax, | ||
358 | + .strategy = &sysctl_intvec, | ||
359 | + .extra1 = &zero, | ||
360 | + .extra2 = &one_hundred, | ||
361 | + }, | ||
362 | + { | ||
363 | + .ctl_name = CTL_UNNUMBERED, | ||
364 | + .procname = "iso_period", | ||
365 | + .data = &sched_iso_period, | ||
366 | + .maxlen = sizeof (int), | ||
367 | + .mode = 0644, | ||
368 | + .proc_handler = &proc_dointvec_minmax, | ||
369 | + .strategy = &sysctl_intvec, | ||
370 | + .extra1 = &one, | ||
371 | + .extra2 = &one_hundred, | ||
372 | + }, | ||
373 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
374 | { | ||
375 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, |