Magellan Linux

Annotation of /trunk/kernel26-magellan/patches-2.6.21-r7/0004-2.6.21-sched-iso-5.4.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 269 - (hide annotations) (download)
Sat Jul 21 00:37:57 2007 UTC (16 years, 10 months ago) by niro
File size: 11954 byte(s)
2.6.21-magellan-r7

1 niro 269 Add the SCHED_ISO policy (isochronous) which is a starvation free soft
2     realtime policy available to unprivileged users. The amount of cpu that
3     SCHED_ISO tasks will run as realtime is configurable by the tunable in
4    
5     /proc/sys/kernel/iso_cpu
6    
7     and is set to 80% by default.
8    
9     The duration over which its cpu usage is averaged is controlled by the
10     tunable
11    
12     /proc/sys/kernel/iso_period
13    
14     and is set to 5 (seconds) by default.
15    
16     Signed-off-by: Con Kolivas <kernel@kolivas.org>
17    
18     Documentation/sysctl/kernel.txt | 21 +++++++
19     include/linux/sched.h | 8 ++
20     kernel/sched.c | 115 +++++++++++++++++++++++++++++++++++++---
21     kernel/sysctl.c | 24 ++++++++
22     4 files changed, 160 insertions(+), 8 deletions(-)
23    
24     Index: linux-2.6.21-ck2/include/linux/sched.h
25     ===================================================================
26     --- linux-2.6.21-ck2.orig/include/linux/sched.h 2007-05-14 19:30:30.000000000 +1000
27     +++ linux-2.6.21-ck2/include/linux/sched.h 2007-05-14 19:30:31.000000000 +1000
28     @@ -34,10 +34,11 @@
29     #define SCHED_FIFO 1
30     #define SCHED_RR 2
31     #define SCHED_BATCH 3
32     +#define SCHED_ISO 4
33    
34     #ifdef __KERNEL__
35    
36     -#define SCHED_MAX SCHED_BATCH
37     +#define SCHED_MAX SCHED_ISO
38     #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
39    
40     struct sched_param {
41     @@ -525,15 +526,17 @@ struct signal_struct {
42     #define MAX_USER_RT_PRIO 100
43     #define MAX_RT_PRIO MAX_USER_RT_PRIO
44     #define PRIO_RANGE (40)
45     +#define ISO_PRIO (MAX_RT_PRIO - 1)
46    
47     #define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE)
48    
49     -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
50     +#define rt_prio(prio) unlikely((prio) < ISO_PRIO)
51     #define rt_task(p) rt_prio((p)->prio)
52     #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
53     #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
54     (policy) == SCHED_RR)
55     #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
56     +#define iso_task(p) unlikely((p)->policy == SCHED_ISO)
57    
58     /*
59     * Some day this will be a full-fledged user tracking system..
60     @@ -1166,6 +1169,7 @@ static inline void put_task_struct(struc
61     #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
62     #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
63     #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
64     +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */
65     #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
66     #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
67    
68     Index: linux-2.6.21-ck2/kernel/sched.c
69     ===================================================================
70     --- linux-2.6.21-ck2.orig/kernel/sched.c 2007-05-14 19:30:30.000000000 +1000
71     +++ linux-2.6.21-ck2/kernel/sched.c 2007-05-14 19:30:31.000000000 +1000
72     @@ -104,6 +104,18 @@ int rr_interval __read_mostly = 8;
73     int sched_interactive __read_mostly = 1;
74    
75     /*
76     + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
77     + * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
78     + * sched_iso_period - sysctl which determines the number of seconds over
79     + * which cpu usage of SCHED_ISO tasks is averaged to determine if they are
80     + * exceeding their allowable bandwidth.
81     +*/
82     +int sched_iso_cpu __read_mostly = 80;
83     +int sched_iso_period __read_mostly = 5;
84     +
85     +#define ISO_PERIOD ((sched_iso_period * HZ) + 1)
86     +
87     +/*
88     * This contains a bitmap for each dynamic priority level with empty slots
89     * for the valid priorities each different nice level can have. It allows
90     * us to stagger the slots where differing priorities run in a way that
91     @@ -200,6 +212,8 @@ struct rq {
92    
93     /* How many times we have rotated the priority queue */
94     unsigned long prio_rotation;
95     + unsigned long iso_ticks;
96     + unsigned short iso_refractory;
97    
98     atomic_t nr_iowait;
99    
100     @@ -790,6 +804,11 @@ static inline void update_if_moved(struc
101     }
102     #endif
103    
104     +static inline int isoprio_suitable(struct task_struct *p)
105     +{
106     + return !(p->flags & PF_ISOREF);
107     +}
108     +
109     /*
110     * recalc_task_prio determines what priority a non rt_task will be
111     * queued at. If the task has already been running during this runqueue's
112     @@ -806,6 +825,25 @@ static void recalc_task_prio(struct task
113     struct prio_array *array = rq->active;
114     int queue_prio;
115    
116     + if (iso_task(p)) {
117     + if (isoprio_suitable(p)) {
118     + /*
119     + * If SCHED_ISO tasks have not used up their real time
120     + * quota they have run just better than highest
121     + * SCHED_NORMAL priority. Otherwise they run as
122     + * SCHED_NORMAL.
123     + */
124     + p->prio = p->normal_prio = ISO_PRIO;
125     + p->array = rq->active;
126     + if (p->time_slice <= 0)
127     + p->time_slice = p->quota;
128     + return;
129     + } else if (p->prio == ISO_PRIO) {
130     + /* Just about to be demoted to SCHED_NORMAL */
131     + p->time_slice = 0;
132     + }
133     + }
134     +
135     update_if_moved(p, rq);
136     if (p->rotation == rq->prio_rotation) {
137     if (p->array == array) {
138     @@ -3180,18 +3218,65 @@ static void task_expired_entitlement(str
139     p->time_slice += overrun;
140     }
141    
142     +/*
143     + * Test if SCHED_ISO tasks have run longer than their alloted period as RT
144     + * tasks and set the refractory flag if necessary. There is 10% hysteresis
145     + * for unsetting the flag.
146     + */
147     +static unsigned int test_ret_isorefractory(struct rq *rq)
148     +{
149     + if (likely(!rq->iso_refractory)) {
150     + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
151     + rq->iso_refractory = 1;
152     + } else {
153     + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
154     + rq->iso_refractory = 0;
155     + }
156     + return rq->iso_refractory;
157     +}
158     +
159     +/* No SCHED_ISO task was running so decrease rq->iso_ticks */
160     +static inline void no_iso_tick(struct rq *rq)
161     +{
162     + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
163     +}
164     +
165     /* This manages tasks that have run out of timeslice during a scheduler_tick */
166     static void task_running_tick(struct rq *rq, struct task_struct *p)
167     {
168     + /*
169     + * If a SCHED_ISO task is running we increment the iso_ticks. In
170     + * order to prevent SCHED_ISO tasks from causing starvation in the
171     + * presence of true RT tasks we account those as iso_ticks as well.
172     + */
173     + if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) {
174     + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
175     + rq->iso_ticks += 100;
176     + } else
177     + no_iso_tick(rq);
178     +
179     + if (iso_task(p)) {
180     + if (unlikely(test_ret_isorefractory(rq))) {
181     + if (isoprio_suitable(p)) {
182     + /*
183     + * SCHED_ISO task is running as RT and limit
184     + * has been hit. Set the PF_ISOREF flag and
185     + * force it to reschedule as SCHED_NORMAL
186     + * by zeroing its time_slice
187     + */
188     + p->flags |= PF_ISOREF;
189     + p->time_slice = 0;
190     + }
191     + } else
192     + p->flags &= ~PF_ISOREF;
193     + }
194     /* SCHED_FIFO tasks never run out of timeslice. */
195     if (p->time_slice > 0 || p->policy == SCHED_FIFO)
196     return;
197     /* p->time_slice <= 0 */
198     - spin_lock(&rq->lock);
199     + set_tsk_need_resched(p);
200     if (likely(task_queued(p)))
201     task_expired_entitlement(rq, p);
202     - set_tsk_need_resched(p);
203     - spin_unlock(&rq->lock);
204     }
205    
206     /*
207     @@ -3207,8 +3292,12 @@ void scheduler_tick(void)
208    
209     update_cpu_clock(p, rq, now, 1);
210    
211     + spin_lock(&rq->lock);
212     if (p != rq->idle)
213     task_running_tick(rq, p);
214     + else
215     + no_iso_tick(rq);
216     + spin_unlock(&rq->lock);
217     #ifdef CONFIG_SMP
218     update_load(rq);
219     if (time_after_eq(jiffies, rq->next_balance))
220     @@ -3285,7 +3374,8 @@ retry:
221     }
222     queue = array->queue + idx;
223     next = list_entry(queue->next, struct task_struct, run_list);
224     - if (unlikely(next->time_slice <= 0)) {
225     + if (unlikely(next->time_slice <= 0 && !(iso_task(next) &&
226     + isoprio_suitable(next)))) {
227     /*
228     * Unlucky enough that this task ran out of time_slice
229     * before it hit a scheduler_tick so it should have its
230     @@ -3377,7 +3467,7 @@ need_resched_nonpreemptible:
231     }
232    
233     idx = sched_find_first_bit(rq->dyn_bitmap);
234     - if (!rt_prio(idx))
235     + if (likely(idx > ISO_PRIO))
236     next = next_dynamic_task(rq, idx);
237     else {
238     queue = rq->active->queue + idx;
239     @@ -4042,12 +4132,22 @@ static void __setscheduler(struct task_s
240     int sched_setscheduler(struct task_struct *p, int policy,
241     struct sched_param *param)
242     {
243     + struct sched_param zero_param = { .sched_priority = 0 };
244     int queued, retval, oldprio, oldpolicy = -1;
245     unsigned long flags;
246     struct rq *rq;
247    
248     /* may grab non-irq protected spin_locks */
249     BUG_ON(in_interrupt());
250     + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
251     + /*
252     + * If the caller requested an RT policy without having the
253     + * necessary rights, we downgrade the policy to SCHED_ISO.
254     + * We also set the parameter to zero to pass the checks.
255     + */
256     + policy = SCHED_ISO;
257     + param = &zero_param;
258     + }
259     recheck:
260     /* double check policy once rq lock held */
261     if (policy < 0)
262     @@ -4577,6 +4677,7 @@ asmlinkage long sys_sched_get_priority_m
263     break;
264     case SCHED_NORMAL:
265     case SCHED_BATCH:
266     + case SCHED_ISO:
267     ret = 0;
268     break;
269     }
270     @@ -4601,6 +4702,7 @@ asmlinkage long sys_sched_get_priority_m
271     break;
272     case SCHED_NORMAL:
273     case SCHED_BATCH:
274     + case SCHED_ISO:
275     ret = 0;
276     }
277     return ret;
278     @@ -6708,6 +6810,7 @@ void __init sched_init(void)
279     rq = cpu_rq(i);
280     spin_lock_init(&rq->lock);
281     lockdep_set_class(&rq->lock, &rq->rq_lock_key);
282     + rq->iso_ticks = 0;
283     rq->nr_running = 0;
284     rq->prio_rotation = 0;
285     rq->active = rq->arrays;
286     @@ -6801,7 +6904,7 @@ void normalize_rt_tasks(void)
287    
288     read_lock_irq(&tasklist_lock);
289     for_each_process(p) {
290     - if (!rt_task(p))
291     + if (!rt_task(p) && !iso_task(p))
292     continue;
293    
294     spin_lock_irqsave(&p->pi_lock, flags);
295     Index: linux-2.6.21-ck2/Documentation/sysctl/kernel.txt
296     ===================================================================
297     --- linux-2.6.21-ck2.orig/Documentation/sysctl/kernel.txt 2007-05-14 19:30:30.000000000 +1000
298     +++ linux-2.6.21-ck2/Documentation/sysctl/kernel.txt 2007-05-14 19:30:31.000000000 +1000
299     @@ -26,6 +26,8 @@ show up in /proc/sys/kernel:
300     - hostname
301     - hotplug
302     - interactive
303     +- iso_cpu
304     +- iso_period
305     - java-appletviewer [ binfmt_java, obsolete ]
306     - java-interpreter [ binfmt_java, obsolete ]
307     - kstack_depth_to_print [ X86 only ]
308     @@ -181,6 +183,25 @@ Default value is 1 (enabled).
309    
310     ==============================================================
311    
312     +iso_cpu:
313     +
314     +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
315     +run effectively at realtime priority, averaged over a rolling iso_period
316     +seconds.
317     +
318     +Set to 80 (percent) by default.
319     +
320     +==============================================================
321     +
322     +iso_period:
323     +
324     +This sets the number of seconds over which SCHED_ISO cpu usage is averaged
325     +to see if it exceeds its allocated cpu bandwidth.
326     +
327     +Set to 5 (seconds) by default.
328     +
329     +==============================================================
330     +
331     l2cr: (PPC only)
332    
333     This flag controls the L2 cache of G3 processor boards. If
334     Index: linux-2.6.21-ck2/kernel/sysctl.c
335     ===================================================================
336     --- linux-2.6.21-ck2.orig/kernel/sysctl.c 2007-05-14 19:30:30.000000000 +1000
337     +++ linux-2.6.21-ck2/kernel/sysctl.c 2007-05-14 19:30:31.000000000 +1000
338     @@ -78,6 +78,8 @@ extern int percpu_pagelist_fraction;
339     extern int compat_log;
340     extern int rr_interval;
341     extern int sched_interactive;
342     +extern int sched_iso_cpu;
343     +extern int sched_iso_period;
344    
345     /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
346     static int maxolduid = 65535;
347     @@ -528,6 +530,28 @@ static ctl_table kern_table[] = {
348     .mode = 0644,
349     .proc_handler = &proc_dointvec,
350     },
351     + {
352     + .ctl_name = CTL_UNNUMBERED,
353     + .procname = "iso_cpu",
354     + .data = &sched_iso_cpu,
355     + .maxlen = sizeof (int),
356     + .mode = 0644,
357     + .proc_handler = &proc_dointvec_minmax,
358     + .strategy = &sysctl_intvec,
359     + .extra1 = &zero,
360     + .extra2 = &one_hundred,
361     + },
362     + {
363     + .ctl_name = CTL_UNNUMBERED,
364     + .procname = "iso_period",
365     + .data = &sched_iso_period,
366     + .maxlen = sizeof (int),
367     + .mode = 0644,
368     + .proc_handler = &proc_dointvec_minmax,
369     + .strategy = &sysctl_intvec,
370     + .extra1 = &one,
371     + .extra2 = &one_hundred,
372     + },
373     #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
374     {
375     .ctl_name = KERN_UNKNOWN_NMI_PANIC,