Magellan Linux

Contents of /trunk/kernel26-alx/patches-2.6.17-r5/0007-2.6.17-sched-iso-4.5.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (show annotations) (download)
Fri May 18 11:04:36 2007 UTC (16 years, 11 months ago) by niro
File size: 10402 byte(s)
-import

1 Add the SCHED_ISO policy (isochronous) which is a starvation free soft
2 realtime policy available to unprivileged users. The amount of cpu that
3 SCHED_ISO tasks will run as realtime is configurable by the tunable in
4
5 /proc/sys/kernel/iso_cpu
6
7 and is set to 80% (over 3 seconds) by default.
8
9 Signed-off-by: Con Kolivas <kernel@kolivas.org>
10
11 Documentation/sysctl/kernel.txt | 9 ++++
12 include/linux/sched.h | 10 +++--
13 include/linux/sysctl.h | 1
14 kernel/sched.c | 77 ++++++++++++++++++++++++++++++++++++----
15 kernel/sysctl.c | 22 ++++++++---
16 5 files changed, 104 insertions(+), 15 deletions(-)
17
18 Index: linux-ck-dev/include/linux/sched.h
19 ===================================================================
20 --- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:23:35.000000000 +1000
21 +++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:23:38.000000000 +1000
22 @@ -164,9 +164,10 @@ extern unsigned long weighted_cpuload(co
23 #define SCHED_FIFO 1
24 #define SCHED_RR 2
25 #define SCHED_BATCH 3
26 +#define SCHED_ISO 4
27
28 #define SCHED_MIN 0
29 -#define SCHED_MAX 3
30 +#define SCHED_MAX 4
31
32 #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
33 #define SCHED_RT(policy) ((policy) == SCHED_FIFO || \
34 @@ -209,7 +210,7 @@ extern void show_stack(struct task_struc
35
36 void io_schedule(void);
37 long io_schedule_timeout(long timeout);
38 -extern int sched_interactive, sched_compute;
39 +extern int sched_interactive, sched_compute, sched_iso_cpu;
40
41 extern void cpu_init (void);
42 extern void trap_init(void);
43 @@ -489,12 +490,14 @@ struct signal_struct {
44
45 #define MAX_USER_RT_PRIO 100
46 #define MAX_RT_PRIO MAX_USER_RT_PRIO
47 +#define ISO_PRIO (MAX_RT_PRIO - 1)
48
49 #define MAX_PRIO (MAX_RT_PRIO + 40)
50 #define MIN_USER_PRIO (MAX_PRIO - 1)
51
52 -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO))
53 +#define rt_task(p) (unlikely(SCHED_RT((p)->policy)))
54 #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
55 +#define iso_task(p) (unlikely((p)->policy == SCHED_ISO))
56
57 /*
58 * Some day this will be a full-fledged user tracking system..
59 @@ -954,6 +957,7 @@ static inline void put_task_struct(struc
60 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
61 #define PF_NONSLEEP 0x20000000 /* Waiting on in kernel activity */
62 #define PF_FORKED 0x40000000 /* Task just forked another process */
63 +#define PF_ISOREF 0x80000000 /* SCHED_ISO task has used up quota */
64
65 /*
66 * Only the _current_ task can read/write to tsk->flags, but other
67 Index: linux-ck-dev/include/linux/sysctl.h
68 ===================================================================
69 --- linux-ck-dev.orig/include/linux/sysctl.h 2006-06-18 15:23:21.000000000 +1000
70 +++ linux-ck-dev/include/linux/sysctl.h 2006-06-18 15:23:38.000000000 +1000
71 @@ -150,6 +150,7 @@ enum
72 KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
73 KERN_INTERACTIVE=73, /* interactive tasks can have cpu bursts */
74 KERN_COMPUTE=74, /* adjust timeslices for a compute server */
75 + KERN_ISO_CPU=75, /* percent cpu SCHED_ISO tasks run SCHED_RR */
76 };
77
78
79 Index: linux-ck-dev/kernel/sched.c
80 ===================================================================
81 --- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:23:35.000000000 +1000
82 +++ linux-ck-dev/kernel/sched.c 2006-06-18 15:23:38.000000000 +1000
83 @@ -62,10 +62,14 @@
84 * raise its priority.
85 * sched_compute - sysctl which enables long timeslices and delayed preemption
86 * for compute server usage.
87 + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
88 + * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
89 */
90 int sched_interactive __read_mostly = 1;
91 int sched_compute __read_mostly;
92 +int sched_iso_cpu __read_mostly = 80;
93
94 +#define ISO_PERIOD (5 * HZ)
95 /*
96 * CACHE_DELAY is the time preemption is delayed in sched_compute mode
97 * and is set to a nominal 10ms.
98 @@ -146,6 +150,9 @@ struct runqueue {
99
100 unsigned long long timestamp_last_tick;
101 unsigned short cache_ticks, preempted;
102 + unsigned long iso_ticks;
103 + unsigned short iso_refractory;
104 +
105 task_t *curr, *idle;
106 struct mm_struct *prev_mm;
107 unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
108 @@ -742,6 +749,17 @@ static int effective_prio(const task_t *
109 if (rt_task(p))
110 return p->prio;
111
112 + if (iso_task(p)) {
113 + if (likely(!(p->flags & PF_ISOREF)))
114 + /*
115 + * If SCHED_ISO tasks have not used up their real time
116 + * quota they have run just better than highest
117 + * SCHED_NORMAL priority. Otherwise they run as
118 + * SCHED_NORMAL.
119 + */
120 + return ISO_PRIO;
121 + }
122 +
123 full_slice = slice(p);
124 if (full_slice > p->slice)
125 used_slice = full_slice - p->slice;
126 @@ -2632,6 +2650,22 @@ static void time_slice_expired(task_t *p
127 }
128
129 /*
130 + * Test if SCHED_ISO tasks have run longer than their alloted period as RT
131 + * tasks and set the refractory flag if necessary. There is 10% hysteresis
132 + * for unsetting the flag.
133 + */
134 +static inline unsigned int test_ret_isorefractory(runqueue_t *rq)
135 +{
136 + if (likely(!rq->iso_refractory)) {
137 + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
138 + rq->iso_refractory = 1;
139 + } else
140 + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
141 + rq->iso_refractory = 0;
142 + return rq->iso_refractory;
143 +}
144 +
145 +/*
146 * This function gets called by the timer code, with HZ frequency.
147 * We call it with interrupts disabled.
148 */
149 @@ -2659,11 +2693,29 @@ void scheduler_tick(void)
150 set_tsk_need_resched(p);
151 goto out;
152 }
153 - /* SCHED_FIFO tasks never run out of timeslice. */
154 - if (unlikely(p->policy == SCHED_FIFO))
155 - goto out;
156
157 spin_lock(&rq->lock);
158 + if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) &&
159 + p->mm)) {
160 + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
161 + rq->iso_ticks += 100;
162 + } else
163 + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
164 +
165 + if (iso_task(p)) {
166 + if (unlikely(test_ret_isorefractory(rq))) {
167 + if (!(p->flags & PF_ISOREF)) {
168 + set_tsk_need_resched(p);
169 + p->flags |= PF_ISOREF;
170 + }
171 + } else
172 + p->flags &= ~PF_ISOREF;
173 + } else
174 + /* SCHED_FIFO tasks never run out of timeslice. */
175 + if (unlikely(p->policy == SCHED_FIFO))
176 + goto out_unlock;
177 +
178 +
179 debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
180 p->ns_debit += debit;
181 if (p->ns_debit < NSJIFFY)
182 @@ -2758,7 +2810,7 @@ static int dependent_sleeper(int this_cp
183 int ret = 0, i;
184
185 /* kernel/rt threads do not participate in dependent sleeping */
186 - if (!p->mm || rt_task(p))
187 + if (!p->mm || rt_task(p) || iso_task(p))
188 return 0;
189
190 for_each_domain(this_cpu, tmp) {
191 @@ -2795,7 +2847,7 @@ static int dependent_sleeper(int this_cp
192 * task from using an unfair proportion of the
193 * physical cpu's resources. -ck
194 */
195 - if (rt_task(smt_curr)) {
196 + if (rt_task(smt_curr) || iso_task(smt_curr)) {
197 /*
198 * With real time tasks we run non-rt tasks only
199 * per_cpu_gain% of the time.
200 @@ -3567,9 +3619,19 @@ int sched_setscheduler(struct task_struc
201 {
202 int retval;
203 int queued, oldprio, oldpolicy = -1;
204 + struct sched_param zero_param = { .sched_priority = 0 };
205 unsigned long flags;
206 runqueue_t *rq;
207
208 + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) {
209 + /*
210 + * If the caller requested an RT policy without having the
211 + * necessary rights, we downgrade the policy to SCHED_ISO.
212 + * We also set the parameter to zero to pass the checks.
213 + */
214 + policy = SCHED_ISO;
215 + param = &zero_param;
216 + }
217 recheck:
218 /* double check policy once rq lock held */
219 if (policy < 0)
220 @@ -4063,6 +4125,7 @@ asmlinkage long sys_sched_get_priority_m
221 break;
222 case SCHED_NORMAL:
223 case SCHED_BATCH:
224 + case SCHED_ISO:
225 ret = 0;
226 break;
227 }
228 @@ -4087,6 +4150,7 @@ asmlinkage long sys_sched_get_priority_m
229 break;
230 case SCHED_NORMAL:
231 case SCHED_BATCH:
232 + case SCHED_ISO:
233 ret = 0;
234 }
235 return ret;
236 @@ -5992,7 +6056,8 @@ void __init sched_init(void)
237
238 rq = cpu_rq(i);
239 spin_lock_init(&rq->lock);
240 - rq->nr_running = rq->cache_ticks = rq->preempted = 0;
241 + rq->nr_running = rq->cache_ticks = rq->preempted =
242 + rq->iso_ticks = 0;
243
244 #ifdef CONFIG_SMP
245 rq->sd = NULL;
246 Index: linux-ck-dev/kernel/sysctl.c
247 ===================================================================
248 --- linux-ck-dev.orig/kernel/sysctl.c 2006-06-18 15:23:21.000000000 +1000
249 +++ linux-ck-dev/kernel/sysctl.c 2006-06-18 15:23:38.000000000 +1000
250 @@ -229,6 +229,11 @@ static ctl_table root_table[] = {
251 { .ctl_name = 0 }
252 };
253
254 +/* Constants for minimum and maximum testing.
255 + We use these as one-element integer vectors. */
256 +static int zero;
257 +static int one_hundred = 100;
258 +
259 static ctl_table kern_table[] = {
260 {
261 .ctl_name = KERN_OSTYPE,
262 @@ -639,6 +644,17 @@ static ctl_table kern_table[] = {
263 .mode = 0644,
264 .proc_handler = &proc_dointvec,
265 },
266 + {
267 + .ctl_name = KERN_ISO_CPU,
268 + .procname = "iso_cpu",
269 + .data = &sched_iso_cpu,
270 + .maxlen = sizeof (int),
271 + .mode = 0644,
272 + .proc_handler = &proc_dointvec_minmax,
273 + .strategy = &sysctl_intvec,
274 + .extra1 = &zero,
275 + .extra2 = &one_hundred,
276 + },
277 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
278 {
279 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
280 @@ -702,12 +718,6 @@ static ctl_table kern_table[] = {
281 { .ctl_name = 0 }
282 };
283
284 -/* Constants for minimum and maximum testing in vm_table.
285 - We use these as one-element integer vectors. */
286 -static int zero;
287 -static int one_hundred = 100;
288 -
289 -
290 static ctl_table vm_table[] = {
291 {
292 .ctl_name = VM_OVERCOMMIT_MEMORY,
293 Index: linux-ck-dev/Documentation/sysctl/kernel.txt
294 ===================================================================
295 --- linux-ck-dev.orig/Documentation/sysctl/kernel.txt 2006-06-18 15:23:21.000000000 +1000
296 +++ linux-ck-dev/Documentation/sysctl/kernel.txt 2006-06-18 15:23:38.000000000 +1000
297 @@ -27,6 +27,7 @@ show up in /proc/sys/kernel:
298 - hostname
299 - hotplug
300 - interactive
301 +- iso_cpu
302 - java-appletviewer [ binfmt_java, obsolete ]
303 - java-interpreter [ binfmt_java, obsolete ]
304 - l2cr [ PPC only ]
305 @@ -182,6 +183,14 @@ are obeyed if this tunable is disabled.
306
307 ==============================================================
308
309 +iso_cpu:
310 +
311 +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
312 +run effectively at realtime priority, averaged over a rolling 3 seconds.
313 +Set to 80% by default.
314 +
315 +==============================================================
316 +
317 l2cr: (PPC only)
318
319 This flag controls the L2 cache of G3 processor boards. If