Contents of /trunk/kernel26-alx/patches-2.6.20-r6/0005-2.6.20-sched-iso-4.7.patch
Parent Directory | Revision Log
Revision 1175 -
(show annotations)
(download)
Thu Oct 14 12:15:46 2010 UTC (13 years, 11 months ago) by niro
File size: 10451 byte(s)
Thu Oct 14 12:15:46 2010 UTC (13 years, 11 months ago) by niro
File size: 10451 byte(s)
-2.6.20-alx-r6 new magellan 0.5.2 kernel
1 | Add the SCHED_ISO policy (isochronous) which is a starvation free soft |
2 | realtime policy available to unprivileged users. The amount of cpu that |
3 | SCHED_ISO tasks will run as realtime is configurable by the tunable in |
4 | |
5 | /proc/sys/kernel/iso_cpu |
6 | |
7 | and is set to 80% (over 3 seconds) by default. |
8 | |
9 | Signed-off-by: Con Kolivas <kernel@kolivas.org> |
10 | |
11 | Documentation/sysctl/kernel.txt | 9 ++++ |
12 | include/linux/sched.h | 10 +++-- |
13 | kernel/sched.c | 77 ++++++++++++++++++++++++++++++++++++---- |
14 | kernel/sysctl.c | 25 +++++++++--- |
15 | 4 files changed, 106 insertions(+), 15 deletions(-) |
16 | |
17 | Index: linux-2.6.20-ck1/include/linux/sched.h |
18 | =================================================================== |
19 | --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100 |
20 | +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:31.000000000 +1100 |
21 | @@ -34,10 +34,11 @@ |
22 | #define SCHED_FIFO 1 |
23 | #define SCHED_RR 2 |
24 | #define SCHED_BATCH 3 |
25 | +#define SCHED_ISO 4 |
26 | |
27 | #ifdef __KERNEL__ |
28 | |
29 | -#define SCHED_MAX SCHED_BATCH |
30 | +#define SCHED_MAX SCHED_ISO |
31 | #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) |
32 | |
33 | struct sched_param { |
34 | @@ -219,7 +220,7 @@ extern void show_stack(struct task_struc |
35 | |
36 | void io_schedule(void); |
37 | long io_schedule_timeout(long timeout); |
38 | -extern int sched_interactive, sched_compute; |
39 | +extern int sched_interactive, sched_compute, sched_iso_cpu; |
40 | |
41 | extern void cpu_init (void); |
42 | extern void trap_init(void); |
43 | @@ -526,16 +527,18 @@ struct signal_struct { |
44 | |
45 | #define MAX_USER_RT_PRIO 100 |
46 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
47 | +#define ISO_PRIO (MAX_RT_PRIO - 1) |
48 | |
49 | #define MAX_PRIO (MAX_RT_PRIO + 40) |
50 | #define MIN_USER_PRIO (MAX_PRIO - 1) |
51 | |
52 | -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) |
53 | +#define rt_prio(prio) unlikely((prio) < ISO_PRIO) |
54 | #define rt_task(p) rt_prio((p)->prio) |
55 | #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) |
56 | #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ |
57 | (policy) == SCHED_RR) |
58 | #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) |
59 | +#define iso_task(p) (unlikely((p)->policy == SCHED_ISO)) |
60 | |
61 | /* |
62 | * Some day this will be a full-fledged user tracking system.. |
63 | @@ -1151,6 +1154,7 @@ static inline void put_task_struct(struc |
64 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
65 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ |
66 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
67 | +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ |
68 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
69 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
70 | #define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */ |
71 | Index: linux-2.6.20-ck1/kernel/sched.c |
72 | =================================================================== |
73 | --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100 |
74 | +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:31.000000000 +1100 |
75 | @@ -65,10 +65,14 @@ |
76 | * raise its priority. |
77 | * sched_compute - sysctl which enables long timeslices and delayed preemption |
78 | * for compute server usage. |
79 | + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks |
80 | + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. |
81 | */ |
82 | int sched_interactive __read_mostly = 1; |
83 | int sched_compute __read_mostly; |
84 | +int sched_iso_cpu __read_mostly = 80; |
85 | |
86 | +#define ISO_PERIOD (5 * HZ) |
87 | /* |
88 | * CACHE_DELAY is the time preemption is delayed in sched_compute mode |
89 | * and is set to a nominal 10ms. |
90 | @@ -143,6 +147,8 @@ struct rq { |
91 | /* Cached timestamp set by update_cpu_clock() */ |
92 | unsigned long long most_recent_timestamp; |
93 | unsigned short cache_ticks, preempted; |
94 | + unsigned long iso_ticks; |
95 | + unsigned short iso_refractory; |
96 | struct task_struct *curr, *idle; |
97 | unsigned long next_balance; |
98 | struct mm_struct *prev_mm; |
99 | @@ -878,6 +884,17 @@ static inline int __normal_prio(struct t |
100 | unsigned int full_slice, used_slice = 0; |
101 | unsigned int best_bonus, rr; |
102 | |
103 | + if (iso_task(p)) { |
104 | + if (likely(!(p->flags & PF_ISOREF))) |
105 | + /* |
106 | + * If SCHED_ISO tasks have not used up their real time |
107 | + * quota they have run just better than highest |
108 | + * SCHED_NORMAL priority. Otherwise they run as |
109 | + * SCHED_NORMAL. |
110 | + */ |
111 | + return ISO_PRIO; |
112 | + } |
113 | + |
114 | full_slice = slice(p); |
115 | if (full_slice > p->slice) |
116 | used_slice = full_slice - p->slice; |
117 | @@ -2990,6 +3007,23 @@ static void time_slice_expired(struct ta |
118 | requeue_task(p, rq, effective_prio(p)); |
119 | } |
120 | |
121 | +/* |
122 | + * Test if SCHED_ISO tasks have run longer than their alloted period as RT |
123 | + * tasks and set the refractory flag if necessary. There is 10% hysteresis |
124 | + * for unsetting the flag. |
125 | + */ |
126 | +static inline unsigned int test_ret_isorefractory(struct rq *rq) |
127 | +{ |
128 | + if (likely(!rq->iso_refractory)) { |
129 | + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) |
130 | + rq->iso_refractory = 1; |
131 | + } else { |
132 | + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) |
133 | + rq->iso_refractory = 0; |
134 | + } |
135 | + return rq->iso_refractory; |
136 | +} |
137 | + |
138 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
139 | { |
140 | unsigned long debit; |
141 | @@ -2999,11 +3033,29 @@ static void task_running_tick(struct rq |
142 | set_tsk_need_resched(p); |
143 | return; |
144 | } |
145 | - /* SCHED_FIFO tasks never run out of timeslice. */ |
146 | - if (unlikely(p->policy == SCHED_FIFO)) |
147 | - return; |
148 | |
149 | spin_lock(&rq->lock); |
150 | + if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) && |
151 | + p->mm)) { |
152 | + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) |
153 | + rq->iso_ticks += 100; |
154 | + } else |
155 | + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; |
156 | + |
157 | + if (iso_task(p)) { |
158 | + if (unlikely(test_ret_isorefractory(rq))) { |
159 | + if (!(p->flags & PF_ISOREF)) { |
160 | + set_tsk_need_resched(p); |
161 | + p->flags |= PF_ISOREF; |
162 | + } |
163 | + } else |
164 | + p->flags &= ~PF_ISOREF; |
165 | + } else { |
166 | + /* SCHED_FIFO tasks never run out of timeslice. */ |
167 | + if (unlikely(p->policy == SCHED_FIFO)) |
168 | + goto out_unlock; |
169 | + } |
170 | + |
171 | debit = ns_diff(rq->most_recent_timestamp, p->timestamp); |
172 | p->ns_debit += debit; |
173 | if (p->ns_debit < NSJIFFY) |
174 | @@ -3122,7 +3174,7 @@ dependent_sleeper(int this_cpu, struct r |
175 | int ret = 0, i; |
176 | |
177 | /* kernel/rt threads do not participate in dependent sleeping */ |
178 | - if (!p->mm || rt_task(p)) |
179 | + if (!p->mm || rt_task(p) || iso_task(p)) |
180 | return 0; |
181 | |
182 | for_each_domain(this_cpu, tmp) { |
183 | @@ -3159,7 +3211,7 @@ dependent_sleeper(int this_cpu, struct r |
184 | * task from using an unfair proportion of the |
185 | * physical cpu's resources. -ck |
186 | */ |
187 | - if (rt_task(smt_curr)) { |
188 | + if (rt_task(smt_curr) || iso_task(smt_curr)) { |
189 | /* |
190 | * With real time tasks we run non-rt tasks only |
191 | * per_cpu_gain% of the time. |
192 | @@ -3971,12 +4023,22 @@ static void __setscheduler(struct task_s |
193 | int sched_setscheduler(struct task_struct *p, int policy, |
194 | struct sched_param *param) |
195 | { |
196 | + struct sched_param zero_param = { .sched_priority = 0 }; |
197 | int queued, retval, oldprio, oldpolicy = -1; |
198 | unsigned long flags; |
199 | struct rq *rq; |
200 | |
201 | /* may grab non-irq protected spin_locks */ |
202 | BUG_ON(in_interrupt()); |
203 | + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { |
204 | + /* |
205 | + * If the caller requested an RT policy without having the |
206 | + * necessary rights, we downgrade the policy to SCHED_ISO. |
207 | + * We also set the parameter to zero to pass the checks. |
208 | + */ |
209 | + policy = SCHED_ISO; |
210 | + param = &zero_param; |
211 | + } |
212 | recheck: |
213 | /* double check policy once rq lock held */ |
214 | if (policy < 0) |
215 | @@ -4501,6 +4563,7 @@ asmlinkage long sys_sched_get_priority_m |
216 | break; |
217 | case SCHED_NORMAL: |
218 | case SCHED_BATCH: |
219 | + case SCHED_ISO: |
220 | ret = 0; |
221 | break; |
222 | } |
223 | @@ -4525,6 +4588,7 @@ asmlinkage long sys_sched_get_priority_m |
224 | break; |
225 | case SCHED_NORMAL: |
226 | case SCHED_BATCH: |
227 | + case SCHED_ISO: |
228 | ret = 0; |
229 | } |
230 | return ret; |
231 | @@ -6647,7 +6711,8 @@ void __init sched_init(void) |
232 | rq = cpu_rq(i); |
233 | spin_lock_init(&rq->lock); |
234 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
235 | - rq->nr_running = rq->cache_ticks = rq->preempted = 0; |
236 | + rq->nr_running = rq->cache_ticks = rq->preempted = |
237 | + rq->iso_ticks = 0; |
238 | |
239 | #ifdef CONFIG_SMP |
240 | rq->sd = NULL; |
241 | Index: linux-2.6.20-ck1/kernel/sysctl.c |
242 | =================================================================== |
243 | --- linux-2.6.20-ck1.orig/kernel/sysctl.c 2007-02-16 19:01:30.000000000 +1100 |
244 | +++ linux-2.6.20-ck1/kernel/sysctl.c 2007-02-16 19:01:31.000000000 +1100 |
245 | @@ -273,6 +273,14 @@ static ctl_table root_table[] = { |
246 | { .ctl_name = 0 } |
247 | }; |
248 | |
249 | + |
250 | +/* |
251 | + * Constants for minimum and maximum testing. |
252 | + * We use these as one-element integer vectors. |
253 | + */ |
254 | +static int zero; |
255 | +static int one_hundred = 100; |
256 | + |
257 | static ctl_table kern_table[] = { |
258 | { |
259 | .ctl_name = KERN_OSTYPE, |
260 | @@ -692,6 +700,17 @@ static ctl_table kern_table[] = { |
261 | .mode = 0644, |
262 | .proc_handler = &proc_dointvec, |
263 | }, |
264 | + { |
265 | + .ctl_name = CTL_UNNUMBERED, |
266 | + .procname = "iso_cpu", |
267 | + .data = &sched_iso_cpu, |
268 | + .maxlen = sizeof (int), |
269 | + .mode = 0644, |
270 | + .proc_handler = &proc_dointvec_minmax, |
271 | + .strategy = &sysctl_intvec, |
272 | + .extra1 = &zero, |
273 | + .extra2 = &one_hundred, |
274 | + }, |
275 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
276 | { |
277 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, |
278 | @@ -800,12 +819,6 @@ static ctl_table kern_table[] = { |
279 | { .ctl_name = 0 } |
280 | }; |
281 | |
282 | -/* Constants for minimum and maximum testing in vm_table. |
283 | - We use these as one-element integer vectors. */ |
284 | -static int zero; |
285 | -static int one_hundred = 100; |
286 | - |
287 | - |
288 | static ctl_table vm_table[] = { |
289 | { |
290 | .ctl_name = VM_OVERCOMMIT_MEMORY, |
291 | Index: linux-2.6.20-ck1/Documentation/sysctl/kernel.txt |
292 | =================================================================== |
293 | --- linux-2.6.20-ck1.orig/Documentation/sysctl/kernel.txt 2007-02-16 19:01:30.000000000 +1100 |
294 | +++ linux-2.6.20-ck1/Documentation/sysctl/kernel.txt 2007-02-16 19:01:31.000000000 +1100 |
295 | @@ -27,6 +27,7 @@ show up in /proc/sys/kernel: |
296 | - hostname |
297 | - hotplug |
298 | - interactive |
299 | +- iso_cpu |
300 | - java-appletviewer [ binfmt_java, obsolete ] |
301 | - java-interpreter [ binfmt_java, obsolete ] |
302 | - kstack_depth_to_print [ X86 only ] |
303 | @@ -185,6 +186,14 @@ are obeyed if this tunable is disabled. |
304 | |
305 | ============================================================== |
306 | |
307 | +iso_cpu: |
308 | + |
309 | +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can |
310 | +run effectively at realtime priority, averaged over a rolling 3 seconds. |
311 | +Set to 80% by default. |
312 | + |
313 | +============================================================== |
314 | + |
315 | l2cr: (PPC only) |
316 | |
317 | This flag controls the L2 cache of G3 processor boards. If |