Annotation of /trunk/kernel26-alx/patches-2.6.20-r5/0005-2.6.20-sched-iso-4.7.patch
Parent Directory | Revision Log
Revision 199 -
(hide annotations)
(download)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 10451 byte(s)
Fri May 18 11:04:36 2007 UTC (17 years, 4 months ago) by niro
File size: 10451 byte(s)
-import
1 | niro | 199 | Add the SCHED_ISO policy (isochronous) which is a starvation free soft |
2 | realtime policy available to unprivileged users. The amount of cpu that | ||
3 | SCHED_ISO tasks will run as realtime is configurable by the tunable in | ||
4 | |||
5 | /proc/sys/kernel/iso_cpu | ||
6 | |||
7 | and is set to 80% (over 3 seconds) by default. | ||
8 | |||
9 | Signed-off-by: Con Kolivas <kernel@kolivas.org> | ||
10 | |||
11 | Documentation/sysctl/kernel.txt | 9 ++++ | ||
12 | include/linux/sched.h | 10 +++-- | ||
13 | kernel/sched.c | 77 ++++++++++++++++++++++++++++++++++++---- | ||
14 | kernel/sysctl.c | 25 +++++++++--- | ||
15 | 4 files changed, 106 insertions(+), 15 deletions(-) | ||
16 | |||
17 | Index: linux-2.6.20-ck1/include/linux/sched.h | ||
18 | =================================================================== | ||
19 | --- linux-2.6.20-ck1.orig/include/linux/sched.h 2007-02-16 19:01:30.000000000 +1100 | ||
20 | +++ linux-2.6.20-ck1/include/linux/sched.h 2007-02-16 19:01:31.000000000 +1100 | ||
21 | @@ -34,10 +34,11 @@ | ||
22 | #define SCHED_FIFO 1 | ||
23 | #define SCHED_RR 2 | ||
24 | #define SCHED_BATCH 3 | ||
25 | +#define SCHED_ISO 4 | ||
26 | |||
27 | #ifdef __KERNEL__ | ||
28 | |||
29 | -#define SCHED_MAX SCHED_BATCH | ||
30 | +#define SCHED_MAX SCHED_ISO | ||
31 | #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) | ||
32 | |||
33 | struct sched_param { | ||
34 | @@ -219,7 +220,7 @@ extern void show_stack(struct task_struc | ||
35 | |||
36 | void io_schedule(void); | ||
37 | long io_schedule_timeout(long timeout); | ||
38 | -extern int sched_interactive, sched_compute; | ||
39 | +extern int sched_interactive, sched_compute, sched_iso_cpu; | ||
40 | |||
41 | extern void cpu_init (void); | ||
42 | extern void trap_init(void); | ||
43 | @@ -526,16 +527,18 @@ struct signal_struct { | ||
44 | |||
45 | #define MAX_USER_RT_PRIO 100 | ||
46 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | ||
47 | +#define ISO_PRIO (MAX_RT_PRIO - 1) | ||
48 | |||
49 | #define MAX_PRIO (MAX_RT_PRIO + 40) | ||
50 | #define MIN_USER_PRIO (MAX_PRIO - 1) | ||
51 | |||
52 | -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) | ||
53 | +#define rt_prio(prio) unlikely((prio) < ISO_PRIO) | ||
54 | #define rt_task(p) rt_prio((p)->prio) | ||
55 | #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) | ||
56 | #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ | ||
57 | (policy) == SCHED_RR) | ||
58 | #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) | ||
59 | +#define iso_task(p) (unlikely((p)->policy == SCHED_ISO)) | ||
60 | |||
61 | /* | ||
62 | * Some day this will be a full-fledged user tracking system.. | ||
63 | @@ -1151,6 +1154,7 @@ static inline void put_task_struct(struc | ||
64 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | ||
65 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ | ||
66 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | ||
67 | +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ | ||
68 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | ||
69 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | ||
70 | #define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */ | ||
71 | Index: linux-2.6.20-ck1/kernel/sched.c | ||
72 | =================================================================== | ||
73 | --- linux-2.6.20-ck1.orig/kernel/sched.c 2007-02-16 19:01:30.000000000 +1100 | ||
74 | +++ linux-2.6.20-ck1/kernel/sched.c 2007-02-16 19:01:31.000000000 +1100 | ||
75 | @@ -65,10 +65,14 @@ | ||
76 | * raise its priority. | ||
77 | * sched_compute - sysctl which enables long timeslices and delayed preemption | ||
78 | * for compute server usage. | ||
79 | + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks | ||
80 | + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. | ||
81 | */ | ||
82 | int sched_interactive __read_mostly = 1; | ||
83 | int sched_compute __read_mostly; | ||
84 | +int sched_iso_cpu __read_mostly = 80; | ||
85 | |||
86 | +#define ISO_PERIOD (5 * HZ) | ||
87 | /* | ||
88 | * CACHE_DELAY is the time preemption is delayed in sched_compute mode | ||
89 | * and is set to a nominal 10ms. | ||
90 | @@ -143,6 +147,8 @@ struct rq { | ||
91 | /* Cached timestamp set by update_cpu_clock() */ | ||
92 | unsigned long long most_recent_timestamp; | ||
93 | unsigned short cache_ticks, preempted; | ||
94 | + unsigned long iso_ticks; | ||
95 | + unsigned short iso_refractory; | ||
96 | struct task_struct *curr, *idle; | ||
97 | unsigned long next_balance; | ||
98 | struct mm_struct *prev_mm; | ||
99 | @@ -878,6 +884,17 @@ static inline int __normal_prio(struct t | ||
100 | unsigned int full_slice, used_slice = 0; | ||
101 | unsigned int best_bonus, rr; | ||
102 | |||
103 | + if (iso_task(p)) { | ||
104 | + if (likely(!(p->flags & PF_ISOREF))) | ||
105 | + /* | ||
106 | + * If SCHED_ISO tasks have not used up their real time | ||
107 | + * quota they have run just better than highest | ||
108 | + * SCHED_NORMAL priority. Otherwise they run as | ||
109 | + * SCHED_NORMAL. | ||
110 | + */ | ||
111 | + return ISO_PRIO; | ||
112 | + } | ||
113 | + | ||
114 | full_slice = slice(p); | ||
115 | if (full_slice > p->slice) | ||
116 | used_slice = full_slice - p->slice; | ||
117 | @@ -2990,6 +3007,23 @@ static void time_slice_expired(struct ta | ||
118 | requeue_task(p, rq, effective_prio(p)); | ||
119 | } | ||
120 | |||
121 | +/* | ||
122 | + * Test if SCHED_ISO tasks have run longer than their alloted period as RT | ||
123 | + * tasks and set the refractory flag if necessary. There is 10% hysteresis | ||
124 | + * for unsetting the flag. | ||
125 | + */ | ||
126 | +static inline unsigned int test_ret_isorefractory(struct rq *rq) | ||
127 | +{ | ||
128 | + if (likely(!rq->iso_refractory)) { | ||
129 | + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) | ||
130 | + rq->iso_refractory = 1; | ||
131 | + } else { | ||
132 | + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) | ||
133 | + rq->iso_refractory = 0; | ||
134 | + } | ||
135 | + return rq->iso_refractory; | ||
136 | +} | ||
137 | + | ||
138 | static void task_running_tick(struct rq *rq, struct task_struct *p) | ||
139 | { | ||
140 | unsigned long debit; | ||
141 | @@ -2999,11 +3033,29 @@ static void task_running_tick(struct rq | ||
142 | set_tsk_need_resched(p); | ||
143 | return; | ||
144 | } | ||
145 | - /* SCHED_FIFO tasks never run out of timeslice. */ | ||
146 | - if (unlikely(p->policy == SCHED_FIFO)) | ||
147 | - return; | ||
148 | |||
149 | spin_lock(&rq->lock); | ||
150 | + if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) && | ||
151 | + p->mm)) { | ||
152 | + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) | ||
153 | + rq->iso_ticks += 100; | ||
154 | + } else | ||
155 | + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; | ||
156 | + | ||
157 | + if (iso_task(p)) { | ||
158 | + if (unlikely(test_ret_isorefractory(rq))) { | ||
159 | + if (!(p->flags & PF_ISOREF)) { | ||
160 | + set_tsk_need_resched(p); | ||
161 | + p->flags |= PF_ISOREF; | ||
162 | + } | ||
163 | + } else | ||
164 | + p->flags &= ~PF_ISOREF; | ||
165 | + } else { | ||
166 | + /* SCHED_FIFO tasks never run out of timeslice. */ | ||
167 | + if (unlikely(p->policy == SCHED_FIFO)) | ||
168 | + goto out_unlock; | ||
169 | + } | ||
170 | + | ||
171 | debit = ns_diff(rq->most_recent_timestamp, p->timestamp); | ||
172 | p->ns_debit += debit; | ||
173 | if (p->ns_debit < NSJIFFY) | ||
174 | @@ -3122,7 +3174,7 @@ dependent_sleeper(int this_cpu, struct r | ||
175 | int ret = 0, i; | ||
176 | |||
177 | /* kernel/rt threads do not participate in dependent sleeping */ | ||
178 | - if (!p->mm || rt_task(p)) | ||
179 | + if (!p->mm || rt_task(p) || iso_task(p)) | ||
180 | return 0; | ||
181 | |||
182 | for_each_domain(this_cpu, tmp) { | ||
183 | @@ -3159,7 +3211,7 @@ dependent_sleeper(int this_cpu, struct r | ||
184 | * task from using an unfair proportion of the | ||
185 | * physical cpu's resources. -ck | ||
186 | */ | ||
187 | - if (rt_task(smt_curr)) { | ||
188 | + if (rt_task(smt_curr) || iso_task(smt_curr)) { | ||
189 | /* | ||
190 | * With real time tasks we run non-rt tasks only | ||
191 | * per_cpu_gain% of the time. | ||
192 | @@ -3971,12 +4023,22 @@ static void __setscheduler(struct task_s | ||
193 | int sched_setscheduler(struct task_struct *p, int policy, | ||
194 | struct sched_param *param) | ||
195 | { | ||
196 | + struct sched_param zero_param = { .sched_priority = 0 }; | ||
197 | int queued, retval, oldprio, oldpolicy = -1; | ||
198 | unsigned long flags; | ||
199 | struct rq *rq; | ||
200 | |||
201 | /* may grab non-irq protected spin_locks */ | ||
202 | BUG_ON(in_interrupt()); | ||
203 | + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { | ||
204 | + /* | ||
205 | + * If the caller requested an RT policy without having the | ||
206 | + * necessary rights, we downgrade the policy to SCHED_ISO. | ||
207 | + * We also set the parameter to zero to pass the checks. | ||
208 | + */ | ||
209 | + policy = SCHED_ISO; | ||
210 | + param = &zero_param; | ||
211 | + } | ||
212 | recheck: | ||
213 | /* double check policy once rq lock held */ | ||
214 | if (policy < 0) | ||
215 | @@ -4501,6 +4563,7 @@ asmlinkage long sys_sched_get_priority_m | ||
216 | break; | ||
217 | case SCHED_NORMAL: | ||
218 | case SCHED_BATCH: | ||
219 | + case SCHED_ISO: | ||
220 | ret = 0; | ||
221 | break; | ||
222 | } | ||
223 | @@ -4525,6 +4588,7 @@ asmlinkage long sys_sched_get_priority_m | ||
224 | break; | ||
225 | case SCHED_NORMAL: | ||
226 | case SCHED_BATCH: | ||
227 | + case SCHED_ISO: | ||
228 | ret = 0; | ||
229 | } | ||
230 | return ret; | ||
231 | @@ -6647,7 +6711,8 @@ void __init sched_init(void) | ||
232 | rq = cpu_rq(i); | ||
233 | spin_lock_init(&rq->lock); | ||
234 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
235 | - rq->nr_running = rq->cache_ticks = rq->preempted = 0; | ||
236 | + rq->nr_running = rq->cache_ticks = rq->preempted = | ||
237 | + rq->iso_ticks = 0; | ||
238 | |||
239 | #ifdef CONFIG_SMP | ||
240 | rq->sd = NULL; | ||
241 | Index: linux-2.6.20-ck1/kernel/sysctl.c | ||
242 | =================================================================== | ||
243 | --- linux-2.6.20-ck1.orig/kernel/sysctl.c 2007-02-16 19:01:30.000000000 +1100 | ||
244 | +++ linux-2.6.20-ck1/kernel/sysctl.c 2007-02-16 19:01:31.000000000 +1100 | ||
245 | @@ -273,6 +273,14 @@ static ctl_table root_table[] = { | ||
246 | { .ctl_name = 0 } | ||
247 | }; | ||
248 | |||
249 | + | ||
250 | +/* | ||
251 | + * Constants for minimum and maximum testing. | ||
252 | + * We use these as one-element integer vectors. | ||
253 | + */ | ||
254 | +static int zero; | ||
255 | +static int one_hundred = 100; | ||
256 | + | ||
257 | static ctl_table kern_table[] = { | ||
258 | { | ||
259 | .ctl_name = KERN_OSTYPE, | ||
260 | @@ -692,6 +700,17 @@ static ctl_table kern_table[] = { | ||
261 | .mode = 0644, | ||
262 | .proc_handler = &proc_dointvec, | ||
263 | }, | ||
264 | + { | ||
265 | + .ctl_name = CTL_UNNUMBERED, | ||
266 | + .procname = "iso_cpu", | ||
267 | + .data = &sched_iso_cpu, | ||
268 | + .maxlen = sizeof (int), | ||
269 | + .mode = 0644, | ||
270 | + .proc_handler = &proc_dointvec_minmax, | ||
271 | + .strategy = &sysctl_intvec, | ||
272 | + .extra1 = &zero, | ||
273 | + .extra2 = &one_hundred, | ||
274 | + }, | ||
275 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
276 | { | ||
277 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, | ||
278 | @@ -800,12 +819,6 @@ static ctl_table kern_table[] = { | ||
279 | { .ctl_name = 0 } | ||
280 | }; | ||
281 | |||
282 | -/* Constants for minimum and maximum testing in vm_table. | ||
283 | - We use these as one-element integer vectors. */ | ||
284 | -static int zero; | ||
285 | -static int one_hundred = 100; | ||
286 | - | ||
287 | - | ||
288 | static ctl_table vm_table[] = { | ||
289 | { | ||
290 | .ctl_name = VM_OVERCOMMIT_MEMORY, | ||
291 | Index: linux-2.6.20-ck1/Documentation/sysctl/kernel.txt | ||
292 | =================================================================== | ||
293 | --- linux-2.6.20-ck1.orig/Documentation/sysctl/kernel.txt 2007-02-16 19:01:30.000000000 +1100 | ||
294 | +++ linux-2.6.20-ck1/Documentation/sysctl/kernel.txt 2007-02-16 19:01:31.000000000 +1100 | ||
295 | @@ -27,6 +27,7 @@ show up in /proc/sys/kernel: | ||
296 | - hostname | ||
297 | - hotplug | ||
298 | - interactive | ||
299 | +- iso_cpu | ||
300 | - java-appletviewer [ binfmt_java, obsolete ] | ||
301 | - java-interpreter [ binfmt_java, obsolete ] | ||
302 | - kstack_depth_to_print [ X86 only ] | ||
303 | @@ -185,6 +186,14 @@ are obeyed if this tunable is disabled. | ||
304 | |||
305 | ============================================================== | ||
306 | |||
307 | +iso_cpu: | ||
308 | + | ||
309 | +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can | ||
310 | +run effectively at realtime priority, averaged over a rolling 3 seconds. | ||
311 | +Set to 80% by default. | ||
312 | + | ||
313 | +============================================================== | ||
314 | + | ||
315 | l2cr: (PPC only) | ||
316 | |||
317 | This flag controls the L2 cache of G3 processor boards. If |