Contents of /alx-src/tags/kernel26-2.6.12-alx-r9/fs/select.c
Parent Directory | Revision Log
Revision 630 -
(show annotations)
(download)
Wed Mar 4 11:03:09 2009 UTC (15 years, 6 months ago) by niro
File MIME type: text/plain
File size: 12486 byte(s)
Wed Mar 4 11:03:09 2009 UTC (15 years, 6 months ago) by niro
File MIME type: text/plain
File size: 12486 byte(s)
Tag kernel26-2.6.12-alx-r9
1 | /* |
2 | * This file contains the procedures for the handling of select and poll |
3 | * |
4 | * Created for Linux based loosely upon Mathius Lattner's minix |
5 | * patches by Peter MacDonald. Heavily edited by Linus. |
6 | * |
7 | * 4 February 1994 |
8 | * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS |
9 | * flag set in its personality we do *not* modify the given timeout |
10 | * parameter to reflect time remaining. |
11 | * |
12 | * 24 January 2000 |
13 | * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation |
14 | * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). |
15 | */ |
16 | |
17 | #include <linux/syscalls.h> |
18 | #include <linux/module.h> |
19 | #include <linux/slab.h> |
20 | #include <linux/smp_lock.h> |
21 | #include <linux/poll.h> |
22 | #include <linux/personality.h> /* for STICKY_TIMEOUTS */ |
23 | #include <linux/file.h> |
24 | #include <linux/fs.h> |
25 | |
26 | #include <asm/uaccess.h> |
27 | |
28 | #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) |
29 | #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) |
30 | |
31 | struct poll_table_entry { |
32 | struct file * filp; |
33 | wait_queue_t wait; |
34 | wait_queue_head_t * wait_address; |
35 | }; |
36 | |
37 | struct poll_table_page { |
38 | struct poll_table_page * next; |
39 | struct poll_table_entry * entry; |
40 | struct poll_table_entry entries[0]; |
41 | }; |
42 | |
43 | #define POLL_TABLE_FULL(table) \ |
44 | ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) |
45 | |
46 | /* |
47 | * Ok, Peter made a complicated, but straightforward multiple_wait() function. |
48 | * I have rewritten this, taking some shortcuts: This code may not be easy to |
49 | * follow, but it should be free of race-conditions, and it's practical. If you |
50 | * understand what I'm doing here, then you understand how the linux |
51 | * sleep/wakeup mechanism works. |
52 | * |
53 | * Two very simple procedures, poll_wait() and poll_freewait() make all the |
54 | * work. poll_wait() is an inline-function defined in <linux/poll.h>, |
55 | * as all select/poll functions have to call it to add an entry to the |
56 | * poll table. |
57 | */ |
58 | static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, |
59 | poll_table *p); |
60 | |
61 | void poll_initwait(struct poll_wqueues *pwq) |
62 | { |
63 | init_poll_funcptr(&pwq->pt, __pollwait); |
64 | pwq->error = 0; |
65 | pwq->table = NULL; |
66 | } |
67 | |
68 | EXPORT_SYMBOL(poll_initwait); |
69 | |
70 | void poll_freewait(struct poll_wqueues *pwq) |
71 | { |
72 | struct poll_table_page * p = pwq->table; |
73 | while (p) { |
74 | struct poll_table_entry * entry; |
75 | struct poll_table_page *old; |
76 | |
77 | entry = p->entry; |
78 | do { |
79 | entry--; |
80 | remove_wait_queue(entry->wait_address,&entry->wait); |
81 | fput(entry->filp); |
82 | } while (entry > p->entries); |
83 | old = p; |
84 | p = p->next; |
85 | free_page((unsigned long) old); |
86 | } |
87 | } |
88 | |
89 | EXPORT_SYMBOL(poll_freewait); |
90 | |
91 | static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, |
92 | poll_table *_p) |
93 | { |
94 | struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); |
95 | struct poll_table_page *table = p->table; |
96 | |
97 | if (!table || POLL_TABLE_FULL(table)) { |
98 | struct poll_table_page *new_table; |
99 | |
100 | new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); |
101 | if (!new_table) { |
102 | p->error = -ENOMEM; |
103 | __set_current_state(TASK_RUNNING); |
104 | return; |
105 | } |
106 | new_table->entry = new_table->entries; |
107 | new_table->next = table; |
108 | p->table = new_table; |
109 | table = new_table; |
110 | } |
111 | |
112 | /* Add a new entry */ |
113 | { |
114 | struct poll_table_entry * entry = table->entry; |
115 | table->entry = entry+1; |
116 | get_file(filp); |
117 | entry->filp = filp; |
118 | entry->wait_address = wait_address; |
119 | init_waitqueue_entry(&entry->wait, current); |
120 | add_wait_queue(wait_address,&entry->wait); |
121 | } |
122 | } |
123 | |
124 | #define FDS_IN(fds, n) (fds->in + n) |
125 | #define FDS_OUT(fds, n) (fds->out + n) |
126 | #define FDS_EX(fds, n) (fds->ex + n) |
127 | |
128 | #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) |
129 | |
130 | static int max_select_fd(unsigned long n, fd_set_bits *fds) |
131 | { |
132 | unsigned long *open_fds; |
133 | unsigned long set; |
134 | int max; |
135 | |
136 | /* handle last in-complete long-word first */ |
137 | set = ~(~0UL << (n & (__NFDBITS-1))); |
138 | n /= __NFDBITS; |
139 | open_fds = current->files->open_fds->fds_bits+n; |
140 | max = 0; |
141 | if (set) { |
142 | set &= BITS(fds, n); |
143 | if (set) { |
144 | if (!(set & ~*open_fds)) |
145 | goto get_max; |
146 | return -EBADF; |
147 | } |
148 | } |
149 | while (n) { |
150 | open_fds--; |
151 | n--; |
152 | set = BITS(fds, n); |
153 | if (!set) |
154 | continue; |
155 | if (set & ~*open_fds) |
156 | return -EBADF; |
157 | if (max) |
158 | continue; |
159 | get_max: |
160 | do { |
161 | max++; |
162 | set >>= 1; |
163 | } while (set); |
164 | max += n * __NFDBITS; |
165 | } |
166 | |
167 | return max; |
168 | } |
169 | |
170 | #define BIT(i) (1UL << ((i)&(__NFDBITS-1))) |
171 | #define MEM(i,m) ((m)+(unsigned)(i)/__NFDBITS) |
172 | #define ISSET(i,m) (((i)&*(m)) != 0) |
173 | #define SET(i,m) (*(m) |= (i)) |
174 | |
175 | #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) |
176 | #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) |
177 | #define POLLEX_SET (POLLPRI) |
178 | |
179 | int do_select(int n, fd_set_bits *fds, long *timeout) |
180 | { |
181 | struct poll_wqueues table; |
182 | poll_table *wait; |
183 | int retval, i; |
184 | long __timeout = *timeout; |
185 | |
186 | spin_lock(¤t->files->file_lock); |
187 | retval = max_select_fd(n, fds); |
188 | spin_unlock(¤t->files->file_lock); |
189 | |
190 | if (retval < 0) |
191 | return retval; |
192 | n = retval; |
193 | |
194 | poll_initwait(&table); |
195 | wait = &table.pt; |
196 | if (!__timeout) |
197 | wait = NULL; |
198 | retval = 0; |
199 | for (;;) { |
200 | unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; |
201 | |
202 | set_current_state(TASK_INTERRUPTIBLE); |
203 | |
204 | inp = fds->in; outp = fds->out; exp = fds->ex; |
205 | rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; |
206 | |
207 | for (i = 0; i < n; ++rinp, ++routp, ++rexp) { |
208 | unsigned long in, out, ex, all_bits, bit = 1, mask, j; |
209 | unsigned long res_in = 0, res_out = 0, res_ex = 0; |
210 | struct file_operations *f_op = NULL; |
211 | struct file *file = NULL; |
212 | |
213 | in = *inp++; out = *outp++; ex = *exp++; |
214 | all_bits = in | out | ex; |
215 | if (all_bits == 0) { |
216 | i += __NFDBITS; |
217 | continue; |
218 | } |
219 | |
220 | for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { |
221 | if (i >= n) |
222 | break; |
223 | if (!(bit & all_bits)) |
224 | continue; |
225 | file = fget(i); |
226 | if (file) { |
227 | f_op = file->f_op; |
228 | mask = DEFAULT_POLLMASK; |
229 | if (f_op && f_op->poll) |
230 | mask = (*f_op->poll)(file, retval ? NULL : wait); |
231 | fput(file); |
232 | if ((mask & POLLIN_SET) && (in & bit)) { |
233 | res_in |= bit; |
234 | retval++; |
235 | } |
236 | if ((mask & POLLOUT_SET) && (out & bit)) { |
237 | res_out |= bit; |
238 | retval++; |
239 | } |
240 | if ((mask & POLLEX_SET) && (ex & bit)) { |
241 | res_ex |= bit; |
242 | retval++; |
243 | } |
244 | } |
245 | cond_resched(); |
246 | } |
247 | if (res_in) |
248 | *rinp = res_in; |
249 | if (res_out) |
250 | *routp = res_out; |
251 | if (res_ex) |
252 | *rexp = res_ex; |
253 | } |
254 | wait = NULL; |
255 | if (retval || !__timeout || signal_pending(current)) |
256 | break; |
257 | if(table.error) { |
258 | retval = table.error; |
259 | break; |
260 | } |
261 | __timeout = schedule_timeout(__timeout); |
262 | } |
263 | __set_current_state(TASK_RUNNING); |
264 | |
265 | poll_freewait(&table); |
266 | |
267 | /* |
268 | * Up-to-date the caller timeout. |
269 | */ |
270 | *timeout = __timeout; |
271 | return retval; |
272 | } |
273 | |
274 | static void *select_bits_alloc(int size) |
275 | { |
276 | return kmalloc(6 * size, GFP_KERNEL); |
277 | } |
278 | |
279 | static void select_bits_free(void *bits, int size) |
280 | { |
281 | kfree(bits); |
282 | } |
283 | |
284 | /* |
285 | * We can actually return ERESTARTSYS instead of EINTR, but I'd |
286 | * like to be certain this leads to no problems. So I return |
287 | * EINTR just for safety. |
288 | * |
289 | * Update: ERESTARTSYS breaks at least the xview clock binary, so |
290 | * I'm trying ERESTARTNOHAND which restart only when you want to. |
291 | */ |
292 | #define MAX_SELECT_SECONDS \ |
293 | ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) |
294 | |
295 | asmlinkage long |
296 | sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) |
297 | { |
298 | fd_set_bits fds; |
299 | char *bits; |
300 | long timeout; |
301 | int ret, size, max_fdset; |
302 | |
303 | timeout = MAX_SCHEDULE_TIMEOUT; |
304 | if (tvp) { |
305 | time_t sec, usec; |
306 | |
307 | if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp)) |
308 | || __get_user(sec, &tvp->tv_sec) |
309 | || __get_user(usec, &tvp->tv_usec)) { |
310 | ret = -EFAULT; |
311 | goto out_nofds; |
312 | } |
313 | |
314 | ret = -EINVAL; |
315 | if (sec < 0 || usec < 0) |
316 | goto out_nofds; |
317 | |
318 | if ((unsigned long) sec < MAX_SELECT_SECONDS) { |
319 | timeout = ROUND_UP(usec, 1000000/HZ); |
320 | timeout += sec * (unsigned long) HZ; |
321 | } |
322 | } |
323 | |
324 | ret = -EINVAL; |
325 | if (n < 0) |
326 | goto out_nofds; |
327 | |
328 | /* max_fdset can increase, so grab it once to avoid race */ |
329 | max_fdset = current->files->max_fdset; |
330 | if (n > max_fdset) |
331 | n = max_fdset; |
332 | |
333 | /* |
334 | * We need 6 bitmaps (in/out/ex for both incoming and outgoing), |
335 | * since we used fdset we need to allocate memory in units of |
336 | * long-words. |
337 | */ |
338 | ret = -ENOMEM; |
339 | size = FDS_BYTES(n); |
340 | bits = select_bits_alloc(size); |
341 | if (!bits) |
342 | goto out_nofds; |
343 | fds.in = (unsigned long *) bits; |
344 | fds.out = (unsigned long *) (bits + size); |
345 | fds.ex = (unsigned long *) (bits + 2*size); |
346 | fds.res_in = (unsigned long *) (bits + 3*size); |
347 | fds.res_out = (unsigned long *) (bits + 4*size); |
348 | fds.res_ex = (unsigned long *) (bits + 5*size); |
349 | |
350 | if ((ret = get_fd_set(n, inp, fds.in)) || |
351 | (ret = get_fd_set(n, outp, fds.out)) || |
352 | (ret = get_fd_set(n, exp, fds.ex))) |
353 | goto out; |
354 | zero_fd_set(n, fds.res_in); |
355 | zero_fd_set(n, fds.res_out); |
356 | zero_fd_set(n, fds.res_ex); |
357 | |
358 | ret = do_select(n, &fds, &timeout); |
359 | |
360 | if (tvp && !(current->personality & STICKY_TIMEOUTS)) { |
361 | time_t sec = 0, usec = 0; |
362 | if (timeout) { |
363 | sec = timeout / HZ; |
364 | usec = timeout % HZ; |
365 | usec *= (1000000/HZ); |
366 | } |
367 | put_user(sec, &tvp->tv_sec); |
368 | put_user(usec, &tvp->tv_usec); |
369 | } |
370 | |
371 | if (ret < 0) |
372 | goto out; |
373 | if (!ret) { |
374 | ret = -ERESTARTNOHAND; |
375 | if (signal_pending(current)) |
376 | goto out; |
377 | ret = 0; |
378 | } |
379 | |
380 | if (set_fd_set(n, inp, fds.res_in) || |
381 | set_fd_set(n, outp, fds.res_out) || |
382 | set_fd_set(n, exp, fds.res_ex)) |
383 | ret = -EFAULT; |
384 | |
385 | out: |
386 | select_bits_free(bits, size); |
387 | out_nofds: |
388 | return ret; |
389 | } |
390 | |
391 | struct poll_list { |
392 | struct poll_list *next; |
393 | int len; |
394 | struct pollfd entries[0]; |
395 | }; |
396 | |
397 | #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) |
398 | |
399 | static void do_pollfd(unsigned int num, struct pollfd * fdpage, |
400 | poll_table ** pwait, int *count) |
401 | { |
402 | int i; |
403 | |
404 | for (i = 0; i < num; i++) { |
405 | int fd; |
406 | unsigned int mask; |
407 | struct pollfd *fdp; |
408 | |
409 | mask = 0; |
410 | fdp = fdpage+i; |
411 | fd = fdp->fd; |
412 | if (fd >= 0) { |
413 | struct file * file = fget(fd); |
414 | mask = POLLNVAL; |
415 | if (file != NULL) { |
416 | mask = DEFAULT_POLLMASK; |
417 | if (file->f_op && file->f_op->poll) |
418 | mask = file->f_op->poll(file, *pwait); |
419 | mask &= fdp->events | POLLERR | POLLHUP; |
420 | fput(file); |
421 | } |
422 | if (mask) { |
423 | *pwait = NULL; |
424 | (*count)++; |
425 | } |
426 | } |
427 | fdp->revents = mask; |
428 | } |
429 | } |
430 | |
431 | static int do_poll(unsigned int nfds, struct poll_list *list, |
432 | struct poll_wqueues *wait, long timeout) |
433 | { |
434 | int count = 0; |
435 | poll_table* pt = &wait->pt; |
436 | |
437 | if (!timeout) |
438 | pt = NULL; |
439 | |
440 | for (;;) { |
441 | struct poll_list *walk; |
442 | set_current_state(TASK_INTERRUPTIBLE); |
443 | walk = list; |
444 | while(walk != NULL) { |
445 | do_pollfd( walk->len, walk->entries, &pt, &count); |
446 | walk = walk->next; |
447 | } |
448 | pt = NULL; |
449 | if (count || !timeout || signal_pending(current)) |
450 | break; |
451 | count = wait->error; |
452 | if (count) |
453 | break; |
454 | timeout = schedule_timeout(timeout); |
455 | } |
456 | __set_current_state(TASK_RUNNING); |
457 | return count; |
458 | } |
459 | |
460 | asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout) |
461 | { |
462 | struct poll_wqueues table; |
463 | int fdcount, err; |
464 | unsigned int i; |
465 | struct poll_list *head; |
466 | struct poll_list *walk; |
467 | |
468 | /* Do a sanity check on nfds ... */ |
469 | if (nfds > current->files->max_fdset && nfds > OPEN_MAX) |
470 | return -EINVAL; |
471 | |
472 | if (timeout) { |
473 | /* Careful about overflow in the intermediate values */ |
474 | if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ) |
475 | timeout = (unsigned long)(timeout*HZ+999)/1000+1; |
476 | else /* Negative or overflow */ |
477 | timeout = MAX_SCHEDULE_TIMEOUT; |
478 | } |
479 | |
480 | poll_initwait(&table); |
481 | |
482 | head = NULL; |
483 | walk = NULL; |
484 | i = nfds; |
485 | err = -ENOMEM; |
486 | while(i!=0) { |
487 | struct poll_list *pp; |
488 | pp = kmalloc(sizeof(struct poll_list)+ |
489 | sizeof(struct pollfd)* |
490 | (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i), |
491 | GFP_KERNEL); |
492 | if(pp==NULL) |
493 | goto out_fds; |
494 | pp->next=NULL; |
495 | pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i); |
496 | if (head == NULL) |
497 | head = pp; |
498 | else |
499 | walk->next = pp; |
500 | |
501 | walk = pp; |
502 | if (copy_from_user(pp->entries, ufds + nfds-i, |
503 | sizeof(struct pollfd)*pp->len)) { |
504 | err = -EFAULT; |
505 | goto out_fds; |
506 | } |
507 | i -= pp->len; |
508 | } |
509 | fdcount = do_poll(nfds, head, &table, timeout); |
510 | |
511 | /* OK, now copy the revents fields back to user space. */ |
512 | walk = head; |
513 | err = -EFAULT; |
514 | while(walk != NULL) { |
515 | struct pollfd *fds = walk->entries; |
516 | int j; |
517 | |
518 | for (j=0; j < walk->len; j++, ufds++) { |
519 | if(__put_user(fds[j].revents, &ufds->revents)) |
520 | goto out_fds; |
521 | } |
522 | walk = walk->next; |
523 | } |
524 | err = fdcount; |
525 | if (!fdcount && signal_pending(current)) |
526 | err = -EINTR; |
527 | out_fds: |
528 | walk = head; |
529 | while(walk!=NULL) { |
530 | struct poll_list *pp = walk->next; |
531 | kfree(walk); |
532 | walk = pp; |
533 | } |
534 | poll_freewait(&table); |
535 | return err; |
536 | } |