diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5e47683..9bf056e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -367,10 +367,6 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testl $TF_MASK,PT_EFLAGS(%esp) - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) @@ -385,6 +381,10 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 3700eef..be4a9a8 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -131,7 +131,6 @@ static void nmi_save_registers(void * dummy) { int cpu = smp_processor_id(); struct op_msrs * msrs = &cpu_msrs[cpu]; - model->fill_in_addresses(msrs); nmi_cpu_save_registers(msrs); } @@ -195,6 +194,7 @@ static struct notifier_block profile_exceptions_nb = { static int nmi_setup(void) { int err=0; + int cpu; if (!allocate_msrs()) return -ENOMEM; @@ -207,6 +207,13 @@ static int nmi_setup(void) /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ + + /* Assume saved/restored counters are the same on all CPUs */ + model->fill_in_addresses(&cpu_msrs[0]); + for_each_possible_cpu (cpu) { + if (cpu != 0) + cpu_msrs[cpu] = cpu_msrs[0]; + } on_each_cpu(nmi_save_registers, NULL, 0, 1); on_each_cpu(nmi_cpu_setup, NULL, 0, 1); nmi_enabled = 1; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index f72e8e8..a84304e 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, */ discard_lazy_cpu_state(); + /* + * Force reload of FP/VEC. + * This has to be done before copying stuff into current->thread.fpr/vr + * for the reasons explained in the previous comment. + */ + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC); + err |= __copy_from_user(¤t->thread.fpr, &sc->fp_regs, FP_REGS_SIZE); #ifdef CONFIG_ALTIVEC @@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, current->thread.vrsave = 0; #endif /* CONFIG_ALTIVEC */ - /* Force reload of FP/VEC */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC); - return err; } diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 2968b90..e67cc4f 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -72,6 +72,8 @@ void show_mem(void) for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { + if (!pfn_valid(pgdat->node_start_pfn + i)) + continue; page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -766,3 +768,9 @@ int in_gate_area_no_task(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } + +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return __alloc_bootmem_core(pgdat->bdata, size, + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); +} diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index 3ffa080..e4e0ccb 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -1102,6 +1102,7 @@ static void cyy_intr_chip(struct cyclades_card *cinfo, int chip, if (data & info->ignore_status_mask) { info->icount.rx++; + spin_unlock(&cinfo->card_lock); return; } if (tty_buffer_request_room(tty, 1)) { diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index cef1287..550ac72 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -255,19 +255,25 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde } -static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) +static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev; struct list_head *tmp; + mddev_t *mddev = bitmap->mddev; ITERATE_RDEV(mddev, rdev, tmp) if (test_bit(In_sync, &rdev->flags) - && !test_bit(Faulty, &rdev->flags)) + && !test_bit(Faulty, &rdev->flags)) { + int size = PAGE_SIZE; + if (page->index == bitmap->file_pages-1) + size = roundup(bitmap->last_page_size, + bdev_hardsect_size(rdev->bdev)); md_super_write(mddev, rdev, - (rdev->sb_offset<<1) + offset + (rdev->sb_offset<<1) + bitmap->offset + page->index * (PAGE_SIZE/512), - PAGE_SIZE, + size, page); + } if (wait) md_super_wait(mddev); @@ -282,7 +288,7 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait) struct buffer_head *bh; if (bitmap->file == NULL) - return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); + return write_sb_page(bitmap, page, wait); bh = page_buffers(page); @@ -923,6 +929,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) } bitmap->filemap[bitmap->file_pages++] = page; + bitmap->last_page_size = count; } paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4c2471e..b9ff4e3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -33,7 +33,6 @@ struct crypt_io { struct dm_target *target; struct bio *base_bio; - struct bio *first_clone; struct work_struct work; atomic_t pending; int error; @@ -107,6 +106,8 @@ struct crypt_config { static struct kmem_cache *_crypt_io_pool; +static void clone_init(struct crypt_io *, struct bio *); + /* * Different IV generation algorithms: * @@ -378,25 +379,20 @@ static int crypt_convert(struct crypt_config *cc, * This should never violate the device limitations * May return a smaller bio when running out of pages */ -static struct bio * -crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, - struct bio *base_bio, unsigned int *bio_vec_idx) +static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size, + unsigned int *bio_vec_idx) { + struct crypt_config *cc = io->target->private; struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; unsigned int i; - if (base_bio) { - clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs); - __bio_clone(clone, base_bio); - } else - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); - + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); if (!clone) return NULL; - clone->bi_destructor = dm_crypt_bio_destructor; + clone_init(io, clone); /* if the last bio was not complete, continue where that one ended */ clone->bi_idx = *bio_vec_idx; @@ -495,9 +491,6 @@ static void dec_pending(struct crypt_io *io, int error) if (!atomic_dec_and_test(&io->pending)) return; - if (io->first_clone) - bio_put(io->first_clone); - bio_endio(io->base_bio, io->base_bio->bi_size, io->error); mempool_free(io, cc->io_pool); @@ -562,6 +555,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone) clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; clone->bi_rw = io->base_bio->bi_rw; + clone->bi_destructor = dm_crypt_bio_destructor; } static void process_read(struct crypt_io *io) @@ -585,7 +579,6 @@ static void process_read(struct crypt_io *io) } clone_init(io, clone); - clone->bi_destructor = dm_crypt_bio_destructor; clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); clone->bi_size = base_bio->bi_size; @@ -615,8 +608,7 @@ static void process_write(struct crypt_io *io) * so repeat the whole process until all the data can be handled. */ while (remaining) { - clone = crypt_alloc_buffer(cc, base_bio->bi_size, - io->first_clone, &bvec_idx); + clone = crypt_alloc_buffer(io, base_bio->bi_size, &bvec_idx); if (unlikely(!clone)) { dec_pending(io, -ENOMEM); return; @@ -631,31 +623,23 @@ static void process_write(struct crypt_io *io) return; } - clone_init(io, clone); clone->bi_sector = cc->start + sector; - - if (!io->first_clone) { - /* - * hold a reference to the first clone, because it - * holds the bio_vec array and that can't be freed - * before all other clones are released - */ - bio_get(clone); - io->first_clone = clone; - } - remaining -= clone->bi_size; sector += bio_sectors(clone); - /* prevent bio_put of first_clone */ + /* Grab another reference to the io struct + * before we kick off the request */ if (remaining) atomic_inc(&io->pending); generic_make_request(clone); + /* Do not reference clone after this - it + * may be gone already. */ + /* out of memory -> run queues */ if (remaining) - congestion_wait(bio_data_dir(clone), HZ/100); + congestion_wait(WRITE, HZ/100); } } @@ -954,10 +938,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct crypt_config *cc = ti->private; struct crypt_io *io; + if (bio_barrier(bio)) + return -EOPNOTSUPP; + io = mempool_alloc(cc->io_pool, GFP_NOIO); io->target = ti; io->base_bio = bio; - io->first_clone = NULL; io->error = io->post_process = 0; atomic_set(&io->pending, 0); kcryptd_queue_io(io); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index dfe3214..2c404f7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -415,7 +415,7 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) raid0_conf_t *conf = mddev_to_conf(mddev); struct strip_zone *zone; mdk_rdev_t *tmp_dev; - unsigned long chunk; + sector_t chunk; sector_t block, rsect; const int rw = bio_data_dir(bio); @@ -470,7 +470,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) sector_div(x, zone->nb_dev); chunk = x; - BUG_ON(x != (sector_t)chunk); x = block >> chunksize_bits; tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 97ee870..b20c6e9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1235,17 +1235,24 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) } r1_bio->read_disk = primary; for (i=0; iraid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { + if (r1_bio->bios[i]->bi_end_io == end_sync_read) { int j; int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - for (j = vcnt; j-- ; ) - if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), - page_address(sbio->bi_io_vec[j].bv_page), - PAGE_SIZE)) - break; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + PAGE_SIZE)) + break; + } + } else + j = 0; if (j >= 0) mddev->resync_mismatches += r1_bio->sectors; if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 82249a6..9eb66c1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1867,6 +1867,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int d = r10_bio->devs[i].devnum; bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; + clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; @@ -2037,6 +2038,11 @@ static int run(mddev_t *mddev) /* 'size' is now the number of chunks in the array */ /* calculate "used chunks per device" in 'stride' */ stride = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + stride += conf->raid_disks - 1; sector_div(stride, conf->raid_disks); mddev->size = stride << (conf->chunk_shift-1); diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c b/drivers/media/video/saa7134/saa7134-tvaudio.c index dd759d6..36b3fa3 100644 --- a/drivers/media/video/saa7134/saa7134-tvaudio.c +++ b/drivers/media/video/saa7134/saa7134-tvaudio.c @@ -1006,7 +1006,7 @@ int saa7134_tvaudio_init2(struct saa7134_dev *dev) int saa7134_tvaudio_fini(struct saa7134_dev *dev) { /* shutdown tvaudio thread */ - if (dev->thread.pid >= 0) { + if (dev->thread.pid > 0) { dev->thread.shutdown = 1; wake_up_interruptible(&dev->thread.wq); wait_for_completion(&dev->thread.exit); diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index c6259c7..40bdcf9 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -1157,13 +1157,16 @@ e1000_probe(struct pci_dev *pdev, !e1000_check_mng_mode(&adapter->hw)) e1000_get_hw_control(adapter); - strcpy(netdev->name, "eth%d"); - if ((err = register_netdev(netdev))) - goto err_register; - /* tell the stack to leave us alone until e1000_open() is called */ netif_carrier_off(netdev); netif_stop_queue(netdev); +#ifdef CONFIG_E1000_NAPI + netif_poll_disable(netdev); +#endif + + strcpy(netdev->name, "eth%d"); + if ((err = register_netdev(netdev))) + goto err_register; DPRINTK(PROBE, INFO, "Intel(R) PRO/1000 Network Connection\n"); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 38e75cf..aec8c59 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -95,7 +95,7 @@ static int disable_msi = 0; module_param(disable_msi, int, 0); MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)"); -static int idle_timeout = 0; +static int idle_timeout = 100; module_param(idle_timeout, int, 0); MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)"); @@ -2341,6 +2341,13 @@ static int sky2_poll(struct net_device *dev0, int *budget) work_done = sky2_status_intr(hw, work_limit); if (work_done < work_limit) { + /* Bug/Errata workaround? + * Need to kick the TX irq moderation timer. + */ + if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) { + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); + } netif_rx_complete(dev0); sky2_read32(hw, B0_Y2_SP_LISR); diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c index 3d2fcc5..64ed5ef 100644 --- a/drivers/serial/mpsc.c +++ b/drivers/serial/mpsc.c @@ -502,7 +502,8 @@ mpsc_sdma_intr_ack(struct mpsc_port_info *pi) if (pi->mirror_regs) pi->shared_regs->SDMA_INTR_CAUSE_m = 0; - writel(0, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE); + writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE + + pi->port.line); return; } diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 2275f27..8f820e4 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -59,6 +59,7 @@ extern void *__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long align, unsigned long goal, unsigned long limit); +extern void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d37f46a..f768e10 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2235,11 +2235,11 @@ #define PCI_DEVICE_ID_INTEL_ICH8_5 0x283e #define PCI_DEVICE_ID_INTEL_ICH8_6 0x2850 #define PCI_DEVICE_ID_INTEL_ICH9_0 0x2910 -#define PCI_DEVICE_ID_INTEL_ICH9_1 0x2911 +#define PCI_DEVICE_ID_INTEL_ICH9_1 0x2917 #define PCI_DEVICE_ID_INTEL_ICH9_2 0x2912 #define PCI_DEVICE_ID_INTEL_ICH9_3 0x2913 #define PCI_DEVICE_ID_INTEL_ICH9_4 0x2914 -#define PCI_DEVICE_ID_INTEL_ICH9_5 0x2915 +#define PCI_DEVICE_ID_INTEL_ICH9_5 0x2919 #define PCI_DEVICE_ID_INTEL_ICH9_6 0x2930 #define PCI_DEVICE_ID_INTEL_82855PM_HB 0x3340 #define PCI_DEVICE_ID_INTEL_82830_HB 0x3575 diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 6db9a4c..dd5a05d 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -232,6 +232,7 @@ struct bitmap { struct page **filemap; /* list of cache pages for the file */ unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ unsigned long flags; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4463735..7a0cc67 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1137,6 +1137,7 @@ static inline void put_task_struct(struct task_struct *t) /* Not implemented yet, only for 486*/ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2a7b38d..1a76bda 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -162,7 +162,7 @@ extern struct workqueue_struct *__create_workqueue(const char *name, int singlethread, int freezeable); #define create_workqueue(name) __create_workqueue((name), 0, 0) -#define create_freezeable_workqueue(name) __create_workqueue((name), 0, 1) +#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1) #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0) extern void destroy_workqueue(struct workqueue_struct *wq); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9c8c232..5a75657 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -905,7 +905,7 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && + if (invalidating && current->audit_context && audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) audit_set_auditable(current->audit_context); diff --git a/kernel/exit.c b/kernel/exit.c index fec12eb..d306845 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -883,13 +883,29 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); + /* + * We can do this unlocked here. The futex code uses + * this flag just to verify whether the pi state + * cleanup has been done or not. In the worst case it + * loops once more. We pretend that the cleanup was + * done as there is no way to return. Either the + * OWNER_DIED bit is set by now or we push the blocked + * task into the wait for ever nirwana as well. + */ + tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } + /* + * tsk->flags are checked in the futex code to protect against + * an exiting task cleaning up the robust pi futexes. + */ + spin_lock_irq(&tsk->pi_lock); tsk->flags |= PF_EXITING; + spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -956,6 +972,12 @@ fastcall NORET_TYPE void do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); + /* + * We can do this unlocked here. The futex code uses this flag + * just to verify whether the pi state cleanup has been done + * or not. In the worst case it loops once more. + */ + tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); diff --git a/kernel/futex.c b/kernel/futex.c index 1df411e..99dad33 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -390,18 +390,12 @@ static struct task_struct * futex_find_get_task(pid_t pid) rcu_read_lock(); p = find_task_by_pid(pid); - if (!p) - goto out_unlock; - if ((current->euid != p->euid) && (current->euid != p->uid)) { - p = NULL; - goto out_unlock; - } - if (p->exit_state != 0) { - p = NULL; - goto out_unlock; - } - get_task_struct(p); -out_unlock: + + if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) + p = ERR_PTR(-ESRCH); + else + get_task_struct(p); + rcu_read_unlock(); return p; @@ -467,7 +461,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) struct futex_q *this, *next; struct list_head *head; struct task_struct *p; - pid_t pid; + pid_t pid = uval & FUTEX_TID_MASK; head = &hb->chain; @@ -485,6 +479,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(pid && pi_state->owner && + pi_state->owner->pid != pid); atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -495,15 +491,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when the owner died bit is set - * and TID = 0: + * the new pi_state to it, but bail out when TID = 0 */ - pid = uval & FUTEX_TID_MASK; - if (!pid && (uval & FUTEX_OWNER_DIED)) + if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (!p) - return -ESRCH; + if (IS_ERR(p)) + return PTR_ERR(p); + + /* + * We need to look at the task state flags to figure out, + * whether the task is exiting. To protect against the do_exit + * change of the task flags, we do this protected by + * p->pi_lock: + */ + spin_lock_irq(&p->pi_lock); + if (unlikely(p->flags & PF_EXITING)) { + /* + * The task is on the way out. When PF_EXITPIDONE is + * set, we know that the task has finished the + * cleanup: + */ + int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + + spin_unlock_irq(&p->pi_lock); + put_task_struct(p); + return ret; + } pi_state = alloc_pi_state(); @@ -516,7 +530,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) /* Store the key for possible exit cleanups: */ pi_state->key = me->key; - spin_lock_irq(&p->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; @@ -583,15 +596,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * preserve the owner died bit.) */ if (!(uval & FUTEX_OWNER_DIED)) { + int ret = 0; + newval = FUTEX_WAITERS | new_owner->pid; pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); pagefault_enable(); + if (curval == -EFAULT) - return -EFAULT; + ret = -EFAULT; if (curval != uval) - return -EINVAL; + ret = -EINVAL; + if (ret) { + spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; + } } spin_lock_irq(&pi_state->owner->pi_lock); @@ -1149,6 +1169,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely(ret != 0)) goto out_release_sem; + retry_unlocked: hb = queue_lock(&q, -1, NULL); retry_locked: @@ -1200,34 +1221,58 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, ret = lookup_pi_state(uval, hb, &q); if (unlikely(ret)) { - /* - * There were no waiters and the owner task lookup - * failed. When the OWNER_DIED bit is set, then we - * know that this is a robust futex and we actually - * take the lock. This is safe as we are protected by - * the hash bucket lock. We also set the waiters bit - * unconditionally here, to simplify glibc handling of - * multiple tasks racing to acquire the lock and - * cleanup the problems which were left by the dead - * owner. - */ - if (curval & FUTEX_OWNER_DIED) { - uval = newval; - newval = current->pid | - FUTEX_OWNER_DIED | FUTEX_WAITERS; + switch (ret) { - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - pagefault_enable(); + case -EAGAIN: + /* + * Task is exiting and we just wait for the + * exit to complete. + */ + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + cond_resched(); + goto retry; - if (unlikely(curval == -EFAULT)) + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - ret = 0; + + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. We also set the waiters bit + * unconditionally here, to simplify glibc handling of + * multiple tasks racing to acquire the lock and + * cleanup the problems which were left by the dead + * owner. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | + FUTEX_OWNER_DIED | FUTEX_WAITERS; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, + newval); + pagefault_enable(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } + default: + goto out_unlock_release_sem; } - goto out_unlock_release_sem; } /* @@ -1279,39 +1324,52 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); /* * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the + * TID. This must be atomic as we have to preserve the * owner died bit here. */ - ret = get_user(uval, uaddr); + ret = get_futex_value_locked(&uval, uaddr); while (!ret) { newval = (uval & FUTEX_OWNER_DIED) | newtid; + + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + if (curval == -EFAULT) ret = -EFAULT; if (curval == uval) break; uval = curval; } - } else { + } else if (ret) { /* * Catch the rare case, where the lock was released * when we were on the way back before we locked * the hash bucket. */ - if (ret && q.pi_state->owner == curr) { - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; + if (q.pi_state->owner == curr && + rt_mutex_trylock(&q.pi_state->pi_mutex)) { + ret = 0; + } else { + /* + * Paranoia check. If we did not take the lock + * in the trylock above, then we should not be + * the owner of the rtmutex, neither the real + * nor the pending one: + */ + if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) + printk(KERN_ERR "futex_lock_pi: ret = %d " + "pi-mutex: %p pi-state %p\n", ret, + q.pi_state->pi_mutex.owner, + q.pi_state->owner); } - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); if (!detect && ret == -EDEADLK && 0) force_sig(SIGKILL, current); @@ -1331,16 +1389,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while * still holding the mmap_sem. + * + * ... and hb->lock. :-) --ANK */ + queue_unlock(&q, hb); + if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; - goto out_unlock_release_sem; - } - goto retry_locked; + ret = futex_handle_fault((unsigned long)uaddr, attempt); + if (ret) + goto out_release_sem; + goto retry_unlocked; } - queue_unlock(&q, hb); up_read(&curr->mm->mmap_sem); ret = get_user(uval, uaddr); @@ -1382,9 +1442,9 @@ retry: goto out; hb = hash_futex(&key); +retry_unlocked: spin_lock(&hb->lock); -retry_locked: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking @@ -1446,16 +1506,17 @@ pi_faulted: * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while * still holding the mmap_sem. + * + * ... and hb->lock. :-) --ANK */ + spin_unlock(&hb->lock); + if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; - goto out_unlock; - } - goto retry_locked; + ret = futex_handle_fault((unsigned long)uaddr, attempt); + if (ret) + goto out; + goto retry_unlocked; } - - spin_unlock(&hb->lock); up_read(¤t->mm->mmap_sem); ret = get_user(uval, uaddr); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4ab17da..dd5feae 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -212,6 +212,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (!waiter || !waiter->task) goto out_unlock_pi; + /* + * Check the orig_waiter state. After we dropped the locks, + * the previous owner of the lock might have released the lock + * and made us the pending owner: + */ + if (orig_waiter && !orig_waiter->task) + goto out_unlock_pi; + + /* + * Drop out, when the task has no waiters. Note, + * top_waiter can be NULL, when we are in the deboosting + * mode! + */ if (top_waiter && (!task_has_pi_waiters(task) || top_waiter != task_top_pi_waiter(task))) goto out_unlock_pi; @@ -659,9 +672,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, * all over without going into schedule to try * to get the lock now: */ - if (unlikely(!waiter.task)) + if (unlikely(!waiter.task)) { + /* + * Reset the return value. We might + * have returned with -EDEADLK and the + * owner released the lock while we + * were walking the pi chain. + */ + ret = 0; continue; - + } if (unlikely(ret)) break; } diff --git a/kernel/sched.c b/kernel/sched.c index 62db30c..907ab05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2814,17 +2814,21 @@ static void idle_balance(int this_cpu, struct rq *this_rq) unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { - if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); - if (time_after(next_balance, - sd->last_balance + sd->balance_interval)) - next_balance = sd->last_balance - + sd->balance_interval; - if (pulled_task) - break; - } + this_rq, sd); + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + if (pulled_task) + break; } if (!pulled_task) /* diff --git a/mm/rmap.c b/mm/rmap.c index 7ce69c1..c30781c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -53,24 +53,6 @@ struct kmem_cache *anon_vma_cachep; -static inline void validate_anon_vma(struct vm_area_struct *find_vma) -{ -#ifdef CONFIG_DEBUG_VM - struct anon_vma *anon_vma = find_vma->anon_vma; - struct vm_area_struct *vma; - unsigned int mapcount = 0; - int found = 0; - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - mapcount++; - BUG_ON(mapcount > 100000); - if (vma == find_vma) - found = 1; - } - BUG_ON(!found); -#endif -} - /* This must be called under the mmap_sem. */ int anon_vma_prepare(struct vm_area_struct *vma) { @@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) { + if (anon_vma) list_add_tail(&vma->anon_vma_node, &anon_vma->head); - validate_anon_vma(vma); - } } void anon_vma_link(struct vm_area_struct *vma) @@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma) if (anon_vma) { spin_lock(&anon_vma->lock); list_add_tail(&vma->anon_vma_node, &anon_vma->head); - validate_anon_vma(vma); spin_unlock(&anon_vma->lock); } } @@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma) return; spin_lock(&anon_vma->lock); - validate_anon_vma(vma); list_del(&vma->anon_vma_node); /* We must garbage collect the anon_vma if it's empty */ diff --git a/mm/sparse.c b/mm/sparse.c index ac26eb0..faa08e2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -209,6 +209,12 @@ static int sparse_init_one_section(struct mem_section *ms, return 1; } +__attribute__((weak)) +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return NULL; +} + static struct page *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -219,6 +225,11 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; + map = alloc_bootmem_high_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); if (map)