Subject: xen3 common From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 517:d71965a78c20) Patch-mainline: obsolete Acked-by: jbeulich@novell.com List of files that don't require modification anymore (and hence removed from this patch), for reference and in case upstream wants to take the forward porting patches: 2.6.25/mm/highmem.c --- drivers/Makefile | 1 drivers/acpi/hardware/hwsleep.c | 15 drivers/acpi/sleep/main.c | 11 drivers/char/agp/intel-agp.c | 10 drivers/char/mem.c | 6 drivers/char/tpm/Makefile | 2 drivers/char/tpm/tpm.h | 15 drivers/char/tpm/tpm_vtpm.c | 542 +++++++++++++++++++++++++ drivers/char/tpm/tpm_vtpm.h | 55 ++ drivers/char/tpm/tpm_xen.c | 722 ++++++++++++++++++++++++++++++++++ drivers/ide/ide-lib.c | 8 drivers/oprofile/buffer_sync.c | 87 +++- drivers/oprofile/cpu_buffer.c | 51 +- drivers/oprofile/cpu_buffer.h | 9 drivers/oprofile/event_buffer.h | 3 drivers/oprofile/oprof.c | 30 + drivers/oprofile/oprof.h | 3 drivers/oprofile/oprofile_files.c | 201 +++++++++ drivers/pci/bus.c | 7 drivers/pci/quirks.c | 34 + fs/aio.c | 120 +++++ fs/compat_ioctl.c | 19 fs/splice.c | 3 include/asm-generic/pci.h | 2 include/asm-generic/pgtable.h | 4 include/linux/aio.h | 5 include/linux/interrupt.h | 6 include/linux/kexec.h | 17 include/linux/mm.h | 7 include/linux/oprofile.h | 12 include/linux/page-flags.h | 15 include/linux/sched.h | 5 include/linux/skbuff.h | 8 include/linux/vermagic.h | 8 kernel/irq/spurious.c | 2 kernel/kexec.c | 71 ++- kernel/softlockup.c | 13 kernel/sysctl.c | 2 kernel/timer.c | 8 mm/memory.c | 38 + mm/mprotect.c | 2 mm/page_alloc.c | 30 + net/core/dev.c | 62 ++ net/core/skbuff.c | 4 net/ipv4/netfilter/nf_nat_proto_tcp.c | 3 net/ipv4/netfilter/nf_nat_proto_udp.c | 4 net/ipv4/xfrm4_output.c | 2 scripts/Makefile.build | 14 scripts/Makefile.lib | 6 49 files changed, 2226 insertions(+), 78 deletions(-) --- a/drivers/Makefile +++ b/drivers/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_NUBUS) += nubus/ obj-$(CONFIG_ATM) += atm/ obj-y += macintosh/ +obj-$(CONFIG_XEN) += xen/ obj-$(CONFIG_IDE) += ide/ obj-$(CONFIG_SCSI) += scsi/ obj-$(CONFIG_ATA) += ata/ --- a/drivers/acpi/hardware/hwsleep.c +++ b/drivers/acpi/hardware/hwsleep.c @@ -252,7 +252,11 @@ u32 PM1Bcontrol; struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; +#if !(defined(CONFIG_XEN) && defined(CONFIG_X86)) u32 in_value; +#else + int err; +#endif struct acpi_object_list arg_list; union acpi_object arg; acpi_status status; @@ -362,6 +366,7 @@ ACPI_FLUSH_CPU_CACHE(); +#if !(defined(CONFIG_XEN) && defined(CONFIG_X86)) status = acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL, PM1Acontrol); if (ACPI_FAILURE(status)) { @@ -408,6 +413,16 @@ /* Spin until we wake */ } while (!in_value); +#else + /* PV ACPI just need check hypercall return value */ + err = acpi_notify_hypervisor_state(sleep_state, + PM1Acontrol, PM1Bcontrol); + if (err) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, + "Hypervisor failure [%d]\n", err)); + return_ACPI_STATUS(AE_ERROR); + } +#endif return_ACPI_STATUS(AE_OK); } --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -31,6 +31,7 @@ static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP +#ifndef CONFIG_ACPI_PV_SLEEP /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { if (!acpi_wakeup_address) { @@ -41,6 +42,7 @@ acpi_wakeup_address)); } +#endif ACPI_FLUSH_CPU_CACHE(); acpi_enable_wakeup_device_prep(acpi_state); #endif @@ -137,7 +139,14 @@ break; case ACPI_STATE_S3: +#ifdef CONFIG_ACPI_PV_SLEEP + /* Hyperviosr will save and restore CPU context + * and then we can skip low level housekeeping here. + */ + acpi_enter_sleep_state(acpi_state); +#else do_suspend_lowlevel(); +#endif break; } @@ -187,7 +196,7 @@ acpi_target_sleep_state = ACPI_STATE_S0; -#ifdef CONFIG_X86 +#if defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP) if (init_8259A_after_S1) { printk("Broken toshiba laptop -> kicking interrupts\n"); init_8259A(0); --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -230,6 +230,13 @@ if (page == NULL) return NULL; +#ifdef CONFIG_XEN + if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) { + __free_pages(page, 2); + return NULL; + } +#endif + if (set_pages_uc(page, 4) < 0) { set_pages_wb(page, 4); __free_pages(page, 2); @@ -249,6 +256,9 @@ page = virt_to_page(addr); set_pages_wb(page, 4); +#ifdef CONFIG_XEN + xen_destroy_contiguous_region((unsigned long)page_address(page), 2); +#endif put_page(page); __free_pages(page, 2); atomic_dec(&agp_bridge->current_memory_agp); --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -108,6 +108,7 @@ } #endif +#ifndef ARCH_HAS_DEV_MEM /* * This funcion reads the *physical* memory. The f_pos points directly to the * memory location. @@ -230,6 +231,7 @@ *ppos += written; return written; } +#endif #ifndef __HAVE_PHYS_MEM_ACCESS_PROT static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -725,6 +727,7 @@ #define open_kmem open_mem #define open_oldmem open_mem +#ifndef ARCH_HAS_DEV_MEM static const struct file_operations mem_fops = { .llseek = memory_lseek, .read = read_mem, @@ -733,6 +736,9 @@ .open = open_mem, .get_unmapped_area = get_unmapped_area_mem, }; +#else +extern const struct file_operations mem_fops; +#endif static const struct file_operations kmem_fops = { .llseek = memory_lseek, --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_NSC) += tpm_nsc.o obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o +obj-$(CONFIG_TCG_XEN) += tpm_xenu.o +tpm_xenu-y = tpm_xen.o tpm_vtpm.o --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -107,6 +107,9 @@ struct dentry **bios_dir; struct list_head list; +#ifdef CONFIG_XEN + void *priv; +#endif void (*release) (struct device *); }; @@ -124,6 +127,18 @@ outb(value & 0xFF, base+1); } +#ifdef CONFIG_XEN +static inline void *chip_get_private(const struct tpm_chip *chip) +{ + return chip->priv; +} + +static inline void chip_set_private(struct tpm_chip *chip, void *priv) +{ + chip->priv = priv; +} +#endif + extern void tpm_get_timeouts(struct tpm_chip *); extern void tpm_gen_interrupt(struct tpm_chip *); extern void tpm_continue_selftest(struct tpm_chip *); --- /dev/null +++ b/drivers/char/tpm/tpm_vtpm.c @@ -0,0 +1,542 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Authors: + * Stefan Berger + * + * Generic device driver part for device drivers in a virtualized + * environment. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ + +#include +#include +#include +#include +#include +#include "tpm.h" +#include "tpm_vtpm.h" + +/* read status bits */ +enum { + STATUS_BUSY = 0x01, + STATUS_DATA_AVAIL = 0x02, + STATUS_READY = 0x04 +}; + +struct transmission { + struct list_head next; + + unsigned char *request; + size_t request_len; + size_t request_buflen; + + unsigned char *response; + size_t response_len; + size_t response_buflen; + + unsigned int flags; +}; + +enum { + TRANSMISSION_FLAG_WAS_QUEUED = 0x1 +}; + + +enum { + DATAEX_FLAG_QUEUED_ONLY = 0x1 +}; + + +/* local variables */ + +/* local function prototypes */ +static int _vtpm_send_queued(struct tpm_chip *chip); + + +/* ============================================================= + * Some utility functions + * ============================================================= + */ +static void vtpm_state_init(struct vtpm_state *vtpms) +{ + vtpms->current_request = NULL; + spin_lock_init(&vtpms->req_list_lock); + init_waitqueue_head(&vtpms->req_wait_queue); + INIT_LIST_HEAD(&vtpms->queued_requests); + + vtpms->current_response = NULL; + spin_lock_init(&vtpms->resp_list_lock); + init_waitqueue_head(&vtpms->resp_wait_queue); + + vtpms->disconnect_time = jiffies; +} + + +static inline struct transmission *transmission_alloc(void) +{ + return kzalloc(sizeof(struct transmission), GFP_ATOMIC); +} + +static unsigned char * +transmission_set_req_buffer(struct transmission *t, + unsigned char *buffer, size_t len) +{ + if (t->request_buflen < len) { + kfree(t->request); + t->request = kmalloc(len, GFP_KERNEL); + if (!t->request) { + t->request_buflen = 0; + return NULL; + } + t->request_buflen = len; + } + + memcpy(t->request, buffer, len); + t->request_len = len; + + return t->request; +} + +static unsigned char * +transmission_set_res_buffer(struct transmission *t, + const unsigned char *buffer, size_t len) +{ + if (t->response_buflen < len) { + kfree(t->response); + t->response = kmalloc(len, GFP_ATOMIC); + if (!t->response) { + t->response_buflen = 0; + return NULL; + } + t->response_buflen = len; + } + + memcpy(t->response, buffer, len); + t->response_len = len; + + return t->response; +} + +static inline void transmission_free(struct transmission *t) +{ + kfree(t->request); + kfree(t->response); + kfree(t); +} + +/* ============================================================= + * Interface with the lower layer driver + * ============================================================= + */ +/* + * Lower layer uses this function to make a response available. + */ +int vtpm_vd_recv(const struct tpm_chip *chip, + const unsigned char *buffer, size_t count, + void *ptr) +{ + unsigned long flags; + int ret_size = 0; + struct transmission *t; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + /* + * The list with requests must contain one request + * only and the element there must be the one that + * was passed to me from the front-end. + */ + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + if (vtpms->current_request != ptr) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return 0; + } + + if ((t = vtpms->current_request)) { + transmission_free(t); + vtpms->current_request = NULL; + } + + t = transmission_alloc(); + if (t) { + if (!transmission_set_res_buffer(t, buffer, count)) { + transmission_free(t); + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return -ENOMEM; + } + ret_size = count; + vtpms->current_response = t; + wake_up_interruptible(&vtpms->resp_wait_queue); + } + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + + return ret_size; +} + + +/* + * Lower layer indicates its status (connected/disconnected) + */ +void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status) +{ + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + vtpms->vd_status = vd_status; + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) { + vtpms->disconnect_time = jiffies; + } +} + +/* ============================================================= + * Interface with the generic TPM driver + * ============================================================= + */ +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + int rc = 0; + unsigned long flags; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + /* + * Check if the previous operation only queued the command + * In this case there won't be a response, so I just + * return from here and reset that flag. In any other + * case I should receive a response from the back-end. + */ + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) { + vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY; + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + /* + * The first few commands (measurements) must be + * queued since it might not be possible to talk to the + * TPM, yet. + * Return a response of up to 30 '0's. + */ + + count = min_t(size_t, count, 30); + memset(buf, 0x0, count); + return count; + } + /* + * Check whether something is in the responselist and if + * there's nothing in the list wait for something to appear. + */ + + if (!vtpms->current_response) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + interruptible_sleep_on_timeout(&vtpms->resp_wait_queue, + 1000); + spin_lock_irqsave(&vtpms->resp_list_lock ,flags); + } + + if (vtpms->current_response) { + struct transmission *t = vtpms->current_response; + vtpms->current_response = NULL; + rc = min(count, t->response_len); + memcpy(buf, t->response, rc); + transmission_free(t); + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return rc; +} + +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) +{ + int rc = 0; + unsigned long flags; + struct transmission *t = transmission_alloc(); + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + if (!t) + return -ENOMEM; + /* + * If there's a current request, it must be the + * previous request that has timed out. + */ + spin_lock_irqsave(&vtpms->req_list_lock, flags); + if (vtpms->current_request != NULL) { + printk("WARNING: Sending although there is a request outstanding.\n" + " Previous request must have timed out.\n"); + transmission_free(vtpms->current_request); + vtpms->current_request = NULL; + } + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + /* + * Queue the packet if the driver below is not + * ready, yet, or there is any packet already + * in the queue. + * If the driver below is ready, unqueue all + * packets first before sending our current + * packet. + * For each unqueued packet, except for the + * last (=current) packet, call the function + * tpm_xen_recv to wait for the response to come + * back. + */ + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) { + if (time_after(jiffies, + vtpms->disconnect_time + HZ * 10)) { + rc = -ENOENT; + } else { + goto queue_it; + } + } else { + /* + * Send all queued packets. + */ + if (_vtpm_send_queued(chip) == 0) { + + vtpms->current_request = t; + + rc = vtpm_vd_send(vtpms->tpm_private, + buf, + count, + t); + /* + * The generic TPM driver will call + * the function to receive the response. + */ + if (rc < 0) { + vtpms->current_request = NULL; + goto queue_it; + } + } else { +queue_it: + if (!transmission_set_req_buffer(t, buf, count)) { + transmission_free(t); + rc = -ENOMEM; + goto exit; + } + /* + * An error occurred. Don't event try + * to send the current request. Just + * queue it. + */ + spin_lock_irqsave(&vtpms->req_list_lock, flags); + vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY; + list_add_tail(&t->next, &vtpms->queued_requests); + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + } + } + +exit: + return rc; +} + + +/* + * Send all queued requests. + */ +static int _vtpm_send_queued(struct tpm_chip *chip) +{ + int rc; + int error = 0; + long flags; + unsigned char buffer[1]; + struct vtpm_state *vtpms; + vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->req_list_lock, flags); + + while (!list_empty(&vtpms->queued_requests)) { + /* + * Need to dequeue them. + * Read the result into a dummy buffer. + */ + struct transmission *qt = (struct transmission *) + vtpms->queued_requests.next; + list_del(&qt->next); + vtpms->current_request = qt; + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + rc = vtpm_vd_send(vtpms->tpm_private, + qt->request, + qt->request_len, + qt); + + if (rc < 0) { + spin_lock_irqsave(&vtpms->req_list_lock, flags); + if ((qt = vtpms->current_request) != NULL) { + /* + * requeue it at the beginning + * of the list + */ + list_add(&qt->next, + &vtpms->queued_requests); + } + vtpms->current_request = NULL; + error = 1; + break; + } + /* + * After this point qt is not valid anymore! + * It is freed when the front-end is delivering + * the data by calling tpm_recv + */ + /* + * Receive response into provided dummy buffer + */ + rc = vtpm_recv(chip, buffer, sizeof(buffer)); + spin_lock_irqsave(&vtpms->req_list_lock, flags); + } + + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + return error; +} + +static void vtpm_cancel(struct tpm_chip *chip) +{ + unsigned long flags; + struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->resp_list_lock,flags); + + if (!vtpms->current_response && vtpms->current_request) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + interruptible_sleep_on(&vtpms->resp_wait_queue); + spin_lock_irqsave(&vtpms->resp_list_lock,flags); + } + + if (vtpms->current_response) { + struct transmission *t = vtpms->current_response; + vtpms->current_response = NULL; + transmission_free(t); + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock,flags); +} + +static u8 vtpm_status(struct tpm_chip *chip) +{ + u8 rc = 0; + unsigned long flags; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + /* + * Data are available if: + * - there's a current response + * - the last packet was queued only (this is fake, but necessary to + * get the generic TPM layer to call the receive function.) + */ + if (vtpms->current_response || + 0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) { + rc = STATUS_DATA_AVAIL; + } else if (!vtpms->current_response && !vtpms->current_request) { + rc = STATUS_READY; + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return rc; +} + +static struct file_operations vtpm_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = tpm_open, + .read = tpm_read, + .write = tpm_write, + .release = tpm_release, +}; + +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL); +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL); +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, + NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel); + +static struct attribute *vtpm_attrs[] = { + &dev_attr_pubek.attr, + &dev_attr_pcrs.attr, + &dev_attr_enabled.attr, + &dev_attr_active.attr, + &dev_attr_owned.attr, + &dev_attr_temp_deactivated.attr, + &dev_attr_caps.attr, + &dev_attr_cancel.attr, + NULL, +}; + +static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs }; + +#define TPM_LONG_TIMEOUT (10 * 60 * HZ) + +static struct tpm_vendor_specific tpm_vtpm = { + .recv = vtpm_recv, + .send = vtpm_send, + .cancel = vtpm_cancel, + .status = vtpm_status, + .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL, + .req_complete_val = STATUS_DATA_AVAIL, + .req_canceled = STATUS_READY, + .attr_group = &vtpm_attr_grp, + .miscdev = { + .fops = &vtpm_ops, + }, + .duration = { + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + }, +}; + +struct tpm_chip *init_vtpm(struct device *dev, + struct tpm_private *tp) +{ + long rc; + struct tpm_chip *chip; + struct vtpm_state *vtpms; + + vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL); + if (!vtpms) + return ERR_PTR(-ENOMEM); + + vtpm_state_init(vtpms); + vtpms->tpm_private = tp; + + chip = tpm_register_hardware(dev, &tpm_vtpm); + if (!chip) { + rc = -ENODEV; + goto err_free_mem; + } + + chip_set_private(chip, vtpms); + + return chip; + +err_free_mem: + kfree(vtpms); + + return ERR_PTR(rc); +} + +void cleanup_vtpm(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip); + tpm_remove_hardware(dev); + kfree(vtpms); +} --- /dev/null +++ b/drivers/char/tpm/tpm_vtpm.h @@ -0,0 +1,55 @@ +#ifndef TPM_VTPM_H +#define TPM_VTPM_H + +struct tpm_chip; +struct tpm_private; + +struct vtpm_state { + struct transmission *current_request; + spinlock_t req_list_lock; + wait_queue_head_t req_wait_queue; + + struct list_head queued_requests; + + struct transmission *current_response; + spinlock_t resp_list_lock; + wait_queue_head_t resp_wait_queue; // processes waiting for responses + + u8 vd_status; + u8 flags; + + unsigned long disconnect_time; + + /* + * The following is a private structure of the underlying + * driver. It is passed as parameter in the send function. + */ + struct tpm_private *tpm_private; +}; + + +enum vdev_status { + TPM_VD_STATUS_DISCONNECTED = 0x0, + TPM_VD_STATUS_CONNECTED = 0x1 +}; + +/* this function is called from tpm_vtpm.c */ +int vtpm_vd_send(struct tpm_private * tp, + const u8 * buf, size_t count, void *ptr); + +/* these functions are offered by tpm_vtpm.c */ +struct tpm_chip *init_vtpm(struct device *, + struct tpm_private *); +void cleanup_vtpm(struct device *); +int vtpm_vd_recv(const struct tpm_chip* chip, + const unsigned char *buffer, size_t count, void *ptr); +void vtpm_vd_status(const struct tpm_chip *, u8 status); + +static inline struct tpm_private *tpm_private_from_dev(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct vtpm_state *vtpms = chip_get_private(chip); + return vtpms->tpm_private; +} + +#endif --- /dev/null +++ b/drivers/char/tpm/tpm_xen.c @@ -0,0 +1,722 @@ +/* + * Copyright (c) 2005, IBM Corporation + * + * Author: Stefan Berger, stefanb@us.ibm.com + * Grant table support: Mahadevan Gomathisankaran + * + * This code has been derived from drivers/xen/netfront/netfront.c + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tpm.h" +#include "tpm_vtpm.h" + +#undef DEBUG + +/* local structures */ +struct tpm_private { + struct tpm_chip *chip; + + tpmif_tx_interface_t *tx; + atomic_t refcnt; + unsigned int irq; + u8 is_connected; + u8 is_suspended; + + spinlock_t tx_lock; + + struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE]; + + atomic_t tx_busy; + void *tx_remember; + + domid_t backend_id; + wait_queue_head_t wait_q; + + struct xenbus_device *dev; + int ring_ref; +}; + +struct tx_buffer { + unsigned int size; // available space in data + unsigned int len; // used space in data + unsigned char *data; // pointer to a page +}; + + +/* locally visible variables */ +static grant_ref_t gref_head; +static struct tpm_private *my_priv; + +/* local function prototypes */ +static irqreturn_t tpmif_int(int irq, + void *tpm_priv, + struct pt_regs *ptregs); +static void tpmif_rx_action(unsigned long unused); +static int tpmif_connect(struct xenbus_device *dev, + struct tpm_private *tp, + domid_t domid); +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0); +static int tpmif_allocate_tx_buffers(struct tpm_private *tp); +static void tpmif_free_tx_buffers(struct tpm_private *tp); +static void tpmif_set_connected_state(struct tpm_private *tp, + u8 newstate); +static int tpm_xmit(struct tpm_private *tp, + const u8 * buf, size_t count, int userbuffer, + void *remember); +static void destroy_tpmring(struct tpm_private *tp); +void __exit tpmif_exit(void); + +#define DPRINTK(fmt, args...) \ + pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_tpm_fr: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args) + +#define GRANT_INVALID_REF 0 + + +static inline int +tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len, + int isuserbuffer) +{ + int copied = len; + + if (len > txb->size) + copied = txb->size; + if (isuserbuffer) { + if (copy_from_user(txb->data, src, copied)) + return -EFAULT; + } else { + memcpy(txb->data, src, copied); + } + txb->len = len; + return copied; +} + +static inline struct tx_buffer *tx_buffer_alloc(void) +{ + struct tx_buffer *txb; + + txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL); + if (!txb) + return NULL; + + txb->len = 0; + txb->size = PAGE_SIZE; + txb->data = (unsigned char *)__get_free_page(GFP_KERNEL); + if (txb->data == NULL) { + kfree(txb); + txb = NULL; + } + + return txb; +} + + +static inline void tx_buffer_free(struct tx_buffer *txb) +{ + if (txb) { + free_page((long)txb->data); + kfree(txb); + } +} + +/************************************************************** + Utility function for the tpm_private structure +**************************************************************/ +static void tpm_private_init(struct tpm_private *tp) +{ + spin_lock_init(&tp->tx_lock); + init_waitqueue_head(&tp->wait_q); + atomic_set(&tp->refcnt, 1); +} + +static void tpm_private_put(void) +{ + if (!atomic_dec_and_test(&my_priv->refcnt)) + return; + + tpmif_free_tx_buffers(my_priv); + kfree(my_priv); + my_priv = NULL; +} + +static struct tpm_private *tpm_private_get(void) +{ + int err; + + if (my_priv) { + atomic_inc(&my_priv->refcnt); + return my_priv; + } + + my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL); + if (!my_priv) + return NULL; + + tpm_private_init(my_priv); + err = tpmif_allocate_tx_buffers(my_priv); + if (err < 0) + tpm_private_put(); + + return my_priv; +} + +/************************************************************** + + The interface to let the tpm plugin register its callback + function and send data to another partition using this module + +**************************************************************/ + +static DEFINE_MUTEX(suspend_lock); +/* + * Send data via this module by calling this function + */ +int vtpm_vd_send(struct tpm_private *tp, + const u8 * buf, size_t count, void *ptr) +{ + int sent; + + mutex_lock(&suspend_lock); + sent = tpm_xmit(tp, buf, count, 0, ptr); + mutex_unlock(&suspend_lock); + + return sent; +} + +/************************************************************** + XENBUS support code +**************************************************************/ + +static int setup_tpmring(struct xenbus_device *dev, + struct tpm_private *tp) +{ + tpmif_tx_interface_t *sring; + int err; + + tp->ring_ref = GRANT_INVALID_REF; + + sring = (void *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + tp->tx = sring; + + err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx)); + if (err < 0) { + free_page((unsigned long)sring); + tp->tx = NULL; + xenbus_dev_fatal(dev, err, "allocating grant reference"); + goto fail; + } + tp->ring_ref = err; + + err = tpmif_connect(dev, tp, dev->otherend_id); + if (err) + goto fail; + + return 0; +fail: + destroy_tpmring(tp); + return err; +} + + +static void destroy_tpmring(struct tpm_private *tp) +{ + tpmif_set_connected_state(tp, 0); + + if (tp->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx); + tp->ring_ref = GRANT_INVALID_REF; + tp->tx = NULL; + } + + if (tp->irq) + unbind_from_irqhandler(tp->irq, tp); + + tp->irq = 0; +} + + +static int talk_to_backend(struct xenbus_device *dev, + struct tpm_private *tp) +{ + const char *message = NULL; + int err; + struct xenbus_transaction xbt; + + err = setup_tpmring(dev, tp); + if (err) { + xenbus_dev_fatal(dev, err, "setting up ring"); + goto out; + } + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_tpmring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref","%u", tp->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(tp->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) { + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_tpmring; + } + + xenbus_switch_state(dev, XenbusStateConnected); + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_error(dev, err, "%s", message); +destroy_tpmring: + destroy_tpmring(tp); +out: + return err; +} + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + DPRINTK("\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + break; + + case XenbusStateConnected: + tpmif_set_connected_state(tp, 1); + break; + + case XenbusStateClosing: + tpmif_set_connected_state(tp, 0); + xenbus_frontend_closed(dev); + break; + + case XenbusStateClosed: + tpmif_set_connected_state(tp, 0); + if (tp->is_suspended == 0) + device_unregister(&dev->dev); + xenbus_frontend_closed(dev); + break; + } +} + +static int tpmfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + int handle; + struct tpm_private *tp = tpm_private_get(); + + if (!tp) + return -ENOMEM; + + tp->chip = init_vtpm(&dev->dev, tp); + if (IS_ERR(tp->chip)) + return PTR_ERR(tp->chip); + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "handle", "%i", &handle); + if (XENBUS_EXIST_ERR(err)) + return err; + + if (err < 0) { + xenbus_dev_fatal(dev,err,"reading virtual-device"); + return err; + } + + tp->dev = dev; + + err = talk_to_backend(dev, tp); + if (err) { + tpm_private_put(); + return err; + } + + return 0; +} + + +static int tpmfront_remove(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + destroy_tpmring(tp); + cleanup_vtpm(&dev->dev); + return 0; +} + +static int tpmfront_suspend(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + u32 ctr; + + /* Take the lock, preventing any application from sending. */ + mutex_lock(&suspend_lock); + tp->is_suspended = 1; + + for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) { + if ((ctr % 10) == 0) + printk("TPM-FE [INFO]: Waiting for outstanding " + "request.\n"); + /* Wait for a request to be responded to. */ + interruptible_sleep_on_timeout(&tp->wait_q, 100); + } + + return 0; +} + +static int tpmfront_suspend_finish(struct tpm_private *tp) +{ + tp->is_suspended = 0; + /* Allow applications to send again. */ + mutex_unlock(&suspend_lock); + return 0; +} + +static int tpmfront_suspend_cancel(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + return tpmfront_suspend_finish(tp); +} + +static int tpmfront_resume(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + destroy_tpmring(tp); + return talk_to_backend(dev, tp); +} + +static int tpmif_connect(struct xenbus_device *dev, + struct tpm_private *tp, + domid_t domid) +{ + int err; + + tp->backend_id = domid; + + err = bind_listening_port_to_irqhandler( + domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp); + if (err <= 0) { + WPRINTK("bind_listening_port_to_irqhandler failed " + "(err=%d)\n", err); + return err; + } + tp->irq = err; + + return 0; +} + +static struct xenbus_device_id tpmfront_ids[] = { + { "vtpm" }, + { "" } +}; + +static struct xenbus_driver tpmfront = { + .name = "vtpm", + .owner = THIS_MODULE, + .ids = tpmfront_ids, + .probe = tpmfront_probe, + .remove = tpmfront_remove, + .resume = tpmfront_resume, + .otherend_changed = backend_changed, + .suspend = tpmfront_suspend, + .suspend_cancel = tpmfront_suspend_cancel, +}; + +static void __init init_tpm_xenbus(void) +{ + xenbus_register_frontend(&tpmfront); +} + +static int tpmif_allocate_tx_buffers(struct tpm_private *tp) +{ + unsigned int i; + + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) { + tp->tx_buffers[i] = tx_buffer_alloc(); + if (!tp->tx_buffers[i]) { + tpmif_free_tx_buffers(tp); + return -ENOMEM; + } + } + return 0; +} + +static void tpmif_free_tx_buffers(struct tpm_private *tp) +{ + unsigned int i; + + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) + tx_buffer_free(tp->tx_buffers[i]); +} + +static void tpmif_rx_action(unsigned long priv) +{ + struct tpm_private *tp = (struct tpm_private *)priv; + int i = 0; + unsigned int received; + unsigned int offset = 0; + u8 *buffer; + tpmif_tx_request_t *tx = &tp->tx->ring[i].req; + + atomic_set(&tp->tx_busy, 0); + wake_up_interruptible(&tp->wait_q); + + received = tx->size; + + buffer = kmalloc(received, GFP_ATOMIC); + if (!buffer) + return; + + for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) { + struct tx_buffer *txb = tp->tx_buffers[i]; + tpmif_tx_request_t *tx; + unsigned int tocopy; + + tx = &tp->tx->ring[i].req; + tocopy = tx->size; + if (tocopy > PAGE_SIZE) + tocopy = PAGE_SIZE; + + memcpy(&buffer[offset], txb->data, tocopy); + + gnttab_release_grant_reference(&gref_head, tx->ref); + + offset += tocopy; + } + + vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember); + kfree(buffer); +} + + +static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs) +{ + struct tpm_private *tp = tpm_priv; + unsigned long flags; + + spin_lock_irqsave(&tp->tx_lock, flags); + tpmif_rx_tasklet.data = (unsigned long)tp; + tasklet_schedule(&tpmif_rx_tasklet); + spin_unlock_irqrestore(&tp->tx_lock, flags); + + return IRQ_HANDLED; +} + + +static int tpm_xmit(struct tpm_private *tp, + const u8 * buf, size_t count, int isuserbuffer, + void *remember) +{ + tpmif_tx_request_t *tx; + TPMIF_RING_IDX i; + unsigned int offset = 0; + + spin_lock_irq(&tp->tx_lock); + + if (unlikely(atomic_read(&tp->tx_busy))) { + printk("tpm_xmit: There's an outstanding request/response " + "on the way!\n"); + spin_unlock_irq(&tp->tx_lock); + return -EBUSY; + } + + if (tp->is_connected != 1) { + spin_unlock_irq(&tp->tx_lock); + return -EIO; + } + + for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) { + struct tx_buffer *txb = tp->tx_buffers[i]; + int copied; + + if (!txb) { + DPRINTK("txb (i=%d) is NULL. buffers initilized?\n" + "Not transmitting anything!\n", i); + spin_unlock_irq(&tp->tx_lock); + return -EFAULT; + } + + copied = tx_buffer_copy(txb, &buf[offset], count, + isuserbuffer); + if (copied < 0) { + /* An error occurred */ + spin_unlock_irq(&tp->tx_lock); + return copied; + } + count -= copied; + offset += copied; + + tx = &tp->tx->ring[i].req; + tx->addr = virt_to_machine(txb->data); + tx->size = txb->len; + tx->unused = 0; + + DPRINTK("First 4 characters sent by TPM-FE are " + "0x%02x 0x%02x 0x%02x 0x%02x\n", + txb->data[0],txb->data[1],txb->data[2],txb->data[3]); + + /* Get the granttable reference for this page. */ + tx->ref = gnttab_claim_grant_reference(&gref_head); + if (tx->ref == -ENOSPC) { + spin_unlock_irq(&tp->tx_lock); + DPRINTK("Grant table claim reference failed in " + "func:%s line:%d file:%s\n", + __FUNCTION__, __LINE__, __FILE__); + return -ENOSPC; + } + gnttab_grant_foreign_access_ref(tx->ref, + tp->backend_id, + virt_to_mfn(txb->data), + 0 /*RW*/); + wmb(); + } + + atomic_set(&tp->tx_busy, 1); + tp->tx_remember = remember; + + mb(); + + notify_remote_via_irq(tp->irq); + + spin_unlock_irq(&tp->tx_lock); + return offset; +} + + +static void tpmif_notify_upperlayer(struct tpm_private *tp) +{ + /* Notify upper layer about the state of the connection to the BE. */ + vtpm_vd_status(tp->chip, (tp->is_connected + ? TPM_VD_STATUS_CONNECTED + : TPM_VD_STATUS_DISCONNECTED)); +} + + +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected) +{ + /* + * Don't notify upper layer if we are in suspend mode and + * should disconnect - assumption is that we will resume + * The mutex keeps apps from sending. + */ + if (is_connected == 0 && tp->is_suspended == 1) + return; + + /* + * Unlock the mutex if we are connected again + * after being suspended - now resuming. + * This also removes the suspend state. + */ + if (is_connected == 1 && tp->is_suspended == 1) + tpmfront_suspend_finish(tp); + + if (is_connected != tp->is_connected) { + tp->is_connected = is_connected; + tpmif_notify_upperlayer(tp); + } +} + + + +/* ================================================================= + * Initialization function. + * ================================================================= + */ + + +static int __init tpmif_init(void) +{ + struct tpm_private *tp; + + if (is_initial_xendomain()) + return -EPERM; + + tp = tpm_private_get(); + if (!tp) + return -ENOMEM; + + IPRINTK("Initialising the vTPM driver.\n"); + if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE, + &gref_head) < 0) { + tpm_private_put(); + return -EFAULT; + } + + init_tpm_xenbus(); + return 0; +} + + +module_init(tpmif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -336,12 +336,12 @@ { u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */ - if (!PCI_DMA_BUS_IS_PHYS) { - addr = BLK_BOUNCE_ANY; - } else if (on && drive->media == ide_disk) { + if (on && drive->media == ide_disk) { struct device *dev = drive->hwif->dev; - if (dev && dev->dma_mask) + if (!PCI_DMA_BUS_IS_PHYS) + addr = BLK_BOUNCE_ANY; + else if (dev && dev->dma_mask) addr = *dev->dma_mask; } --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -6,6 +6,10 @@ * * @author John Levon * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the * global event buffer. Such processing is necessary @@ -40,6 +44,7 @@ static DEFINE_SPINLOCK(task_mortuary); static void process_task_mortuary(void); +static int cpu_current_domain[NR_CPUS]; /* Take ownership of the task struct and place it on the * list for processing. Only after two full buffer syncs @@ -148,6 +153,11 @@ int sync_start(void) { int err; + int i; + + for (i = 0; i < NR_CPUS; i++) { + cpu_current_domain[i] = COORDINATOR_DOMAIN; + } start_cpu_work(); @@ -274,15 +284,31 @@ last_cookie = INVALID_COOKIE; } -static void add_kernel_ctx_switch(unsigned int in_kernel) +static void add_cpu_mode_switch(unsigned int cpu_mode) { add_event_entry(ESCAPE_CODE); - if (in_kernel) - add_event_entry(KERNEL_ENTER_SWITCH_CODE); - else - add_event_entry(KERNEL_EXIT_SWITCH_CODE); + switch (cpu_mode) { + case CPU_MODE_USER: + add_event_entry(USER_ENTER_SWITCH_CODE); + break; + case CPU_MODE_KERNEL: + add_event_entry(KERNEL_ENTER_SWITCH_CODE); + break; + case CPU_MODE_XEN: + add_event_entry(XEN_ENTER_SWITCH_CODE); + break; + default: + break; + } } - + +static void add_domain_switch(unsigned long domain_id) +{ + add_event_entry(ESCAPE_CODE); + add_event_entry(DOMAIN_SWITCH_CODE); + add_event_entry(domain_id); +} + static void add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) { @@ -347,9 +373,9 @@ * for later lookup from userspace. */ static int -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) +add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode) { - if (in_kernel) { + if (cpu_mode >= CPU_MODE_KERNEL) { add_sample_entry(s->eip, s->event); return 1; } else if (mm) { @@ -495,15 +521,21 @@ struct mm_struct *mm = NULL; struct task_struct * new; unsigned long cookie = 0; - int in_kernel = 1; + int cpu_mode = 1; unsigned int i; sync_buffer_state state = sb_buffer_start; unsigned long available; + int domain_switch = 0; mutex_lock(&buffer_mutex); add_cpu_switch(cpu); + /* We need to assign the first samples in this CPU buffer to the + same domain that we were processing at the last sync_buffer */ + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) { + add_domain_switch(cpu_current_domain[cpu]); + } /* Remember, only we can modify tail_pos */ available = get_slots(cpu_buf); @@ -511,16 +543,18 @@ for (i = 0; i < available; ++i) { struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; - if (is_code(s->eip)) { - if (s->event <= CPU_IS_KERNEL) { - /* kernel/userspace switch */ - in_kernel = s->event; + if (is_code(s->eip) && !domain_switch) { + if (s->event <= CPU_MODE_XEN) { + /* xen/kernel/userspace switch */ + cpu_mode = s->event; if (state == sb_buffer_start) state = sb_sample_start; - add_kernel_ctx_switch(s->event); + add_cpu_mode_switch(s->event); } else if (s->event == CPU_TRACE_BEGIN) { state = sb_bt_start; add_trace_begin(); + } else if (s->event == CPU_DOMAIN_SWITCH) { + domain_switch = 1; } else { struct mm_struct * oldmm = mm; @@ -534,11 +568,21 @@ add_user_ctx_switch(new, cookie); } } else { - if (state >= sb_bt_start && - !add_sample(mm, s, in_kernel)) { - if (state == sb_bt_start) { - state = sb_bt_ignore; - atomic_inc(&oprofile_stats.bt_lost_no_mapping); + if (domain_switch) { + cpu_current_domain[cpu] = s->eip; + add_domain_switch(s->eip); + domain_switch = 0; + } else { + if (cpu_current_domain[cpu] != + COORDINATOR_DOMAIN) { + add_sample_entry(s->eip, s->event); + } + else if (state >= sb_bt_start && + !add_sample(mm, s, cpu_mode)) { + if (state == sb_bt_start) { + state = sb_bt_ignore; + atomic_inc(&oprofile_stats.bt_lost_no_mapping); + } } } } @@ -547,6 +591,11 @@ } release_mm(mm); + /* We reset domain to COORDINATOR at each CPU switch */ + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) { + add_domain_switch(COORDINATOR_DOMAIN); + } + mark_done(cpu); mutex_unlock(&buffer_mutex); --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -6,6 +6,10 @@ * * @author John Levon * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. * Eventually each CPU's buffer is processed into the global @@ -34,6 +38,8 @@ #define DEFAULT_TIMER_EXPIRE (HZ / 10) static int work_enabled; +static int32_t current_domain = COORDINATOR_DOMAIN; + void free_cpu_buffers(void) { int i; @@ -57,7 +63,7 @@ goto fail; b->last_task = NULL; - b->last_is_kernel = -1; + b->last_cpu_mode = -1; b->tracing = 0; b->buffer_size = buffer_size; b->tail_pos = 0; @@ -115,7 +121,7 @@ * collected will populate the buffer with proper * values to initialize the buffer */ - cpu_buf->last_is_kernel = -1; + cpu_buf->last_cpu_mode = -1; cpu_buf->last_task = NULL; } @@ -165,13 +171,13 @@ * because of the head/tail separation of the writer and reader * of the CPU buffer. * - * is_kernel is needed because on some architectures you cannot + * cpu_mode is needed because on some architectures you cannot * tell if you are in kernel or user space simply by looking at - * pc. We tag this in the buffer by generating kernel enter/exit - * events whenever is_kernel changes + * pc. We tag this in the buffer by generating kernel/user (and xen) + * enter events whenever cpu_mode changes */ static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, - int is_kernel, unsigned long event) + int cpu_mode, unsigned long event) { struct task_struct * task; @@ -187,18 +193,18 @@ return 0; } - is_kernel = !!is_kernel; - task = current; /* notice a switch from user->kernel or vice versa */ - if (cpu_buf->last_is_kernel != is_kernel) { - cpu_buf->last_is_kernel = is_kernel; - add_code(cpu_buf, is_kernel); + if (cpu_buf->last_cpu_mode != cpu_mode) { + cpu_buf->last_cpu_mode = cpu_mode; + add_code(cpu_buf, cpu_mode); } - + /* notice a task switch */ - if (cpu_buf->last_task != task) { + /* if not processing other domain samples */ + if ((cpu_buf->last_task != task) && + (current_domain == COORDINATOR_DOMAIN)) { cpu_buf->last_task = task; add_code(cpu_buf, (unsigned long)task); } @@ -282,6 +288,25 @@ add_sample(cpu_buf, pc, 0); } +int oprofile_add_domain_switch(int32_t domain_id) +{ + struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; + + /* should have space for switching into and out of domain + (2 slots each) plus one sample and one cpu mode switch */ + if (((nr_available_slots(cpu_buf) < 6) && + (domain_id != COORDINATOR_DOMAIN)) || + (nr_available_slots(cpu_buf) < 2)) + return 0; + + add_code(cpu_buf, CPU_DOMAIN_SWITCH); + add_sample(cpu_buf, domain_id, 0); + + current_domain = domain_id; + + return 1; +} + /* * This serves to avoid cpu buffer overflow, and makes sure * the task mortuary progresses --- a/drivers/oprofile/cpu_buffer.h +++ b/drivers/oprofile/cpu_buffer.h @@ -36,7 +36,7 @@ volatile unsigned long tail_pos; unsigned long buffer_size; struct task_struct * last_task; - int last_is_kernel; + int last_cpu_mode; int tracing; struct op_sample * buffer; unsigned long sample_received; @@ -52,7 +52,10 @@ void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf); /* transient events for the CPU buffer -> event buffer */ -#define CPU_IS_KERNEL 1 -#define CPU_TRACE_BEGIN 2 +#define CPU_MODE_USER 0 +#define CPU_MODE_KERNEL 1 +#define CPU_MODE_XEN 2 +#define CPU_TRACE_BEGIN 3 +#define CPU_DOMAIN_SWITCH 4 #endif /* OPROFILE_CPU_BUFFER_H */ --- a/drivers/oprofile/event_buffer.h +++ b/drivers/oprofile/event_buffer.h @@ -23,6 +23,9 @@ #define INVALID_COOKIE ~0UL #define NO_COOKIE 0UL +/* Constant used to refer to coordinator domain (Xen) */ +#define COORDINATOR_DOMAIN -1 + extern const struct file_operations event_buffer_fops; /* mutex between sync_cpu_buffers() and the --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -5,6 +5,10 @@ * @remark Read the file COPYING * * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #include @@ -33,6 +37,32 @@ */ static int timer = 0; +int oprofile_set_active(int active_domains[], unsigned int adomains) +{ + int err; + + if (!oprofile_ops.set_active) + return -EINVAL; + + mutex_lock(&start_mutex); + err = oprofile_ops.set_active(active_domains, adomains); + mutex_unlock(&start_mutex); + return err; +} + +int oprofile_set_passive(int passive_domains[], unsigned int pdomains) +{ + int err; + + if (!oprofile_ops.set_passive) + return -EINVAL; + + mutex_lock(&start_mutex); + err = oprofile_ops.set_passive(passive_domains, pdomains); + mutex_unlock(&start_mutex); + return err; +} + int oprofile_setup(void) { int err; --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -35,5 +35,8 @@ void oprofile_timer_init(struct oprofile_operations * ops); int oprofile_set_backtrace(unsigned long depth); + +int oprofile_set_active(int active_domains[], unsigned int adomains); +int oprofile_set_passive(int passive_domains[], unsigned int pdomains); #endif /* OPROF_H */ --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -5,15 +5,21 @@ * @remark Read the file COPYING * * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #include #include +#include +#include #include "event_buffer.h" #include "oprofile_stats.h" #include "oprof.h" - + unsigned long fs_buffer_size = 131072; unsigned long fs_cpu_buffer_size = 8192; unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */ @@ -117,11 +123,202 @@ static const struct file_operations dump_fops = { .write = dump_write, }; - + +#define TMPBUFSIZE 512 + +static unsigned int adomains = 0; +static int active_domains[MAX_OPROF_DOMAINS + 1]; +static DEFINE_MUTEX(adom_mutex); + +static ssize_t adomain_write(struct file * file, char const __user * buf, + size_t count, loff_t * offset) +{ + char *tmpbuf; + char *startp, *endp; + int i; + unsigned long val; + ssize_t retval = count; + + if (*offset) + return -EINVAL; + if (count > TMPBUFSIZE - 1) + return -EINVAL; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + if (copy_from_user(tmpbuf, buf, count)) { + kfree(tmpbuf); + return -EFAULT; + } + tmpbuf[count] = 0; + + mutex_lock(&adom_mutex); + + startp = tmpbuf; + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */ + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) { + val = simple_strtoul(startp, &endp, 0); + if (endp == startp) + break; + while (ispunct(*endp) || isspace(*endp)) + endp++; + active_domains[i] = val; + if (active_domains[i] != val) + /* Overflow, force error below */ + i = MAX_OPROF_DOMAINS + 1; + startp = endp; + } + /* Force error on trailing junk */ + adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i; + + kfree(tmpbuf); + + if (adomains > MAX_OPROF_DOMAINS + || oprofile_set_active(active_domains, adomains)) { + adomains = 0; + retval = -EINVAL; + } + + mutex_unlock(&adom_mutex); + return retval; +} + +static ssize_t adomain_read(struct file * file, char __user * buf, + size_t count, loff_t * offset) +{ + char * tmpbuf; + size_t len; + int i; + ssize_t retval; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + mutex_lock(&adom_mutex); + + len = 0; + for (i = 0; i < adomains; i++) + len += snprintf(tmpbuf + len, + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0, + "%u ", active_domains[i]); + WARN_ON(len > TMPBUFSIZE); + if (len != 0 && len <= TMPBUFSIZE) + tmpbuf[len-1] = '\n'; + + mutex_unlock(&adom_mutex); + + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len); + + kfree(tmpbuf); + return retval; +} + + +static struct file_operations active_domain_ops = { + .read = adomain_read, + .write = adomain_write, +}; + +static unsigned int pdomains = 0; +static int passive_domains[MAX_OPROF_DOMAINS]; +static DEFINE_MUTEX(pdom_mutex); + +static ssize_t pdomain_write(struct file * file, char const __user * buf, + size_t count, loff_t * offset) +{ + char *tmpbuf; + char *startp, *endp; + int i; + unsigned long val; + ssize_t retval = count; + + if (*offset) + return -EINVAL; + if (count > TMPBUFSIZE - 1) + return -EINVAL; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + if (copy_from_user(tmpbuf, buf, count)) { + kfree(tmpbuf); + return -EFAULT; + } + tmpbuf[count] = 0; + + mutex_lock(&pdom_mutex); + + startp = tmpbuf; + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */ + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) { + val = simple_strtoul(startp, &endp, 0); + if (endp == startp) + break; + while (ispunct(*endp) || isspace(*endp)) + endp++; + passive_domains[i] = val; + if (passive_domains[i] != val) + /* Overflow, force error below */ + i = MAX_OPROF_DOMAINS + 1; + startp = endp; + } + /* Force error on trailing junk */ + pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i; + + kfree(tmpbuf); + + if (pdomains > MAX_OPROF_DOMAINS + || oprofile_set_passive(passive_domains, pdomains)) { + pdomains = 0; + retval = -EINVAL; + } + + mutex_unlock(&pdom_mutex); + return retval; +} + +static ssize_t pdomain_read(struct file * file, char __user * buf, + size_t count, loff_t * offset) +{ + char * tmpbuf; + size_t len; + int i; + ssize_t retval; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + mutex_lock(&pdom_mutex); + + len = 0; + for (i = 0; i < pdomains; i++) + len += snprintf(tmpbuf + len, + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0, + "%u ", passive_domains[i]); + WARN_ON(len > TMPBUFSIZE); + if (len != 0 && len <= TMPBUFSIZE) + tmpbuf[len-1] = '\n'; + + mutex_unlock(&pdom_mutex); + + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len); + + kfree(tmpbuf); + return retval; +} + +static struct file_operations passive_domain_ops = { + .read = pdomain_read, + .write = pdomain_write, +}; + void oprofile_create_files(struct super_block * sb, struct dentry * root) { oprofilefs_create_file(sb, root, "enable", &enable_fops); oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); + oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops); + oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops); oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops); oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size); oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed); --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -17,6 +17,8 @@ #include "pci.h" +extern int pci_mem_align; + /** * pci_bus_alloc_resource - allocate a resource from a parent bus * @bus: PCI bus @@ -44,6 +46,11 @@ type_mask |= IORESOURCE_IO | IORESOURCE_MEM; + /* If the boot parameter 'pci-mem-align' was specified then we need to + align the memory addresses, at page size alignment. */ + if (pci_mem_align && (align < (PAGE_SIZE-1))) + align = PAGE_SIZE - 1; + for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) { struct resource *r = bus->resource[i]; if (!r) --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -24,6 +24,40 @@ #include #include "pci.h" +/* A global flag which signals if we should page-align PCI mem windows. */ +int pci_mem_align = 0; + +static int __init set_pci_mem_align(char *str) +{ + pci_mem_align = 1; + return 1; +} +__setup("pci-mem-align", set_pci_mem_align); + +/* This quirk function enables us to force all memory resources which are + * assigned to PCI devices, to be page-aligned. + */ +static void __devinit quirk_align_mem_resources(struct pci_dev *dev) +{ + int i; + struct resource *r; + resource_size_t old_start; + + if (!pci_mem_align) + return; + + for (i=0; i < DEVICE_COUNT_RESOURCE; i++) { + r = &dev->resource[i]; + if ((r == NULL) || !(r->flags & IORESOURCE_MEM)) + continue; + + old_start = r->start; + r->start = (r->start + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + r->end = r->end - (old_start - r->start); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_align_mem_resources); + /* The Mellanox Tavor device gives false positive parity errors * Mark this device with a broken_parity_status, to allow * PCI scanning code to "skip" this now blacklisted device. --- a/fs/aio.c +++ b/fs/aio.c @@ -36,6 +36,11 @@ #include #include +#ifdef CONFIG_EPOLL +#include +#include +#endif + #if DEBUG > 1 #define dprintk printk #else @@ -1008,6 +1013,11 @@ if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); +#ifdef CONFIG_EPOLL + if (ctx->file && waitqueue_active(&ctx->poll_wait)) + wake_up(&ctx->poll_wait); +#endif + spin_unlock_irqrestore(&ctx->ctx_lock, flags); return ret; } @@ -1015,6 +1025,8 @@ /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) + * If ent parameter is 0, just returns the number of events that would + * be fetched. * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ @@ -1037,13 +1049,18 @@ head = ring->head % info->nr; if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp, KM_USER1); + if (ent) { /* event requested */ + struct io_event *evp = + aio_ring_event(info, head, KM_USER1); + *ent = *evp; + head = (head + 1) % info->nr; + /* finish reading the event before updatng the head */ + smp_mb(); + ring->head = head; + ret = 1; + put_aio_ring_event(evp, KM_USER1); + } else /* only need to know availability */ + ret = 1; } spin_unlock(&info->ring_lock); @@ -1234,6 +1251,13 @@ aio_cancel_all(ioctx); wait_for_all_aios(ioctx); +#ifdef CONFIG_EPOLL + /* forget the poll file, but it's up to the user to close it */ + if (ioctx->file) { + ioctx->file->private_data = 0; + ioctx->file = 0; + } +#endif /* * Wake up any waiters. The setting of ctx->dead must be seen @@ -1244,6 +1268,68 @@ put_ioctx(ioctx); /* once for the lookup */ } +#ifdef CONFIG_EPOLL + +static int aio_queue_fd_close(struct inode *inode, struct file *file) +{ + struct kioctx *ioctx = file->private_data; + if (ioctx) { + file->private_data = 0; + spin_lock_irq(&ioctx->ctx_lock); + ioctx->file = 0; + spin_unlock_irq(&ioctx->ctx_lock); + } + return 0; +} + +static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait) +{ unsigned int pollflags = 0; + struct kioctx *ioctx = file->private_data; + + if (ioctx) { + + spin_lock_irq(&ioctx->ctx_lock); + /* Insert inside our poll wait queue */ + poll_wait(file, &ioctx->poll_wait, wait); + + /* Check our condition */ + if (aio_read_evt(ioctx, 0)) + pollflags = POLLIN | POLLRDNORM; + spin_unlock_irq(&ioctx->ctx_lock); + } + + return pollflags; +} + +static const struct file_operations aioq_fops = { + .release = aio_queue_fd_close, + .poll = aio_queue_fd_poll +}; + +/* make_aio_fd: + * Create a file descriptor that can be used to poll the event queue. + * Based and piggybacked on the excellent epoll code. + */ + +static int make_aio_fd(struct kioctx *ioctx) +{ + int error, fd; + struct inode *inode; + struct file *file; + + error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops); + if (error) + return error; + + /* associate the file with the IO context */ + file->private_data = ioctx; + ioctx->file = file; + init_waitqueue_head(&ioctx->poll_wait); + return fd; +} +#endif + + /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and @@ -1256,18 +1342,30 @@ * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented. + * + * To request a selectable fd, the user context has to be initialized + * to 1, instead of 0, and the return value is the fd. + * This keeps the system call compatible, since a non-zero value + * was not allowed so far. */ asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; + int make_fd = 0; ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out; ret = -EINVAL; +#ifdef CONFIG_EPOLL + if (ctx == 1) { + make_fd = 1; + ctx = 0; + } +#endif if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", ctx, nr_events); @@ -1278,8 +1376,12 @@ ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) - return 0; +#ifdef CONFIG_EPOLL + if (make_fd && ret >= 0) + ret = make_aio_fd(ioctx); +#endif + if (ret >= 0) + return ret; get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ io_destroy(ioctx); --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -114,6 +114,13 @@ #include #endif +#ifdef CONFIG_XEN +#include +#include +#include +#include +#endif + static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *f) { @@ -2834,6 +2841,18 @@ IGNORE_IOCTL(FBIOSCURSOR32) IGNORE_IOCTL(FBIOGCURSOR32) #endif + +#ifdef CONFIG_XEN +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_32, privcmd_ioctl_32) +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_32, privcmd_ioctl_32) +COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET) +#endif }; #define IOCTL_HASHSIZE 256 --- a/fs/splice.c +++ b/fs/splice.c @@ -1218,6 +1218,9 @@ if (!access_ok(VERIFY_READ, base, len)) break; + if (unlikely(!access_ok(VERIFY_READ, base, len))) + break; + /* * Get this base offset and number of pages, then map * in the user pages. --- a/include/asm-generic/pci.h +++ b/include/asm-generic/pci.h @@ -43,7 +43,9 @@ return root; } +#ifndef pcibios_scan_all_fns #define pcibios_scan_all_fns(a, b) 0 +#endif #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -99,6 +99,10 @@ } #endif +#ifndef arch_change_pte_range +#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0 +#endif + #ifndef __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (pte_val(A) == pte_val(B)) #endif --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -200,6 +200,11 @@ struct aio_ring_info ring_info; struct delayed_work wq; +#ifdef CONFIG_EPOLL + // poll integration + wait_queue_head_t poll_wait; + struct file *file; +#endif }; /* prototypes */ --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -194,6 +194,12 @@ } #endif /* CONFIG_GENERIC_HARDIRQS */ +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED +int irq_ignore_unhandled(unsigned int irq); +#else +#define irq_ignore_unhandled(irq) 0 +#endif + #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) #define or_softirq_pending(x) (local_softirq_pending() |= (x)) --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -46,6 +46,13 @@ KEXEC_CORE_NOTE_NAME_BYTES + \ KEXEC_CORE_NOTE_DESC_BYTES ) +#ifndef KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) page_to_pfn(page) +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn) +#define kexec_virt_to_phys(addr) virt_to_phys(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(addr) +#endif + /* * This structure is used to hold the arguments that are used when loading * kernel binaries. @@ -106,6 +113,12 @@ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET; extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); +#ifdef CONFIG_XEN +extern int xen_machine_kexec_load(struct kimage *image); +extern void xen_machine_kexec_unload(struct kimage *image); +extern void xen_machine_kexec_setup_resources(void); +extern void xen_machine_kexec_register_resources(struct resource *res); +#endif extern asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -154,6 +167,10 @@ #ifndef kexec_flush_icache_page #define kexec_flush_icache_page(page) +#endif + +#ifndef kexec_flush_icache_page +#define kexec_flush_icache_page(page) #endif #define KEXEC_ON_CRASH 0x00000001 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -100,6 +100,9 @@ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#ifdef CONFIG_XEN +#define VM_FOREIGN 0x00200000 /* Has pages belonging to another VM */ +#endif #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ @@ -172,6 +175,10 @@ /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + /* Area-specific function for clearing the PTE at @ptep. Returns the + * original value of @ptep. */ + pte_t (*zap_pte)(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int is_fullmm); #ifdef CONFIG_NUMA int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); struct mempolicy *(*get_policy)(struct vm_area_struct *vma, --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -16,6 +16,8 @@ #include #include #include + +#include /* Each escaped entry is prefixed by ESCAPE_CODE * then one of the following codes, then the @@ -28,7 +30,7 @@ #define CPU_SWITCH_CODE 2 #define COOKIE_SWITCH_CODE 3 #define KERNEL_ENTER_SWITCH_CODE 4 -#define KERNEL_EXIT_SWITCH_CODE 5 +#define USER_ENTER_SWITCH_CODE 5 #define MODULE_LOADED_CODE 6 #define CTX_TGID_CODE 7 #define TRACE_BEGIN_CODE 8 @@ -36,6 +38,7 @@ #define XEN_ENTER_SWITCH_CODE 10 #define SPU_PROFILING_CODE 11 #define SPU_CTX_SWITCH_CODE 12 +#define DOMAIN_SWITCH_CODE 13 struct super_block; struct dentry; @@ -47,6 +50,11 @@ /* create any necessary configuration files in the oprofile fs. * Optional. */ int (*create_files)(struct super_block * sb, struct dentry * root); + /* setup active domains with Xen */ + int (*set_active)(int *active_domains, unsigned int adomains); + /* setup passive domains with Xen */ + int (*set_passive)(int *passive_domains, unsigned int pdomains); + /* Do any necessary interrupt setup. Optional. */ int (*setup)(void); /* Do any necessary interrupt shutdown. Optional. */ @@ -113,6 +121,8 @@ /* add a backtrace entry, to be called from the ->backtrace callback */ void oprofile_add_trace(unsigned long eip); +/* add a domain switch entry */ +int oprofile_add_domain_switch(int32_t domain_id); /** * Create a file of the given name as a child of the given root, with --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -97,6 +97,8 @@ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ #define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */ +#define PG_foreign 20 /* Page is owned by foreign allocator. */ + #if (BITS_PER_LONG > 32) /* * 64-bit-only flags build down from bit 31 @@ -296,6 +298,19 @@ #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) +#define SetPageForeign(_page, dtor) do { \ + set_bit(PG_foreign, &(_page)->flags); \ + BUG_ON((dtor) == (void (*)(struct page *))0); \ + (_page)->index = (long)(dtor); \ +} while (0) +#define ClearPageForeign(page) do { \ + clear_bit(PG_foreign, &(page)->flags); \ + (page)->index = 0; \ +} while (0) +#define PageForeignDestructor(_page) \ + ((void (*)(struct page *))(_page)->index)(_page) + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -290,6 +290,7 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP +extern unsigned long softlockup_get_next_event(void); extern void softlockup_tick(void); extern void spawn_softlockup_task(void); extern void touch_softlockup_watchdog(void); @@ -299,6 +300,10 @@ extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_warnings; #else +static inline unsigned long softlockup_get_next_event(void) +{ + return MAX_JIFFY_OFFSET; +} static inline void softlockup_tick(void) { } --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -217,6 +217,8 @@ * @local_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @nohdr: Payload reference only, must not modify header + * @proto_data_valid: Protocol data validated since arriving at localhost + * @proto_csum_blank: Protocol csum must be added before leaving localhost * @pkt_type: Packet class * @fclone: skbuff clone status * @ip_summed: Driver fed us an IP checksum @@ -310,7 +312,13 @@ __u16 tc_verd; /* traffic control verdict */ #endif #endif +#ifndef CONFIG_XEN /* 2 byte hole */ +#else + __u8 proto_data_valid:1, + proto_csum_blank:1; + /* 1 byte hole */ +#endif #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -17,6 +17,11 @@ #else #define MODULE_VERMAGIC_MODULE_UNLOAD "" #endif +#ifdef CONFIG_XEN +#define MODULE_VERMAGIC_XEN "Xen " +#else +#define MODULE_VERMAGIC_XEN +#endif #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif @@ -24,5 +29,6 @@ #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ - MODULE_VERMAGIC_MODULE_UNLOAD MODULE_ARCH_VERMAGIC + MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_XEN \ + MODULE_ARCH_VERMAGIC --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -182,7 +182,7 @@ */ if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; - else + else if (!irq_ignore_unhandled(irq)) desc->irqs_unhandled++; desc->last_unhandled = jiffies; if (unlikely(action_ret != IRQ_NONE)) --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -340,13 +340,26 @@ return 0; } -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit) { struct page *pages; pages = alloc_pages(gfp_mask, order); if (pages) { unsigned int count, i; +#ifdef CONFIG_XEN + int address_bits; + + if (limit == ~0UL) + address_bits = BITS_PER_LONG; + else + address_bits = long_log2(limit); + + if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) { + __free_pages(pages, order); + return NULL; + } +#endif pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; @@ -365,6 +378,9 @@ count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); +#ifdef CONFIG_XEN + xen_destroy_contiguous_region((unsigned long)page_address(page), order); +#endif __free_pages(page, order); } @@ -410,10 +426,10 @@ do { unsigned long pfn, epfn, addr, eaddr; - pages = kimage_alloc_pages(GFP_KERNEL, order); + pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = kexec_page_to_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -447,6 +463,7 @@ return pages; } +#ifndef CONFIG_XEN static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -500,7 +517,7 @@ } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); + pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT); break; } } @@ -527,6 +544,13 @@ return pages; } +#else /* !CONFIG_XEN */ +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + return kimage_alloc_normal_control_pages(image, order); +} +#endif static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { @@ -542,7 +566,7 @@ return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -603,13 +627,13 @@ #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = kexec_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -621,6 +645,10 @@ if (!image) return; +#ifdef CONFIG_XEN + xen_machine_kexec_unload(image); +#endif + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { @@ -696,7 +724,7 @@ * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = kexec_page_to_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -707,16 +735,16 @@ kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); + page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT); if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (kexec_page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unuseable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = kexec_page_to_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -739,7 +767,7 @@ struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -789,7 +817,7 @@ result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, kexec_page_to_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -821,6 +849,7 @@ return result; } +#ifndef CONFIG_XEN static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -843,7 +872,7 @@ char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = kexec_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -892,6 +921,13 @@ return result; } +#else /* CONFIG_XEN */ +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + return kimage_load_normal_segment(image, segment); +} +#endif /* * Exec Kernel system call: for obvious reasons only root may call it. @@ -1002,6 +1038,13 @@ if (result) goto out; } +#ifdef CONFIG_XEN + if (image) { + result = xen_machine_kexec_load(image); + if (result) + goto out; + } +#endif /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -39,6 +39,19 @@ .notifier_call = softlock_panic, }; +unsigned long softlockup_get_next_event(void) +{ + int this_cpu = smp_processor_id(); + unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); + + if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || + did_panic || + !per_cpu(watchdog_task, this_cpu)) + return MAX_JIFFY_OFFSET; + + return max_t(long, 0, touch_timestamp + HZ - jiffies); +} + /* * Returns seconds, approximately. We don't need nanosecond * resolution, and we don't need to waste time with a big divide when --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -742,7 +742,7 @@ .proc_handler = &proc_dointvec, }, #endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP) { .procname = "acpi_video_flags", .data = &acpi_realmode_flags, --- a/kernel/timer.c +++ b/kernel/timer.c @@ -802,7 +802,7 @@ unsigned long get_next_timer_interrupt(unsigned long now) { struct tvec_base *base = __get_cpu_var(tvec_bases); - unsigned long expires; + unsigned long expires, sl_next; spin_lock(&base->lock); expires = __next_timer_interrupt(base); @@ -811,7 +811,11 @@ if (time_before_eq(expires, now)) return now; - return cmp_next_hrtimer_event(now, expires); + expires = cmp_next_hrtimer_event(now, expires); + sl_next = softlockup_get_next_event(); + + return expires <= now || expires - now < sl_next + ? expires : now + sl_next; } #ifdef CONFIG_NO_IDLE_HZ --- a/mm/memory.c +++ b/mm/memory.c @@ -402,6 +402,12 @@ return NULL; } +#if defined(CONFIG_XEN) && defined(CONFIG_X86) + /* XEN: Covers user-space grant mappings (even of local pages). */ + if (unlikely(vma->vm_flags & VM_FOREIGN)) + return NULL; +#endif + #ifdef CONFIG_DEBUG_VM /* * Add some anal sanity checks for now. Eventually, @@ -410,7 +416,8 @@ * and that the resulting page looks ok. */ if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); + if (!(vma->vm_flags & VM_RESERVED)) + print_bad_pte(vma, pte, addr); return NULL; } #endif @@ -668,8 +675,12 @@ page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); + if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte)) + ptent = vma->vm_ops->zap_pte(vma, addr, pte, + tlb->fullmm); + else + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; @@ -902,6 +913,7 @@ tlb_finish_mmu(tlb, address, end); return end; } +EXPORT_SYMBOL(zap_page_range); /* * Do a quick page-table lookup for a single page. @@ -1043,6 +1055,26 @@ continue; } +#ifdef CONFIG_XEN + if (vma && (vma->vm_flags & VM_FOREIGN)) { + struct page **map = vma->vm_private_data; + int offset = (start - vma->vm_start) >> PAGE_SHIFT; + if (map[offset] != NULL) { + if (pages) { + struct page *page = map[offset]; + + pages[i] = page; + get_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + } +#endif if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -86,6 +86,8 @@ next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; + if (arch_change_pte_range(mm, pmd, addr, next, newprot)) + continue; change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); } while (pmd++, addr = next, addr != end); } --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -245,7 +245,11 @@ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_buddy ); + 1 << PG_buddy | +#ifdef CONFIG_X86_XEN + 1 << PG_pinned | +#endif + 1 << PG_foreign ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -471,7 +475,11 @@ 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | - 1 << PG_buddy )))) + 1 << PG_buddy | +#ifdef CONFIG_X86_XEN + 1 << PG_pinned | +#endif + 1 << PG_foreign )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -527,6 +535,12 @@ int i; int reserved = 0; +#ifdef CONFIG_XEN + if (PageForeign(page)) { + PageForeignDestructor(page); + return; + } +#endif for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); if (reserved) @@ -622,7 +636,11 @@ 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | - 1 << PG_buddy )))) + 1 << PG_buddy | +#ifdef CONFIG_X86_XEN + 1 << PG_pinned | +#endif + 1 << PG_foreign )))) bad_page(page); /* @@ -990,6 +1008,12 @@ struct per_cpu_pages *pcp; unsigned long flags; +#ifdef CONFIG_XEN + if (PageForeign(page)) { + PageForeignDestructor(page); + return; + } +#endif if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) --- a/net/core/dev.c +++ b/net/core/dev.c @@ -122,6 +122,12 @@ #include "net-sysfs.h" +#ifdef CONFIG_XEN +#include +#include +#include +#endif + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -1580,6 +1586,42 @@ return 0; } +#ifdef CONFIG_XEN +inline int skb_checksum_setup(struct sk_buff *skb) +{ + if (skb->proto_csum_blank) { + if (skb->protocol != htons(ETH_P_IP)) + goto out; + skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; + if (skb->h.raw >= skb->tail) + goto out; + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + skb->csum = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + skb->csum = offsetof(struct udphdr, check); + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" + " %d packet", skb->nh.iph->protocol); + goto out; + } + if ((skb->h.raw + skb->csum + 2) > skb->tail) + goto out; + skb->ip_summed = CHECKSUM_HW; + skb->proto_csum_blank = 0; + } + return 0; +out: + return -EPROTO; +} +#else +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; } +#endif + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -1612,6 +1654,12 @@ struct Qdisc *q; int rc = -ENOMEM; + /* If a checksum-deferred packet is forwarded to a device that needs a + * checksum, correct the pointers and force checksumming. + */ + if (skb_checksum_setup(skb)) + goto out_kfree_skb; + /* GSO will handle the following emulations directly. */ if (netif_needs_gso(dev, skb)) goto gso; @@ -2062,6 +2110,19 @@ } #endif +#ifdef CONFIG_XEN + switch (skb->ip_summed) { + case CHECKSUM_UNNECESSARY: + skb->proto_data_valid = 1; + break; + case CHECKSUM_HW: + /* XXX Implement me. */ + default: + skb->proto_data_valid = 0; + break; + } +#endif + list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) @@ -4587,6 +4648,7 @@ EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(skb_checksum_setup); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -454,6 +454,10 @@ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; +#ifdef CONFIG_XEN + C(proto_data_valid); + C(proto_csum_blank); +#endif n->destructor = NULL; C(iif); C(tail); --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -132,6 +132,9 @@ if (hdrsize < sizeof(*hdr)) return 1; + if (skb_checksum_setup(skb)) + return 0; + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); return 1; --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -116,6 +116,10 @@ newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } + + if (skb_checksum_setup(skb)) + return 0; + if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -81,7 +81,7 @@ #endif skb->protocol = htons(ETH_P_IP); - return xfrm_output(skb); + return skb_checksum_setup(skb) ?: xfrm_output(skb); } int xfrm4_output(struct sk_buff *skb) --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -73,6 +73,20 @@ $(warning kbuild: Makefile.build is included improperly) endif +ifeq ($(CONFIG_XEN),y) +$(objtree)/scripts/Makefile.xen: $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build + @echo ' Updating $@' + $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\ + ,$(error 'Your awk program does not define gensub. Use gawk or another awk with gensub')) + @$(AWK) -f $< $(filter-out $<,$^) >$@ + +xen-src-single-used-m := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c)))) +xen-single-used-m := $(xen-src-single-used-m:-xen.c=.o) +single-used-m := $(filter-out $(xen-single-used-m),$(single-used-m)) + +-include $(objtree)/scripts/Makefile.xen +endif + # =========================================================================== ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),) --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -17,6 +17,12 @@ lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m))) +# Remove objects forcibly disabled + +obj-y := $(filter-out $(disabled-obj-y),$(obj-y)) +obj-m := $(filter-out $(disabled-obj-y),$(obj-m)) +lib-y := $(filter-out $(disabled-obj-y),$(lib-y)) + # Handle objects in subdirs # ---------------------------------------------------------------------------