summaryrefslogtreecommitdiff
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig19
-rw-r--r--drivers/xen/Makefile4
-rw-r--r--drivers/xen/balloon.c712
-rw-r--r--drivers/xen/events.c674
-rw-r--r--drivers/xen/features.c29
-rw-r--r--drivers/xen/grant-table.c37
-rw-r--r--drivers/xen/xenbus/xenbus_client.c6
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c32
-rw-r--r--drivers/xen/xencomm.c232
9 files changed, 1705 insertions, 40 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
new file mode 100644
index 000000000000..4b75a16de009
--- /dev/null
+++ b/drivers/xen/Kconfig
@@ -0,0 +1,19 @@
+config XEN_BALLOON
+ bool "Xen memory balloon driver"
+ depends on XEN
+ default y
+ help
+ The balloon driver allows the Xen domain to request more memory from
+ the system to expand the domain's memory allocation, or alternatively
+ return unneeded memory to the system.
+
+config XEN_SCRUB_PAGES
+ bool "Scrub pages before returning them to system"
+ depends on XEN_BALLOON
+ default y
+ help
+ Scrub pages before returning them to the system for reuse by
+ other domains. This makes sure that any confidential data
+ is not accidentally visible to other domains. Is it more
+ secure, but slightly less efficient.
+ If in doubt, say yes.
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 56592f0d6cef..37af04f1ffd9 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,2 +1,4 @@
-obj-y += grant-table.o
+obj-y += grant-table.o features.o events.o
obj-y += xenbus/
+obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+obj-$(CONFIG_XEN_BALLOON) += balloon.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
new file mode 100644
index 000000000000..ab25ba6cbbb9
--- /dev/null
+++ b/drivers/xen/balloon.c
@@ -0,0 +1,712 @@
+/******************************************************************************
+ * balloon.c
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/highmem.h>
+#include <linux/list.h>
+#include <linux/sysdev.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+
+#include <xen/interface/memory.h>
+#include <xen/balloon.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+#define BALLOON_CLASS_NAME "memory"
+
+struct balloon_stats {
+ /* We aim for 'current allocation' == 'target allocation'. */
+ unsigned long current_pages;
+ unsigned long target_pages;
+ /* We may hit the hard limit in Xen. If we do then we remember it. */
+ unsigned long hard_limit;
+ /*
+ * Drivers may alter the memory reservation independently, but they
+ * must inform the balloon driver so we avoid hitting the hard limit.
+ */
+ unsigned long driver_pages;
+ /* Number of pages in high- and low-memory balloons. */
+ unsigned long balloon_low;
+ unsigned long balloon_high;
+};
+
+static DEFINE_MUTEX(balloon_mutex);
+
+static struct sys_device balloon_sysdev;
+
+static int register_balloon(struct sys_device *sysdev);
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+static DEFINE_SPINLOCK(balloon_lock);
+
+static struct balloon_stats balloon_stats;
+
+/* We increase/decrease in batches which fit in a page */
+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+/* VM /proc information for memory */
+extern unsigned long totalram_pages;
+
+#ifdef CONFIG_HIGHMEM
+extern unsigned long totalhigh_pages;
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() do {} while(0)
+#define dec_totalhigh_pages() do {} while(0)
+#endif
+
+/* List of ballooned pages, threaded through the mem_map array. */
+static LIST_HEAD(ballooned_pages);
+
+/* Main work function, always executed in process context. */
+static void balloon_process(struct work_struct *work);
+static DECLARE_WORK(balloon_worker, balloon_process);
+static struct timer_list balloon_timer;
+
+/* When ballooning out (allocating memory to return to Xen) we don't really
+ want the kernel to try too hard since that can trigger the oom killer. */
+#define GFP_BALLOON \
+ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
+static void scrub_page(struct page *page)
+{
+#ifdef CONFIG_XEN_SCRUB_PAGES
+ if (PageHighMem(page)) {
+ void *v = kmap(page);
+ clear_page(v);
+ kunmap(v);
+ } else {
+ void *v = page_address(page);
+ clear_page(v);
+ }
+#endif
+}
+
+/* balloon_append: add the given page to the balloon. */
+static void balloon_append(struct page *page)
+{
+ /* Lowmem is re-populated first, so highmem pages go at list tail. */
+ if (PageHighMem(page)) {
+ list_add_tail(&page->lru, &ballooned_pages);
+ balloon_stats.balloon_high++;
+ dec_totalhigh_pages();
+ } else {
+ list_add(&page->lru, &ballooned_pages);
+ balloon_stats.balloon_low++;
+ }
+}
+
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(void)
+{
+ struct page *page;
+
+ if (list_empty(&ballooned_pages))
+ return NULL;
+
+ page = list_entry(ballooned_pages.next, struct page, lru);
+ list_del(&page->lru);
+
+ if (PageHighMem(page)) {
+ balloon_stats.balloon_high--;
+ inc_totalhigh_pages();
+ }
+ else
+ balloon_stats.balloon_low--;
+
+ return page;
+}
+
+static struct page *balloon_first_page(void)
+{
+ if (list_empty(&ballooned_pages))
+ return NULL;
+ return list_entry(ballooned_pages.next, struct page, lru);
+}
+
+static struct page *balloon_next_page(struct page *page)
+{
+ struct list_head *next = page->lru.next;
+ if (next == &ballooned_pages)
+ return NULL;
+ return list_entry(next, struct page, lru);
+}
+
+static void balloon_alarm(unsigned long unused)
+{
+ schedule_work(&balloon_worker);
+}
+
+static unsigned long current_target(void)
+{
+ unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit);
+
+ target = min(target,
+ balloon_stats.current_pages +
+ balloon_stats.balloon_low +
+ balloon_stats.balloon_high);
+
+ return target;
+}
+
+static int increase_reservation(unsigned long nr_pages)
+{
+ unsigned long pfn, i, flags;
+ struct page *page;
+ long rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ spin_lock_irqsave(&balloon_lock, flags);
+
+ page = balloon_first_page();
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(page == NULL);
+ frame_list[i] = page_to_pfn(page);;
+ page = balloon_next_page(page);
+ }
+
+ reservation.extent_start = (unsigned long)frame_list;
+ reservation.nr_extents = nr_pages;
+ rc = HYPERVISOR_memory_op(
+ XENMEM_populate_physmap, &reservation);
+ if (rc < nr_pages) {
+ if (rc > 0) {
+ int ret;
+
+ /* We hit the Xen hard limit: reprobe. */
+ reservation.nr_extents = rc;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ BUG_ON(ret != rc);
+ }
+ if (rc >= 0)
+ balloon_stats.hard_limit = (balloon_stats.current_pages + rc -
+ balloon_stats.driver_pages);
+ goto out;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ page = balloon_retrieve();
+ BUG_ON(page == NULL);
+
+ pfn = page_to_pfn(page);
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ phys_to_machine_mapping_valid(pfn));
+
+ set_phys_to_machine(pfn, frame_list[i]);
+
+ /* Link back into the page tables if not highmem. */
+ if (pfn < max_low_pfn) {
+ int ret;
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(frame_list[i], PAGE_KERNEL),
+ 0);
+ BUG_ON(ret);
+ }
+
+ /* Relinquish the page back to the allocator. */
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+ }
+
+ balloon_stats.current_pages += nr_pages;
+ totalram_pages = balloon_stats.current_pages;
+
+ out:
+ spin_unlock_irqrestore(&balloon_lock, flags);
+
+ return 0;
+}
+
+static int decrease_reservation(unsigned long nr_pages)
+{
+ unsigned long pfn, i, flags;
+ struct page *page;
+ int need_sleep = 0;
+ int ret;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ for (i = 0; i < nr_pages; i++) {
+ if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+ nr_pages = i;
+ need_sleep = 1;
+ break;
+ }
+
+ pfn = page_to_pfn(page);
+ frame_list[i] = pfn_to_mfn(pfn);
+
+ scrub_page(page);
+ }
+
+ /* Ensure that ballooned highmem pages don't have kmaps. */
+ kmap_flush_unused();
+ flush_tlb_all();
+
+ spin_lock_irqsave(&balloon_lock, flags);
+
+ /* No more mappings: invalidate P2M and add to balloon. */
+ for (i = 0; i < nr_pages; i++) {
+ pfn = mfn_to_pfn(frame_list[i]);
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ balloon_append(pfn_to_page(pfn));
+ }
+
+ reservation.extent_start = (unsigned long)frame_list;
+ reservation.nr_extents = nr_pages;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != nr_pages);
+
+ balloon_stats.current_pages -= nr_pages;
+ totalram_pages = balloon_stats.current_pages;
+
+ spin_unlock_irqrestore(&balloon_lock, flags);
+
+ return need_sleep;
+}
+
+/*
+ * We avoid multiple worker processes conflicting via the balloon mutex.
+ * We may of course race updates of the target counts (which are protected
+ * by the balloon lock), or with changes to the Xen hard limit, but we will
+ * recover from these in time.
+ */
+static void balloon_process(struct work_struct *work)
+{
+ int need_sleep = 0;
+ long credit;
+
+ mutex_lock(&balloon_mutex);
+
+ do {
+ credit = current_target() - balloon_stats.current_pages;
+ if (credit > 0)
+ need_sleep = (increase_reservation(credit) != 0);
+ if (credit < 0)
+ need_sleep = (decrease_reservation(-credit) != 0);
+
+#ifndef CONFIG_PREEMPT
+ if (need_resched())
+ schedule();
+#endif
+ } while ((credit != 0) && !need_sleep);
+
+ /* Schedule more work if there is some still to be done. */
+ if (current_target() != balloon_stats.current_pages)
+ mod_timer(&balloon_timer, jiffies + HZ);
+
+ mutex_unlock(&balloon_mutex);
+}
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+ /* No need for lock. Not read-modify-write updates. */
+ balloon_stats.hard_limit = ~0UL;
+ balloon_stats.target_pages = target;
+ schedule_work(&balloon_worker);
+}
+
+static struct xenbus_watch target_watch =
+{
+ .node = "memory/target"
+};
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ unsigned long long new_target;
+ int err;
+
+ err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+ if (err != 1) {
+ /* This is ok (for domain0 at least) - so just return */
+ return;
+ }
+
+ /* The given memory/target value is in KiB, so it needs converting to
+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ */
+ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+}
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ int err;
+
+ err = register_xenbus_watch(&target_watch);
+ if (err)
+ printk(KERN_ERR "Failed to set balloon watcher\n");
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xenstore_notifier;
+
+static int __init balloon_init(void)
+{
+ unsigned long pfn;
+ struct page *page;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ pr_info("xen_balloon: Initialising balloon driver.\n");
+
+ balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ totalram_pages = balloon_stats.current_pages;
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ balloon_stats.balloon_low = 0;
+ balloon_stats.balloon_high = 0;
+ balloon_stats.driver_pages = 0UL;
+ balloon_stats.hard_limit = ~0UL;
+
+ init_timer(&balloon_timer);
+ balloon_timer.data = 0;
+ balloon_timer.function = balloon_alarm;
+
+ register_balloon(&balloon_sysdev);
+
+ /* Initialise the balloon with excess memory space. */
+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ balloon_append(page);
+ }
+
+ target_watch.callback = watch_target;
+ xenstore_notifier.notifier_call = balloon_init_watcher;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(balloon_init);
+
+static void balloon_exit(void)
+{
+ /* XXX - release balloon here */
+ return;
+}
+
+module_exit(balloon_exit);
+
+static void balloon_update_driver_allowance(long delta)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&balloon_lock, flags);
+ balloon_stats.driver_pages += delta;
+ spin_unlock_irqrestore(&balloon_lock, flags);
+}
+
+static int dealloc_pte_fn(
+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+ unsigned long mfn = pte_mfn(*pte);
+ int ret;
+ struct xen_memory_reservation reservation = {
+ .nr_extents = 1,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ reservation.extent_start = (unsigned long)&mfn;
+ set_pte_at(&init_mm, addr, pte, __pte_ma(0ull));
+ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != 1);
+ return 0;
+}
+
+static struct page **alloc_empty_pages_and_pagevec(int nr_pages)
+{
+ unsigned long vaddr, flags;
+ struct page *page, **pagevec;
+ int i, ret;
+
+ pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
+ if (pagevec == NULL)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = pagevec[i] = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto err;
+
+ vaddr = (unsigned long)page_address(page);
+
+ scrub_page(page);
+
+ spin_lock_irqsave(&balloon_lock, flags);
+
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ unsigned long gmfn = page_to_pfn(page);
+ struct xen_memory_reservation reservation = {
+ .nr_extents = 1,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ reservation.extent_start = (unsigned long)&gmfn;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ if (ret == 1)
+ ret = 0; /* success */
+ } else {
+ ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
+ dealloc_pte_fn, NULL);
+ }
+
+ if (ret != 0) {
+ spin_unlock_irqrestore(&balloon_lock, flags);
+ __free_page(page);
+ goto err;
+ }
+
+ totalram_pages = --balloon_stats.current_pages;
+
+ spin_unlock_irqrestore(&balloon_lock, flags);
+ }
+
+ out:
+ schedule_work(&balloon_worker);
+ flush_tlb_all();
+ return pagevec;
+
+ err:
+ spin_lock_irqsave(&balloon_lock, flags);
+ while (--i >= 0)
+ balloon_append(pagevec[i]);
+ spin_unlock_irqrestore(&balloon_lock, flags);
+ kfree(pagevec);
+ pagevec = NULL;
+ goto out;
+}
+
+static void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
+{
+ unsigned long flags;
+ int i;
+
+ if (pagevec == NULL)
+ return;
+
+ spin_lock_irqsave(&balloon_lock, flags);
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(page_count(pagevec[i]) != 1);
+ balloon_append(pagevec[i]);
+ }
+ spin_unlock_irqrestore(&balloon_lock, flags);
+
+ kfree(pagevec);
+
+ schedule_work(&balloon_worker);
+}
+
+static void balloon_release_driver_page(struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&balloon_lock, flags);
+ balloon_append(page);
+ balloon_stats.driver_pages--;
+ spin_unlock_irqrestore(&balloon_lock, flags);
+
+ schedule_work(&balloon_worker);
+}
+
+
+#define BALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct sys_device *dev, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ } \
+ static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
+BALLOON_SHOW(hard_limit_kb,
+ (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n",
+ (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
+BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
+
+static ssize_t show_target_kb(struct sys_device *dev, char *buf)
+{
+ return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
+}
+
+static ssize_t store_target_kb(struct sys_device *dev,
+ const char *buf,
+ size_t count)
+{
+ char memstring[64], *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (count <= 1)
+ return -EBADMSG; /* runt */
+ if (count > sizeof(memstring))
+ return -EFBIG; /* too long */
+ strcpy(memstring, buf);
+
+ target_bytes = memparse(memstring, &endchar);
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
+ show_target_kb, store_target_kb);
+
+static struct sysdev_attribute *balloon_attrs[] = {
+ &attr_target_kb,
+};
+
+static struct attribute *balloon_info_attrs[] = {
+ &attr_current_kb.attr,
+ &attr_low_kb.attr,
+ &attr_high_kb.attr,
+ &attr_hard_limit_kb.attr,
+ &attr_driver_kb.attr,
+ NULL
+};
+
+static struct attribute_group balloon_info_group = {
+ .name = "info",
+ .attrs = balloon_info_attrs,
+};
+
+static struct sysdev_class balloon_sysdev_class = {
+ .name = BALLOON_CLASS_NAME,
+};
+
+static int register_balloon(struct sys_device *sysdev)
+{
+ int i, error;
+
+ error = sysdev_class_register(&balloon_sysdev_class);
+ if (error)
+ return error;
+
+ sysdev->id = 0;
+ sysdev->cls = &balloon_sysdev_class;
+
+ error = sysdev_register(sysdev);
+ if (error) {
+ sysdev_class_unregister(&balloon_sysdev_class);
+ return error;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
+ error = sysdev_create_file(sysdev, balloon_attrs[i]);
+ if (error)
+ goto fail;
+ }
+
+ error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
+ if (error)
+ goto fail;
+
+ return 0;
+
+ fail:
+ while (--i >= 0)
+ sysdev_remove_file(sysdev, balloon_attrs[i]);
+ sysdev_unregister(sysdev);
+ sysdev_class_unregister(&balloon_sysdev_class);
+ return error;
+}
+
+static void unregister_balloon(struct sys_device *sysdev)
+{
+ int i;
+
+ sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
+ sysdev_remove_file(sysdev, balloon_attrs[i]);
+ sysdev_unregister(sysdev);
+ sysdev_class_unregister(&balloon_sysdev_class);
+}
+
+static void balloon_sysfs_exit(void)
+{
+ unregister_balloon(&balloon_sysdev);
+}
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
new file mode 100644
index 000000000000..4f0f22b020ea
--- /dev/null
+++ b/drivers/xen/events.c
@@ -0,0 +1,674 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels. Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels. The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip. When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications. This includes all the virtual
+ * device events, since they're driven by front-ends in another domain
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+ unsigned short evtchn;
+ unsigned char index;
+ unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+ IRQT_UNBOUND,
+ IRQT_PIRQ,
+ IRQT_VIRQ,
+ IRQT_IPI,
+ IRQT_EVTCHN
+};
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+ [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn) ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+ (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+ return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+ return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+ return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+ return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] &
+ cpu_evtchn_mask[cpu][idx] &
+ ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+ int irq = evtchn_to_irq[chn];
+
+ BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+ irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+ __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+ __set_bit(chn, cpu_evtchn_mask[cpu]);
+
+ cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ /* By default all event channels notify CPU#0. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+ memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+ memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ unsigned int cpu = get_cpu();
+
+ BUG_ON(!irqs_disabled());
+
+ /* Slow path (hypercall) if this is a non-local port. */
+ if (unlikely(cpu != cpu_from_evtchn(port))) {
+ struct evtchn_unmask unmask = { .port = port };
+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+ } else {
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+ sync_clear_bit(port, &s->evtchn_mask[0]);
+
+ /*
+ * The following is basically the equivalent of
+ * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+ * the interrupt edge' if the channel is masked.
+ */
+ if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+ !sync_test_and_set_bit(port / BITS_PER_LONG,
+ &vcpu_info->evtchn_pending_sel))
+ vcpu_info->evtchn_upcall_pending = 1;
+ }
+
+ put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+ int irq;
+
+ /* Only allocate from dynirq range */
+ for (irq = 0; irq < NR_IRQS; irq++)
+ if (irq_bindcount[irq] == 0)
+ break;
+
+ if (irq == NR_IRQS)
+ panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+ return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+ int irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = evtchn_to_irq[evtchn];
+
+ if (irq == -1) {
+ irq = find_unbound_irq();
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "event");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+ }
+
+ irq_bindcount[irq]++;
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(ipi_to_irq, cpu)[ipi];
+ if (irq == -1) {
+ irq = find_unbound_irq();
+ if (irq < 0)
+ goto out;
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "ipi");
+
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(virq_to_irq, cpu)[virq];
+
+ if (irq == -1) {
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ irq = find_unbound_irq();
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "virq");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+ per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+ struct evtchn_close close;
+ int evtchn = evtchn_from_irq(irq);
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ switch (type_from_irq(irq)) {
+ case IRQT_VIRQ:
+ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+ [index_from_irq(irq)] = -1;
+ break;
+ default:
+ break;
+ }
+
+ /* Closed ports are implicitly re-bound to VCPU0. */
+ bind_evtchn_to_cpu(evtchn, 0);
+
+ evtchn_to_irq[evtchn] = -1;
+ irq_info[irq] = IRQ_UNBOUND;
+
+ dynamic_irq_init(irq);
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_evtchn_to_irq(evtchn);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_virq_to_irq(virq, cpu);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_ipi_to_irq(ipi, cpu);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+ free_irq(irq, dev_id);
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int cpu = smp_processor_id();
+ int i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(debug_lock);
+
+ spin_lock_irqsave(&debug_lock, flags);
+
+ printk("vcpu %d\n ", cpu);
+
+ for_each_online_cpu(i) {
+ struct vcpu_info *v = per_cpu(xen_vcpu, i);
+ printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
+ (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
+ v->evtchn_upcall_pending,
+ v->evtchn_pending_sel);
+ }
+ printk("pending:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nmasks:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nunmasked:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\npending list:\n");
+ for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (sync_test_bit(i, sh->evtchn_pending)) {
+ printk(" %d: event %d -> irq %d\n",
+ cpu_evtchn[i], i,
+ evtchn_to_irq[i]);
+ }
+ }
+
+ spin_unlock_irqrestore(&debug_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+
+/*
+ * Search the CPUs pending events bitmasks. For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching. The first level is
+ * a bitset of words which contain pending event bits. The second
+ * level is a bitset of pending events themselves.
+ */
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ int cpu = get_cpu();
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+ static DEFINE_PER_CPU(unsigned, nesting_count);
+ unsigned count;
+
+ do {
+ unsigned long pending_words;
+
+ vcpu_info->evtchn_upcall_pending = 0;
+
+ if (__get_cpu_var(nesting_count)++)
+ goto out;
+
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+ /* Clear master flag /before/ clearing selector flag. */
+ rmb();
+#endif
+ pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+ while (pending_words != 0) {
+ unsigned long pending_bits;
+ int word_idx = __ffs(pending_words);
+ pending_words &= ~(1UL << word_idx);
+
+ while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
+
+ if (irq != -1)
+ xen_do_IRQ(irq, regs);
+ }
+ }
+
+ BUG_ON(!irqs_disabled());
+
+ count = __get_cpu_var(nesting_count);
+ __get_cpu_var(nesting_count) = 0;
+ } while(count != 1);
+
+out:
+ put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+ struct evtchn_bind_vcpu bind_vcpu;
+ int evtchn = evtchn_from_irq(irq);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ /* Send future instances of this interrupt to other vcpu. */
+ bind_vcpu.port = evtchn;
+ bind_vcpu.vcpu = tcpu;
+
+ /*
+ * If this fails, it usually just indicates that we're dealing with a
+ * virq or IPI channel, which don't actually need to be rebound. Ignore
+ * it, but don't do the xenlinux-level rebind in that case.
+ */
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+ bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+ unsigned tcpu = first_cpu(dest);
+ rebind_irq_to_cpu(irq, tcpu);
+}
+
+int resend_irq_on_evtchn(unsigned int irq)
+{
+ int masked, evtchn = evtchn_from_irq(irq);
+ struct shared_info *s = HYPERVISOR_shared_info;
+
+ if (!VALID_EVTCHN(evtchn))
+ return 1;
+
+ masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
+ sync_set_bit(evtchn, s->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
+
+ return 1;
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int ret = 0;
+
+ if (VALID_EVTCHN(evtchn)) {
+ int masked;
+
+ masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
+ sync_set_bit(evtchn, sh->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .name = "xen-dyn",
+ .mask = disable_dynirq,
+ .unmask = enable_dynirq,
+ .ack = ack_dynirq,
+ .set_affinity = set_affinity_irq,
+ .retrigger = retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+ int i;
+
+ init_evtchn_cpu_bindings();
+
+ /* No event channels are 'live' right now. */
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ mask_evtchn(i);
+
+ /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_bindcount[i] = 0;
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/drivers/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+ struct xen_feature_info fi;
+ int i, j;
+
+ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+ fi.submap_idx = i;
+ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+ break;
+ for (j = 0; j < 32; j++)
+ xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+ }
+}
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index d85dc6d41c2a..52b6b41b909d 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -439,24 +439,6 @@ static inline unsigned int max_nr_grant_frames(void)
return xen_max;
}
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
-{
- unsigned long **frames = (unsigned long **)data;
-
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
- return 0;
-}
-
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
-{
-
- set_pte_at(&init_mm, addr, pte, __pte(0));
- return 0;
-}
-
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct gnttab_setup_table setup;
@@ -470,7 +452,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
setup.dom = DOMID_SELF;
setup.nr_frames = nr_gframes;
- setup.frame_list = frames;
+ set_xen_guest_handle(setup.frame_list, frames);
rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
if (rc == -ENOSYS) {
@@ -480,17 +462,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
BUG_ON(rc || setup.status);
- if (shared == NULL) {
- struct vm_struct *area;
- area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
- BUG_ON(area == NULL);
- shared = area->addr;
- }
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn, &frames);
+ rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+ &shared);
BUG_ON(rc);
- frames -= nr_gframes; /* adjust after map_pte_fn() */
kfree(frames);
@@ -506,10 +480,7 @@ static int gnttab_resume(void)
static int gnttab_suspend(void)
{
- apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_grant_frames,
- unmap_pte_fn, NULL);
-
+ arch_gnttab_unmap_shared(shared, nr_grant_frames);
return 0;
}
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 9fd2f70ab46d..0f86b0ff7879 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -399,7 +399,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
*vaddr = NULL;
- area = alloc_vm_area(PAGE_SIZE);
+ area = xen_alloc_vm_area(PAGE_SIZE);
if (!area)
return -ENOMEM;
@@ -409,7 +409,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
BUG();
if (op.status != GNTST_okay) {
- free_vm_area(area);
+ xen_free_vm_area(area);
xenbus_dev_fatal(dev, op.status,
"mapping in shared page %d from domain %d",
gnt_ref, dev->otherend_id);
@@ -508,7 +508,7 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
BUG();
if (op.status == GNTST_okay)
- free_vm_area(area);
+ xen_free_vm_area(area);
else
xenbus_dev_error(dev, op.status,
"unmapping page at handle %d error %d",
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 4750de316ad3..57ceb5346b74 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -88,6 +88,16 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
}
+static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+
+ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+ return -ENOMEM;
+
+ return 0;
+}
+
/* device/<type>/<id> => <type>-<id> */
static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
{
@@ -166,6 +176,7 @@ static struct xen_bus_type xenbus_frontend = {
.bus = {
.name = "xen",
.match = xenbus_match,
+ .uevent = xenbus_uevent,
.probe = xenbus_dev_probe,
.remove = xenbus_dev_remove,
.shutdown = xenbus_dev_shutdown,
@@ -438,6 +449,12 @@ static ssize_t xendev_show_devtype(struct device *dev,
}
DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+static ssize_t xendev_show_modalias(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
+}
+DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
@@ -492,10 +509,16 @@ int xenbus_probe_node(struct xen_bus_type *bus,
err = device_create_file(&xendev->dev, &dev_attr_devtype);
if (err)
- goto fail_remove_file;
+ goto fail_remove_nodename;
+
+ err = device_create_file(&xendev->dev, &dev_attr_modalias);
+ if (err)
+ goto fail_remove_devtype;
return 0;
-fail_remove_file:
+fail_remove_devtype:
+ device_remove_file(&xendev->dev, &dev_attr_devtype);
+fail_remove_nodename:
device_remove_file(&xendev->dev, &dev_attr_nodename);
fail_unregister:
device_unregister(&xendev->dev);
@@ -846,6 +869,7 @@ static int is_disconnected_device(struct device *dev, void *data)
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct device_driver *drv = data;
+ struct xenbus_driver *xendrv;
/*
* A device with no driver will never connect. We care only about
@@ -858,7 +882,9 @@ static int is_disconnected_device(struct device *dev, void *data)
if (drv && (dev->driver != drv))
return 0;
- return (xendev->state != XenbusStateConnected);
+ xendrv = to_xenbus_driver(dev->driver);
+ return (xendev->state != XenbusStateConnected ||
+ (xendrv->is_ready && !xendrv->is_ready(xendev)));
}
static int exists_disconnected_device(struct device_driver *drv)
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
new file mode 100644
index 000000000000..797cb4e31f07
--- /dev/null
+++ b/drivers/xen/xencomm.c
@@ -0,0 +1,232 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <xen/xencomm.h>
+#include <xen/interface/xen.h>
+#ifdef __ia64__
+#include <asm/xen/xencomm.h> /* for is_kern_addr() */
+#endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+static int xencomm_init(struct xencomm_desc *desc,
+ void *buffer, unsigned long bytes)
+{
+ unsigned long recorded = 0;
+ int i = 0;
+
+ while ((recorded < bytes) && (i < desc->nr_addrs)) {
+ unsigned long vaddr = (unsigned long)buffer + recorded;
+ unsigned long paddr;
+ int offset;
+ int chunksz;
+
+ offset = vaddr % PAGE_SIZE; /* handle partial pages */
+ chunksz = min(PAGE_SIZE - offset, bytes - recorded);
+
+ paddr = xencomm_vtop(vaddr);
+ if (paddr == ~0UL) {
+ printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n",
+ __func__, vaddr);
+ return -EINVAL;
+ }
+
+ desc->address[i++] = paddr;
+ recorded += chunksz;
+ }
+
+ if (recorded < bytes) {
+ printk(KERN_DEBUG
+ "%s: could only translate %ld of %ld bytes\n",
+ __func__, recorded, bytes);
+ return -ENOSPC;
+ }
+
+ /* mark remaining addresses invalid (just for safety) */
+ while (i < desc->nr_addrs)
+ desc->address[i++] = XENCOMM_INVALID;
+
+ desc->magic = XENCOMM_MAGIC;
+
+ return 0;
+}
+
+static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
+ void *buffer, unsigned long bytes)
+{
+ struct xencomm_desc *desc;
+ unsigned long buffer_ulong = (unsigned long)buffer;
+ unsigned long start = buffer_ulong & PAGE_MASK;
+ unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
+ unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
+ unsigned long size = sizeof(*desc) +
+ sizeof(desc->address[0]) * nr_addrs;
+
+ /*
+ * slab allocator returns at least sizeof(void*) aligned pointer.
+ * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
+ * cross page boundary.
+ */
+ if (sizeof(*desc) > sizeof(void *)) {
+ unsigned long order = get_order(size);
+ desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
+ order);
+ if (desc == NULL)
+ return NULL;
+
+ desc->nr_addrs =
+ ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
+ sizeof(*desc->address);
+ } else {
+ desc = kmalloc(size, gfp_mask);
+ if (desc == NULL)
+ return NULL;
+
+ desc->nr_addrs = nr_addrs;
+ }
+ return desc;
+}
+
+void xencomm_free(struct xencomm_handle *desc)
+{
+ if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) {
+ struct xencomm_desc *desc__ = (struct xencomm_desc *)desc;
+ if (sizeof(*desc__) > sizeof(void *)) {
+ unsigned long size = sizeof(*desc__) +
+ sizeof(desc__->address[0]) * desc__->nr_addrs;
+ unsigned long order = get_order(size);
+ free_pages((unsigned long)__va(desc), order);
+ } else
+ kfree(__va(desc));
+ }
+}
+
+static int xencomm_create(void *buffer, unsigned long bytes,
+ struct xencomm_desc **ret, gfp_t gfp_mask)
+{
+ struct xencomm_desc *desc;
+ int rc;
+
+ pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes);
+
+ if (bytes == 0) {
+ /* don't create a descriptor; Xen recognizes NULL. */
+ BUG_ON(buffer != NULL);
+ *ret = NULL;
+ return 0;
+ }
+
+ BUG_ON(buffer == NULL); /* 'bytes' is non-zero */
+
+ desc = xencomm_alloc(gfp_mask, buffer, bytes);
+ if (!desc) {
+ printk(KERN_DEBUG "%s failure\n", "xencomm_alloc");
+ return -ENOMEM;
+ }
+
+ rc = xencomm_init(desc, buffer, bytes);
+ if (rc) {
+ printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc);
+ xencomm_free((struct xencomm_handle *)__pa(desc));
+ return rc;
+ }
+
+ *ret = desc;
+ return 0;
+}
+
+/* check if memory address is within VMALLOC region */
+static int is_phys_contiguous(unsigned long addr)
+{
+ if (!is_kernel_addr(addr))
+ return 0;
+
+ return (addr < VMALLOC_START) || (addr >= VMALLOC_END);
+}
+
+static struct xencomm_handle *xencomm_create_inline(void *ptr)
+{
+ unsigned long paddr;
+
+ BUG_ON(!is_phys_contiguous((unsigned long)ptr));
+
+ paddr = (unsigned long)xencomm_pa(ptr);
+ BUG_ON(paddr & XENCOMM_INLINE_FLAG);
+ return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
+}
+
+/* "mini" routine, for stack-based communications: */
+static int xencomm_create_mini(void *buffer,
+ unsigned long bytes, struct xencomm_mini *xc_desc,
+ struct xencomm_desc **ret)
+{
+ int rc = 0;
+ struct xencomm_desc *desc;
+ BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0);
+
+ desc = (void *)xc_desc;
+
+ desc->nr_addrs = XENCOMM_MINI_ADDRS;
+
+ rc = xencomm_init(desc, buffer, bytes);
+ if (!rc)
+ *ret = desc;
+
+ return rc;
+}
+
+struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes)
+{
+ int rc;
+ struct xencomm_desc *desc;
+
+ if (is_phys_contiguous((unsigned long)ptr))
+ return xencomm_create_inline(ptr);
+
+ rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL);
+
+ if (rc || desc == NULL)
+ return NULL;
+
+ return xencomm_pa(desc);
+}
+
+struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes,
+ struct xencomm_mini *xc_desc)
+{
+ int rc;
+ struct xencomm_desc *desc = NULL;
+
+ if (is_phys_contiguous((unsigned long)ptr))
+ return xencomm_create_inline(ptr);
+
+ rc = xencomm_create_mini(ptr, bytes, xc_desc,
+ &desc);
+
+ if (rc)
+ return NULL;
+
+ return xencomm_pa(desc);
+}