21 files changed, 306 insertions, 92 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 930126698a0f..310fe508d9cd 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1930,6 +1930,42 @@ The "pte_enc" field provides a value that can OR'ed into the hash
 PTE's RPN field (ie, it needs to be shifted left by 12 to OR it
 into the hash PTE second double word).
 
+
+4.75 KVM_PPC_ALLOCATE_HTAB
+
+Capability: KVM_CAP_PPC_ALLOC_HTAB
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to u32 containing hash table order (in/out)
+Returns: 0 on success, -1 on error
+
+This requests the host kernel to allocate an MMU hash table for a
+guest using the PAPR paravirtualization interface.  This only does
+anything if the kernel is configured to use the Book 3S HV style of
+virtualization.  Otherwise the capability doesn't exist and the ioctl
+returns an ENOTTY error.  The rest of this description assumes Book 3S
+HV.
+
+There must be no vcpus running when this ioctl is called; if there
+are, it will do nothing and return an EBUSY error.
+
+The parameter is a pointer to a 32-bit unsigned integer variable
+containing the order (log base 2) of the desired size of the hash
+table, which must be between 18 and 46.  On successful return from the
+ioctl, it will have been updated with the order of the hash table that
+was allocated.
+
+If no hash table has been allocated when any vcpu is asked to run
+(with the KVM_RUN ioctl), the host kernel will allocate a
+default-sized hash table (16 MB).
+
+If this ioctl is called when a hash table has already been allocated,
+the kernel will clear out the existing hash table (zero all HPTEs) and
+return the hash table order in the parameter.  (If the guest is using
+the virtualized real-mode area (VRMA) facility, the kernel will
+re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
+
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt
index 6e7c37050930..4911cf95c67e 100644
--- a/Documentation/virtual/kvm/ppc-pv.txt
+++ b/Documentation/virtual/kvm/ppc-pv.txt
@@ -109,8 +109,6 @@ The following bits are safe to be set inside the guest:
 
   MSR_EE
   MSR_RI
-  MSR_CR
-  MSR_ME
 
 If any other bit changes in the MSR, please still use mtmsr(d).
 
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index 976835d8f22e..bf2c06c33871 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -153,6 +153,8 @@
 #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5"
 #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4"
 
+extern bool epapr_paravirt_enabled;
+extern u32 epapr_hypercall_start[];
 
 /*
  * We use "uintptr_t" to define a register because it's guaranteed to be a
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index b0c08b142770..0dd1d86d3e31 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 #define SPAPR_TCE_SHIFT		12
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_NPTE	(HPT_NPTEG << 3)		/* 8 PTEs per PTEG */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+#define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
+extern int kvm_hpt_order;		/* order of preallocated HPTs */
 #endif
 
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d848cdc49715..50ea12fd7bf5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -237,6 +237,10 @@ struct kvm_arch {
 	unsigned long vrma_slb_v;
 	int rma_setup_done;
 	int using_mmu_notifiers;
+	u32 hpt_order;
+	atomic_t vcpus_running;
+	unsigned long hpt_npte;
+	unsigned long hpt_mask;
 	spinlock_t slot_phys_lock;
 	unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
 	int slot_npages[KVM_MEM_SLOTS_NUM];
@@ -414,7 +418,9 @@ struct kvm_vcpu_arch {
 	ulong mcsrr1;
 	ulong mcsr;
 	u32 dec;
+#ifdef CONFIG_BOOKE
 	u32 decar;
+#endif
 	u32 tbl;
 	u32 tbu;
 	u32 tcr;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f68c22fa2fce..0124937a23b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
-extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
+extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
 extern void kvmppc_free_hpt(struct kvm *kvm);
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 83afacd3ba7b..bb282dd81612 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
 obj-y				+= ppc_save_regs.o
 endif
 
+obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
 
 # Disable GCOV in odd or sensitive code
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
new file mode 100644
index 000000000000..697b390ebfd8
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+.global epapr_hypercall_start
+epapr_hypercall_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+	blr
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
new file mode 100644
index 000000000000..028aeae370b6
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -0,0 +1,52 @@
+/*
+ * ePAPR para-virtualization support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ */
+
+#include <linux/of.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+
+bool epapr_paravirt_enabled;
+
+static int __init epapr_paravirt_init(void)
+{
+	struct device_node *hyper_node;
+	const u32 *insts;
+	int len, i;
+
+	hyper_node = of_find_node_by_path("/hypervisor");
+	if (!hyper_node)
+		return -ENODEV;
+
+	insts = of_get_property(hyper_node, "hcall-instructions", &len);
+	if (!insts)
+		return -ENODEV;
+
+	if (len % 4 || len > (4 * 4))
+		return -ENODEV;
+
+	for (i = 0; i < (len / 4); i++)
+		patch_instruction(epapr_hypercall_start + i, insts[i]);
+
+	epapr_paravirt_enabled = true;
+
+	return 0;
+}
+
+early_initcall(epapr_paravirt_init);
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 62bdf2389669..1c13307e0308 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -31,6 +31,7 @@
 #include <asm/cacheflush.h>
 #include <asm/disassemble.h>
 #include <asm/ppc-opcode.h>
+#include <asm/epapr_hcalls.h>
 
 #define KVM_MAGIC_PAGE		(-4096L)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -726,7 +727,7 @@ unsigned long kvm_hypercall(unsigned long *in,
 	unsigned long register r11 asm("r11") = nr;
 	unsigned long register r12 asm("r12");
 
-	asm volatile("bl	kvm_hypercall_start"
+	asm volatile("bl	epapr_hypercall_start"
 		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
 		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
 		       "=r"(r12)
@@ -747,29 +748,6 @@ unsigned long kvm_hypercall(unsigned long *in,
 }
 EXPORT_SYMBOL_GPL(kvm_hypercall);
 
-static int kvm_para_setup(void)
-{
-	extern u32 kvm_hypercall_start;
-	struct device_node *hyper_node;
-	u32 *insts;
-	int len, i;
-
-	hyper_node = of_find_node_by_path("/hypervisor");
-	if (!hyper_node)
-		return -1;
-
-	insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
-	if (len % 4)
-		return -1;
-	if (len > (4 * 4))
-		return -1;
-
-	for (i = 0; i < (len / 4); i++)
-		kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
-
-	return 0;
-}
-
 static __init void kvm_free_tmp(void)
 {
 	unsigned long start, end;
@@ -791,7 +769,7 @@ static int __init kvm_guest_init(void)
 	if (!kvm_para_available())
 		goto free_tmp;
 
-	if (kvm_para_setup())
+	if (!epapr_paravirt_enabled)
 		goto free_tmp;
 
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index e291cf3cf954..e100ff324a85 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -24,16 +24,6 @@
 #include <asm/page.h>
 #include <asm/asm-offsets.h>
 
-/* Hypercall entry point. Will be patched with device tree instructions. */
-
-.global kvm_hypercall_start
-kvm_hypercall_start:
-	li	r3, -1
-	nop
-	nop
-	nop
-	blr
-
 #define KVM_MAGIC_PAGE		(-4096)
 
 #ifdef CONFIG_64BIT
@@ -132,7 +122,7 @@ kvm_emulate_mtmsrd_len:
 	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
 
 
-#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_SAFE_BITS (MSR_EE | MSR_RI)
 #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
 
 .global kvm_emulate_mtmsr
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 80a577517584..d03eb6f7b058 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,56 +37,121 @@
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970	63
 
-long kvmppc_alloc_hpt(struct kvm *kvm)
+/* Power architecture requires HPT is at least 256kB */
+#define PPC_MIN_HPT_ORDER	18
+
+long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
 	unsigned long hpt;
-	long lpid;
 	struct revmap_entry *rev;
 	struct kvmppc_linear_info *li;
+	long order = kvm_hpt_order;
 
-	/* Allocate guest's hashed page table */
-	li = kvm_alloc_hpt();
-	if (li) {
-		/* using preallocated memory */
-		hpt = (ulong)li->base_virt;
-		kvm->arch.hpt_li = li;
-	} else {
-		/* using dynamic memory */
+	if (htab_orderp) {
+		order = *htab_orderp;
+		if (order < PPC_MIN_HPT_ORDER)
+			order = PPC_MIN_HPT_ORDER;
+	}
+
+	/*
+	 * If the user wants a different size from default,
+	 * try first to allocate it from the kernel page allocator.
+	 */
+	hpt = 0;
+	if (order != kvm_hpt_order) {
 		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-				       __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT);
+				       __GFP_NOWARN, order - PAGE_SHIFT);
+		if (!hpt)
+			--order;
 	}
 
+	/* Next try to allocate from the preallocated pool */
 	if (!hpt) {
-		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
-		return -ENOMEM;
+		li = kvm_alloc_hpt();
+		if (li) {
+			hpt = (ulong)li->base_virt;
+			kvm->arch.hpt_li = li;
+			order = kvm_hpt_order;
+		}
 	}
+
+	/* Lastly try successively smaller sizes from the page allocator */
+	while (!hpt && order > PPC_MIN_HPT_ORDER) {
+		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
+				       __GFP_NOWARN, order - PAGE_SHIFT);
+		if (!hpt)
+			--order;
+	}
+
+	if (!hpt)
+		return -ENOMEM;
+
 	kvm->arch.hpt_virt = hpt;
+	kvm->arch.hpt_order = order;
+	/* HPTEs are 2**4 bytes long */
+	kvm->arch.hpt_npte = 1ul << (order - 4);
+	/* 128 (2**7) bytes in each HPTEG */
+	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
 
 	/* Allocate reverse map array */
-	rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
+	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
 	if (!rev) {
 		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
 		goto out_freehpt;
 	}
 	kvm->arch.revmap = rev;
+	kvm->arch.sdr1 = __pa(hpt) | (order - 18);
 
-	lpid = kvmppc_alloc_lpid();
-	if (lpid < 0)
-		goto out_freeboth;
+	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
+		hpt, order, kvm->arch.lpid);
 
-	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
-	kvm->arch.lpid = lpid;
-
-	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+	if (htab_orderp)
+		*htab_orderp = order;
 	return 0;
 
- out_freeboth:
-	vfree(rev);
  out_freehpt:
-	free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+	if (kvm->arch.hpt_li)
+		kvm_release_hpt(kvm->arch.hpt_li);
+	else
+		free_pages(hpt, order - PAGE_SHIFT);
 	return -ENOMEM;
 }
 
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+{
+	long err = -EBUSY;
+	long order;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.rma_setup_done) {
+		kvm->arch.rma_setup_done = 0;
+		/* order rma_setup_done vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.rma_setup_done = 1;
+			goto out;
+		}
+	}
+	if (kvm->arch.hpt_virt) {
+		order = kvm->arch.hpt_order;
+		/* Set the entire HPT to 0, i.e. invalid HPTEs */
+		memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+		/*
+		 * Set the whole last_vcpu array to an invalid vcpu number.
+		 * This ensures that each vcpu will flush its TLB on next entry.
+		 */
+		memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
+		*htab_orderp = order;
+		err = 0;
+	} else {
+		err = kvmppc_alloc_hpt(kvm, htab_orderp);
+		order = *htab_orderp;
+	}
+ out:
+	mutex_unlock(&kvm->lock);
+	return err;
+}
+
 void kvmppc_free_hpt(struct kvm *kvm)
 {
 	kvmppc_free_lpid(kvm->arch.lpid);
@@ -94,7 +159,8 @@ void kvmppc_free_hpt(struct kvm *kvm)
 	if (kvm->arch.hpt_li)
 		kvm_release_hpt(kvm->arch.hpt_li);
 	else
-		free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+		free_pages(kvm->arch.hpt_virt,
+			   kvm->arch.hpt_order - PAGE_SHIFT);
 }
 
 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -119,6 +185,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	unsigned long psize;
 	unsigned long hp0, hp1;
 	long ret;
+	struct kvm *kvm = vcpu->kvm;
 
 	psize = 1ul << porder;
 	npages = memslot->npages >> (porder - PAGE_SHIFT);
@@ -127,8 +194,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	if (npages > 1ul << (40 - porder))
 		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
-	if (npages > HPT_NPTEG)
-		npages = HPT_NPTEG;
+	if (npages > kvm->arch.hpt_mask + 1)
+		npages = kvm->arch.hpt_mask + 1;
 
 	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -138,7 +205,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	for (i = 0; i < npages; ++i) {
 		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
-		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
 		/*
 		 * We assume that the hash table is empty and no
 		 * vcpus are using it at this stage.  Since we create
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c6af1d623839..d084e412b3c5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -56,7 +56,7 @@
 /* #define EXIT_DEBUG_INT */
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
-static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu);
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
@@ -1068,11 +1068,15 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		return -EINTR;
 	}
 
-	/* On the first time here, set up VRMA or RMA */
+	atomic_inc(&vcpu->kvm->arch.vcpus_running);
+	/* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+	smp_mb();
+
+	/* On the first time here, set up HTAB and VRMA or RMA */
 	if (!vcpu->kvm->arch.rma_setup_done) {
-		r = kvmppc_hv_setup_rma(vcpu);
+		r = kvmppc_hv_setup_htab_rma(vcpu);
 		if (r)
-			return r;
+			goto out;
 	}
 
 	flush_fp_to_thread(current);
@@ -1090,6 +1094,9 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			kvmppc_core_prepare_to_enter(vcpu);
 		}
 	} while (r == RESUME_GUEST);
+
+ out:
+	atomic_dec(&vcpu->kvm->arch.vcpus_running);
 	return r;
 }
 
@@ -1305,7 +1312,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 {
 }
 
-static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
 	int err = 0;
 	struct kvm *kvm = vcpu->kvm;
@@ -1324,6 +1331,15 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
 	if (kvm->arch.rma_setup_done)
 		goto out;	/* another vcpu beat us to it */
 
+	/* Allocate hashed page table (if not done already) and reset it */
+	if (!kvm->arch.hpt_virt) {
+		err = kvmppc_alloc_hpt(kvm, NULL);
+		if (err) {
+			pr_err("KVM: Couldn't alloc HPT\n");
+			goto out;
+		}
+	}
+
 	/* Look up the memslot for guest physical address 0 */
 	memslot = gfn_to_memslot(kvm, 0);
 
@@ -1435,13 +1451,14 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
-	long r;
-	unsigned long lpcr;
+	unsigned long lpcr, lpid;
 
-	/* Allocate hashed page table */
-	r = kvmppc_alloc_hpt(kvm);
-	if (r)
-		return r;
+	/* Allocate the guest's logical partition ID */
+
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		return -ENOMEM;
+	kvm->arch.lpid = lpid;
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
@@ -1451,7 +1468,6 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
 		/* PPC970; HID4 is effectively the LPCR */
-		unsigned long lpid = kvm->arch.lpid;
 		kvm->arch.host_lpid = 0;
 		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
 		lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index e1b60f56f2a1..fb4eac290fef 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,9 @@ static void __init kvm_linear_init_one(ulong size, int count, int type);
 static struct kvmppc_linear_info *kvm_alloc_linear(int type);
 static void kvm_release_linear(struct kvmppc_linear_info *ri);
 
+int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
+EXPORT_SYMBOL_GPL(kvm_hpt_order);
+
 /*************** RMA *************/
 
 /*
@@ -209,7 +212,7 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri)
 void __init kvm_linear_init(void)
 {
 	/* HPT */
-	kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT);
+	kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
 
 	/* RMA */
 	/* Only do this on PPC970 in HV mode */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index cec4daddbf31..5c70d19494f9 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -237,7 +237,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	/* Find and lock the HPTEG slot to use */
  do_insert:
-	if (pte_index >= HPT_NPTE)
+	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
@@ -352,7 +352,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long v, r, rb;
 	struct revmap_entry *rev;
 
-	if (pte_index >= HPT_NPTE)
+	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
@@ -419,7 +419,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				i = 4;
 				break;
 			}
-			if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) {
+			if (req != 1 || flags == 3 ||
+			    pte_index >= kvm->arch.hpt_npte) {
 				/* parameter error */
 				args[j] = ((0xa0 | flags) << 56) + pte_index;
 				ret = H_PARAMETER;
@@ -521,7 +522,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	struct revmap_entry *rev;
 	unsigned long v, r, rb, mask, bits;
 
-	if (pte_index >= HPT_NPTE)
+	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
@@ -583,7 +584,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 	int i, n = 1;
 	struct revmap_entry *rev = NULL;
 
-	if (pte_index >= HPT_NPTE)
+	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
@@ -678,7 +679,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		somask = (1UL << 28) - 1;
 		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 	}
-	hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK;
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
 	avpn = slb_v & ~(somask >> 16);	/* also includes B */
 	avpn |= (eaddr & somask) >> 16;
 
@@ -723,7 +724,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		if (val & HPTE_V_SECONDARY)
 			break;
 		val |= HPTE_V_SECONDARY;
-		hash = hash ^ HPT_HASH_MASK;
+		hash = hash ^ kvm->arch.hpt_mask;
 	}
 	return -1;
 }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 72f13f4a06e0..86681eec60b1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1267,6 +1267,11 @@ void kvmppc_decrementer_func(unsigned long data)
 {
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
+	if (vcpu->arch.tcr & TCR_ARE) {
+		vcpu->arch.dec = vcpu->arch.decar;
+		kvmppc_emulate_dec(vcpu);
+	}
+
 	kvmppc_set_tsr_bits(vcpu, TSR_DIS);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 6c76397f2af4..9eb9809eb13e 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -129,6 +129,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
+	case SPRN_DECAR:
+		vcpu->arch.decar = spr_val;
+		break;
 	/*
 	 * Note: SPRG4-7 are user-readable.
 	 * These values are loaded into the real SPRGs when resuming the
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 8b99e076dc81..e04b0ef55ce0 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 		*spr_val = vcpu->arch.shared->mas7_3 >> 32;
 		break;
 #endif
+	case SPRN_DECAR:
+		*spr_val = vcpu->arch.decar;
+		break;
 	case SPRN_TLB0CFG:
 		*spr_val = vcpu->arch.tlbcfg[0];
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1493c8de947b..87f4dc886076 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -246,6 +246,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
+	case KVM_CAP_PPC_ALLOC_HTAB:
 		r = 1;
 		break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -802,6 +803,23 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = -EFAULT;
 		break;
 	}
+
+	case KVM_PPC_ALLOCATE_HTAB: {
+		struct kvm *kvm = filp->private_data;
+		u32 htab_order;
+
+		r = -EFAULT;
+		if (get_user(htab_order, (u32 __user *)argp))
+			break;
+		r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+		if (r)
+			break;
+		r = -EFAULT;
+		if (put_user(htab_order, (u32 __user *)argp))
+			break;
+		r = 0;
+		break;
+	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index a35ca44ade66..e7a896acd982 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -25,6 +25,7 @@ source "arch/powerpc/platforms/wsp/Kconfig"
 config KVM_GUEST
 	bool "KVM Guest support"
 	default n
+	select EPAPR_PARAVIRT
 	---help---
 	  This option enables various optimizations for running under the KVM
 	  hypervisor. Overhead for the kernel when not running inside KVM should
@@ -32,6 +33,14 @@ config KVM_GUEST
 
 	  In case of doubt, say Y
 
+config EPAPR_PARAVIRT
+	bool "ePAPR para-virtualization support"
+	default n
+	help
+	  Enables ePAPR para-virtualization support for guests.
+
+	  In case of doubt, say Y
+
 config PPC_NATIVE
 	bool
 	depends on 6xx || PPC64
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 09f2b3aa2da7..2ce09aa7d3b3 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -617,6 +617,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SIGNAL_MSI 77
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
+#define KVM_CAP_PPC_ALLOC_HTAB 80
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -828,6 +829,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
 /* Available with KVM_CAP_PPC_GET_SMMU_INFO */
 #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
+/* Available with KVM_CAP_PPC_ALLOC_HTAB */
+#define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
 
 /*
  * ioctls for vcpu fds