diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 106 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 25 | ||||
-rw-r--r-- | arch/x86/kvm/assigned-dev.c | 1058 | ||||
-rw-r--r-- | arch/x86/kvm/assigned-dev.h | 32 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/cpuid.c | 225 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/cpuid.h | 64 | ||||
-rw-r--r-- | arch/x86/kvm/debugfs.c | 69 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/emulate.c | 732 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 1266 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.h | 90 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 738 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 66 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/i8259.c | 33 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/ioapic.c | 90 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/ioapic.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/iommu.c | 356 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/irq.c | 26 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/irq.h | 17 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/irq_comm.c | 126 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/kvm_cache_regs.h | 51 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/lapic.c | 613 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/lapic.h | 81 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/mmu.c | 868 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/mmu.h | 39 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/mmu_audit.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 333 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/mtrr.c | 52 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/page_track.c | 33 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/paging_tmpl.h | 87 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/pmu.c | 26 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/pmu.h | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/pmu_amd.c | 2 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/pmu_intel.c | 7 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/svm.c | 1213 | ||||
-rwxr-xr-x | arch/x86/kvm/svm_def.h | 176 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 1374 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/tss.h | 0 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/vmx.c | 4438 | ||||
-rwxr-xr-x | arch/x86/kvm/vmx_def.h | 425 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/x86.c | 4183 | ||||
-rwxr-xr-x[-rw-r--r--] | arch/x86/kvm/x86.h | 37 |
41 files changed, 4009 insertions, 15191 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig deleted file mode 100644 index ab8e32f..0000000 --- a/arch/x86/kvm/Kconfig +++ /dev/null @@ -1,106 +0,0 @@ -# -# KVM configuration -# - -source "virt/kvm/Kconfig" - -menuconfig VIRTUALIZATION - bool "Virtualization" - depends on HAVE_KVM || X86 - default y - ---help--- - Say Y here to get to see options for using your Linux host to run other - operating systems inside virtual machines (guests). - This option alone does not add any kernel code. - - If you say N, all options in this submenu will be skipped and disabled. - -if VIRTUALIZATION - -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM - depends on HIGH_RES_TIMERS - # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET - select PREEMPT_NOTIFIERS - select MMU_NOTIFIER - select ANON_INODES - select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD - select IRQ_BYPASS_MANAGER - select HAVE_KVM_IRQ_BYPASS - select HAVE_KVM_IRQ_ROUTING - select HAVE_KVM_EVENTFD - select KVM_ASYNC_PF - select USER_RETURN_NOTIFIER - select KVM_MMIO - select TASKSTATS - select TASK_DELAY_ACCT - select PERF_EVENTS - select HAVE_KVM_MSI - select HAVE_KVM_CPU_RELAX_INTERCEPT - select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_VFIO - select SRCU - ---help--- - Support hosting fully virtualized guest machines using hardware - virtualization extensions. You will need a fairly recent - processor equipped with virtualization extensions. You will also - need to select one or more of the processor modules below. - - This module provides access to the hardware capabilities through - a character device node named /dev/kvm. - - To compile this as a module, choose M here: the module - will be called kvm. - - If unsure, say N. - -config KVM_INTEL - tristate "KVM for Intel processors support" - depends on KVM - # for perf_guest_get_msrs(): - depends on CPU_SUP_INTEL - ---help--- - Provides support for KVM on Intel processors equipped with the VT - extensions. - - To compile this as a module, choose M here: the module - will be called kvm-intel. - -config KVM_AMD - tristate "KVM for AMD processors support" - depends on KVM - ---help--- - Provides support for KVM on AMD processors equipped with the AMD-V - (SVM) extensions. - - To compile this as a module, choose M here: the module - will be called kvm-amd. - -config KVM_MMU_AUDIT - bool "Audit KVM MMU" - depends on KVM && TRACEPOINTS - ---help--- - This option adds a R/W kVM module parameter 'mmu_audit', which allows - auditing of KVM MMU events at runtime. - -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - -# OK, it's a little counter-intuitive to do this, but it puts it neatly under -# the virtualization menu. -source drivers/vhost/Kconfig -source drivers/lguest/Kconfig - -endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile deleted file mode 100644 index 3bff207..0000000 --- a/arch/x86/kvm/Makefile +++ /dev/null @@ -1,25 +0,0 @@ - -ccflags-y += -Iarch/x86/kvm - -CFLAGS_x86.o := -I. -CFLAGS_svm.o := -I. -CFLAGS_vmx.o := -I. - -KVM := ../../../virt/kvm - -kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o -kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o - -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o debugfs.o - -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - -kvm-intel-y += vmx.o pmu_intel.o -kvm-amd-y += svm.o pmu_amd.o - -obj-$(CONFIG_KVM) += kvm.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b859..0000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/namei.h> -#include <linux/fs.h> -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a..0000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include <linux/kvm_host.h> - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afa7bbb..ad85822 100644..100755 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -6,6 +6,7 @@ * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 + * Copyright 2019 Google LLC * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -13,16 +14,9 @@ */ #include <linux/kvm_host.h> -#include <linux/export.h> -#include <linux/vmalloc.h> -#include <linux/uaccess.h> -#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ -#include <asm/user.h> -#include <asm/fpu/xstate.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" -#include "trace.h" #include "pmu.h" static u32 xstate_required_size(u64 xstate_bv, bool compacted) @@ -33,7 +27,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx, offset; + u32 eax = 0, ebx = 0, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); offset = compacted ? ret : ebx; ret = max(ret, offset + eax); @@ -51,11 +45,10 @@ bool kvm_mpx_supported(void) return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) && kvm_x86_ops->mpx_supported()); } -EXPORT_SYMBOL_GPL(kvm_mpx_supported); u64 kvm_supported_xcr0(void) { - u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + u64 xcr0 = GVM_SUPPORTED_XCR0 & host_xcr0; if (!kvm_mpx_supported()) xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); @@ -67,7 +60,7 @@ u64 kvm_supported_xcr0(void) int kvm_update_cpuid(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; struct kvm_lapic *apic = vcpu->arch.apic; best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -114,9 +107,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - if (use_eager_fpu()) - kvm_x86_ops->fpu_activate(vcpu); - /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. @@ -144,7 +134,7 @@ static int is_efer_nx(void) static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) { int i; - struct kvm_cpuid_entry2 *e, *entry; + struct kvm_cpuid_entry *e, *entry; entry = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { @@ -162,7 +152,7 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); if (!best || best->eax < 0x80000008) @@ -173,65 +163,19 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } -EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); -/* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r, i; - struct kvm_cpuid_entry *cpuid_entries = NULL; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - if (cpuid->nent) { - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * - cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out; - } - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); - -out: - vfree(cpuid_entries); - return r; -} - -int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) { int r; r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + if (cpuid->nent > GVM_MAX_CPUID_ENTRIES) goto out; r = -EFAULT; if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); @@ -241,19 +185,16 @@ out: return r; } -int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +int kvm_vcpu_ioctl_get_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) { int r; r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; + return 0; out: @@ -263,10 +204,12 @@ out: static void cpuid_mask(u32 *word, int wordnum) { +#if 0 *word &= boot_cpu_data.x86_capability[wordnum]; +#endif } -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static void do_cpuid_1_ent(struct kvm_cpuid_entry *entry, u32 function, u32 index) { entry->function = function; @@ -276,7 +219,7 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry *entry, u32 func, u32 index, int *nent, int maxnent) { switch (func) { @@ -298,7 +241,7 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, return 0; } -static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static inline int __do_cpuid_ent(struct kvm_cpuid_entry *entry, u32 function, u32 index, int *nent, int maxnent) { int r; @@ -346,7 +289,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + 0 /* Reserved*/ | F(AES) | 0 /*F(XSAVE)*/ | 0 /* OSXSAVE */ | 0 /*F(AVX)*/ | F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ const u32 kvm_cpuid_8000_0001_ecx_x86_features = @@ -403,18 +346,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + * in kvm_emulate_cpuid() using GVM_CPUID_FLAG_STATE_READ_NEXT */ case 2: { int t, times = entry->eax & 0xff; - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + entry->flags |= GVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= GVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times; ++t) { if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry[t].flags |= GVM_CPUID_FLAG_STATEFUL_FUNC; ++*nent; } break; @@ -423,7 +366,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 4: { int i, cache_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until cache_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -434,7 +377,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + GVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -446,7 +389,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = 0; break; case 7: { - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ if (index == 0) { entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; @@ -469,6 +412,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 9: break; case 0xa: { /* Architectural Performance Monitoring */ +#if 0 struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -495,13 +439,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx = cap.events_mask; entry->ecx = 0; entry->edx = edx.full; +#endif break; } /* function 0xb has additional index. */ case 0xb: { int i, level_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -512,7 +457,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + GVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -525,7 +470,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx = xstate_required_size(supported, false); entry->ecx = entry->ebx; entry->edx &= supported >> 32; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; if (!supported) break; @@ -552,37 +497,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry[i].ecx = 0; entry[i].edx = 0; entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + GVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; ++i; } break; } - case KVM_CPUID_SIGNATURE: { - static const char signature[12] = "KVMKVMKVM\0\0"; - const u32 *sigptr = (const u32 *)signature; - entry->eax = KVM_CPUID_FEATURES; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | - (1 << KVM_FEATURE_PV_UNHALT); - - if (sched_info_on()) - entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); - - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; @@ -593,11 +513,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ +#if 0 /* invariant TSC is CPUID.80000007H:EDX[8] */ entry->edx &= (1 << 8); /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; +#endif break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; @@ -646,10 +568,10 @@ out: return r; } -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, +static int do_cpuid_ent(struct kvm_cpuid_entry *entry, u32 func, u32 idx, int *nent, int maxnent, unsigned int type) { - if (type == KVM_GET_EMULATED_CPUID) + if (type == GVM_GET_EMULATED_CPUID) return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); return __do_cpuid_ent(entry, func, idx, nent, maxnent); @@ -666,23 +588,23 @@ struct kvm_cpuid_param { static bool is_centaur_cpu(const struct kvm_cpuid_param *param) { - return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; + return 0; } -static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, +static bool sanity_check_entries(struct kvm_cpuid_entry __user *entries, __u32 num_entries, unsigned int ioctl_type) { int i; __u32 pad[3]; - if (ioctl_type != KVM_GET_EMULATED_CPUID) + if (ioctl_type != GVM_GET_EMULATED_CPUID) return false; /* * We want to make sure that ->padding is being passed clean from * userspace in case we want to use it for something in the future. * - * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * Sadly, this wasn't enforced for GVM_GET_SUPPORTED_CPUID and so we * have to give ourselves satisfied only with the emulated side. /me * sheds a tear. */ @@ -696,31 +618,29 @@ static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, return false; } -int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries, +int kvm_dev_ioctl_get_cpuid(PIRP pIrp, struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries, unsigned int type) { - struct kvm_cpuid_entry2 *cpuid_entries; + struct kvm_cpuid_entry *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; u32 func; static const struct kvm_cpuid_param param[] = { { .func = 0, .has_leaf_count = true }, { .func = 0x80000000, .has_leaf_count = true }, { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, - { .func = KVM_CPUID_SIGNATURE }, - { .func = KVM_CPUID_FEATURES }, }; if (cpuid->nent < 1) goto out; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - cpuid->nent = KVM_MAX_CPUID_ENTRIES; + if (cpuid->nent > GVM_MAX_CPUID_ENTRIES) + cpuid->nent = GVM_MAX_CPUID_ENTRIES; if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; r = -ENOMEM; - cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); if (!cpuid_entries) goto out; @@ -749,11 +669,19 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, goto out_free; } - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; cpuid->nent = nent; + + r = gvmUpdateReturnBuffer(pIrp, 0, cpuid, sizeof(cpuid)); + if (!NT_SUCCESS(r)) { + r = -EFAULT; + goto out_free; + } + r = gvmUpdateReturnBuffer(pIrp, sizeof(cpuid), cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry)); + if (!NT_SUCCESS(r)) { + r = -EFAULT; + goto out_free; + } r = 0; out_free: @@ -764,48 +692,47 @@ out: static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + struct kvm_cpuid_entry *e = &vcpu->arch.cpuid_entries[i]; int j, nent = vcpu->arch.cpuid_nent; - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + e->flags &= ~GVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + struct kvm_cpuid_entry *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + ej->flags |= GVM_CPUID_FLAG_STATE_READ_NEXT; return j; } } - return 0; /* silence gcc, even though control never reaches here */ } /* find an entry with matching function, matching index (if needed), and that * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, +static int is_matching_cpuid_entry(struct kvm_cpuid_entry *e, u32 function, u32 index) { if (e->function != function) return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + if ((e->flags & GVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + if ((e->flags & GVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & GVM_CPUID_FLAG_STATE_READ_NEXT)) return 0; return 1; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, +struct kvm_cpuid_entry *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { int i; - struct kvm_cpuid_entry2 *best = NULL; + struct kvm_cpuid_entry *best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; + struct kvm_cpuid_entry *e; e = &vcpu->arch.cpuid_entries[i]; if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + if (e->flags & GVM_CPUID_FLAG_STATEFUL_FUNC) move_to_next_stateful_cpuid_entry(vcpu, i); best = e; break; @@ -813,17 +740,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* * If no match is found, check whether we exceed the vCPU's limit * and return the content of the highest valid _standard_ leaf instead. * This is to satisfy the CPUID specification. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, +static struct kvm_cpuid_entry* check_cpuid_limit(struct kvm_vcpu *vcpu, u32 function, u32 index) { - struct kvm_cpuid_entry2 *maxlevel; + struct kvm_cpuid_entry *maxlevel; maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); if (!maxlevel || maxlevel->eax >= function) @@ -839,7 +765,7 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, function, index); @@ -859,9 +785,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) *edx = best->edx; } else *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); } -EXPORT_SYMBOL_GPL(kvm_cpuid); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { @@ -876,4 +800,3 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2..4bfa008 100644..100755 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -1,25 +1,31 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H #include "x86.h" -#include <asm/cpu.h> +#include <gvm_types.h> +#include <asm/cpufeatures.h> +#include <ntkrutils.h> int kvm_update_cpuid(struct kvm_vcpu *vcpu); bool kvm_mpx_supported(void); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, +struct kvm_cpuid_entry *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries, +int kvm_dev_ioctl_get_cpuid(PIRP pIrp, struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries, unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); -int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); -int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries); +int kvm_vcpu_ioctl_get_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries); void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); @@ -31,7 +37,7 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; if (!static_cpu_has(X86_FEATURE_XSAVE)) return false; @@ -42,7 +48,7 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->edx & bit(X86_FEATURE_MTRR)); @@ -50,7 +56,7 @@ static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); @@ -58,7 +64,7 @@ static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_SMEP)); @@ -66,7 +72,7 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_SMAP)); @@ -74,7 +80,7 @@ static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); @@ -82,7 +88,7 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ecx & bit(X86_FEATURE_PKU)); @@ -90,7 +96,7 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_LM)); @@ -98,7 +104,7 @@ static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->ecx & bit(X86_FEATURE_OSVW)); @@ -106,7 +112,7 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_PCID)); @@ -114,7 +120,7 @@ static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_X2APIC)); @@ -122,7 +128,7 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0, 0); return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; @@ -130,7 +136,7 @@ static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_GBPAGES)); @@ -138,7 +144,7 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_RTM)); @@ -146,7 +152,7 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_RDTSCP)); @@ -159,7 +165,7 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); @@ -174,7 +180,7 @@ static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (!best) @@ -185,7 +191,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (!best) @@ -196,7 +202,7 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (!best) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c deleted file mode 100644 index c19c7ed..0000000 --- a/arch/x86/kvm/debugfs.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * Copyright 2016 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ -#include <linux/kvm_host.h> -#include <linux/debugfs.h> - -bool kvm_arch_has_vcpu_debugfs(void) -{ - return true; -} - -static int vcpu_get_tsc_offset(void *data, u64 *val) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; - *val = vcpu->arch.tsc_offset; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); - -static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; - *val = vcpu->arch.tsc_scaling_ratio; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); - -static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) -{ - *val = kvm_tsc_scaling_ratio_frac_bits; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - struct dentry *ret; - - ret = debugfs_create_file("tsc-offset", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_offset_fops); - if (!ret) - return -ENOMEM; - - if (kvm_has_tsc_control) { - ret = debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_fops); - if (!ret) - return -ENOMEM; - ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_frac_fops); - if (!ret) - return -ENOMEM; - - } - - return 0; -} diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3ce9d2..6ae4ce5 100644..100755 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -10,6 +10,7 @@ * * Copyright (C) 2006 Qumranet * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> @@ -23,12 +24,15 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" #include <asm/kvm_emulate.h> -#include <linux/stringify.h> -#include <asm/debugreg.h> +#include <uapi/asm/debugreg.h> #include "x86.h" #include "tss.h" +#include <gvm_types.h> +#include <uapi/asm/processor-flags.h> +#include <asm/cpufeatures.h> + /* * Operand types */ @@ -174,14 +178,14 @@ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) -#define X2(x...) x, x -#define X3(x...) X2(x), x -#define X4(x...) X2(x), X2(x) -#define X5(x...) X4(x), x -#define X6(x...) X4(x), X2(x) -#define X7(x...) X4(x), X3(x) -#define X8(x...) X4(x), X4(x) -#define X16(x...) X8(x), X8(x) +#define X2(x,...) x, x +#define X3(x,...) X2(x), x +#define X4(x,...) X2(x), X2(x) +#define X5(x,...) X4(x), x +#define X6(x,...) X4(x), X2(x) +#define X7(x,...) X4(x), X3(x) +#define X8(x,...) X4(x), X4(x) +#define X16(x,...) X8(x), X8(x) #define NR_FASTOP (ilog2(sizeof(ulong)) + 1) #define FASTOP_SIZE 8 @@ -281,7 +285,7 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { - unsigned reg; + unsigned reg = 0; for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); @@ -308,32 +312,23 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); -#define FOP_FUNC(name) \ - ".align " __stringify(FASTOP_SIZE) " \n\t" \ - ".type " name ", @function \n\t" \ - name ":\n\t" +#define FOP_FUNC(name) -#define FOP_RET "ret \n\t" +#define FOP_RET #define FOP_START(op) \ - extern void em_##op(struct fastop *fake); \ - asm(".pushsection .text, \"ax\" \n\t" \ - ".global em_" #op " \n\t" \ - FOP_FUNC("em_" #op) + extern void em_##op(struct fastop *fake); -#define FOP_END \ - ".popsection") +#define FOP_END #define FOPNOP() \ - FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \ - FOP_RET + FOP_FUNC(__stringify(__UNIQUE_ID(nop))) #define FOP1E(op, dst) \ - FOP_FUNC(#op "_" #dst) \ - "10: " #op " %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst) #define FOP1EEX(op, dst) \ - FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) + FOP1E(op, dst) #define FASTOP1(op) \ FOP_START(op) \ @@ -362,8 +357,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP2E(op, dst, src) \ - FOP_FUNC(#op "_" #dst "_" #src) \ - #op " %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src) #define FASTOP2(op) \ FOP_START(op) \ @@ -401,8 +395,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP3E(op, dst, src, src2) \ - FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \ - #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src "_" #src2) /* 3-operand, word-only, src2=cl */ #define FASTOP3WCL(op) \ @@ -414,15 +407,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END /* Special case for SETcc - 1 instruction per cc */ -#define FOP_SETCC(op) \ - ".align 4 \n\t" \ - ".type " #op ", @function \n\t" \ - #op ": \n\t" \ - #op " %al \n\t" \ - FOP_RET - -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +#define FOP_SETCC(op) FOP_START(setcc) FOP_SETCC(seto) @@ -443,7 +428,7 @@ FOP_SETCC(setle) FOP_SETCC(setnle) FOP_END; -FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET +FOP_START(salc) FOP_END; static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, @@ -472,7 +457,7 @@ static void assign_masked(ulong *dest, ulong src, ulong mask) *dest = (*dest & ~mask) | (src & mask); } -static void assign_register(unsigned long *reg, u64 val, int bytes) +static void assign_register(size_t *reg, u64 val, int bytes) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (bytes) { @@ -491,9 +476,9 @@ static void assign_register(unsigned long *reg, u64 val, int bytes) } } -static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) +static inline size_t ad_mask(struct x86_emulate_ctxt *ctxt) { - return (1UL << (ctxt->ad_bytes << 3)) - 1; + return (1ULL << (ctxt->ad_bytes << 3)) - 1; } static ulong stack_mask(struct x86_emulate_ctxt *ctxt) @@ -513,16 +498,16 @@ static int stack_size(struct x86_emulate_ctxt *ctxt) } /* Access/update address held in a register, based on addressing mode. */ -static inline unsigned long -address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) +static inline size_t +address_mask(struct x86_emulate_ctxt *ctxt, size_t reg) { - if (ctxt->ad_bytes == sizeof(unsigned long)) + if (ctxt->ad_bytes == sizeof(size_t)) return reg; else return reg & ad_mask(ctxt); } -static inline unsigned long +static inline size_t register_address(struct x86_emulate_ctxt *ctxt, int reg) { return address_mask(ctxt, reg_read(ctxt, reg)); @@ -553,7 +538,7 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +static size_t seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; @@ -733,8 +718,8 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = dst }; - if (ctxt->op_bytes != sizeof(unsigned long)) - addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); + if (ctxt->op_bytes != sizeof(size_t)) + addr.ea = dst & ((1ULL << (ctxt->op_bytes << 3)) - 1); rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; @@ -799,7 +784,7 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; unsigned size, max_size; - unsigned long linear; + size_t linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; @@ -851,27 +836,53 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _ctxt) \ -({ _type _x; \ - \ - rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ - ctxt->fetch.ptr += sizeof(_type); \ - _x; \ -}) - -#define insn_fetch_arr(_arr, _size, _ctxt) \ -({ \ - rc = do_insn_fetch_bytes(_ctxt, _size); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - ctxt->_eip += (_size); \ - memcpy(_arr, ctxt->fetch.ptr, _size); \ - ctxt->fetch.ptr += (_size); \ -}) +#define __insn_fetch_type(_type) \ +static __always_inline int \ + __insn_fetch_##_type(struct x86_emulate_ctxt *ctxt, _type *_x) \ +{ \ + int rc; \ + rc = do_insn_fetch_bytes(ctxt, sizeof(_type)); \ + if (rc == X86EMUL_CONTINUE) { \ + ctxt->_eip += sizeof(_type); \ + *_x = *(_type *) ctxt->fetch.ptr; \ + ctxt->fetch.ptr += sizeof(_type); \ + } \ + return rc; \ +} + +__insn_fetch_type(u8) +__insn_fetch_type(s8) +__insn_fetch_type(u16) +__insn_fetch_type(s16) +__insn_fetch_type(u32) +__insn_fetch_type(s32) +__insn_fetch_type(u64) +__insn_fetch_type(s64) + +#define insn_fetch(_type, _ctxt, _data) __insn_fetch_##_type(_ctxt, &(_type)_data) + +#define insn_fetch_modrmea(_type, _ctxt) \ + do { \ + _type __temp; \ + rc = insn_fetch(_type, _ctxt, __temp); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + modrm_ea += __temp; \ + } while (0) + + +static __always_inline int insn_fetch_arr(char *_arr, + unsigned int _size, struct x86_emulate_ctxt *_ctxt) +{ + int rc; + rc = do_insn_fetch_bytes(_ctxt, _size); + if (rc == X86EMUL_CONTINUE) { + _ctxt->_eip += (_size); + memcpy(_arr, _ctxt->fetch.ptr, _size); + _ctxt->fetch.ptr += (_size); + } + return rc; +} /* * Given the 'reg' portion of a ModRM byte, and a register block, return a @@ -893,7 +904,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, - u16 *size, unsigned long *address, int op_bytes) + u16 *size, size_t *address, int op_bytes) { int rc; @@ -968,14 +979,14 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt) return fastop(ctxt, em_bsr); } -static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) +extern u8 __asm_test_cc(void *fop, size_t flags); +static __always_inline u8 test_cc(unsigned int condition, size_t flags) { - u8 rc; - void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + u8 rc = 0; + void(*fop)(void) = (void(*)(void))((char *)em_setcc + 4 * (condition & 0xf)); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; - asm("push %[flags]; popf; call *%[fastop]" - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + rc = __asm_test_cc(fop, flags); return rc; } @@ -997,55 +1008,80 @@ static void fetch_register_operand(struct operand *op) } } +#define DECLARE_XMM(n) \ +extern __asm_save_xmm##n(sse128_t *data); \ +extern __asm_store_xmm##n(sse128_t *data); + +DECLARE_XMM(0) +DECLARE_XMM(1) +DECLARE_XMM(2) +DECLARE_XMM(3) +DECLARE_XMM(4) +DECLARE_XMM(5) +DECLARE_XMM(6) +DECLARE_XMM(7) +DECLARE_XMM(8) +DECLARE_XMM(9) +DECLARE_XMM(10) +DECLARE_XMM(11) +DECLARE_XMM(12) +DECLARE_XMM(13) +DECLARE_XMM(14) +DECLARE_XMM(15) + +#define SAVE_XMM(n) \ +case n: __asm_save_xmm##n(data); break; static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; + SAVE_XMM(0) + SAVE_XMM(1) + SAVE_XMM(2) + SAVE_XMM(3) + SAVE_XMM(4) + SAVE_XMM(5) + SAVE_XMM(6) + SAVE_XMM(7) #ifdef CONFIG_X86_64 - case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; + SAVE_XMM(8) + SAVE_XMM(9) + SAVE_XMM(10) + SAVE_XMM(11) + SAVE_XMM(12) + SAVE_XMM(13) + SAVE_XMM(14) + SAVE_XMM(15) #endif default: BUG(); } ctxt->ops->put_fpu(ctxt); } +#define STORE_XMM(n) \ +case n: __asm_store_xmm##n(data); break; static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; + STORE_XMM(0) + STORE_XMM(1) + STORE_XMM(2) + STORE_XMM(3) + STORE_XMM(4) + STORE_XMM(5) + STORE_XMM(6) + STORE_XMM(7) #ifdef CONFIG_X86_64 - case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; + STORE_XMM(8) + STORE_XMM(9) + STORE_XMM(10) + STORE_XMM(11) + STORE_XMM(12) + STORE_XMM(13) + STORE_XMM(14) + STORE_XMM(15) #endif default: BUG(); } @@ -1056,14 +1092,14 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; - case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; - case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; - case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; - case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; - case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; - case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; - case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + case 0: __asm_save_mm0(data); break; + case 1: __asm_save_mm1(data); break; + case 2: __asm_save_mm2(data); break; + case 3: __asm_save_mm3(data); break; + case 4: __asm_save_mm4(data); break; + case 5: __asm_save_mm5(data); break; + case 6: __asm_save_mm6(data); break; + case 7: __asm_save_mm7(data); break; default: BUG(); } ctxt->ops->put_fpu(ctxt); @@ -1073,14 +1109,14 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; - case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; - case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; - case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; - case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; - case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; - case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; - case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + case 0: __asm_store_mm0(data); break; + case 1: __asm_store_mm1(data); break; + case 2: __asm_store_mm2(data); break; + case 3: __asm_store_mm3(data); break; + case 4: __asm_store_mm4(data); break; + case 5: __asm_store_mm5(data); break; + case 6: __asm_store_mm6(data); break; + case 7: __asm_store_mm7(data); break; default: BUG(); } ctxt->ops->put_fpu(ctxt); @@ -1092,20 +1128,20 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) return emulate_nm(ctxt); ctxt->ops->get_fpu(ctxt); - asm volatile("fninit"); + __fninit(); ctxt->ops->put_fpu(ctxt); return X86EMUL_CONTINUE; } static int em_fnstcw(struct x86_emulate_ctxt *ctxt) { - u16 fcw; + u16 fcw = 0; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); ctxt->ops->get_fpu(ctxt); - asm volatile("fnstcw %0": "+m"(fcw)); + __fnstcw(&fcw); ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fcw; @@ -1115,13 +1151,13 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) static int em_fnstsw(struct x86_emulate_ctxt *ctxt) { - u16 fsw; + u16 fsw = 0; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); ctxt->ops->get_fpu(ctxt); - asm volatile("fnstsw %0": "+m"(fsw)); + __fnstsw(&fsw); ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fsw; @@ -1217,13 +1253,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, ctxt); + insn_fetch_modrmea(u16, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, ctxt); + insn_fetch_modrmea(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, ctxt); + insn_fetch_modrmea(u16, ctxt); break; } switch (ctxt->modrm_rm) { @@ -1260,13 +1296,15 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, ctxt); + rc = insn_fetch(u8, ctxt, sib); + if (rc != X86EMUL_CONTINUE) + goto done; index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, ctxt); + insn_fetch_modrmea(s32, ctxt); else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); @@ -1278,7 +1316,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { - modrm_ea += insn_fetch(s32, ctxt); + insn_fetch_modrmea(s32, ctxt); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { @@ -1288,10 +1326,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } switch (ctxt->modrm_mod) { case 1: - modrm_ea += insn_fetch(s8, ctxt); + insn_fetch_modrmea(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, ctxt); + insn_fetch_modrmea(s32, ctxt); break; } } @@ -1311,13 +1349,19 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, ctxt); + rc = insn_fetch(u16, ctxt, op->addr.mem.ea); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 4: - op->addr.mem.ea = insn_fetch(u32, ctxt); + rc = insn_fetch(u32, ctxt, op->addr.mem.ea); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 8: - op->addr.mem.ea = insn_fetch(u64, ctxt); + rc = insn_fetch(u64, ctxt, op->addr.mem.ea); + if (rc != X86EMUL_CONTINUE) + goto done; break; } done: @@ -1347,7 +1391,7 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) } static int read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *dest, unsigned size) + size_t addr, void *dest, unsigned size) { int rc; struct read_cache *mc = &ctxt->mem_read; @@ -1716,7 +1760,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, static void write_register_operand(struct operand *op) { - return assign_register(op->addr.reg, op->val, op->bytes); + assign_register(op->addr.reg, op->val, op->bytes); } static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) @@ -1802,7 +1846,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; - unsigned long val, change_mask; + size_t val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); @@ -1834,7 +1878,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, break; } - *(unsigned long *)dest = + *(size_t *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); return rc; @@ -1893,7 +1937,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt) static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; - unsigned long selector; + size_t selector; int rc; rc = emulate_pop(ctxt, &selector, 2); @@ -1901,7 +1945,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) return rc; if (ctxt->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + ctxt->interruptibility = GVM_X86_SHADOW_INT_MOV_SS; if (ctxt->op_bytes > 2) rsp_increment(ctxt, ctxt->op_bytes - 2); @@ -1911,7 +1955,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) static int em_pusha(struct x86_emulate_ctxt *ctxt) { - unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP); + size_t old_esp = reg_read(ctxt, VCPU_REGS_RSP); int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; @@ -1931,7 +1975,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM; + ctxt->src.val = (size_t)ctxt->eflags & ~X86_EFLAGS_VM; return em_push(ctxt); } @@ -2034,16 +2078,16 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; - unsigned long temp_eip = 0; - unsigned long temp_eflags = 0; - unsigned long cs = 0; - unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + size_t temp_eip = 0; + size_t temp_eflags = 0; + size_t cs = 0; + size_t mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF | X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF | X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF | X86_EFLAGS_AC | X86_EFLAGS_ID | X86_EFLAGS_FIXED; - unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | + size_t vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | X86_EFLAGS_VIP; /* TODO: Add stack limit check */ @@ -2168,7 +2212,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + size_t eip; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2180,7 +2224,7 @@ static int em_ret(struct x86_emulate_ctxt *ctxt) static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip, cs; + size_t eip, cs; int cpl = ctxt->ops->cpl(ctxt); struct desc_struct new_desc; @@ -2267,15 +2311,26 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) return edx & bit(X86_FEATURE_LM); } -#define GET_SMSTATE(type, smbase, offset) \ - ({ \ - type __val; \ - int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ - sizeof(__val)); \ - if (r != X86EMUL_CONTINUE) \ - return X86EMUL_UNHANDLEABLE; \ - __val; \ - }) +#define GET_SMSTATE(type, smbase, offset, val) \ +do { \ + type __val; \ + int __r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val,\ + sizeof(__val)); \ + if (__r != X86EMUL_CONTINUE) \ + return X86EMUL_UNHANDLEABLE; \ + val = __val; \ +} while(0) + +#define __GET_SMSTATE_TYPE(type, smbase, offset) \ +static __always_inline int __get_smstate_##type(size_t smbase, size_t offset, type *val) \ +{ \ + type __val; \ + int __r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ + if (__r == X86EMUL_CONTINUE) \ + *val = __val; \ + return r; \ +} static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) { @@ -2294,17 +2349,21 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) struct desc_struct desc; int offset; u16 selector; + u32 temp; - selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + GET_SMSTATE(int, smbase, 0x7fa8 + n * 4, selector); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + GET_SMSTATE(u32, smbase, offset + 8, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, offset + 4, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, offset, temp); + rsm_set_desc_flags(&desc, temp); ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); return X86EMUL_CONTINUE; } @@ -2313,16 +2372,19 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) { struct desc_struct desc; int offset; - u16 selector; - u32 base3; + u16 selector, temp16; + u32 base3, temp; offset = 0x7e00 + n * 16; - selector = GET_SMSTATE(u16, smbase, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - base3 = GET_SMSTATE(u32, smbase, offset + 12); + GET_SMSTATE(u16, smbase, offset, selector); + GET_SMSTATE(u16, smbase, offset + 2, temp16); + rsm_set_desc_flags(&desc, temp16 << 8); + GET_SMSTATE(u32, smbase, offset + 4, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, offset + 8, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, offset + 12, base3); ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); return X86EMUL_CONTINUE; @@ -2362,38 +2424,47 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) u16 selector; u32 val, cr0, cr4; int i; + u32 temp; - cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + GET_SMSTATE(u32, smbase, 0x7ffc, cr0); + GET_SMSTATE(u32, smbase, 0x7ff8, temp); + ctxt->ops->set_cr(ctxt, 3, temp); + GET_SMSTATE(u32, smbase, 0x7ff4, ctxt->eflags); + ctxt->eflags |= X86_EFLAGS_FIXED; + GET_SMSTATE(u32, smbase, 0x7ff0, ctxt->_eip); for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4, *reg_write(ctxt, i)); - val = GET_SMSTATE(u32, smbase, 0x7fcc); + GET_SMSTATE(u32, smbase, 0x7fcc, val); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7fc8); + GET_SMSTATE(u32, smbase, 0x7fc8, val); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - selector = GET_SMSTATE(u32, smbase, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + GET_SMSTATE(u32, smbase, 0x7fc4, selector); + GET_SMSTATE(u32, smbase, 0x7f64, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f60, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f5c, temp); + rsm_set_desc_flags(&desc, temp); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - selector = GET_SMSTATE(u32, smbase, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + GET_SMSTATE(u32, smbase, 0x7fc0, selector); + GET_SMSTATE(u32, smbase, 0x7f80, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f7c, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f78, temp); + rsm_set_desc_flags(&desc, temp); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - dt.address = GET_SMSTATE(u32, smbase, 0x7f74); - dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + GET_SMSTATE(u32, smbase, 0x7f74, dt.address); + GET_SMSTATE(u32, smbase, 0x7f70, dt.size); ctxt->ops->set_gdt(ctxt, &dt); - dt.address = GET_SMSTATE(u32, smbase, 0x7f58); - dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + GET_SMSTATE(u32, smbase, 0x7f58, dt.address); + GET_SMSTATE(u32, smbase, 0x7f54, dt.size); ctxt->ops->set_idt(ctxt, &dt); for (i = 0; i < 6; i++) { @@ -2402,9 +2473,10 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) return r; } - cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + GET_SMSTATE(u32, smbase, 0x7f14, cr4); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + GET_SMSTATE(u32, smbase, 0x7ef8, temp); + ctxt->ops->set_smbase(ctxt, temp); return rsm_enter_protected_mode(ctxt, cr0, cr4); } @@ -2417,45 +2489,56 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) u32 base3; u16 selector; int i, r; + u64 temp64; + u32 temp = 0; for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8, *reg_write(ctxt, i)); - ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + GET_SMSTATE(u64, smbase, 0x7f78, ctxt->_eip); + GET_SMSTATE(u32, smbase, 0x7f70, ctxt->eflags); + ctxt->eflags |= X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smbase, 0x7f68); + GET_SMSTATE(u32, smbase, 0x7f68, val); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7f60); + GET_SMSTATE(u32, smbase, 0x7f60, val); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); - cr4 = GET_SMSTATE(u64, smbase, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); - val = GET_SMSTATE(u64, smbase, 0x7ed0); + GET_SMSTATE(u64, smbase, 0x7f58, cr0); + GET_SMSTATE(u64, smbase, 0x7f50, temp64); + ctxt->ops->set_cr(ctxt, 3, temp); + GET_SMSTATE(u64, smbase, 0x7f48, cr4); + GET_SMSTATE(u32, smbase, 0x7f00, temp); + ctxt->ops->set_smbase(ctxt, temp); + GET_SMSTATE(u64, smbase, 0x7ed0, val); ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); - selector = GET_SMSTATE(u32, smbase, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); - base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + GET_SMSTATE(u32, smbase, 0x7e90, selector); + GET_SMSTATE(u32, smbase, 0x7e92, temp); + rsm_set_desc_flags(&desc, temp << 8); + GET_SMSTATE(u32, smbase, 0x7e94, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e98, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e9c, base3); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e84); - dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + GET_SMSTATE(u32, smbase, 0x7e84, dt.size); + GET_SMSTATE(u64, smbase, 0x7e88, dt.address); ctxt->ops->set_idt(ctxt, &dt); - selector = GET_SMSTATE(u32, smbase, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); - base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + GET_SMSTATE(u32, smbase, 0x7e70, selector); + GET_SMSTATE(u32, smbase, 0x7e72, temp); + rsm_set_desc_flags(&desc, temp << 8); + GET_SMSTATE(u32, smbase, 0x7e74, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e78, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e7c, base3); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e64); - dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + GET_SMSTATE(u32, smbase, 0x7e64, dt.size); + GET_SMSTATE(u64, smbase, 0x7e68, dt.address); ctxt->ops->set_gdt(ctxt, &dt); r = rsm_enter_protected_mode(ctxt, cr0, cr4); @@ -2473,7 +2556,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) static int em_rsm(struct x86_emulate_ctxt *ctxt) { - unsigned long cr0, cr4, efer; + size_t cr0, cr4, efer; u64 smbase; int ret; @@ -2806,7 +2889,7 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, int r; u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; - unsigned long base; + size_t base; ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) @@ -3226,7 +3309,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (has_error_code) { ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; ctxt->lock_prefix = 0; - ctxt->src.val = (unsigned long) error_code; + ctxt->src.val = (size_t) error_code; ret = em_push(ctxt); } @@ -3260,7 +3343,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, struct operand *op) { - int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count; + int df = (ctxt->eflags & X86_EFLAGS_DF) ? -(int)op->count : op->count; register_address_increment(ctxt, reg, df * op->bytes); op->addr.mem.ea = register_address(ctxt, reg); @@ -3349,7 +3432,7 @@ static int em_call(struct x86_emulate_ctxt *ctxt) int rc; long rel = ctxt->src.val; - ctxt->src.val = (unsigned long)ctxt->_eip; + ctxt->src.val = (size_t)ctxt->_eip; rc = jmp_rel(ctxt, rel); if (rc != X86EMUL_CONTINUE) return rc; @@ -3389,7 +3472,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) /* If we failed, we tainted the memory, but the very least we should restore cs */ if (rc != X86EMUL_CONTINUE) { - pr_warn_once("faulting far call emulation tainted memory\n"); + //pr_warn_once("faulting far call emulation tainted memory\n"); goto fail; } return rc; @@ -3403,7 +3486,7 @@ fail: static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + size_t eip; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3496,7 +3579,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) */ tmp = (u16)ctxt->src.val; ctxt->dst.val &= ~0xffffUL; - ctxt->dst.val |= (unsigned long)swab16(tmp); + ctxt->dst.val |= (size_t)swab16(tmp); break; case 4: ctxt->dst.val = swab32((u32)ctxt->src.val); @@ -3522,7 +3605,7 @@ static int em_cr_write(struct x86_emulate_ctxt *ctxt) static int em_dr_write(struct x86_emulate_ctxt *ctxt) { - unsigned long val; + size_t val; if (ctxt->mode == X86EMUL_MODE_PROT64) val = ctxt->src.val & ~0ULL; @@ -3581,7 +3664,7 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); if (ctxt->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + ctxt->interruptibility = GVM_X86_SHADOW_INT_MOV_SS; /* Disable writeback. */ ctxt->dst.type = OP_NONE; @@ -3672,6 +3755,8 @@ static int em_sidt(struct x86_emulate_ctxt *ctxt) return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); } +// Disable VC warning for unaligned access in desc_ptr +#pragma warning(disable : 4366) static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) { struct desc_ptr desc_ptr; @@ -3695,6 +3780,7 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } +#pragma warning(default : 4366) static int em_lgdt(struct x86_emulate_ctxt *ctxt) { @@ -3776,7 +3862,7 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) if (emulator_bad_iopl(ctxt)) return emulate_gp(ctxt, 0); - ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; + ctxt->interruptibility = GVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; return X86EMUL_CONTINUE; } @@ -3820,11 +3906,11 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt) switch (ctxt->op_bytes) { #ifdef CONFIG_X86_64 case 8: - asm("bswap %0" : "+r"(ctxt->dst.val)); + __bswap64(&ctxt->dst.val); break; #endif default: - asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); + __bswap32((u32 *)&ctxt->dst.val); break; } return X86EMUL_CONTINUE; @@ -3846,7 +3932,9 @@ static bool valid_cr(int nr) { switch (nr) { case 0: - case 2 ... 4: + case 2: + case 3: + case 4: case 8: return true; default: @@ -3925,7 +4013,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) { - unsigned long dr7; + size_t dr7; ctxt->ops->get_dr(ctxt, 7, &dr7); @@ -4575,16 +4663,24 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, ctxt); + rc = insn_fetch(s8, ctxt, op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 2: - op->val = insn_fetch(s16, ctxt); + rc = insn_fetch(s16, ctxt, op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 4: - op->val = insn_fetch(s32, ctxt); + rc = insn_fetch(s32, ctxt, op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 8: - op->val = insn_fetch(s64, ctxt); + rc = insn_fetch(s64, ctxt, (s64)op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; } if (!sign_extension) { @@ -4766,7 +4862,6 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, break; } -done: return rc; } @@ -4817,7 +4912,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, ctxt)) { + rc = insn_fetch(u8, ctxt, ctxt->b); + if (rc != X86EMUL_CONTINUE) + goto done; + switch (ctxt->b) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -4843,7 +4941,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) has_seg_override = true; ctxt->seg_override = ctxt->b & 7; break; - case 0x40 ... 0x4f: /* REX */ + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4a: + case 0x4b: + case 0x4c: + case 0x4d: + case 0x4e: + case 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; ctxt->rex_prefix = ctxt->b; @@ -4875,20 +4988,27 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->opcode_len = 2; - ctxt->b = insn_fetch(u8, ctxt); + rc = insn_fetch(u8, ctxt, ctxt->b); + if (rc != X86EMUL_CONTINUE) + goto done; opcode = twobyte_table[ctxt->b]; /* 0F_38 opcode map */ if (ctxt->b == 0x38) { ctxt->opcode_len = 3; - ctxt->b = insn_fetch(u8, ctxt); + rc = insn_fetch(u8, ctxt, ctxt->b); + if (rc != X86EMUL_CONTINUE) + goto done; opcode = opcode_map_0f_38[ctxt->b]; } } ctxt->d = opcode.flags; - if (ctxt->d & ModRM) - ctxt->modrm = insn_fetch(u8, ctxt); + if (ctxt->d & ModRM) { + rc = insn_fetch(u8, ctxt, ctxt->modrm); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* vex-prefix instructions are not implemented */ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && @@ -5069,15 +5189,11 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) bool fault = false; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + __try { + __fwait(); + } __except(EXCEPTION_EXECUTE_HANDLER) { + fault = true; + } ctxt->ops->put_fpu(ctxt); if (unlikely(fault)) @@ -5093,18 +5209,17 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); } +extern void __asm_fastop(size_t *flags,void *fop, + struct x86_emulate_ctxt *ctxt); static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); - ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + size_t flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + char *__fop = (char *)fop; if (!(ctxt->d & ByteOp)) - fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + __fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" - : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) - : "c"(ctxt->src2.val)); + __asm_fastop(&flags, __fop, ctxt); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); if (!fop) /* exception is returned in fop variable */ @@ -5115,7 +5230,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) void init_decode_cache(struct x86_emulate_ctxt *ctxt) { memset(&ctxt->rip_relative, 0, - (void *)&ctxt->modrm - (void *)&ctxt->rip_relative); + (char *)&ctxt->modrm - (char *)&ctxt->rip_relative); ctxt->io_read.pos = 0; ctxt->io_read.end = 0; @@ -5289,14 +5404,36 @@ special_insn: goto threebyte_insn; switch (ctxt->b) { - case 0x70 ... 0x7f: /* jcc (short) */ + case 0x70: /* jcc (short) */ + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7a: + case 0x7b: + case 0x7c: + case 0x7d: + case 0x7e: + case 0x7f: if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; break; - case 0x90 ... 0x97: /* nop / xchg reg, rax */ + case 0x90: /* nop / xchg reg, rax */ + case 0x91: + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: + case 0x97: if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) ctxt->dst.type = OP_NONE; else @@ -5382,7 +5519,7 @@ writeback: count = ctxt->src.count; else count = ctxt->dst.count; - register_address_increment(ctxt, VCPU_REGS_RCX, -count); + register_address_increment(ctxt, VCPU_REGS_RCX, -(int)count); if (!string_insn_completed(ctxt)) { /* @@ -5436,25 +5573,72 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; - case 0x40 ... 0x4f: /* cmov */ + case 0x40: /* cmov */ + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4a: + case 0x4b: + case 0x4c: + case 0x4d: + case 0x4e: + case 0x4f: if (test_cc(ctxt->b, ctxt->eflags)) ctxt->dst.val = ctxt->src.val; else if (ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ + case 0x80: /* jnz rel, etc*/ + case 0x81: + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8a: + case 0x8b: + case 0x8c: + case 0x8d: + case 0x8e: + case 0x8f: if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); break; - case 0x90 ... 0x9f: /* setcc r/m8 */ + case 0x90: /* setcc r/m8 */ + case 0x91: + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: + case 0x97: + case 0x98: + case 0x99: + case 0x9a: + case 0x9b: + case 0x9c: + case 0x9d: + case 0x9e: + case 0x9f: ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xb6 ... 0xb7: /* movzx */ + case 0xb6: /* movzx */ + case 0xb7: ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; - case 0xbe ... 0xbf: /* movsx */ + case 0xbe: /* movsx */ + case 0xbf: ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c deleted file mode 100644 index 42b1c83..0000000 --- a/arch/x86/kvm/hyperv.c +++ /dev/null @@ -1,1266 +0,0 @@ -/* - * KVM Microsoft Hyper-V emulation - * - * derived from arch/x86/kvm/x86.c - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2008 Qumranet, Inc. - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * Amit Shah <amit.shah@qumranet.com> - * Ben-Ami Yassour <benami@il.ibm.com> - * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "x86.h" -#include "lapic.h" -#include "ioapic.h" -#include "hyperv.h" - -#include <linux/kvm_host.h> -#include <linux/highmem.h> -#include <asm/apicdef.h> -#include <trace/events/kvm.h> - -#include "trace.h" - -static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) -{ - return atomic64_read(&synic->sint[sint]); -} - -static inline int synic_get_sint_vector(u64 sint_value) -{ - if (sint_value & HV_SYNIC_SINT_MASKED) - return -1; - return sint_value & HV_SYNIC_SINT_VECTOR_MASK; -} - -static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, - int vector) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { - if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) - return true; - } - return false; -} - -static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, - int vector) -{ - int i; - u64 sint_value; - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { - sint_value = synic_read_sint(synic, i); - if (synic_get_sint_vector(sint_value) == vector && - sint_value & HV_SYNIC_SINT_AUTO_EOI) - return true; - } - return false; -} - -static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, - u64 data, bool host) -{ - int vector; - - vector = data & HV_SYNIC_SINT_VECTOR_MASK; - if (vector < 16 && !host) - return 1; - /* - * Guest may configure multiple SINTs to use the same vector, so - * we maintain a bitmap of vectors handled by synic, and a - * bitmap of vectors with auto-eoi behavior. The bitmaps are - * updated here, and atomically queried on fast paths. - */ - - atomic64_set(&synic->sint[sint], data); - - if (synic_has_vector_connected(synic, vector)) - __set_bit(vector, synic->vec_bitmap); - else - __clear_bit(vector, synic->vec_bitmap); - - if (synic_has_vector_auto_eoi(synic, vector)) - __set_bit(vector, synic->auto_eoi_bitmap); - else - __clear_bit(vector, synic->auto_eoi_bitmap); - - /* Load SynIC vectors into EOI exit bitmap */ - kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); - return 0; -} - -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) -{ - struct kvm_vcpu *vcpu; - struct kvm_vcpu_hv_synic *synic; - - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); - if (!vcpu) - return NULL; - synic = vcpu_to_synic(vcpu); - return (synic->active) ? synic : NULL; -} - -static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, - u32 sint) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *msg; - struct hv_message_page *msg_page; - - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", - gpa); - return; - } - msg_page = kmap_atomic(page); - - msg = &msg_page->sint_message[sint]; - msg->header.message_flags.msg_pending = 0; - - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); -} - -static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - struct kvm_vcpu_hv_stimer *stimer; - int gsi, idx, stimers_pending; - - trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); - - if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) - synic_clear_sint_msg_pending(synic, sint); - - /* Try to deliver pending Hyper-V SynIC timers messages */ - stimers_pending = 0; - for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { - stimer = &hv_vcpu->stimer[idx]; - if (stimer->msg_pending && - (stimer->config & HV_STIMER_ENABLE) && - HV_STIMER_SINT(stimer->config) == sint) { - set_bit(stimer->index, - hv_vcpu->stimer_pending_bitmap); - stimers_pending++; - } - } - if (stimers_pending) - kvm_make_request(KVM_REQ_HV_STIMER, vcpu); - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = atomic_read(&synic->sint_to_gsi[sint]); - if (gsi != -1) - kvm_notify_acked_gsi(kvm, gsi); - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; - - hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; - hv_vcpu->exit.u.synic.msr = msr; - hv_vcpu->exit.u.synic.control = synic->control; - hv_vcpu->exit.u.synic.evt_page = synic->evt_page; - hv_vcpu->exit.u.synic.msg_page = synic->msg_page; - - kvm_make_request(KVM_REQ_HV_EXIT, vcpu); -} - -static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, - u32 msr, u64 data, bool host) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - int ret; - - if (!synic->active) - return 1; - - trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); - - ret = 0; - switch (msr) { - case HV_X64_MSR_SCONTROL: - synic->control = data; - if (!host) - synic_exit(synic, msr); - break; - case HV_X64_MSR_SVERSION: - if (!host) { - ret = 1; - break; - } - synic->version = data; - break; - case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) - if (kvm_clear_guest(vcpu->kvm, - data & PAGE_MASK, PAGE_SIZE)) { - ret = 1; - break; - } - synic->evt_page = data; - if (!host) - synic_exit(synic, msr); - break; - case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) - if (kvm_clear_guest(vcpu->kvm, - data & PAGE_MASK, PAGE_SIZE)) { - ret = 1; - break; - } - synic->msg_page = data; - if (!host) - synic_exit(synic, msr); - break; - case HV_X64_MSR_EOM: { - int i; - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) - kvm_hv_notify_acked_sint(vcpu, i); - break; - } - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); - break; - default: - ret = 1; - break; - } - return ret; -} - -static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) -{ - int ret; - - if (!synic->active) - return 1; - - ret = 0; - switch (msr) { - case HV_X64_MSR_SCONTROL: - *pdata = synic->control; - break; - case HV_X64_MSR_SVERSION: - *pdata = synic->version; - break; - case HV_X64_MSR_SIEFP: - *pdata = synic->evt_page; - break; - case HV_X64_MSR_SIMP: - *pdata = synic->msg_page; - break; - case HV_X64_MSR_EOM: - *pdata = 0; - break; - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); - break; - default: - ret = 1; - break; - } - return ret; -} - -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_lapic_irq irq; - int ret, vector; - - if (sint >= ARRAY_SIZE(synic->sint)) - return -EINVAL; - - vector = synic_get_sint_vector(synic_read_sint(synic, sint)); - if (vector < 0) - return -ENOENT; - - memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); - irq.dest_mode = APIC_DEST_PHYSICAL; - irq.delivery_mode = APIC_DM_FIXED; - irq.vector = vector; - irq.level = 1; - - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); - trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); - return ret; -} - -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) -{ - struct kvm_vcpu_hv_synic *synic; - - synic = synic_get(kvm, vcpu_id); - if (!synic) - return -EINVAL; - - return synic_set_irq(synic, sint); -} - -void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) -{ - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - int i; - - trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) - if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) - kvm_hv_notify_acked_sint(vcpu, i); -} - -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) -{ - struct kvm_vcpu_hv_synic *synic; - - synic = synic_get(kvm, vcpu_id); - if (!synic) - return -EINVAL; - - if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) - return -EINVAL; - - atomic_set(&synic->sint_to_gsi[sint], gsi); - return 0; -} - -void kvm_hv_irq_routing_update(struct kvm *kvm) -{ - struct kvm_irq_routing_table *irq_rt; - struct kvm_kernel_irq_routing_entry *e; - u32 gsi; - - irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, - lockdep_is_held(&kvm->irq_lock)); - - for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { - hlist_for_each_entry(e, &irq_rt->map[gsi], link) { - if (e->type == KVM_IRQ_ROUTING_HV_SINT) - kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, - e->hv_sint.sint, gsi); - } - } -} - -static void synic_init(struct kvm_vcpu_hv_synic *synic) -{ - int i; - - memset(synic, 0, sizeof(*synic)); - synic->version = HV_SYNIC_VERSION_1; - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { - atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); - atomic_set(&synic->sint_to_gsi[i], -1); - } -} - -static u64 get_time_ref_counter(struct kvm *kvm) -{ - struct kvm_hv *hv = &kvm->arch.hyperv; - struct kvm_vcpu *vcpu; - u64 tsc; - - /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. - */ - if (!hv->tsc_ref.tsc_sequence) - return div_u64(get_kvmclock_ns(kvm), 100); - - vcpu = kvm_get_vcpu(kvm, 0); - tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) - + hv->tsc_ref.tsc_offset; -} - -static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, - bool vcpu_kick) -{ - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - - set_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); - kvm_make_request(KVM_REQ_HV_STIMER, vcpu); - if (vcpu_kick) - kvm_vcpu_kick(vcpu); -} - -static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) -{ - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - - trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index); - - hrtimer_cancel(&stimer->timer); - clear_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); - stimer->msg_pending = false; - stimer->exp_time = 0; -} - -static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) -{ - struct kvm_vcpu_hv_stimer *stimer; - - stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); - trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index); - stimer_mark_pending(stimer, true); - - return HRTIMER_NORESTART; -} - -/* - * stimer_start() assumptions: - * a) stimer->count is not equal to 0 - * b) stimer->config has HV_STIMER_ENABLE flag - */ -static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) -{ - u64 time_now; - ktime_t ktime_now; - - time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); - ktime_now = ktime_get(); - - if (stimer->config & HV_STIMER_PERIODIC) { - if (stimer->exp_time) { - if (time_now >= stimer->exp_time) { - u64 remainder; - - div64_u64_rem(time_now - stimer->exp_time, - stimer->count, &remainder); - stimer->exp_time = - time_now + (stimer->count - remainder); - } - } else - stimer->exp_time = time_now + stimer->count; - - trace_kvm_hv_stimer_start_periodic( - stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, - time_now, stimer->exp_time); - - hrtimer_start(&stimer->timer, - ktime_add_ns(ktime_now, - 100 * (stimer->exp_time - time_now)), - HRTIMER_MODE_ABS); - return 0; - } - stimer->exp_time = stimer->count; - if (time_now >= stimer->count) { - /* - * Expire timer according to Hypervisor Top-Level Functional - * specification v4(15.3.1): - * "If a one shot is enabled and the specified count is in - * the past, it will expire immediately." - */ - stimer_mark_pending(stimer, false); - return 0; - } - - trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, - time_now, stimer->count); - - hrtimer_start(&stimer->timer, - ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), - HRTIMER_MODE_ABS); - return 0; -} - -static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, - bool host) -{ - trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, config, host); - - stimer_cleanup(stimer); - if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) - config &= ~HV_STIMER_ENABLE; - stimer->config = config; - stimer_mark_pending(stimer, false); - return 0; -} - -static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, - bool host) -{ - trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, count, host); - - stimer_cleanup(stimer); - stimer->count = count; - if (stimer->count == 0) - stimer->config &= ~HV_STIMER_ENABLE; - else if (stimer->config & HV_STIMER_AUTOENABLE) - stimer->config |= HV_STIMER_ENABLE; - stimer_mark_pending(stimer, false); - return 0; -} - -static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) -{ - *pconfig = stimer->config; - return 0; -} - -static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) -{ - *pcount = stimer->count; - return 0; -} - -static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, - struct hv_message *src_msg) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *dst_msg; - int r; - struct hv_message_page *msg_page; - - if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) - return -ENOENT; - - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - return -EFAULT; - - msg_page = kmap_atomic(page); - dst_msg = &msg_page->sint_message[sint]; - if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, - src_msg->header.message_type) != HVMSG_NONE) { - dst_msg->header.message_flags.msg_pending = 1; - r = -EAGAIN; - } else { - memcpy(&dst_msg->u.payload, &src_msg->u.payload, - src_msg->header.payload_size); - dst_msg->header.message_type = src_msg->header.message_type; - dst_msg->header.payload_size = src_msg->header.payload_size; - r = synic_set_irq(synic, sint); - if (r >= 1) - r = 0; - else if (r == 0) - r = -EFAULT; - } - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - return r; -} - -static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) -{ - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct hv_message *msg = &stimer->msg; - struct hv_timer_message_payload *payload = - (struct hv_timer_message_payload *)&msg->u.payload; - - payload->expiration_time = stimer->exp_time; - payload->delivery_time = get_time_ref_counter(vcpu->kvm); - return synic_deliver_msg(vcpu_to_synic(vcpu), - HV_STIMER_SINT(stimer->config), msg); -} - -static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) -{ - int r; - - stimer->msg_pending = true; - r = stimer_send_msg(stimer); - trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, r); - if (!r) { - stimer->msg_pending = false; - if (!(stimer->config & HV_STIMER_PERIODIC)) - stimer->config &= ~HV_STIMER_ENABLE; - } -} - -void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - struct kvm_vcpu_hv_stimer *stimer; - u64 time_now, exp_time; - int i; - - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { - stimer = &hv_vcpu->stimer[i]; - if (stimer->config & HV_STIMER_ENABLE) { - exp_time = stimer->exp_time; - - if (exp_time) { - time_now = - get_time_ref_counter(vcpu->kvm); - if (time_now >= exp_time) - stimer_expiration(stimer); - } - - if ((stimer->config & HV_STIMER_ENABLE) && - stimer->count) - stimer_start(stimer); - else - stimer_cleanup(stimer); - } - } -} - -void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - int i; - - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - stimer_cleanup(&hv_vcpu->stimer[i]); -} - -static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) -{ - struct hv_message *msg = &stimer->msg; - struct hv_timer_message_payload *payload = - (struct hv_timer_message_payload *)&msg->u.payload; - - memset(&msg->header, 0, sizeof(msg->header)); - msg->header.message_type = HVMSG_TIMER_EXPIRED; - msg->header.payload_size = sizeof(*payload); - - payload->timer_index = stimer->index; - payload->expiration_time = 0; - payload->delivery_time = 0; -} - -static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) -{ - memset(stimer, 0, sizeof(*stimer)); - stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; - stimer_prepare_msg(stimer); -} - -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - int i; - - synic_init(&hv_vcpu->synic); - - bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - stimer_init(&hv_vcpu->stimer[i], i); -} - -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) -{ - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so deactivate APICV - */ - kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; - return 0; -} - -static bool kvm_hv_msr_partition_wide(u32 msr) -{ - bool r = false; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - case HV_X64_MSR_REFERENCE_TSC: - case HV_X64_MSR_TIME_REF_COUNT: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_RESET: - r = true; - break; - } - - return r; -} - -static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 *pdata) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) - return -EINVAL; - - *pdata = hv->hv_crash_param[index]; - return 0; -} - -static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - *pdata = hv->hv_crash_ctl; - return 0; -} - -static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (host) - hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; - - if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { - - vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", - hv->hv_crash_param[0], - hv->hv_crash_param[1], - hv->hv_crash_param[2], - hv->hv_crash_param[3], - hv->hv_crash_param[4]); - - /* Send notification about crash to user space */ - kvm_make_request(KVM_REQ_HV_CRASH, vcpu); - } - - return 0; -} - -static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 data) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) - return -EINVAL; - - hv->hv_crash_param[index] = data; - return 0; -} - -/* - * The kvmclock and Hyper-V TSC page use similar formulas, and converting - * between them is possible: - * - * kvmclock formula: - * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) - * + system_time - * - * Hyper-V formula: - * nsec/100 = ticks * scale / 2^64 + offset - * - * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. - * By dividing the kvmclock formula by 100 and equating what's left we get: - * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 - * - * Now expand the kvmclock formula and divide by 100: - * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) - * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) - * + system_time - * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * + system_time / 100 - * - * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: - * nsec/100 = ticks * scale / 2^64 - * - tsc_timestamp * scale / 2^64 - * + system_time / 100 - * - * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: - * offset = system_time / 100 - tsc_timestamp * scale / 2^64 - * - * These two equivalencies are implemented in this function. - */ -static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, - HV_REFERENCE_TSC_PAGE *tsc_ref) -{ - u64 max_mul; - - if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) - return false; - - /* - * check if scale would overflow, if so we use the time ref counter - * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 - * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) - * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) - */ - max_mul = 100ull << (32 - hv_clock->tsc_shift); - if (hv_clock->tsc_to_system_mul >= max_mul) - return false; - - /* - * Otherwise compute the scale and offset according to the formulas - * derived above. - */ - tsc_ref->tsc_scale = - mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), - hv_clock->tsc_to_system_mul, - 100); - - tsc_ref->tsc_offset = hv_clock->system_time; - do_div(tsc_ref->tsc_offset, 100); - tsc_ref->tsc_offset -= - mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); - return true; -} - -void kvm_hv_setup_tsc_page(struct kvm *kvm, - struct pvclock_vcpu_time_info *hv_clock) -{ - struct kvm_hv *hv = &kvm->arch.hyperv; - u32 tsc_seq; - u64 gfn; - - BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); - - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - return; - - gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - /* - * Because the TSC parameters only vary when there is a - * change in the master clock, do not bother with caching. - */ - if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), - &tsc_seq, sizeof(tsc_seq)))) - return; - - /* - * While we're computing and writing the parameters, force the - * guest to use the time reference count MSR. - */ - hv->tsc_ref.tsc_sequence = 0; - if (kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - return; - - if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - return; - - /* Ensure sequence is zero before writing the rest of the struct. */ - smp_wmb(); - if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - return; - - /* - * Now switch to the TSC page mechanism by writing the sequence. - */ - tsc_seq++; - if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) - tsc_seq = 1; - - /* Write the struct entirely before the non-zero sequence. */ - smp_wmb(); - - hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); -} - -static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, - bool host) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - hv->hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!hv->hv_guest_os_id) - hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; - - /* if guest os id is not set hypercall should remain disabled */ - if (!hv->hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - hv->hv_hypercall = data; - break; - } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) - return 1; - hv->hv_hypercall = data; - mark_page_dirty(kvm, gfn); - break; - } - case HV_X64_MSR_REFERENCE_TSC: - hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - break; - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_set_crash_data(vcpu, - msr - HV_X64_MSR_CRASH_P0, - data); - case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_set_crash_ctl(vcpu, data, host); - case HV_X64_MSR_RESET: - if (data == 1) { - vcpu_debug(vcpu, "hyper-v reset requested\n"); - kvm_make_request(KVM_REQ_HV_RESET, vcpu); - } - break; - default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } - return 0; -} - -/* Calculate cpu time spent by current task in 100ns units */ -static u64 current_task_runtime_100ns(void) -{ - cputime_t utime, stime; - - task_cputime_adjusted(current, &utime, &stime); - return div_u64(cputime_to_nsecs(utime + stime), 100); -} - -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) -{ - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - u64 gfn; - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - hv->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) - return 1; - break; - } - gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (kvm_is_error_hva(addr)) - return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) - return 1; - hv->hv_vapic = data; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, - gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) - return 1; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); - case HV_X64_MSR_VP_RUNTIME: - if (!host) - return 1; - hv->runtime_offset = data - current_task_runtime_100ns(); - break; - case HV_X64_MSR_SCONTROL: - case HV_X64_MSR_SVERSION: - case HV_X64_MSR_SIEFP: - case HV_X64_MSR_SIMP: - case HV_X64_MSR_EOM: - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); - case HV_X64_MSR_STIMER0_CONFIG: - case HV_X64_MSR_STIMER1_CONFIG: - case HV_X64_MSR_STIMER2_CONFIG: - case HV_X64_MSR_STIMER3_CONFIG: { - int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - - return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), - data, host); - } - case HV_X64_MSR_STIMER0_COUNT: - case HV_X64_MSR_STIMER1_COUNT: - case HV_X64_MSR_STIMER2_COUNT: - case HV_X64_MSR_STIMER3_COUNT: { - int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - - return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), - data, host); - } - default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } - - return 0; -} - -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = hv->hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = hv->hv_hypercall; - break; - case HV_X64_MSR_TIME_REF_COUNT: - data = get_time_ref_counter(kvm); - break; - case HV_X64_MSR_REFERENCE_TSC: - data = hv->hv_tsc_page; - break; - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_get_crash_data(vcpu, - msr - HV_X64_MSR_CRASH_P0, - pdata); - case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_get_crash_ctl(vcpu, pdata); - case HV_X64_MSR_RESET: - data = 0; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - - *pdata = data; - return 0; -} - -static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: - data = hv->hv_vapic; - break; - case HV_X64_MSR_VP_RUNTIME: - data = current_task_runtime_100ns() + hv->runtime_offset; - break; - case HV_X64_MSR_SCONTROL: - case HV_X64_MSR_SVERSION: - case HV_X64_MSR_SIEFP: - case HV_X64_MSR_SIMP: - case HV_X64_MSR_EOM: - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); - case HV_X64_MSR_STIMER0_CONFIG: - case HV_X64_MSR_STIMER1_CONFIG: - case HV_X64_MSR_STIMER2_CONFIG: - case HV_X64_MSR_STIMER3_CONFIG: { - int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - - return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), - pdata); - } - case HV_X64_MSR_STIMER0_COUNT: - case HV_X64_MSR_STIMER1_COUNT: - case HV_X64_MSR_STIMER2_COUNT: - case HV_X64_MSR_STIMER3_COUNT: { - int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - - return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), - pdata); - } - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} - -int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) -{ - if (kvm_hv_msr_partition_wide(msr)) { - int r; - - mutex_lock(&vcpu->kvm->lock); - r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return kvm_hv_set_msr(vcpu, msr, data, host); -} - -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - if (kvm_hv_msr_partition_wide(msr)) { - int r; - - mutex_lock(&vcpu->kvm->lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return kvm_hv_get_msr(vcpu, msr, pdata); -} - -bool kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; -} - -static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) -{ - bool longmode; - - longmode = is_64_bit_mode(vcpu); - if (longmode) - kvm_register_write(vcpu, VCPU_REGS_RAX, result); - else { - kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); - } -} - -static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - - kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); - return 1; -} - -int kvm_hv_hypercall(struct kvm_vcpu *vcpu) -{ - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - bool fast, longmode; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif - - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; - - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - - /* Hypercall continuation is not supported yet */ - if (rep_cnt || rep_idx) { - res = HV_STATUS_INVALID_HYPERCALL_CODE; - goto set_result; - } - - switch (code) { - case HVCALL_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); - break; - case HVCALL_POST_MESSAGE: - case HVCALL_SIGNAL_EVENT: - /* don't bother userspace if it has no way to handle it */ - if (!vcpu_to_synic(vcpu)->active) { - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - -set_result: - ret = res | (((u64)rep_done & 0xfff) << 32); - kvm_hv_hypercall_set_result(vcpu, ret); - return 1; -} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h deleted file mode 100644 index cd11195..0000000 --- a/arch/x86/kvm/hyperv.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * KVM Microsoft Hyper-V emulation - * - * derived from arch/x86/kvm/x86.c - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2008 Qumranet, Inc. - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * Amit Shah <amit.shah@qumranet.com> - * Ben-Ami Yassour <benami@il.ibm.com> - * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#ifndef __ARCH_X86_KVM_HYPERV_H__ -#define __ARCH_X86_KVM_HYPERV_H__ - -static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) -{ - return &vcpu->arch.hyperv; -} - -static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu) -{ - struct kvm_vcpu_arch *arch; - - arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv); - return container_of(arch, struct kvm_vcpu, arch); -} - -static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu) -{ - return &vcpu->arch.hyperv.synic; -} - -static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) -{ - return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); -} - -int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); - -bool kvm_hv_hypercall_enabled(struct kvm *kvm); -int kvm_hv_hypercall(struct kvm_vcpu *vcpu); - -void kvm_hv_irq_routing_update(struct kvm *kvm); -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); -void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); - -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); - -static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, - int timer_index) -{ - return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index]; -} - -static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) -{ - struct kvm_vcpu_hv *hv_vcpu; - - hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv, - stimer[0]); - return hv_vcpu_to_vcpu(hv_vcpu); -} - -static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) -{ - return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap, - HV_SYNIC_STIMER_COUNT); -} - -void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); - -void kvm_hv_setup_tsc_page(struct kvm *kvm, - struct pvclock_vcpu_time_info *hv_clock); - -#endif diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c deleted file mode 100644 index 16a7134..0000000 --- a/arch/x86/kvm/i8254.c +++ /dev/null @@ -1,738 +0,0 @@ -/* - * 8253/8254 interval timer emulation - * - * Copyright (c) 2003-2004 Fabrice Bellard - * Copyright (c) 2006 Intel Corporation - * Copyright (c) 2007 Keir Fraser, XenSource Inc - * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * Authors: - * Sheng Yang <sheng.yang@intel.com> - * Based on QEMU and Xen. - */ - -#define pr_fmt(fmt) "pit: " fmt - -#include <linux/kvm_host.h> -#include <linux/slab.h> - -#include "ioapic.h" -#include "irq.h" -#include "i8254.h" -#include "x86.h" - -#ifndef CONFIG_X86_64 -#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) -#else -#define mod_64(x, y) ((x) % (y)) -#endif - -#define RW_STATE_LSB 1 -#define RW_STATE_MSB 2 -#define RW_STATE_WORD0 3 -#define RW_STATE_WORD1 4 - -static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - - switch (c->mode) { - default: - case 0: - case 4: - /* XXX: just disable/enable counting */ - break; - case 1: - case 2: - case 3: - case 5: - /* Restart counting on rising edge. */ - if (c->gate < val) - c->count_load_time = ktime_get(); - break; - } - - c->gate = val; -} - -static int pit_get_gate(struct kvm_pit *pit, int channel) -{ - return pit->pit_state.channels[channel].gate; -} - -static s64 __kpit_elapsed(struct kvm_pit *pit) -{ - s64 elapsed; - ktime_t remaining; - struct kvm_kpit_state *ps = &pit->pit_state; - - if (!ps->period) - return 0; - - /* - * The Counter does not stop when it reaches zero. In - * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to - * the highest count, either FFFF hex for binary counting - * or 9999 for BCD counting, and continues counting. - * Modes 2 and 3 are periodic; the Counter reloads - * itself with the initial count and continues counting - * from there. - */ - remaining = hrtimer_get_remaining(&ps->timer); - elapsed = ps->period - ktime_to_ns(remaining); - - return elapsed; -} - -static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c, - int channel) -{ - if (channel == 0) - return __kpit_elapsed(pit); - - return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); -} - -static int pit_get_count(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - s64 d, t; - int counter; - - t = kpit_elapsed(pit, c, channel); - d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); - - switch (c->mode) { - case 0: - case 1: - case 4: - case 5: - counter = (c->count - d) & 0xffff; - break; - case 3: - /* XXX: may be incorrect for odd counts */ - counter = c->count - (mod_64((2 * d), c->count)); - break; - default: - counter = c->count - mod_64(d, c->count); - break; - } - return counter; -} - -static int pit_get_out(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - s64 d, t; - int out; - - t = kpit_elapsed(pit, c, channel); - d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); - - switch (c->mode) { - default: - case 0: - out = (d >= c->count); - break; - case 1: - out = (d < c->count); - break; - case 2: - out = ((mod_64(d, c->count) == 0) && (d != 0)); - break; - case 3: - out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); - break; - case 4: - case 5: - out = (d == c->count); - break; - } - - return out; -} - -static void pit_latch_count(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - - if (!c->count_latched) { - c->latched_count = pit_get_count(pit, channel); - c->count_latched = c->rw_mode; - } -} - -static void pit_latch_status(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - - if (!c->status_latched) { - /* TODO: Return NULL COUNT (bit 6). */ - c->status = ((pit_get_out(pit, channel) << 7) | - (c->rw_mode << 4) | - (c->mode << 1) | - c->bcd); - c->status_latched = 1; - } -} - -static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps) -{ - return container_of(ps, struct kvm_pit, pit_state); -} - -static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, - irq_ack_notifier); - struct kvm_pit *pit = pit_state_to_pit(ps); - - atomic_set(&ps->irq_ack, 1); - /* irq_ack should be set before pending is read. Order accesses with - * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work. - */ - smp_mb(); - if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); -} - -void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct hrtimer *timer; - - if (!kvm_vcpu_is_bsp(vcpu) || !pit) - return; - - timer = &pit->pit_state.timer; - mutex_lock(&pit->pit_state.lock); - if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); - mutex_unlock(&pit->pit_state.lock); -} - -static void destroy_pit_timer(struct kvm_pit *pit) -{ - hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); -} - -static void pit_do_work(struct kthread_work *work) -{ - struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); - struct kvm *kvm = pit->kvm; - struct kvm_vcpu *vcpu; - int i; - struct kvm_kpit_state *ps = &pit->pit_state; - - if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) - return; - - kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation does not follow - * the MP specification. We propagate a PIT interrupt to all - * VCPUs and only when LVT0 is in NMI mode. The interrupt can - * also be simultaneously delivered through PIC and IOAPIC. - */ - if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); -} - -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); - struct kvm_pit *pt = pit_state_to_pit(ps); - - if (atomic_read(&ps->reinject)) - atomic_inc(&ps->pending); - - kthread_queue_work(&pt->worker, &pt->expired); - - if (ps->is_periodic) { - hrtimer_add_expires_ns(&ps->timer, ps->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} - -static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) -{ - atomic_set(&pit->pit_state.pending, 0); - atomic_set(&pit->pit_state.irq_ack, 1); -} - -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) -{ - struct kvm_kpit_state *ps = &pit->pit_state; - struct kvm *kvm = pit->kvm; - - if (atomic_read(&ps->reinject) == reinject) - return; - - if (reinject) { - /* The initial state is preserved while ps->reinject == 0. */ - kvm_pit_reset_reinject(pit); - kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); - kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - } else { - kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); - kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - } - - atomic_set(&ps->reinject, reinject); -} - -static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) -{ - struct kvm_kpit_state *ps = &pit->pit_state; - struct kvm *kvm = pit->kvm; - s64 interval; - - if (!ioapic_in_kernel(kvm) || - ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) - return; - - interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ); - - pr_debug("create pit timer, interval is %llu nsec\n", interval); - - /* TODO The new value only affected after the retriggered */ - hrtimer_cancel(&ps->timer); - kthread_flush_work(&pit->expired); - ps->period = interval; - ps->is_periodic = is_period; - - kvm_pit_reset_reinject(pit); - - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (ps->is_periodic) { - s64 min_period = min_timer_period_us * 1000LL; - - if (ps->period < min_period) { - pr_info_ratelimited( - "kvm: requested %lld ns " - "i8254 timer period limited to %lld ns\n", - ps->period, min_period); - ps->period = min_period; - } - } - - hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), - HRTIMER_MODE_ABS); -} - -static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) -{ - struct kvm_kpit_state *ps = &pit->pit_state; - - pr_debug("load_count val is %d, channel is %d\n", val, channel); - - /* - * The largest possible initial count is 0; this is equivalent - * to 216 for binary counting and 104 for BCD counting. - */ - if (val == 0) - val = 0x10000; - - ps->channels[channel].count = val; - - if (channel != 0) { - ps->channels[channel].count_load_time = ktime_get(); - return; - } - - /* Two types of timer - * mode 1 is one shot, mode 2 is period, otherwise del timer */ - switch (ps->channels[0].mode) { - case 0: - case 1: - /* FIXME: enhance mode 4 precision */ - case 4: - create_pit_timer(pit, val, 0); - break; - case 2: - case 3: - create_pit_timer(pit, val, 1); - break; - default: - destroy_pit_timer(pit); - } -} - -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start) -{ - u8 saved_mode; - - WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock)); - - if (hpet_legacy_start) { - /* save existing mode for later reenablement */ - WARN_ON(channel != 0); - saved_mode = pit->pit_state.channels[0].mode; - pit->pit_state.channels[0].mode = 0xff; /* disable timer */ - pit_load_count(pit, channel, val); - pit->pit_state.channels[0].mode = saved_mode; - } else { - pit_load_count(pit, channel, val); - } -} - -static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pit, dev); -} - -static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pit, speaker_dev); -} - -static inline int pit_in_range(gpa_t addr) -{ - return ((addr >= KVM_PIT_BASE_ADDRESS) && - (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); -} - -static int pit_ioport_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, const void *data) -{ - struct kvm_pit *pit = dev_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - int channel, access; - struct kvm_kpit_channel_state *s; - u32 val = *(u32 *) data; - if (!pit_in_range(addr)) - return -EOPNOTSUPP; - - val &= 0xff; - addr &= KVM_PIT_CHANNEL_MASK; - - mutex_lock(&pit_state->lock); - - if (val != 0) - pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", - (unsigned int)addr, len, val); - - if (addr == 3) { - channel = val >> 6; - if (channel == 3) { - /* Read-Back Command. */ - for (channel = 0; channel < 3; channel++) { - s = &pit_state->channels[channel]; - if (val & (2 << channel)) { - if (!(val & 0x20)) - pit_latch_count(pit, channel); - if (!(val & 0x10)) - pit_latch_status(pit, channel); - } - } - } else { - /* Select Counter <channel>. */ - s = &pit_state->channels[channel]; - access = (val >> 4) & KVM_PIT_CHANNEL_MASK; - if (access == 0) { - pit_latch_count(pit, channel); - } else { - s->rw_mode = access; - s->read_state = access; - s->write_state = access; - s->mode = (val >> 1) & 7; - if (s->mode > 5) - s->mode -= 4; - s->bcd = val & 1; - } - } - } else { - /* Write Count. */ - s = &pit_state->channels[addr]; - switch (s->write_state) { - default: - case RW_STATE_LSB: - pit_load_count(pit, addr, val); - break; - case RW_STATE_MSB: - pit_load_count(pit, addr, val << 8); - break; - case RW_STATE_WORD0: - s->write_latch = val; - s->write_state = RW_STATE_WORD1; - break; - case RW_STATE_WORD1: - pit_load_count(pit, addr, s->write_latch | (val << 8)); - s->write_state = RW_STATE_WORD0; - break; - } - } - - mutex_unlock(&pit_state->lock); - return 0; -} - -static int pit_ioport_read(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, void *data) -{ - struct kvm_pit *pit = dev_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - int ret, count; - struct kvm_kpit_channel_state *s; - if (!pit_in_range(addr)) - return -EOPNOTSUPP; - - addr &= KVM_PIT_CHANNEL_MASK; - if (addr == 3) - return 0; - - s = &pit_state->channels[addr]; - - mutex_lock(&pit_state->lock); - - if (s->status_latched) { - s->status_latched = 0; - ret = s->status; - } else if (s->count_latched) { - switch (s->count_latched) { - default: - case RW_STATE_LSB: - ret = s->latched_count & 0xff; - s->count_latched = 0; - break; - case RW_STATE_MSB: - ret = s->latched_count >> 8; - s->count_latched = 0; - break; - case RW_STATE_WORD0: - ret = s->latched_count & 0xff; - s->count_latched = RW_STATE_MSB; - break; - } - } else { - switch (s->read_state) { - default: - case RW_STATE_LSB: - count = pit_get_count(pit, addr); - ret = count & 0xff; - break; - case RW_STATE_MSB: - count = pit_get_count(pit, addr); - ret = (count >> 8) & 0xff; - break; - case RW_STATE_WORD0: - count = pit_get_count(pit, addr); - ret = count & 0xff; - s->read_state = RW_STATE_WORD1; - break; - case RW_STATE_WORD1: - count = pit_get_count(pit, addr); - ret = (count >> 8) & 0xff; - s->read_state = RW_STATE_WORD0; - break; - } - } - - if (len > sizeof(ret)) - len = sizeof(ret); - memcpy(data, (char *)&ret, len); - - mutex_unlock(&pit_state->lock); - return 0; -} - -static int speaker_ioport_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, const void *data) -{ - struct kvm_pit *pit = speaker_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - u32 val = *(u32 *) data; - if (addr != KVM_SPEAKER_BASE_ADDRESS) - return -EOPNOTSUPP; - - mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; - pit_set_gate(pit, 2, val & 1); - mutex_unlock(&pit_state->lock); - return 0; -} - -static int speaker_ioport_read(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, void *data) -{ - struct kvm_pit *pit = speaker_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - unsigned int refresh_clock; - int ret; - if (addr != KVM_SPEAKER_BASE_ADDRESS) - return -EOPNOTSUPP; - - /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ - refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; - - mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); - if (len > sizeof(ret)) - len = sizeof(ret); - memcpy(data, (char *)&ret, len); - mutex_unlock(&pit_state->lock); - return 0; -} - -static void kvm_pit_reset(struct kvm_pit *pit) -{ - int i; - struct kvm_kpit_channel_state *c; - - pit->pit_state.flags = 0; - for (i = 0; i < 3; i++) { - c = &pit->pit_state.channels[i]; - c->mode = 0xff; - c->gate = (i != 2); - pit_load_count(pit, i, 0); - } - - kvm_pit_reset_reinject(pit); -} - -static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) -{ - struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); - - if (!mask) - kvm_pit_reset_reinject(pit); -} - -static const struct kvm_io_device_ops pit_dev_ops = { - .read = pit_ioport_read, - .write = pit_ioport_write, -}; - -static const struct kvm_io_device_ops speaker_dev_ops = { - .read = speaker_ioport_read, - .write = speaker_ioport_write, -}; - -struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) -{ - struct kvm_pit *pit; - struct kvm_kpit_state *pit_state; - struct pid *pid; - pid_t pid_nr; - int ret; - - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); - if (!pit) - return NULL; - - pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) - goto fail_request; - - mutex_init(&pit->pit_state.lock); - - pid = get_pid(task_tgid(current)); - pid_nr = pid_vnr(pid); - put_pid(pid); - - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) - goto fail_kthread; - - kthread_init_work(&pit->expired, pit_do_work); - - pit->kvm = kvm; - - pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; - - pit_state->irq_ack_notifier.gsi = 0; - pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; - pit->mask_notifier.func = pit_mask_notifer; - - kvm_pit_reset(pit); - - kvm_pit_set_reinject(pit, true); - - mutex_lock(&kvm->slots_lock); - kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, - KVM_PIT_MEM_LENGTH, &pit->dev); - if (ret < 0) - goto fail_register_pit; - - if (flags & KVM_PIT_SPEAKER_DUMMY) { - kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - KVM_SPEAKER_BASE_ADDRESS, 4, - &pit->speaker_dev); - if (ret < 0) - goto fail_register_speaker; - } - mutex_unlock(&kvm->slots_lock); - - return pit; - -fail_register_speaker: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); -fail_register_pit: - mutex_unlock(&kvm->slots_lock); - kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); -fail_kthread: - kvm_free_irq_source_id(kvm, pit->irq_source_id); -fail_request: - kfree(pit); - return NULL; -} - -void kvm_free_pit(struct kvm *kvm) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - if (pit) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); - kvm_pit_set_reinject(pit, false); - hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - kfree(pit); - } -} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h deleted file mode 100644 index 2f5af07..0000000 --- a/arch/x86/kvm/i8254.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef __I8254_H -#define __I8254_H - -#include <linux/kthread.h> - -#include <kvm/iodev.h> - -struct kvm_kpit_channel_state { - u32 count; /* can be 65536 */ - u16 latched_count; - u8 count_latched; - u8 status_latched; - u8 status; - u8 read_state; - u8 write_state; - u8 write_latch; - u8 rw_mode; - u8 mode; - u8 bcd; /* not supported */ - u8 gate; /* timer start */ - ktime_t count_load_time; -}; - -struct kvm_kpit_state { - /* All members before "struct mutex lock" are protected by the lock. */ - struct kvm_kpit_channel_state channels[3]; - u32 flags; - bool is_periodic; - s64 period; /* unit: ns */ - struct hrtimer timer; - u32 speaker_data_on; - - struct mutex lock; - atomic_t reinject; - atomic_t pending; /* accumulated triggered timers */ - atomic_t irq_ack; - struct kvm_irq_ack_notifier irq_ack_notifier; -}; - -struct kvm_pit { - struct kvm_io_device dev; - struct kvm_io_device speaker_dev; - struct kvm *kvm; - struct kvm_kpit_state pit_state; - int irq_source_id; - struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; - struct kthread_work expired; -}; - -#define KVM_PIT_BASE_ADDRESS 0x40 -#define KVM_SPEAKER_BASE_ADDRESS 0x61 -#define KVM_PIT_MEM_LENGTH 4 -#define KVM_PIT_FREQ 1193181 -#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 -#define KVM_PIT_CHANNEL_MASK 0x3 - -struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); -void kvm_free_pit(struct kvm *kvm); - -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start); -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); - -#endif diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360..c178239 100644..100755 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -4,6 +4,7 @@ * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,13 +27,11 @@ * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Port from Qemu. */ -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/bitops.h> #include "irq.h" - #include <linux/kvm_host.h> -#include "trace.h" + +#include <ntddk.h> +#include <gvm_types.h> #define pr_pic_unimpl(fmt, ...) \ pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) @@ -40,13 +39,11 @@ static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) { spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) { bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu, *found = NULL; @@ -67,7 +64,7 @@ static void pic_unlock(struct kvm_pic *s) if (!found) return; - kvm_make_request(KVM_REQ_EVENT, found); + kvm_make_request(GVM_REQ_EVENT, found); kvm_vcpu_kick(found); } } @@ -84,7 +81,7 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * it should be safe since PIC state is already updated at this stage. */ pic_unlock(s->pics_state); - kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); + //kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); pic_lock(s->pics_state); } @@ -199,8 +196,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) irq_source_id, level); ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); - trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, - s->pics[irq >> 3].imr, ret == 0); pic_unlock(s); return ret; @@ -620,16 +615,16 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, + ret = kvm_io_bus_register_dev(kvm, GVM_PIO_BUS, 0x20, 2, &s->dev_master); if (ret < 0) goto fail_unlock; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); + ret = kvm_io_bus_register_dev(kvm, GVM_PIO_BUS, 0xa0, 2, &s->dev_slave); if (ret < 0) goto fail_unreg_2; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + ret = kvm_io_bus_register_dev(kvm, GVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); if (ret < 0) goto fail_unreg_1; @@ -638,10 +633,10 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; fail_unreg_1: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); + kvm_io_bus_unregister_dev(kvm, GVM_PIO_BUS, &s->dev_slave); fail_unreg_2: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); + kvm_io_bus_unregister_dev(kvm, GVM_PIO_BUS, &s->dev_master); fail_unlock: mutex_unlock(&kvm->slots_lock); @@ -653,8 +648,8 @@ fail_unlock: void kvm_destroy_pic(struct kvm_pic *vpic) { - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kvm_io_bus_unregister_dev(vpic->kvm, GVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(vpic->kvm, GVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(vpic->kvm, GVM_PIO_BUS, &vpic->dev_eclr); kfree(vpic); } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5..4e2c62b 100644..100755 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -1,6 +1,7 @@ /* * Copyright (C) 2001 MandrakeSoft S.A. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * MandrakeSoft S.A. * 43, rue d'Aboukir @@ -28,36 +29,26 @@ */ #include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/current.h> -#include <trace/events/kvm.h> +#include <uapi/linux/kvm.h> #include "ioapic.h" #include "lapic.h" #include "irq.h" -#if 0 -#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#ifdef KVM_IOAPIC_DEBUG +#define ioapic_debug DbgPrint #else -#define ioapic_debug(fmt, arg...) +#define ioapic_debug(fmt,...) #endif + static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); -static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, - unsigned long addr, - unsigned long length) +static size_t ioapic_read_indirect(struct kvm_ioapic *ioapic, + size_t addr, + size_t length) { - unsigned long result = 0; + size_t result = 0; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -94,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); + bitmap_zero(ioapic->rtc_status.dest_map.map, GVM_MAX_VCPU_ID); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -148,9 +139,6 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) struct kvm_vcpu *vcpu; int i; - if (RTC_GSI >= IOAPIC_NUM_PINS) - return; - rtc_irq_eoi_tracking_reset(ioapic); kvm_for_each_vcpu(i, vcpu, ioapic->kvm) __rtc_irq_eoi_tracking_restore_one(vcpu); @@ -220,11 +208,10 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, ret = ioapic_service(ioapic, irq, line_status); out: - trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); return ret; } -static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) +static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, size_t irr) { u32 idx; @@ -253,7 +240,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) for (index = 0; index < IOAPIC_NUM_PINS; index++) { e = &ioapic->redirtbl[index]; if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || - kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || + //kvm_irq_has_notifier(ioapic->kvm, GVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, e->fields.dest_mode) || @@ -311,7 +298,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } mask_after = e->fields.mask; if (mask_before != mask_after) - kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + kvm_fire_mask_notifiers(ioapic->kvm, GVM_IRQCHIP_IOAPIC, index, mask_after); if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); @@ -389,31 +376,11 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) int i; spin_lock(&ioapic->lock); - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + for (i = 0; i < GVM_IOAPIC_NUM_PINS; i++) __clear_bit(irq_source_id, &ioapic->irq_states[i]); spin_unlock(&ioapic->lock); } -static void kvm_ioapic_eoi_inject_work(struct work_struct *work) -{ - int i; - struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, - eoi_inject.work); - spin_lock(&ioapic->lock); - for (i = 0; i < IOAPIC_NUM_PINS; i++) { - union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; - - if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) - continue; - - if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) - ioapic_service(ioapic, i, false); - } - spin_unlock(&ioapic->lock); -} - -#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 - static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { @@ -441,7 +408,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, * after ack notifier returns. */ spin_unlock(&ioapic->lock); - kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + //kvm_notify_acked_irq(ioapic->kvm, GVM_IRQCHIP_IOAPIC, i); spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || @@ -452,21 +419,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ent->fields.remote_irr = 0; if (!ent->fields.mask && (ioapic->irr & (1 << i))) { ++ioapic->irq_eoi[i]; - if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { - /* - * Real hardware does not deliver the interrupt - * immediately during eoi broadcast, and this - * lets a buggy guest make slow progress - * even if it does not correctly handle a - * level-triggered interrupt. Emulate this - * behavior if we detect an interrupt storm. - */ - schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); - ioapic->irq_eoi[i] = 0; - trace_kvm_ioapic_delayed_eoi_inj(ent->bits); - } else { - ioapic_service(ioapic, i, false); - } + ioapic_service(ioapic, i, false); } else { ioapic->irq_eoi[i] = 0; } @@ -501,7 +454,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ioapic_debug("addr %lx\n", (unsigned long)addr); + ioapic_debug("addr %lx\n", (size_t)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; @@ -586,7 +539,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) { int i; - cancel_delayed_work_sync(&ioapic->eoi_inject); for (i = 0; i < IOAPIC_NUM_PINS; i++) ioapic->redirtbl[i].fields.mask = 1; ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; @@ -612,13 +564,12 @@ int kvm_ioapic_init(struct kvm *kvm) if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); - INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, + ret = kvm_io_bus_register_dev(kvm, GVM_MMIO_BUS, ioapic->base_address, IOAPIC_MEM_LENGTH, &ioapic->dev); mutex_unlock(&kvm->slots_lock); if (ret < 0) { @@ -635,8 +586,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - cancel_delayed_work_sync(&ioapic->eoi_inject); - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm_io_bus_unregister_dev(kvm, GVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; kfree(ioapic); } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 1cc6e54..854f770 100644..100755 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_IO_APIC_H #define __KVM_IO_APIC_H @@ -5,11 +9,13 @@ #include <kvm/iodev.h> +#include <gvm_types.h> + struct kvm; struct kvm_vcpu; -#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS -#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES +#define IOAPIC_NUM_PINS GVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS GVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -34,21 +40,17 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); + DECLARE_BITMAP(map, GVM_MAX_VCPU_ID); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID]; + u8 vectors[GVM_MAX_VCPU_ID]; }; @@ -81,30 +83,16 @@ struct kvm_ioapic { u32 irr; u32 pad; union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; - unsigned long irq_states[IOAPIC_NUM_PINS]; + size_t irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; struct rtc_status rtc_status; - struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; u32 irr_delivered; }; -#ifdef DEBUG -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ - __FILE__, __LINE__, #x); \ - BUG(); \ - } \ -} while (0) -#else -#define ASSERT(x) do { } while (0) -#endif - static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) { return kvm->arch.vioapic; diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426..0000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay <allen.m.kay@intel.com> - * Author: Weidong Han <weidong.han@intel.com> - * Author: Ben-Ami Yassour <benami@il.ibm.com> - */ - -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/stat.h> -#include <linux/iommu.h> -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 60d91c9..ba0db8f 100644..100755 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -2,6 +2,7 @@ * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -20,11 +21,9 @@ * */ -#include <linux/export.h> #include <linux/kvm_host.h> #include "irq.h" -#include "i8254.h" #include "x86.h" /* @@ -38,7 +37,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* * check if there is a pending userspace external interrupt @@ -57,10 +55,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) u8 accept = kvm_apic_accept_pic_intr(v); if (accept) { - if (irqchip_split(v->kvm)) - return pending_userspace_extint(v); - else - return pic_irqchip(v->kvm)->output; + return pic_irqchip(v->kvm)->output; } else return 0; } @@ -99,7 +94,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); /* * Read pending interrupt(from non-APIC source) @@ -108,13 +102,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); static int kvm_cpu_get_extint(struct kvm_vcpu *v) { if (kvm_cpu_has_extint(v)) { - if (irqchip_split(v->kvm)) { - int vector = v->arch.pending_external_vector; - - v->arch.pending_external_vector = -1; - return vector; - } else - return kvm_pic_read_irq(v->kvm); /* PIC */ + return kvm_pic_read_irq(v->kvm); /* PIC */ } else return -1; } @@ -136,17 +124,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) return kvm_get_apic_interrupt(v); /* APIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) kvm_inject_apic_timer_irqs(vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); - -void __kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - __kvm_migrate_apic_timer(vcpu); - __kvm_migrate_pit_timer(vcpu); -} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731e..b51da4d 100644..100755 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -1,6 +1,7 @@ /* * irq.h: in kernel interrupt controller related definitions * Copyright (c) 2007, Intel Corporation. + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -22,10 +23,7 @@ #ifndef __IRQ_H #define __IRQ_H -#include <linux/mm_types.h> -#include <linux/hrtimer.h> #include <linux/kvm_host.h> -#include <linux/spinlock.h> #include <kvm/iodev.h> #include "ioapic.h" @@ -33,7 +31,7 @@ #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ - ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) + ((irq) < 8 ? GVM_IRQCHIP_PIC_MASTER : GVM_IRQCHIP_PIC_SLAVE) struct kvm; struct kvm_vcpu; @@ -70,7 +68,7 @@ struct kvm_pic { struct kvm_io_device dev_slave; struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); - unsigned long irq_states[PIC_NUM_PINS]; + size_t irq_states[PIC_NUM_PINS]; }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); @@ -91,18 +89,12 @@ static inline int pic_in_kernel(struct kvm *kvm) return ret; } -static inline int irqchip_split(struct kvm *kvm) -{ - return kvm->arch.irqchip_split; -} - static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); bool ret; ret = (vpic != NULL); - ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); @@ -114,9 +106,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); -void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); -void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c01916..1fd7c73 100644..100755 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -1,6 +1,7 @@ /* * irq_comm.c: Common API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -21,21 +22,15 @@ */ #include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <trace/events/kvm.h> - #include <asm/msidef.h> - #include "irq.h" #include "ioapic.h" - #include "lapic.h" - -#include "hyperv.h" #include "x86.h" +#include <gvm_types.h> + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -45,7 +40,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, /* * XXX: rejecting pic routes when pic isn't in use would be better, * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. + * NULL and GVM_CREATE_IRQCHIP can race with GVM_IRQ_LINE. */ if (!pic) return -1; @@ -71,7 +66,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, { int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + size_t dest_vcpu_bitmap[BITS_TO_LONGS(GVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; if (irq->dest_mode == 0 && irq->dest_id == 0xff && @@ -112,7 +107,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (dest_vcpus != 0) { int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); + dest_vcpu_bitmap, GVM_MAX_VCPUS); lowest = kvm_get_vcpu(kvm, idx); } @@ -126,10 +121,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? - (u64)e->msi.address_hi << 32 : 0), - e->msi.data); - irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; if (kvm->arch.x2apic_format) @@ -144,7 +135,6 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } -EXPORT_SYMBOL_GPL(kvm_set_msi_irq); static inline bool kvm_msi_route_invalid(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e) @@ -169,16 +159,6 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } -static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - if (!level) - return -1; - - return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); -} - int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -187,11 +167,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, int r; switch (e->type) { - case KVM_IRQ_ROUTING_HV_SINT: - return kvm_hv_set_sint(e, kvm, irq_source_id, level, - line_status); - - case KVM_IRQ_ROUTING_MSI: + case GVM_IRQ_ROUTING_MSI: if (kvm_msi_route_invalid(kvm, e)) return -EINVAL; @@ -210,7 +186,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, int kvm_request_irq_source_id(struct kvm *kvm) { - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + size_t *bitmap = &kvm->arch.irq_sources_bitmap; int irq_source_id; mutex_lock(&kvm->irq_lock); @@ -222,8 +198,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) goto unlock; } - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != GVM_USERSPACE_IRQ_SOURCE_ID); set_bit(irq_source_id, bitmap); unlock: mutex_unlock(&kvm->irq_lock); @@ -233,8 +208,7 @@ unlock: void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) { - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != GVM_USERSPACE_IRQ_SOURCE_ID); mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || @@ -257,7 +231,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, { mutex_lock(&kvm->irq_lock); kimn->irq = irq; - hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); + hlist_add_head(&kimn->link, &kvm->arch.mask_notifier_list); mutex_unlock(&kvm->irq_lock); } @@ -265,24 +239,25 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { mutex_lock(&kvm->irq_lock); - hlist_del_rcu(&kimn->link); + hlist_del(&kimn->link); mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); } void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask) { struct kvm_irq_mask_notifier *kimn; - int idx, gsi; + int gsi; - idx = srcu_read_lock(&kvm->irq_srcu); + mutex_lock(&kvm->irq_lock); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) +#define LIST_ENTRY_TYPE_INFO struct kvm_irq_mask_notifier + hlist_for_each_entry(kimn, &kvm->arch.mask_notifier_list, link) if (kimn->irq == gsi) kimn->func(kimn, mask); - srcu_read_unlock(&kvm->irq_srcu, idx); +#undef LIST_ENTRY_TYPE_INFO + mutex_unlock(&kvm->irq_lock); } int kvm_set_routing_entry(struct kvm *kvm, @@ -294,20 +269,20 @@ int kvm_set_routing_entry(struct kvm *kvm, unsigned max_pin; switch (ue->type) { - case KVM_IRQ_ROUTING_IRQCHIP: + case GVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: + case GVM_IRQCHIP_PIC_MASTER: e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; break; - case KVM_IRQCHIP_PIC_SLAVE: + case GVM_IRQCHIP_PIC_SLAVE: e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; delta = 8; break; - case KVM_IRQCHIP_IOAPIC: - max_pin = KVM_IOAPIC_NUM_PINS; + case GVM_IRQCHIP_IOAPIC: + max_pin = GVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; default: @@ -318,7 +293,7 @@ int kvm_set_routing_entry(struct kvm *kvm, if (e->irqchip.pin >= max_pin) goto out; break; - case KVM_IRQ_ROUTING_MSI: + case GVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; @@ -327,11 +302,6 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_msi_route_invalid(kvm, e)) goto out; break; - case KVM_IRQ_ROUTING_HV_SINT: - e->set = kvm_hv_set_sint; - e->hv_sint.vcpu = ue->u.hv_sint.vcpu; - e->hv_sint.sint = ue->u.hv_sint.sint; - break; default: goto out; } @@ -366,15 +336,14 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, return r == 1; } -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); #define IOAPIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ - .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } + { .gsi = irq, .type = GVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = GVM_IRQCHIP_IOAPIC, .pin = (irq) } } #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) #define PIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + { .gsi = irq, .type = GVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } #define ROUTING_ENTRY2(irq) \ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) @@ -400,13 +369,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) ARRAY_SIZE(default_routing), 0); } -static const struct kvm_irq_routing_entry empty_routing[] = {}; - -int kvm_setup_empty_irq_routing(struct kvm *kvm) -{ - return kvm_set_irq_routing(kvm, empty_routing, 0, 0); -} - void kvm_arch_post_irq_routing_update(struct kvm *kvm) { if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) @@ -414,37 +376,3 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } -void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, - ulong *ioapic_handled_vectors) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_kernel_irq_routing_entry *entry; - struct kvm_irq_routing_table *table; - u32 i, nr_ioapic_pins; - int idx; - - idx = srcu_read_lock(&kvm->irq_srcu); - table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - nr_ioapic_pins = min_t(u32, table->nr_rt_entries, - kvm->arch.nr_reserved_ioapic_pins); - for (i = 0; i < nr_ioapic_pins; ++i) { - hlist_for_each_entry(entry, &table->map[i], link) { - struct kvm_lapic_irq irq; - - if (entry->type != KVM_IRQ_ROUTING_MSI) - continue; - - kvm_set_msi_irq(vcpu->kvm, entry, &irq); - - if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, - irq.dest_id, irq.dest_mode)) - __set_bit(irq.vector, ioapic_handled_vectors); - } - } - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -void kvm_arch_irq_routing_update(struct kvm *kvm) -{ - kvm_hv_irq_routing_update(kvm); -} diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 762cdf2..2ca26a9 100644..100755 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,15 +1,21 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#include <uapi/asm/processor-flags.h> + #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, +static inline size_t kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + if (!test_bit(reg, (size_t *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, reg); return vcpu->arch.regs[reg]; @@ -17,19 +23,19 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, static inline void kvm_register_write(struct kvm_vcpu *vcpu, enum kvm_reg reg, - unsigned long val) + size_t val) { vcpu->arch.regs[reg] = val; - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (size_t *)&vcpu->arch.regs_dirty); + __set_bit(reg, (size_t *)&vcpu->arch.regs_avail); } -static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +static inline size_t kvm_rip_read(struct kvm_vcpu *vcpu) { return kvm_register_read(vcpu, VCPU_REGS_RIP); } -static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, size_t val) { kvm_register_write(vcpu, VCPU_REGS_RIP, val); } @@ -39,54 +45,49 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + (size_t *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } -static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +static inline size_t kvm_read_cr0_bits(struct kvm_vcpu *vcpu, size_t mask) { - ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + size_t tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if (tmask & vcpu->arch.cr0_guest_owned_bits) kvm_x86_ops->decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } -static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +static inline size_t kvm_read_cr0(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, ~0UL); + return kvm_read_cr0_bits(vcpu, ~(size_t)0); } -static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +static inline size_t kvm_read_cr4_bits(struct kvm_vcpu *vcpu, size_t mask) { - ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + size_t tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if (tmask & vcpu->arch.cr4_guest_owned_bits) kvm_x86_ops->decache_cr4_guest_bits(vcpu); return vcpu->arch.cr4 & mask; } -static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) +static inline size_t kvm_read_cr3(struct kvm_vcpu *vcpu) { - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + if (!test_bit(VCPU_EXREG_CR3, (size_t *)&vcpu->arch.regs_avail)) kvm_x86_ops->decache_cr3(vcpu); return vcpu->arch.cr3; } -static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +static inline size_t kvm_read_cr4(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, ~0UL); + return kvm_read_cr4_bits(vcpu, ~(size_t)0); } static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) { - return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) - | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); -} - -static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_pkru(vcpu); + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & (unsigned)-1) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & (unsigned)-1) << 32); } static inline void enter_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f69340..7a156d4 100644..100755 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -6,6 +6,7 @@ * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Dor Laor <dor.laor@qumranet.com> @@ -19,29 +20,15 @@ */ #include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <linux/export.h> -#include <linux/math64.h> -#include <linux/slab.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/page.h> -#include <asm/current.h> +#include <uapi/linux/kvm.h> #include <asm/apicdef.h> -#include <asm/delay.h> -#include <linux/atomic.h> -#include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" -#include "trace.h" #include "x86.h" #include "cpuid.h" -#include "hyperv.h" + +#include <gvm_types.h> + #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -57,10 +44,10 @@ #define APIC_BUS_CYCLE_NS 1 /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) +#define apic_debug(fmt, arg,...) /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION (0x14UL | ((GVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define APIC_SHORT_MASK 0xc0000 @@ -72,9 +59,33 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul + +/** + * hweightN - returns the hamming weight of a N-bit word + * @x: the word to weigh + * + * The Hamming Weight of a number is the total number of bits set in it. + */ + +static unsigned int hweight32(unsigned int w) +{ + w -= (w >> 1) & 0x55555555; + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); + w = (w + (w >> 4)) & 0x0f0f0f0f; + return (w * 0x01010101) >> 24; +} + +static unsigned int hweight16(unsigned int w) +{ + unsigned int res = w - ((w >> 1) & 0x5555); + res = (res & 0x3333) + ((res >> 2) & 0x3333); + res = (res + (res >> 4)) & 0x0F0F; + return (res + (res >> 8)) & 0x00FF; +} + static inline int apic_test_vector(int vec, void *bitmap) { - return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return test_bit(VEC_POS(vec), (size_t *)((char *)(bitmap)+REG_POS(vec))); } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) @@ -87,22 +98,19 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) static inline void apic_clear_vector(int vec, void *bitmap) { - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + clear_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } static inline int __apic_test_and_set_vector(int vec, void *bitmap) { - return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return __test_and_set_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } static inline int __apic_test_and_clear_vector(int vec, void *bitmap) { - return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return __test_and_clear_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } -struct static_key_deferred apic_hw_disabled __read_mostly; -struct static_key_deferred apic_sw_disabled __read_mostly; - static inline int apic_enabled(struct kvm_lapic *apic) { return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); @@ -118,7 +126,7 @@ static inline int apic_enabled(struct kvm_lapic *apic) static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { - case KVM_APIC_MODE_X2APIC: { + case GVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; u32 max_apic_id = map->max_apic_id; @@ -133,11 +141,11 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, return true; } - case KVM_APIC_MODE_XAPIC_FLAT: + case GVM_APIC_MODE_XAPIC_FLAT: *cluster = map->xapic_flat_map; *mask = dest_id & 0xff; return true; - case KVM_APIC_MODE_XAPIC_CLUSTER: + case GVM_APIC_MODE_XAPIC_CLUSTER: *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; @@ -147,13 +155,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -191,13 +192,13 @@ static void recalculate_apic_map(struct kvm *kvm) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { - new->mode |= KVM_APIC_MODE_X2APIC; + new->mode |= GVM_APIC_MODE_X2APIC; } else if (ldr) { ldr = GET_APIC_LOGICAL_ID(ldr); if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) - new->mode |= KVM_APIC_MODE_XAPIC_FLAT; + new->mode |= GVM_APIC_MODE_XAPIC_FLAT; else - new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; + new->mode |= GVM_APIC_MODE_XAPIC_CLUSTER; } if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) @@ -207,13 +208,12 @@ static void recalculate_apic_map(struct kvm *kvm) cluster[ffs(mask) - 1] = apic; } out: - old = rcu_dereference_protected(kvm->arch.apic_map, - lockdep_is_held(&kvm->arch.apic_map_lock)); - rcu_assign_pointer(kvm->arch.apic_map, new); + old = kvm->arch.apic_map; + kvm->arch.apic_map = new; mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree(old); kvm_make_scan_ioapic_request(kvm); } @@ -227,10 +227,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; if (enabled) { - static_key_slow_dec_deferred(&apic_sw_disabled); recalculate_apic_map(apic->vcpu->kvm); - } else - static_key_slow_inc(&apic_sw_disabled.key); + } //else } } @@ -275,11 +273,6 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic) return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; } -static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) -{ - return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; -} - static inline int apic_lvt_nmi_mode(u32 lvt_val) { return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; @@ -288,7 +281,7 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; + struct kvm_cpuid_entry *feat; u32 v = APIC_VERSION; if (!lapic_in_kernel(vcpu)) @@ -300,7 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { +static const unsigned int apic_lvt_mask[GVM_APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ @@ -315,7 +308,7 @@ static int find_highest_vector(void *bitmap) for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) { - reg = bitmap + REG_POS(vec); + reg = (u32 *)((u8 *)bitmap + REG_POS(vec)); if (*reg) return fls(*reg) - 1 + vec; } @@ -330,7 +323,7 @@ static u8 count_vectors(void *bitmap) u8 count = 0; for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { - reg = bitmap + REG_POS(vec); + reg = (u32 *)((u8 *)bitmap + REG_POS(vec)); count += hweight32(*reg); } @@ -344,10 +337,9 @@ void __kvm_apic_update_irr(u32 *pir, void *regs) for (i = 0; i <= 7; i++) { pir_val = xchg(&pir[i], 0); if (pir_val) - *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + *((u32 *)((u8 *)regs + APIC_IRR + i * 0x10)) |= pir_val; } } -EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) { @@ -355,9 +347,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) __kvm_apic_update_irr(pir, apic->regs); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { @@ -375,8 +366,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - if (apic->vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -392,7 +381,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } else { apic->irr_pending = false; apic_clear_vector(vec, apic->regs + APIC_IRR); @@ -496,54 +485,6 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } -static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); -} - -static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) -{ - - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); -} - -static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; -} - -static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) -{ - u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) - apic_debug("Can't read EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return val & 0x1; -} - -static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) -{ - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { - apic_debug("Can't set EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return; - } - __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); -} - -static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) -{ - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { - apic_debug("Can't clear EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return; - } - __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -565,7 +506,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) if (old_ppr != ppr) { kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + kvm_make_request(GVM_REQ_EVENT, apic->vcpu); } } @@ -623,7 +564,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* The KVM local APIC implementation has two quirks: +/* The kvm local APIC implementation has two quirks: * * - the xAPIC MDA stores the destination at bits 24-31, while this * is not true of struct kvm_lapic_irq's dest_id field. This is @@ -635,7 +576,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) * rewrites the destination of non-IPI messages from APIC_BROADCAST * to X2APIC_BROADCAST. * - * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * The broadcast quirk can be disabled with GVM_CAP_X2APIC_API. This is * important when userspace wants to use x2APIC-format MSIs, because * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ @@ -681,10 +622,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return false; } } -EXPORT_SYMBOL_GPL(kvm_apic_match_dest); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size) + const size_t *bitmap, u32 bitmap_size) { u32 mod; int i, idx = -1; @@ -713,7 +653,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, { if (kvm->arch.x2apic_broadcast_quirk_disabled) { if ((irq->dest_id == APIC_BROADCAST && - map->mode != KVM_APIC_MODE_X2APIC)) + map->mode != GVM_APIC_MODE_X2APIC)) return true; if (irq->dest_id == X2APIC_BROADCAST) return true; @@ -737,7 +677,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map, struct kvm_lapic ***dst, - unsigned long *bitmap) + size_t *bitmap) { int i, lowest; @@ -803,7 +743,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; - unsigned long bitmap; + size_t bitmap; struct kvm_lapic **dst = NULL; int i; bool ret; @@ -850,7 +790,7 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; - unsigned long bitmap; + size_t bitmap; struct kvm_lapic **dst = NULL; bool ret = false; @@ -862,7 +802,7 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && hweight16(bitmap) == 1) { - unsigned long i = find_first_bit(&bitmap, 16); + size_t i = find_first_bit(&bitmap, 16); if (dst[i]) { *dest_vcpu = dst[i]->vcpu; @@ -885,8 +825,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; @@ -912,26 +850,20 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_clear_vector(vector, apic->regs + APIC_TMR); } - if (vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active && + kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { kvm_lapic_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; - case APIC_DM_REMRD: - result = 1; - vcpu->arch.pv.pv_unhalted = 1; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - break; - case APIC_DM_SMI: result = 1; - kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_make_request(GVM_REQ_SMI, vcpu); kvm_vcpu_kick(vcpu); break; @@ -944,12 +876,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (!trig_mode || level) { result = 1; - /* assumes that there are only KVM_APIC_INIT/SIPI */ - apic->pending_events = (1UL << KVM_APIC_INIT); + /* assumes that there are only GVM_APIC_INIT/SIPI */ + apic->pending_events = (1ULL << GVM_APIC_INIT); /* make sure pending_events is visible before sending * the request */ smp_wmb(); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -964,8 +896,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); - set_bit(KVM_APIC_SIPI, &apic->pending_events); - kvm_make_request(KVM_REQ_EVENT, vcpu); + set_bit(GVM_APIC_SIPI, &apic->pending_events); + kvm_make_request(GVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -1003,13 +935,6 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) if (!kvm_ioapic_handles_vector(apic, vector)) return; - /* Request a KVM exit to inform the userspace IOAPIC. */ - if (irqchip_split(apic->vcpu->kvm)) { - apic->vcpu->arch.pending_ioapic_eoi = vector; - kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); - return; - } - if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else @@ -1022,8 +947,6 @@ static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - trace_kvm_eoi(apic, vector); - /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -1034,11 +957,8 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap)) - kvm_hv_synic_send_eoi(apic->vcpu, vector); - kvm_ioapic_send_eoi(apic, vector); - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + kvm_make_request(GVM_REQ_EVENT, apic->vcpu); return vector; } @@ -1050,12 +970,9 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; - trace_kvm_eoi(apic, vector); - kvm_ioapic_send_eoi(apic, vector); - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + kvm_make_request(GVM_REQ_EVENT, apic->vcpu); } -EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); static void apic_send_ipi(struct kvm_lapic *apic) { @@ -1075,8 +992,6 @@ static void apic_send_ipi(struct kvm_lapic *apic) else irq.dest_id = GET_APIC_DEST_FIELD(icr_high); - trace_kvm_apic_ipi(icr_low, irq.dest_id); - apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " @@ -1117,7 +1032,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; - kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); + kvm_make_request(GVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } @@ -1137,13 +1052,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) switch (offset) { case APIC_ARBPRI: - apic_debug("Access APIC ARBPRI register which is for P6\n"); + //apic_debug("Access APIC ARBPRI register which is for P6\n"); break; - case APIC_TMCCT: /* Timer CCR */ - if (apic_lvtt_tscdeadline(apic)) - return 0; - val = apic_get_tmcct(apic); break; case APIC_PROCPRI: @@ -1175,21 +1086,19 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { - apic_debug("KVM_APIC_READ: alignment error %x %d\n", + apic_debug("GVM_APIC_READ: alignment error %x %d\n", offset, len); return 1; } if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { - apic_debug("KVM_APIC_READ: read reserved register %x\n", + apic_debug("GVM_APIC_READ: read reserved register %x\n", offset); return 1; } result = __apic_read(apic, offset & ~0xf); - trace_kvm_apic_read(offset, result); - switch (len) { case 1: case 2: @@ -1203,7 +1112,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, } return 0; } -EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { @@ -1253,8 +1161,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; - struct swait_queue_head *q = &vcpu->wq; - struct kvm_timer *ktimer = &apic->lapic_timer; + //struct swait_queue_head *q = &vcpu->wq; if (atomic_read(&apic->lapic_timer.pending)) return; @@ -1262,11 +1169,12 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); + kvm_vcpu_kick(vcpu); + +#if 0 if (swait_active(q)) swake_up(q); - - if (apic_lvtt_tscdeadline(apic)) - ktimer->expired_tscdeadline = ktimer->tscdeadline; +#endif } /* @@ -1292,136 +1200,6 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } -void wait_lapic_expire(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u64 guest_tsc, tsc_deadline; - - if (!lapic_in_kernel(vcpu)) - return; - - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - - if (!lapic_timer_int_injected(vcpu)) - return; - - tsc_deadline = apic->lapic_timer.expired_tscdeadline; - apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ - if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); -} - -static void start_sw_tscdeadline(struct kvm_lapic *apic) -{ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - ktime_t expire; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - ktime_t now; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); - } else - apic_timer_expired(apic); - - local_irq_restore(flags); -} - -bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) -{ - if (!lapic_in_kernel(vcpu)) - return false; - - return vcpu->arch.apic->lapic_timer.hv_timer_in_use; -} -EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); - -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) -{ - kvm_x86_ops->cancel_hv_timer(apic->vcpu); - apic->lapic_timer.hv_timer_in_use = false; -} - -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) -{ - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (atomic_read(&apic->lapic_timer.pending) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); - - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); - } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; -} - -void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); - -void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_tscdeadline(apic); - - if (atomic_read(&apic->lapic_timer.pending)) - return; - - start_sw_tscdeadline(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1467,9 +1245,6 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) - start_sw_tscdeadline(apic); } } @@ -1492,8 +1267,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; - trace_kvm_apic_write(reg, val); - switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) @@ -1535,7 +1308,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) int i; u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { + for (i = 0; i < GVM_APIC_LVT_NUM; i++) { lvt_val = kvm_lapic_get_reg(apic, APIC_LVTT + 0x10 * i); kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, @@ -1583,9 +1356,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_TMICT: - if (apic_lvtt_tscdeadline(apic)) - break; - hrtimer_cancel(&apic->lapic_timer.timer); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); @@ -1593,14 +1363,14 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: if (val & 4) - apic_debug("KVM_WRITE:TDCR %x\n", val); + apic_debug("GVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { - apic_debug("KVM_WRITE:ESR not zero %x\n", val); + apic_debug("GVM_WRITE:ESR not zero %x\n", val); ret = 1; } break; @@ -1619,7 +1389,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_debug("Local APIC Write to read-only register %x\n", reg); return ret; } -EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) @@ -1658,7 +1427,6 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) @@ -1673,7 +1441,6 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) /* TODO: optimize to just emulate side effect w/o one more write */ kvm_lapic_reg_write(vcpu->arch.apic, offset, val); } -EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); void kvm_free_lapic(struct kvm_vcpu *vcpu) { @@ -1684,14 +1451,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); - if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) - static_key_slow_dec_deferred(&apic_hw_disabled); - - if (!apic->sw_enabled) - static_key_slow_dec_deferred(&apic_sw_disabled); - if (apic->regs) - free_page((unsigned long)apic->regs); + free_page((size_t)apic->regs); kfree(apic); } @@ -1702,31 +1463,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - -void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) - return; - - hrtimer_cancel(&apic->lapic_timer.timer); - apic->lapic_timer.tscdeadline = data; - start_apic_timer(apic); -} - -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, size_t cr8) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1760,9 +1497,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - static_key_slow_dec_deferred(&apic_hw_disabled); } else { - static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } } @@ -1780,7 +1515,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((value & MSR_IA32_APICBASE_ENABLE) && apic->base_address != APIC_DEFAULT_PHYS_BASE) - pr_warn_once("APIC base relocation is unsupported by KVM"); + pr_warn_once("APIC base relocation is unsupported by kvm"); /* with FSB delivery interrupt, we can restart APIC functionality */ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " @@ -1809,10 +1544,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) } kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) + for (i = 0; i < GVM_APIC_LVT_NUM; i++) kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) + if (kvm_check_has_quirk(vcpu->kvm, GVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); @@ -1840,7 +1575,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_bsp(vcpu)) kvm_lapic_set_base(vcpu, vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); - vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1945,7 +1679,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1991,8 +1724,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) - apic->lapic_timer.tscdeadline = 0; atomic_set(&apic->lapic_timer.pending, 0); } } @@ -2016,11 +1747,6 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); - apic_update_ppr(apic); - } - return vector; } @@ -2086,7 +1812,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); } - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); @@ -2095,63 +1821,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) return 0; } -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -{ - struct hrtimer *timer; - - if (!lapic_in_kernel(vcpu)) - return; - - timer = &vcpu->arch.apic->lapic_timer.timer; - if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); -} - -/* - * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt - * - * Detect whether guest triggered PV EOI since the - * last entry. If yes, set EOI on guests's behalf. - * Clear PV EOI in guest memory in any case. - */ -static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, - struct kvm_lapic *apic) -{ - bool pending; - int vector; - /* - * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host - * and KVM_PV_EOI_ENABLED in guest memory as follows: - * - * KVM_APIC_PV_EOI_PENDING is unset: - * -> host disabled PV EOI. - * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: - * -> host enabled PV EOI, guest did not execute EOI yet. - * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: - * -> host enabled PV EOI, guest executed EOI. - */ - BUG_ON(!pv_eoi_enabled(vcpu)); - pending = pv_eoi_get_pending(vcpu); - /* - * Clear pending bit in any case: it will be set again on vmentry. - * While this might not be ideal from performance point of view, - * this makes sure pv eoi is only enabled when we know it's safe. - */ - pv_eoi_clr_pending(vcpu); - if (pending) - return; - vector = apic_set_eoi(apic); - trace_kvm_pv_eoi(apic, vector); -} - void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; - if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) - apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); - - if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) + if (!test_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, @@ -2161,41 +1835,13 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) apic_set_tpr(vcpu->arch.apic, data & 0xff); } -/* - * apic_sync_pv_eoi_to_guest - called before vmentry - * - * Detect whether it's safe to enable PV EOI and - * if yes do so. - */ -static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, - struct kvm_lapic *apic) -{ - if (!pv_eoi_enabled(vcpu) || - /* IRR set or many bits in ISR: could be nested. */ - apic->irr_pending || - /* Cache not set: could be safe but we don't bother. */ - apic->highest_isr_cache == -1 || - /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { - /* - * PV EOI was disabled by apic_sync_pv_eoi_from_guest - * so we need not do anything here. - */ - return; - } - - pv_eoi_set_pending(apic->vcpu); -} - void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; struct kvm_lapic *apic = vcpu->arch.apic; - apic_sync_pv_eoi_to_guest(vcpu, apic); - - if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) + if (!test_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff; @@ -2218,9 +1864,9 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; - __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + __set_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } else { - __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + __clear_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } vcpu->arch.apic->vapic_addr = vapic_addr; @@ -2253,7 +1899,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { - apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n", + apic_debug("GVM_APIC_READ: read x2apic reserved register %x\n", reg); return 1; } @@ -2268,95 +1914,48 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } -int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 1; - - /* if this is ICR write vector before command */ - if (reg == APIC_ICR) - kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(apic, reg, (u32)data); -} - -int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u32 low, high = 0; - - if (!lapic_in_kernel(vcpu)) - return 1; - - if (kvm_lapic_reg_read(apic, reg, 4, &low)) - return 1; - if (reg == APIC_ICR) - kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); - - *data = (((u64)high) << 32) | low; - - return 0; -} - -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) -{ - u64 addr = data & ~KVM_MSR_ENABLED; - if (!IS_ALIGNED(addr, 4)) - return 1; - - vcpu->arch.pv_eoi.msr_val = data; - if (!pv_eoi_enabled(vcpu)) - return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, - addr, sizeof(u8)); -} - void kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; - unsigned long pe; + size_t pe; if (!lapic_in_kernel(vcpu) || !apic->pending_events) return; /* * INITs are latched while in SMM. Because an SMM CPU cannot - * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs + * be in GVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs * and delay processing of INIT until the next RSM. */ if (is_smm(vcpu)) { - WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); - if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) - clear_bit(KVM_APIC_SIPI, &apic->pending_events); + WARN_ON_ONCE(vcpu->arch.mp_state == GVM_MP_STATE_INIT_RECEIVED); + if (test_bit(GVM_APIC_SIPI, &apic->pending_events)) + clear_bit(GVM_APIC_SIPI, &apic->pending_events); return; } pe = xchg(&apic->pending_events, 0); - if (test_bit(KVM_APIC_INIT, &pe)) { + if (test_bit(GVM_APIC_INIT, &pe)) { kvm_lapic_reset(vcpu, true); kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + vcpu->arch.mp_state = GVM_MP_STATE_INIT_RECEIVED; } - if (test_bit(KVM_APIC_SIPI, &pe) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + if (test_bit(GVM_APIC_SIPI, &pe) && + vcpu->arch.mp_state == GVM_MP_STATE_INIT_RECEIVED) { /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; apic_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; } } void kvm_lapic_init(void) { - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&apic_hw_disabled, HZ); - jump_label_rate_limit(&apic_sw_disabled, HZ); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c..ffbed39 100644..100755 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_X86_LAPIC_H #define __KVM_X86_LAPIC_H @@ -5,26 +9,31 @@ #include <linux/kvm_host.h> -#define KVM_APIC_INIT 0 -#define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 +#include <ntkrutils.h> +#include <asm/apicdef.h> +#include <asm/msr-index.h> +#include <gvm_types.h> +#include <ntkrutils.h> + +#define GVM_APIC_INIT 0 +#define GVM_APIC_SIPI 1 +#define GVM_APIC_LVT_NUM 6 + +#define GVM_APIC_SHORT_MASK 0xc0000 +#define GVM_APIC_DEST_MASK 0x800 -#define KVM_APIC_SHORT_MASK 0xc0000 -#define KVM_APIC_DEST_MASK 0x800 +#define u32 unsigned int struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ u32 timer_mode; u32 timer_mode_mask; - u64 tscdeadline; - u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ - bool hv_timer_in_use; }; struct kvm_lapic { - unsigned long base_address; + size_t base_address; struct kvm_io_device dev; struct kvm_timer lapic_timer; u32 divide_count; @@ -41,10 +50,10 @@ struct kvm_lapic { * the guest 1:1, because it is accessed by the vmx microcode. * Note: Only one register, the TPR, is used by the microcode. */ - void *regs; + u8 *regs; gpa_t vapic_addr; struct gfn_to_hva_cache vapic_cache; - unsigned long pending_events; + size_t pending_events; unsigned int sipi_vector; }; @@ -59,7 +68,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, size_t cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); @@ -85,9 +94,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); -void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); - void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); @@ -98,15 +104,6 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); -int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); - -static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; -} - -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); #define VEC_POS(v) ((v) & (32 - 1)) @@ -114,12 +111,12 @@ void kvm_lapic_init(void); static inline void kvm_lapic_set_vector(int vec, void *bitmap) { - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + set_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) { - kvm_lapic_set_vector(vec, apic->regs + APIC_IRR); + kvm_lapic_set_vector(vec, (unsigned char *)apic->regs + APIC_IRR); /* * irr_pending must be true if any interrupt is pending; set it after * APIC_IRR to avoid race with apic_clear_irr @@ -129,39 +126,27 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) { - return *((u32 *) (apic->regs + reg_off)); + return *((u32 *) ((unsigned char *)apic->regs + reg_off)); } static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - *((u32 *) (apic->regs + reg_off)) = val; + *((u32 *) ((unsigned char *)apic->regs + reg_off)) = val; } -extern struct static_key kvm_no_apic_vcpu; - static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) { - if (static_key_false(&kvm_no_apic_vcpu)) - return vcpu->arch.apic; - return true; + return vcpu->arch.apic; } -extern struct static_key_deferred apic_hw_disabled; - static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_hw_disabled.key)) - return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; - return MSR_IA32_APICBASE_ENABLE; + return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; } -extern struct static_key_deferred apic_sw_disabled; - static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_sw_disabled.key)) - return apic->sw_enabled; - return true; + return apic->sw_enabled; } static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) @@ -197,7 +182,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { - return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + return lapic_in_kernel(vcpu) && test_bit(GVM_APIC_INIT, &vcpu->arch.apic->pending_events); } static inline u32 kvm_apic_id(struct kvm_lapic *apic) @@ -213,14 +198,8 @@ static inline u32 kvm_apic_id(struct kvm_lapic *apic) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); -void wait_lapic_expire(struct kvm_vcpu *vcpu); - bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size); -void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); -void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); + const size_t *bitmap, u32 bitmap_size); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e98..e183d24 100644..100755 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -8,6 +8,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -23,27 +24,12 @@ #include "x86.h" #include "kvm_cache_regs.h" #include "cpuid.h" +#include <linux/list.h> #include <linux/kvm_host.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/moduleparam.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/hugetlb.h> -#include <linux/compiler.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/uaccess.h> - -#include <asm/page.h> -#include <asm/cmpxchg.h> -#include <asm/io.h> -#include <asm/vmx.h> #include <asm/kvm_page_track.h> +#pragma warning(disable : 4221) /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -51,7 +37,7 @@ * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging. */ -bool tdp_enabled = false; +bool tdp_enabled = true; enum { AUDIT_PRE_PAGE_FAULT, @@ -72,8 +58,8 @@ module_param(dbg, bool, 0644); #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) #define MMU_WARN_ON(x) WARN_ON(x) #else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) +#define pgprintk(x,...) do { } while (0) +#define rmap_printk(x,...) do { } while (0) #define MMU_WARN_ON(x) do { } while (0) #endif @@ -129,11 +115,6 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -#include <trace/events/kvm.h> - -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -162,13 +143,13 @@ struct kvm_shadow_walk_iterator { #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ - shadow_walk_okay(&(_walker)) && \ - ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ + shadow_walk_okay(&(_walker)); \ __shadow_walk_next(&(_walker), spte)) -static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; +// todo-001 +//static struct kmem_cache *pte_list_desc_cache; +//static struct kmem_cache *mmu_page_header_cache; +//static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_nx_mask; static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ @@ -178,6 +159,60 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +#ifdef CONFIG_X86_64 +typedef u64 phys_addr_t; +#define __PHYSICAL_MASK_SHIFT 46 +#endif +/* PAGE_SHIFT determines the page size */ +#ifndef PAGE_SIZE +#define PAGE_SHIFT 12 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) +#endif + +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) + +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + +#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __VIRTUAL_MASK ((1ULL << __VIRTUAL_MASK_SHIFT) - 1) + +/* Cast *PAGE_MASK to a signed type so that it is sign-extended if +virtual addresses are 32-bits but physical addresses are larger +(ie, 32-bit PAE). */ +#define PHYSICAL_PAGE_MASK (((ssize_t)PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PMD_PAGE_MASK (((ssize_t)PMD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((ssize_t)PUD_PAGE_MASK) & __PHYSICAL_MASK) + +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* +* Extracts the flags from a (pte|pmd|pud|pgd)val_t +* This includes the protection key value. +*/ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +#define pte_val(pte) (pte.pte) + +static pteval_t pte_flags(pte_t pte) +{ + return pte_val(pte) & PTE_FLAGS_MASK; +} + +static size_t pte_pfn(pte_t pte) +{ + return (pte_val(pte)& PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static int pte_write(pte_t pte) +{ + return pte_flags(pte) & _PAGE_RW; +} + + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -185,7 +220,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { shadow_mmio_mask = mmio_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); /* * the low bit of the generation number is always presumed to be zero. @@ -240,7 +274,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, access &= ACC_WRITE_MASK | ACC_USER_MASK; mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; - trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -279,7 +312,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); - trace_check_mmio_spte(spte, kvm_gen, spte_gen); return likely(kvm_gen == spte_gen); } @@ -293,7 +325,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_cpuid_PSE36(void) { @@ -354,7 +385,9 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) static u64 __get_spte_lockless(u64 *sptep) { - return ACCESS_ONCE(*sptep); + u64 temp; + ACCESS_ONCE(*sptep, temp); + return temp; } #else union split_spte { @@ -561,12 +594,6 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) ret = true; if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); return ret; } @@ -578,11 +605,6 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) shadow_accessed_mask | shadow_dirty_mask)) ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; } @@ -607,17 +629,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by + * kvm does not hold the refcount of the page used by * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) - kvm_set_pfn_dirty(pfn); return 1; } @@ -663,14 +680,14 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) + size_t cache_size, int min) { void *obj; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + obj = kzalloc_fast(cache_size, GFP_KERNEL); if (!obj) return -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -683,11 +700,10 @@ static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) return cache->nobjs; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); + kfree_fast(mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -709,7 +725,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); + free_page((size_t)mc->objects[--mc->nobjs]); } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) @@ -717,25 +733,23 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) int r; r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + sizeof(struct pte_list_desc), 8 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); + sizeof(struct kvm_mmu_page), 4); out: return r; } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) @@ -754,7 +768,7 @@ static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { - kmem_cache_free(pte_list_desc_cache, pte_list_desc); + kfree_fast(pte_list_desc); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) @@ -773,43 +787,6 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) sp->gfns[index] = gfn; } -/* - * Return the pointer to the large page information for a given gfn, - * handling slots that are not large page aligned. - */ -static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) -{ - unsigned long idx; - - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.lpage_info[level - 2][idx]; -} - -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, - gfn_t gfn, int count) -{ - struct kvm_lpage_info *linfo; - int i; - - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->disallow_lpage += count; - WARN_ON(linfo->disallow_lpage < 0); - } -} - -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) -{ - update_gfn_disallow_lpage_count(slot, gfn, 1); -} - -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) -{ - update_gfn_disallow_lpage_count(slot, gfn, -1); -} - static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -823,10 +800,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return kvm_slot_page_track_add_page(kvm, slot, gfn, + kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); - - kvm_mmu_gfn_disallow_lpage(slot, gfn); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -840,55 +815,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return kvm_slot_page_track_remove_page(kvm, slot, gfn, + kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); - - kvm_mmu_gfn_allow_lpage(slot, gfn); -} - -static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, - struct kvm_memory_slot *slot) -{ - struct kvm_lpage_info *linfo; - - if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return !!linfo->disallow_lpage; - } - - return true; } static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { - struct kvm_memory_slot *slot; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); -} - -static int host_mapping_level(struct kvm *kvm, gfn_t gfn) -{ - unsigned long page_size; - int i, ret = 0; - - page_size = kvm_host_page_size(kvm, gfn); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - if (page_size >= KVM_HPAGE_SIZE(i)) - ret = i; - else - break; - } - - return ret; + return true; } static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, bool no_dirty_log) { - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + if (!slot || slot->flags & GVM_MEMSLOT_INVALID) return false; if (no_dirty_log && slot->dirty_bitmap) return false; @@ -912,29 +852,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { - int host_level, level, max_level; - struct kvm_memory_slot *slot; - - if (unlikely(*force_pt_level)) - return PT_PAGE_TABLE_LEVEL; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - *force_pt_level = !memslot_valid_for_gpte(slot, true); - if (unlikely(*force_pt_level)) - return PT_PAGE_TABLE_LEVEL; - - host_level = host_mapping_level(vcpu->kvm, large_gfn); - - if (host_level == PT_PAGE_TABLE_LEVEL) - return host_level; - - max_level = min(kvm_x86_ops->get_lpage_level(), host_level); - - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) - break; - - return level - 1; + return PT_PAGE_TABLE_LEVEL; } /* @@ -956,17 +874,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, if (!rmap_head->val) { rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); - rmap_head->val = (unsigned long)spte; + rmap_head->val = (size_t)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (size_t)desc | 1; ++count; } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ull); while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { desc = desc->more; count += PTE_LIST_EXT; @@ -996,12 +914,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, if (j != 0) return; if (!prev_desc && !desc->more) - rmap_head->val = (unsigned long)desc->sptes[0]; + rmap_head->val = (size_t)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; else - rmap_head->val = (unsigned long)desc->more | 1; + rmap_head->val = (size_t)desc->more | 1; mmu_free_pte_list_desc(desc); } @@ -1023,7 +941,7 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) rmap_head->val = 0; } else { rmap_printk("pte_list_remove: %p many->many\n", spte); - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ull); prev_desc = NULL; while (desc) { for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { @@ -1041,13 +959,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, +static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, struct kvm_memory_slot *slot) { - unsigned long idx; + size_t idx; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; + idx = gfn - slot->base_gfn; + return &slot->arch.rmap[idx]; } static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, @@ -1058,7 +976,7 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); + return __gfn_to_rmap(gfn, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1123,7 +1041,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, goto out; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ull); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: @@ -1296,13 +1214,13 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) + gfn_t gfn_offset, size_t mask) { struct kvm_rmap_head *rmap_head; while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1321,20 +1239,19 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, */ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) + gfn_t gfn_offset, size_t mask) { struct kvm_rmap_head *rmap_head; while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + slot); __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ mask &= mask - 1; } } -EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); /** * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected @@ -1348,7 +1265,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) + gfn_t gfn_offset, size_t mask) { if (kvm_x86_ops->enable_log_dirty_pt_masked) kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, @@ -1361,13 +1278,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { struct kvm_rmap_head *rmap_head; - int i; bool write_protected = false; - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); - } + rmap_head = __gfn_to_rmap(gfn, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); return write_protected; } @@ -1386,11 +1300,13 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) struct rmap_iterator iter; bool flush = false; - while ((sptep = rmap_get_first(rmap_head, &iter))) { + sptep = rmap_get_first(rmap_head, &iter); + while (sptep) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); flush = true; + sptep = rmap_get_first(rmap_head, &iter); } return flush; @@ -1398,14 +1314,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) + size_t data) { return kvm_zap_rmapp(kvm, rmap_head); } static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) + size_t data) { u64 *sptep; struct rmap_iterator iter; @@ -1468,8 +1384,8 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, + iterator->rmap = __gfn_to_rmap(iterator->gfn, iterator->slot); + iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, iterator->slot); } @@ -1495,7 +1411,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { if (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += 1ULL; return; } @@ -1515,15 +1431,15 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_next(_iter_)) static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, + size_t start, + size_t end, + size_t data, int (*handler)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data)) + size_t data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1531,10 +1447,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, int ret = 0; int i; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; + size_t hva_start, hva_end; gfn_t gfn_start, gfn_end; hva_start = max(start, memslot->userspace_addr); @@ -1550,7 +1466,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, + PT_PAGE_TABLE_LEVEL, gfn_start, gfn_end - 1, &iterator) ret |= handler(kvm, iterator.rmap, memslot, @@ -1561,38 +1477,38 @@ static int kvm_handle_hva_range(struct kvm *kvm, return ret; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, +static int kvm_handle_hva(struct kvm *kvm, size_t hva, + size_t data, int (*handler)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data)) + size_t data)) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +int kvm_unmap_hva(struct kvm *kvm, size_t hva) { return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range(struct kvm *kvm, size_t start, size_t end) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +void kvm_set_spte_hva(struct kvm *kvm, size_t hva, pte_t pte) { - kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + kvm_handle_hva(kvm, hva, (size_t)&pte, kvm_set_pte_rmapp); } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) + size_t data) { u64 *sptep; - struct rmap_iterator uninitialized_var(iter); + struct rmap_iterator iter; int young = 0; BUG_ON(!shadow_accessed_mask); @@ -1601,17 +1517,16 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); + (size_t *)sptep); } } - trace_kvm_age_page(gfn, level, slot, young); return young; } static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, - int level, unsigned long data) + int level, size_t data) { u64 *sptep; struct rmap_iterator iter; @@ -1649,8 +1564,9 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +//todo-003 +#if 0 +int kvm_age_hva(struct kvm *kvm, size_t start, size_t end) { /* * In case of absence of EPT Access and Dirty Bits supports, @@ -1674,8 +1590,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } +#endif -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_test_age_hva(struct kvm *kvm, size_t hva) { return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); } @@ -1705,7 +1622,7 @@ static int is_empty_shadow_page(u64 *spt) static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) { kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); + //percpu_counter_add(&kvm_total_used_mmu_pages, nr); } static void kvm_mmu_free_page(struct kvm_mmu_page *sp) @@ -1713,15 +1630,15 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - free_page((unsigned long)sp->spt); + free_page((size_t)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); - kmem_cache_free(mmu_page_header_cache, sp); + free_page((size_t)sp->gfns); + kfree_fast(sp); } static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return gfn & ((1 << GVM_MMU_HASH_SHIFT) - 1); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, @@ -1754,7 +1671,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + set_page_private(virt_to_page(sp->spt), (size_t)sp); /* * The active_mmu_pages list is the FIFO list, do not move the @@ -1808,13 +1725,13 @@ static void nonpaging_update_pte(struct kvm_vcpu *vcpu, WARN_ON(1); } -#define KVM_PAGE_ARRAY_NR 16 +#define GVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { struct mmu_page_and_offset { struct kvm_mmu_page *sp; unsigned int idx; - } page[KVM_PAGE_ARRAY_NR]; + } page[GVM_PAGE_ARRAY_NR]; unsigned int nr; }; @@ -1831,7 +1748,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, pvec->page[pvec->nr].sp = sp; pvec->page[pvec->nr].idx = idx; pvec->nr++; - return (pvec->nr == KVM_PAGE_ARRAY_NR); + return (pvec->nr == GVM_PAGE_ARRAY_NR); } static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) @@ -1896,7 +1813,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); - trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } @@ -1953,10 +1869,10 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); else if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); } -#ifdef CONFIG_KVM_MMU_AUDIT +#ifdef CONFIG_GVM_MMU_AUDIT #include "mmu_audit.c" #else static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } @@ -1982,6 +1898,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_mmu_page *s; bool ret = false; +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; @@ -1989,6 +1906,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); ret |= kvm_sync_page(vcpu, s, invalid_list); } +#undef LIST_ENTRY_TYPE_INFO return ret; } @@ -1998,9 +1916,16 @@ struct mmu_page_path { unsigned int idx[PT64_ROOT_LEVEL]; }; +static int __for_each_sp_end(struct kvm_mmu_page **sp, struct kvm_mmu_pages *pvec, int nr) +{ + *sp = pvec->page[nr].sp; + + return 1; +} + #define for_each_sp(pvec, sp, parents, i) \ for (i = mmu_pages_first(&pvec, &parents); \ - i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i < pvec.nr && __for_each_sp_end(&sp, &pvec, i); \ i = mmu_pages_next(&pvec, &parents, i)) static int mmu_pages_next(struct kvm_mmu_pages *pvec, @@ -2090,9 +2015,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + //if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) + { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); - cond_resched_lock(&vcpu->kvm->mmu_lock); + //cond_resched_lock(&vcpu->kvm->mmu_lock); flush = false; } } @@ -2138,6 +2064,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } + +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; @@ -2153,16 +2081,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); } if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(GVM_REQ_MMU_SYNC, vcpu); __clear_sp_write_flooding_count(sp); - trace_kvm_mmu_get_page(sp, false); return sp; } +#undef LIST_ENTRY_TYPE_INFO ++vcpu->kvm->stat.mmu_cache_miss; @@ -2188,7 +2116,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); - trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); return sp; @@ -2240,7 +2167,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { - return __shadow_walk_next(iterator, *iterator->sptep); + __shadow_walk_next(iterator, *iterator->sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2248,8 +2175,6 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; @@ -2322,8 +2247,11 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) u64 *sptep; struct rmap_iterator iter; - while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) + sptep = rmap_get_first(&sp->parent_ptes, &iter); + while (sptep) { drop_parent_pte(sp, sptep); + sptep = rmap_get_first(&sp->parent_ptes, &iter); + } } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2355,7 +2283,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, { int ret; - trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); @@ -2405,10 +2332,12 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, */ kvm_flush_remote_tlbs(kvm); +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); kvm_mmu_free_page(sp); } +#undef LIST_ENTRY_TYPE_INFO } static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, @@ -2460,22 +2389,22 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; spin_lock(&kvm->mmu_lock); +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } +#undef LIST_ENTRY_TYPE_INFO kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; @@ -2487,9 +2416,12 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_mmu_page *sp; - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) +#if 0 + if (kvm_page_track_is_active(vcpu, gfn, GVM_PAGE_TRACK_WRITE)) return true; +#endif +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) return true; @@ -2500,16 +2432,15 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); kvm_unsync_page(vcpu, sp); } +#undef LIST_ENTRY_TYPE_INFO return false; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); - - return true; + /* Without IOMMU, we won't assign real MMIO resource */ + return false; } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2635,7 +2566,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, true, host_writable)) { if (write_fault) emulate = true; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep))) @@ -2657,8 +2588,6 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return emulate; } @@ -2669,7 +2598,7 @@ static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) - return KVM_PFN_ERR_FAULT; + return GVM_PFN_ERR_FAULT; return gfn_to_pfn_memslot_atomic(slot, gfn); } @@ -2678,7 +2607,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) { - struct page *pages[PTE_PREFETCH_NUM]; + pfn_t pfn[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; unsigned access = sp->role.access; int i, ret; @@ -2689,13 +2618,13 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (!slot) return -1; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); + ret = gfn_to_pfn_many_atomic(slot, gfn, pfn, end - start); if (ret <= 0) return -1; for (i = 0; i < ret; i++, gfn++, start++) mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + pfn[i], true, true); return 0; } @@ -2744,7 +2673,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) + int level, gfn_t gfn, kvm_pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2757,7 +2686,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, + write, level, gfn, pfn, false, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; @@ -2779,19 +2708,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, return emulate; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_MCEERR_AR; - info.si_addr = (void __user *)address; - info.si_addr_lsb = PAGE_SHIFT; - - send_sig_info(SIGBUS, &info, tsk); -} - static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { /* @@ -2800,59 +2716,12 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * caused mmio page fault and treat it as mmio access. * Return 1 to tell kvm to emulate it. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (pfn == GVM_PFN_ERR_RO_FAULT) return 1; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); - return 0; - } - return -EFAULT; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, - int *levelp) -{ - kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn)) && - !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - } -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { @@ -2941,9 +2810,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) { + spte = mmu_spte_get_lockless(iterator.sptep); if (!is_shadow_present_pte(spte) || iterator.level < level) break; + } /* * If the mapping has been changed, let the vcpu fault on the @@ -2996,67 +2867,42 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); exit: - trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static void get_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); static void make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, - gfn_t gfn, bool prefault) + gfn_t gfn) { int r; int level; bool force_pt_level = false; kvm_pfn_t pfn; - unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } if (fast_page_fault(vcpu, v, level, error_code)) return 0; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + get_pfn(vcpu, gfn, v, &pfn, write, &map_writable); if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; make_mmu_pages_available(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; } @@ -3110,7 +2956,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -3291,7 +3137,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } -EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3405,7 +3250,6 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (direct) addr = 0; - trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); return RET_MMIO_PF_EMULATE; } @@ -3416,7 +3260,6 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, u32 error_code, gfn_t gfn) @@ -3428,12 +3271,14 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, !(error_code & PFERR_WRITE_MASK)) return false; +#if 0 /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_page_track_is_active(vcpu, gfn, GVM_PAGE_TRACK_WRITE)) return true; +#endif return false; } @@ -3448,6 +3293,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + spte = mmu_spte_get_lockless(iterator.sptep); clear_sp_write_flooding_count(iterator.sptep); if (!is_shadow_present_pte(spte)) break; @@ -3456,7 +3302,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code, bool prefault) + u32 error_code) { gfn_t gfn = gva >> PAGE_SHIFT; int r; @@ -3473,76 +3319,24 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code, gfn, prefault); + return nonpaging_map(vcpu, gva & PAGE_MASK, error_code, gfn); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) -{ - struct kvm_arch_async_pf arch; - - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; - arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu.direct_map; - arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - - return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); -} - -static bool can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu))) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static void get_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { struct kvm_memory_slot *slot; - bool async; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); - if (!async) - return false; /* *pfn has correct page already */ - - if (!prefault && can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) - return true; - } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); - return false; -} - -static bool -check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) -{ - int page_num = KVM_PAGES_PER_HPAGE(level); - - gfn &= ~(page_num - 1); - - return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); } -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, - bool prefault) +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code) { kvm_pfn_t pfn; int r; int level; - bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; - unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; @@ -3555,43 +3349,24 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - if (level > PT_DIRECTORY_LEVEL && - !check_hugepage_cache_consistency(vcpu, gfn, level)) - level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } + level = mapping_level(vcpu, gfn, NULL); if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; + get_pfn(vcpu, gfn, gpa, &pfn, write, &map_writable); if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; make_mmu_pages_available(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -3614,7 +3389,7 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static unsigned long get_cr3(struct kvm_vcpu *vcpu) +static size_t get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } @@ -3662,10 +3437,12 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, return gpte & PT_PAGE_SIZE_MASK; } +#if 0 #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" #undef PTTYPE +#endif #define PTTYPE 64 #include "paging_tmpl.h" @@ -3820,7 +3597,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) /* * Passing "true" to the last argument is okay; it adds a check - * on bit 8 of the SPTEs which KVM doesn't use anyway. + * on bit 8 of the SPTEs which kvm doesn't use anyway. */ __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, @@ -3828,7 +3605,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); } -EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); static inline bool boot_cpu_is_amd(void) { @@ -3932,81 +3708,6 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } -/* -* PKU is an additional mechanism by which the paging controls access to -* user-mode addresses based on the value in the PKRU register. Protection -* key violations are reported through a bit in the page fault error code. -* Unlike other bits of the error code, the PK bit is not known at the -* call site of e.g. gva_to_gpa; it must be computed directly in -* permission_fault based on two bits of PKRU, on some machine state (CR4, -* CR0, EFER, CPL), and on other bits of the error code and the page tables. -* -* In particular the following conditions come from the error code, the -* page tables and the machine state: -* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 -* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) -* - PK is always zero if U=0 in the page tables -* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. -* -* The PKRU bitmask caches the result of these four conditions. The error -* code (minus the P bit) and the page table's U bit form an index into the -* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed -* with the two bits of the PKRU register corresponding to the protection key. -* For the first three conditions above the bits will be 00, thus masking -* away both AD and WD. For all reads or if the last condition holds, WD -* only will be masked away. -*/ -static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept) -{ - unsigned bit; - bool wp; - - if (ept) { - mmu->pkru_mask = 0; - return; - } - - /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { - mmu->pkru_mask = 0; - return; - } - - wp = is_write_protection(vcpu); - - for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { - unsigned pfec, pkey_bits; - bool check_pkey, check_write, ff, uf, wf, pte_user; - - pfec = bit << 1; - ff = pfec & PFERR_FETCH_MASK; - uf = pfec & PFERR_USER_MASK; - wf = pfec & PFERR_WRITE_MASK; - - /* PFEC.RSVD is replaced by ACC_USER_MASK. */ - pte_user = pfec & PFERR_RSVD_MASK; - - /* - * Only need to check the access which is not an - * instruction fetch and is to a user page. - */ - check_pkey = (!ff && pte_user); - /* - * write access is controlled by PKRU if it is a - * user access or CR0.WP = 1. - */ - check_write = check_pkey && wf && (uf || wp); - - /* PKRU.AD stops both read and write access. */ - pkey_bits = !!check_pkey; - /* PKRU.WD stops write access. */ - pkey_bits |= (!!check_write) << 1; - - mmu->pkru_mask |= (pkey_bits & 3) << pfec; - } -} - static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { unsigned root_level = mmu->root_level; @@ -4025,7 +3726,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); @@ -4053,7 +3753,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; @@ -4112,7 +3811,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4144,10 +3842,10 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.smm = is_smm(vcpu); reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { +#if 0 struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -4165,11 +3863,10 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); - update_pkru_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); +#endif } -EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { @@ -4220,7 +3917,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); - update_pkru_bitmask(vcpu, g_context, false); update_last_nonleaf_level(vcpu, g_context); } @@ -4239,7 +3935,6 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { @@ -4257,14 +3952,12 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } -EXPORT_SYMBOL_GPL(kvm_mmu_unload); static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, @@ -4413,7 +4106,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush; - union kvm_mmu_page_role mask = { }; + union kvm_mmu_page_role mask = { 0 }; mask.cr0_wp = 1; mask.cr4_pae = 1; @@ -4426,7 +4119,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * If we don't have indirect shadow pages, it means no page is * write-protected, so we can exit simply. */ - if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + unsigned int temp; + ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages, temp); + if (!temp) return; remote_flush = local_flush = false; @@ -4446,6 +4141,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { @@ -4471,6 +4167,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++spte; } } +#undef LIST_ENTRY_TYPE_INFO kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); @@ -4490,16 +4187,15 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); static void make_mmu_pages_available(struct kvm_vcpu *vcpu) { LIST_HEAD(invalid_list); - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= GVM_MIN_FREE_MMU_PAGES)) return; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + while (kvm_mmu_available_pages(vcpu->kvm) < GVM_REFILL_PAGES) { if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) break; @@ -4527,7 +4223,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); if (r < 0) return r; if (!r) @@ -4550,38 +4246,33 @@ emulate: BUG(); } } -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } -EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_enable_tdp(void) { tdp_enabled = true; } -EXPORT_SYMBOL_GPL(kvm_enable_tdp); void kvm_disable_tdp(void) { tdp_enabled = false; } -EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - free_page((unsigned long)vcpu->arch.mmu.pae_root); + MmFreeContiguousMemory(vcpu->arch.mmu.pae_root); if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((size_t)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { - struct page *page; int i; /* @@ -4589,11 +4280,14 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) + PHYSICAL_ADDRESS addr_4g; + addr_4g.QuadPart = 0xFFFFFFFF; + + vcpu->arch.mmu.pae_root = + MmAllocateContiguousMemory(PAGE_SIZE, addr_4g); + if (!vcpu->arch.mmu.pae_root) return -ENOMEM; - vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; @@ -4649,6 +4343,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, if (iterator.rmap) flush |= fn(kvm, iterator.rmap); +#if 0 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { kvm_flush_remote_tlbs(kvm); @@ -4656,6 +4351,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } cond_resched_lock(&kvm->mmu_lock); } +#endif } if (flush && lock_flush_tlb) { @@ -4682,15 +4378,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } static bool @@ -4708,7 +4396,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; spin_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { gfn_t start, end; @@ -4719,7 +4407,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + PT_PAGE_TABLE_LEVEL, PT_PAGE_TABLE_LEVEL, start, end - 1, true); } } @@ -4748,7 +4436,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * which do tlb flush out of mmu-lock should be serialized by * kvm->slots_lock otherwise tlb flush would be missed. */ - lockdep_assert_held(&kvm->slots_lock); + //lockdep_assert_held(&kvm->slots_lock); /* * We can flush all the TLBs out of the mmu lock without TLB @@ -4786,9 +4474,8 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct //&& + /*PageTransCompoundMap(pfn_to_page(pfn))*/) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; @@ -4817,7 +4504,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); - lockdep_assert_held(&kvm->slots_lock); + //lockdep_assert_held(&kvm->slots_lock); /* * It's also safe to flush TLBs out of mmu lock here as currently this @@ -4828,25 +4515,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); - -void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - bool flush; - - spin_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); - spin_unlock(&kvm->mmu_lock); - - /* see kvm_mmu_slot_remove_write_access */ - lockdep_assert_held(&kvm->slots_lock); - - if (flush) - kvm_flush_remote_tlbs(kvm); -} -EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -4857,13 +4525,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); - lockdep_assert_held(&kvm->slots_lock); + //lockdep_assert_held(&kvm->slots_lock); /* see kvm_mmu_slot_leaf_clear_dirty */ if (flush) kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) @@ -4872,6 +4539,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) int batch = 0; restart: +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { int ret; @@ -4895,8 +4563,8 @@ restart: * Need not flush tlb since we only zap the sp with invalid * generation number. */ - if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + if (batch >= BATCH_ZAP_PAGES) {// && + //cond_resched_lock(&kvm->mmu_lock)) { batch = 0; goto restart; } @@ -4908,6 +4576,7 @@ restart: if (ret) goto restart; } +#undef LIST_ENTRY_TYPE_INFO /* * Should flush tlb before free page tables since lockless-walking @@ -4921,14 +4590,13 @@ restart: * to zap obsolete pages. * * It's required when memslot is being deleted or VM is being - * destroyed, in these cases, we should ensure that KVM MMU does + * destroyed, in these cases, we should ensure that kvm MMU does * not use any resource of the being-deleted slot or all slots * after calling the function. */ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); - trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; /* @@ -4963,12 +4631,14 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) } } -static unsigned long +// todo-002 +#if 0 +static size_t mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; + size_t freed = 0; spin_lock(&kvm_lock); @@ -5024,7 +4694,7 @@ unlock: return freed; } -static unsigned long +static size_t mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return percpu_counter_read_positive(&kvm_total_used_mmu_pages); @@ -5035,39 +4705,16 @@ static struct shrinker mmu_shrinker = { .scan_objects = mmu_shrink_scan, .seeks = DEFAULT_SEEKS * 10, }; +#endif static void mmu_destroy_caches(void) { +#if 0 if (pte_list_desc_cache) kmem_cache_destroy(pte_list_desc_cache); if (mmu_page_header_cache) kmem_cache_destroy(mmu_page_header_cache); -} - -int kvm_mmu_module_init(void) -{ - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, 0, NULL); - if (!pte_list_desc_cache) - goto nomem; - - mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); - if (!mmu_page_header_cache) - goto nomem; - - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto nomem; - - register_shrinker(&mmu_shrinker); - - return 0; - -nomem: - mmu_destroy_caches(); - return -ENOMEM; +#endif } /* @@ -5081,16 +4728,16 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) struct kvm_memory_slot *memslot; int i; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) nr_pages += memslot->npages; } - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; + nr_mmu_pages = nr_pages * GVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + (unsigned int) GVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } @@ -5104,8 +4751,11 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_module_exit(void) { + // todo-001 +#if 0 mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); +#endif } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e9..cf39e5a 100644..100755 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_X86_MMU_H #define __KVM_X86_MMU_H @@ -44,7 +48,7 @@ #define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) +#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + GVM_NR_PAGE_SIZES - 1) static inline u64 rsvd_bits(int s, int e) { @@ -96,7 +100,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is - * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences + * used to sync dirty bitmap when we do GVM_GET_DIRTY_LOG. The differences * between these two sorts are: * 1) the first case clears SPTE_MMU_WRITEABLE bit. * 2) the first case requires flushing tlb immediately avoiding corrupting @@ -126,7 +130,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) * * TODO: introduce APIs to split these two cases. */ -static inline int is_writable_pte(unsigned long pte) +static inline int is_writable_pte(size_t pte) { return pte & PT_WRITABLE_MASK; } @@ -149,7 +153,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + size_t rflags = kvm_x86_ops->get_rflags(vcpu); /* * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. @@ -164,41 +168,20 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * but it will be one in index if SMAP checks are being overridden. * It is important to keep this branchless. */ - unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + size_t smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); - if (unlikely(mmu->pkru_mask)) { - u32 pkru_bits, offset; - - /* - * PKRU defines 32 bits, there are 16 domains and 2 - * attribute bits per domain in pkru. pte_pkey is the - * index of the protection domain, so pte_pkey * 2 is - * is the index of the first bit for the domain. - */ - pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; - - /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); - - pkru_bits &= mmu->pkru_mask >> offset; - errcode |= -pkru_bits & PFERR_PK_MASK; - fault |= (pkru_bits != 0); - } - - return -(u32)fault & errcode; + + return -(s32)fault & errcode; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); #endif diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index dcce533..76050b1 100644..100755 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -5,6 +5,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -17,6 +18,7 @@ * */ +#if 0 #include <linux/ratelimit.h> char const *audit_point_name[] = { @@ -278,7 +280,7 @@ static void mmu_audit_disable(void) static int mmu_audit_set(const char *val, const struct kernel_param *kp) { int ret; - unsigned long enable; + size_t enable; ret = kstrtoul(val, 10, &enable); if (ret < 0) @@ -304,3 +306,4 @@ static const struct kernel_param_ops audit_param_ops = { }; arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); +#endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h deleted file mode 100644 index 5a24b84..0000000 --- a/arch/x86/kvm/mmutrace.h +++ /dev/null @@ -1,333 +0,0 @@ -#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVMMMU_H - -#include <linux/tracepoint.h> -#include <linux/trace_events.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvmmmu - -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ - __field(bool, unsync) - -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ - __entry->unsync = sp->unsync; - -#define KVM_MMU_PAGE_PRINTK() ({ \ - const char *saved_ptr = trace_seq_buffer_ptr(p); \ - static const char *access_str[] = { \ - "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ - }; \ - union kvm_mmu_page_role role; \ - \ - role.word = __entry->role; \ - \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ - __entry->gfn, role.level, \ - role.cr4_pae ? " pae" : "", \ - role.quadrant, \ - role.direct ? " direct" : "", \ - access_str[role.access], \ - role.invalid ? " invalid" : "", \ - role.nxe ? "" : "!", \ - __entry->root_count, \ - __entry->unsync ? "unsync" : "sync", 0); \ - saved_ptr; \ - }) - -#define kvm_mmu_trace_pferr_flags \ - { PFERR_PRESENT_MASK, "P" }, \ - { PFERR_WRITE_MASK, "W" }, \ - { PFERR_USER_MASK, "U" }, \ - { PFERR_RSVD_MASK, "RSVD" }, \ - { PFERR_FETCH_MASK, "F" } - -/* - * A pagetable walk has started - */ -TRACE_EVENT( - kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, u32 pferr), - TP_ARGS(addr, pferr), - - TP_STRUCT__entry( - __field(__u64, addr) - __field(__u32, pferr) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pferr = pferr; - ), - - TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, - __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) -); - - -/* We just walked a paging element */ -TRACE_EVENT( - kvm_mmu_paging_element, - TP_PROTO(u64 pte, int level), - TP_ARGS(pte, level), - - TP_STRUCT__entry( - __field(__u64, pte) - __field(__u32, level) - ), - - TP_fast_assign( - __entry->pte = pte; - __entry->level = level; - ), - - TP_printk("pte %llx level %u", __entry->pte, __entry->level) -); - -DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size), - - TP_STRUCT__entry( - __field(__u64, gpa) - ), - - TP_fast_assign( - __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) - + index * size; - ), - - TP_printk("gpa %llx", __entry->gpa) -); - -/* We set a pte accessed bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size) -); - -/* We set a pte dirty bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size) -); - -TRACE_EVENT( - kvm_mmu_walker_error, - TP_PROTO(u32 pferr), - TP_ARGS(pferr), - - TP_STRUCT__entry( - __field(__u32, pferr) - ), - - TP_fast_assign( - __entry->pferr = pferr; - ), - - TP_printk("pferr %x %s", __entry->pferr, - __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) -); - -TRACE_EVENT( - kvm_mmu_get_page, - TP_PROTO(struct kvm_mmu_page *sp, bool created), - TP_ARGS(sp, created), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - __field(bool, created) - ), - - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - __entry->created = created; - ), - - TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), - __entry->created ? "new" : "existing") -); - -DECLARE_EVENT_CLASS(kvm_mmu_page_class, - - TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), - - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), - - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -TRACE_EVENT( - mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), - - TP_STRUCT__entry( - __field(void *, sptep) - __field(gfn_t, gfn) - __field(unsigned, access) - __field(unsigned int, gen) - ), - - TP_fast_assign( - __entry->sptep = sptep; - __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; - ), - - TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, - __entry->gfn, __entry->access, __entry->gen) -); - -TRACE_EVENT( - handle_mmio_page_fault, - TP_PROTO(u64 addr, gfn_t gfn, unsigned access), - TP_ARGS(addr, gfn, access), - - TP_STRUCT__entry( - __field(u64, addr) - __field(gfn_t, gfn) - __field(unsigned, access) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->gfn = gfn; - __entry->access = access; - ), - - TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, - __entry->access) -); - -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - -TRACE_EVENT( - fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(gva_t, gva) - __field(u32, error_code) - __field(u64 *, sptep) - __field(u64, old_spte) - __field(u64, new_spte) - __field(bool, retry) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu->vcpu_id; - __entry->gva = gva; - __entry->error_code = error_code; - __entry->sptep = sptep; - __entry->old_spte = old_spte; - __entry->new_spte = *sptep; - __entry->retry = retry; - ), - - TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" - " new %llx spurious %d fixed %d", __entry->vcpu_id, - __entry->gva, __print_flags(__entry->error_code, "|", - kvm_mmu_trace_pferr_flags), __entry->sptep, - __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) - ) -); - -TRACE_EVENT( - kvm_mmu_invalidate_zap_all_pages, - TP_PROTO(struct kvm *kvm), - TP_ARGS(kvm), - - TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) - __field(unsigned int, mmu_used_pages) - ), - - TP_fast_assign( - __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; - ), - - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", - __entry->mmu_valid_gen, __entry->mmu_used_pages - ) -); - - -TRACE_EVENT( - check_mmio_spte, - TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), - TP_ARGS(spte, kvm_gen, spte_gen), - - TP_STRUCT__entry( - __field(unsigned int, kvm_gen) - __field(unsigned int, spte_gen) - __field(u64, spte) - ), - - TP_fast_assign( - __entry->kvm_gen = kvm_gen; - __entry->spte_gen = spte_gen; - __entry->spte = spte; - ), - - TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, - __entry->kvm_gen, __entry->spte_gen, - __entry->kvm_gen == __entry->spte_gen - ) -); -#endif /* _TRACE_KVMMMU_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE mmutrace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 0149ac5..6f3c042 100644..100755 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -4,6 +4,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright(C) 2015 Intel Corporation. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -17,7 +18,6 @@ */ #include <linux/kvm_host.h> -#include <asm/mtrr.h> #include "cpuid.h" #include "mmu.h" @@ -26,10 +26,19 @@ #define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) #define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) +/* MTRR memory types, which are defined in SDM */ +#define MTRR_TYPE_UNCACHABLE 0 +#define MTRR_TYPE_WRCOMB 1 +/*#define MTRR_TYPE_ 2*/ +/*#define MTRR_TYPE_ 3*/ +#define MTRR_TYPE_WRTHROUGH 4 +#define MTRR_TYPE_WRPROT 5 +#define MTRR_TYPE_WRBACK 6 +#define MTRR_NUM_TYPES 7 + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: case MSR_MTRRfix64K_00000: case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: @@ -44,6 +53,9 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRdefType: case MSR_IA32_CR_PAT: return true; + default: + if (msr >= 0x200 && msr < 0x210) + return true; } return false; } @@ -83,7 +95,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * kvm_NR_VAR_MTRR)); mask = (~0ULL) << cpuid_maxphyaddr(vcpu); if ((msr & 1) == 0) { @@ -101,7 +113,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return true; } -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) { @@ -200,11 +211,19 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) *seg = 0; *unit = 0; break; - case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: *seg = 1; *unit = msr - MSR_MTRRfix16K_80000; break; - case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: *seg = 2; *unit = msr - MSR_MTRRfix4K_C0000; break; @@ -319,8 +338,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) gfn_t start, end; int index; - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (msr == MSR_IA32_CR_PAT || !tdp_enabled) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) @@ -372,10 +390,12 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { +#define LIST_ENTRY_TYPE_INFO struct kvm_mtrr_range list_for_each_entry(tmp, &mtrr_state->head, node) if (cur->base >= tmp->base) break; list_add_tail(&cur->node, &tmp->node); +#undef LIST_ENTRY_TYPE_INFO } } @@ -410,9 +430,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * SMRR = 0 * WC = 1 * FIX = 1 - * VCNT = KVM_NR_VAR_MTRR + * VCNT = kvm_NR_VAR_MTRR */ - *pdata = 0x500 | KVM_NR_VAR_MTRR; + *pdata = 0x500 | 8; return 0; } @@ -525,9 +545,11 @@ static void __mtrr_lookup_var_next(struct mtrr_iter *iter) { struct kvm_mtrr *mtrr_state = iter->mtrr_state; +#define LIST_ENTRY_TYPE_INFO struct kvm_mtrr_range list_for_each_entry_continue(iter->range, &mtrr_state->head, node) if (match_var_range(iter, iter->range)) return; +#undef LIST_ENTRY_TYPE_INFO iter->range = NULL; iter->partial_map |= iter->start_max < iter->end; @@ -540,7 +562,9 @@ static void mtrr_lookup_var_start(struct mtrr_iter *iter) iter->fixed = false; iter->start_max = iter->start; iter->range = NULL; +#define LIST_ENTRY_TYPE_INFO struct kvm_mtrr_range iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); +#undef LIST_ENTRY_TYPE_INFO __mtrr_lookup_var_next(iter); } @@ -557,9 +581,10 @@ static void mtrr_lookup_fixed_next(struct mtrr_iter *iter) iter->index++; /* have looked up for all fixed MTRRs. */ - if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) - return mtrr_lookup_var_start(iter); - + if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) { + mtrr_lookup_var_start(iter); + return; + } /* switch to next segment. */ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg)) iter->seg++; @@ -696,7 +721,6 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) return type; } -EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num) diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539..2d6d87d 100644..100755 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -5,6 +5,7 @@ * write access is tracked. * * Copyright(C) 2015 Intel Corporation. + * Copyright 2019 Google LLC * * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> @@ -33,7 +34,7 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *free, } int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) + size_t npages) { int i; @@ -64,7 +65,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, { int index, val; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn - slot->base_gfn; val = slot->arch.gfn_track[mode][index]; @@ -96,12 +97,6 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, update_gfn_track(slot, gfn, mode, 1); - /* - * new track stops large page mapping for the - * tracked page. - */ - kvm_mmu_gfn_disallow_lpage(slot, gfn); - if (mode == KVM_PAGE_TRACK_WRITE) if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); @@ -128,12 +123,6 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, return; update_gfn_track(slot, gfn, mode, -1); - - /* - * allow large page mapping for the tracked page - * after the tracker is gone. - */ - kvm_mmu_gfn_allow_lpage(slot, gfn); } /* @@ -144,6 +133,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot; int index; + unsigned short temp; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; @@ -152,8 +142,9 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return false; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); - return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); + index = gfn - slot->base_gfn; + ACCESS_ONCE(slot->arch.gfn_track[mode][index], temp); + return !!temp; } void kvm_page_track_init(struct kvm *kvm) @@ -165,6 +156,14 @@ void kvm_page_track_init(struct kvm *kvm) INIT_HLIST_HEAD(&head->track_notifier_list); } +void kvm_page_track_destroy(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + /* * register the notifier so that event interception for the tracked guest * pages can be received. @@ -220,8 +219,10 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, return; idx = srcu_read_lock(&head->track_srcu); +#define LIST_ENTRY_TYPE_INFO struct kvm_page_track_notifier_node hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) n->track_write(vcpu, gpa, new, bytes); +#undef LIST_ENTRY_TYPE_INFO srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a011054..0d5fd47 100644..100755 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -8,6 +8,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -27,9 +28,13 @@ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro * uses for EPT without A/D paging type. */ +#if 0 extern u64 __pure __using_nonexistent_pte_bit(void) __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); +#endif +#pragma warning(disable : 4127) +#pragma warning(disable : 4310) #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -65,6 +70,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#if 0 #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -80,6 +86,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 4 +#endif #else #error Invalid PTTYPE value #endif @@ -119,8 +126,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) if (!PT_GUEST_DIRTY_MASK) return; - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - mask = (unsigned)~ACC_WRITE_MASK; /* Allow write access to dirty gptes */ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & @@ -128,7 +133,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) *access &= mask; } -static inline int FNAME(is_present_gpte)(unsigned long pte) +static inline int FNAME(is_present_gpte)(size_t pte) { #if PTTYPE != PTTYPE_EPT return pte & PT_PRESENT_MASK; @@ -144,18 +149,20 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int npages; pt_element_t ret; pt_element_t *table; - struct page *page; + PMDL kmap_mdl; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + npages = get_user_pages_fast((size_t)ptep_user, 1, 1, &kmap_mdl); /* Check if the user is doing something meaningless. */ if (unlikely(npages != 1)) return -EFAULT; - table = kmap_atomic(page); + table = kmap_atomic(kmap_mdl); + if (!table) + return -EFAULT; ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); + kunmap_atomic(kmap_mdl); - kvm_release_page_dirty(page); + kvm_release_page(kmap_mdl); return (ret != orig_pte); } @@ -195,8 +202,6 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else - BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); - BUILD_BUG_ON(ACC_EXEC_MASK != 1); access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ access ^= (gpte >> PT64_NX_SHIFT); @@ -226,12 +231,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); if (!(pte & PT_GUEST_ACCESSED_MASK)) { - trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); pte |= PT_GUEST_ACCESSED_MASK; } if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) @@ -266,11 +269,13 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned pkeys = 0; +#if 0 #if PTTYPE == 64 pte_t pte = {.pte = gpte}; pkeys = pte_flags_pkey(pte_flags(pte)); #endif +#endif return pkeys; } @@ -283,7 +288,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, { int ret; pt_element_t pte; - pt_element_t __user *uninitialized_var(ptep_user); + pt_element_t __user *ptep_user; gfn_t table_gfn; unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; gpa_t pte_gpa; @@ -295,7 +300,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gpa_t real_gpa; gfn_t gfn; - trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -303,7 +307,6 @@ retry_walk: #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); - trace_kvm_mmu_paging_element(pte, walker->level); if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; @@ -318,7 +321,7 @@ retry_walk: do { gfn_t real_gfn; - unsigned long host_addr; + size_t host_addr; pt_access &= pte_access; --walker->level; @@ -355,13 +358,11 @@ retry_walk: if (unlikely(kvm_is_error_hva(host_addr))) goto error; - ptep_user = (pt_element_t __user *)((void *)host_addr + offset); + ptep_user = (pt_element_t __user *)((char *)host_addr + offset); if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; - trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; @@ -449,7 +450,6 @@ error: walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; - trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; } @@ -546,7 +546,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, return; if (sp->role.direct) - return __direct_pte_prefetch(vcpu, sp, sptep); + __direct_pte_prefetch(vcpu, sp, sptep); i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; @@ -571,7 +571,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; @@ -634,7 +634,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (is_shadow_present_pte(*it.sptep)) continue; - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + direct_gfn = gw->gfn; sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access); @@ -643,13 +643,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, clear_sp_write_flooding_count(it.sptep); emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + it.level, gw->gfn, pfn, false, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); return emulate; out_gpte_changed: - kvm_release_pfn_clean(pfn); return 0; } @@ -676,7 +675,6 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, bool *write_fault_to_shadow_pgtable) { int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); bool self_changed = false; if (!(walker->pte_access & ACC_WRITE_MASK || @@ -686,7 +684,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, for (level = walker->level; level <= walker->max_level; level++) { gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - self_changed |= !(gfn & mask); + self_changed |= !gfn; *write_fault_to_shadow_pgtable |= !gfn; } @@ -707,8 +705,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code) { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; @@ -716,8 +713,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; - unsigned long mmu_seq; bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -742,8 +737,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) - inject_page_fault(vcpu, &walker.fault); + inject_page_fault(vcpu, &walker.fault); return 0; } @@ -759,20 +753,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &force_pt_level); - if (likely(!force_pt_level)) { - level = min(walker.level, level); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - } else - force_pt_level = true; + level = mapping_level(vcpu, walker.gfn, NULL); + } - mmu_seq = vcpu->kvm->mmu_notifier_seq; + //mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) - return 0; + get_pfn(vcpu, walker.gfn, addr, &pfn, write_fault, &map_writable); if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -799,25 +786,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, } spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); make_mmu_pages_available(vcpu); - if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 06ce377..d3937d2 100644..100755 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -2,6 +2,7 @@ * Kernel-based Virtual Machine -- Performance Monitoring Unit support * * Copyright 2015 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@redhat.com> @@ -13,6 +14,7 @@ * */ +#if 0 #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -63,9 +65,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + (size_t *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (size_t *)&pmu->global_status); + kvm_make_request(GVM_REQ_PMU, pmc->vcpu); } } @@ -77,9 +79,9 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + (size_t *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (size_t *)&pmu->global_status); + kvm_make_request(GVM_REQ_PMU, pmc->vcpu); /* * Inject PMI. If vcpu was in a guest mode during NMI PMI @@ -92,7 +94,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, if (!kvm_is_in_guest()) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); + kvm_make_request(GVM_REQ_PMI, pmc->vcpu); } } @@ -130,7 +132,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); + clear_bit(pmc->idx, (size_t*)&pmc_to_pmu(pmc)->reprogram_pmi); } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) @@ -173,7 +175,6 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) (eventsel & HSW_IN_TX), (eventsel & HSW_IN_TX_CHECKPOINTED)); } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { @@ -191,7 +192,6 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) !(en_field & 0x1), /* exclude kernel */ pmi, false, false); } -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { @@ -209,7 +209,6 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) reprogram_fixed_counter(pmc, ctrl, idx); } } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -219,11 +218,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) bitmask = pmu->reprogram_pmi; - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (size_t *)&bitmask, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + clear_bit(bit, (size_t *)&pmu->reprogram_pmi); continue; } @@ -307,3 +306,4 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); } +#endif diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f96e1f9..1025403 100644..100755 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -1,6 +1,8 @@ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H +#if 0 + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -115,4 +117,6 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; +#endif + #endif /* __KVM_X86_PMU_H */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index cd94443..5db57c6 100644..100755 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -11,6 +11,7 @@ * * Implementation is based on pmu_intel.c file */ +#if 0 #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -203,3 +204,4 @@ struct kvm_pmu_ops amd_pmu_ops = { .init = amd_pmu_init, .reset = amd_pmu_reset, }; +#endif diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a850..4fb5c5f 100644..100755 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -2,6 +2,7 @@ * KVM PMU support for Intel CPUs * * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@redhat.com> @@ -11,6 +12,7 @@ * the COPYING file in the top-level directory. * */ +#if 0 #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -63,7 +65,7 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + for_each_set_bit(bit, (size_t *)&diff, X86_PMC_IDX_MAX) reprogram_counter(pmu, bit); } @@ -98,7 +100,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); + return test_bit(pmc->idx, (size_t *)&pmu->global_ctrl); } static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) @@ -356,3 +358,4 @@ struct kvm_pmu_ops intel_pmu_ops = { .init = intel_pmu_init, .reset = intel_pmu_reset, }; +#endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca..31fc896 100644..100755 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5,6 +5,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -26,199 +27,12 @@ #include "cpuid.h" #include "pmu.h" -#include <linux/module.h> -#include <linux/mod_devicetable.h> -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/trace_events.h> -#include <linux/slab.h> -#include <linux/amd-iommu.h> -#include <linux/hashtable.h> - -#include <asm/apic.h> -#include <asm/perf_event.h> -#include <asm/tlbflush.h> -#include <asm/desc.h> -#include <asm/debugreg.h> -#include <asm/kvm_para.h> -#include <asm/irq_remapping.h> - -#include <asm/virtext.h> -#include "trace.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static const struct x86_cpu_id svm_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_SVM), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); - -#define IOPM_ALLOC_ORDER 2 -#define MSRPM_ALLOC_ORDER 1 - -#define SEG_TYPE_LDT 2 -#define SEG_TYPE_BUSY_TSS16 3 - -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) - -#define SVM_AVIC_DOORBELL 0xc001011b - -#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ -#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ -#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ - -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - -#define TSC_RATIO_RSVD 0xffffff0000000000ULL -#define TSC_RATIO_MIN 0x0000000000000001ULL -#define TSC_RATIO_MAX 0x000000ffffffffffULL - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - -/* AVIC GATAG is encoded using VM and VCPU IDs */ -#define AVIC_VCPU_ID_BITS 8 -#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) - -#define AVIC_VM_ID_BITS 24 -#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) -#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) - -#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ - (y & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) - -static bool erratum_383_found __read_mostly; - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_TSC_AUX, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_vcpu; - -struct nested_state { - struct vmcb *hsave; - u64 hsave_msr; - u64 vm_cr_msr; - u64 vmcb; - - /* These are the merged vectors */ - u32 *msrpm; - - /* gpa pointers to the real vectors */ - u64 vmcb_msrpm; - u64 vmcb_iopm; - - /* A VMEXIT is required but not yet emulated */ - bool exit_required; - - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - - /* Nested Paging related state */ - u64 nested_cr3; -}; - -#define MSRPM_OFFSETS 16 -static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - -/* - * Set osvw_len to higher value when updated Revision Guides - * are published and we know what the new status bits are - */ -static uint64_t osvw_len = 4, osvw_status; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - uint64_t sysenter_esp; - uint64_t sysenter_eip; - uint64_t tsc_aux; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; +#include <asm/svm.h> +#include <asm/vmx.h> - u32 *msrpm; - - ulong nmi_iret_rip; - - struct nested_state nested; - - bool nmi_singlestep; - - unsigned int3_injected; - unsigned long int3_rip; - u32 apf_reason; - - /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; - - u32 ldr_reg; - struct page *avic_backing_page; - u64 *avic_physical_id_cache; - bool avic_is_running; - - /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. - */ - struct list_head ir_list; - spinlock_t ir_list_lock; -}; - -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; +#include <__asm.h> +//seperate definitions to svm_def.h for asmgen +#include "svm_def.h" #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) @@ -228,9 +42,6 @@ struct amd_svm_iommu_ir { #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) -static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define TSC_RATIO_DEFAULT 0x0100000000ULL - #define MSR_INVALID 0xffffffffU static const struct svm_direct_access_msrs { @@ -263,23 +74,18 @@ static bool npt_enabled; /* allow nested paging (virtualized MMU) for all guests */ static int npt = true; -module_param(npt, int, S_IRUGO); -/* allow nested virtualization in KVM/SVM */ -static int nested = true; -module_param(nested, int, S_IRUGO); +/* allow nested virtualization in kvm/SVM */ +static int nested = false; /* enable / disable AVIC */ static int avic; -#ifdef CONFIG_X86_LOCAL_APIC -module_param(avic, int, S_IRUGO); -#endif /* AVIC VM ID bit masks and lock */ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); static DEFINE_SPINLOCK(avic_vm_id_lock); -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +static void svm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -345,11 +151,13 @@ static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u64 *entry = svm->avic_physical_id_cache; + u64 temp; if (!entry) return false; - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); + READ_ONCE(*entry, temp); + return temp & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; } static void recalc_intercepts(struct vcpu_svm *svm) @@ -489,8 +297,10 @@ static inline bool gif_set(struct vcpu_svm *svm) return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } -static unsigned long iopm_base; +static size_t iopm_base; +static void *iopm_va; +#pragma pack(push, 1) struct kvm_ldttss_desc { u16 limit0; u16 base0; @@ -498,7 +308,8 @@ struct kvm_ldttss_desc { unsigned limit1:4, zero0:3, g:1, base2:8; u32 base3; u32 zero1; -} __attribute__((packed)); +}; +#pragma pack(pop) struct svm_cpu_data { int cpu; @@ -549,17 +360,17 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + __svm_clgi(); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + __svm_stgi(); } -static inline void invlpga(unsigned long addr, u32 asid) +static inline void invlpga(size_t addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); + __svm_invlpga((void *)addr, asid); } static int get_npt_level(void) @@ -593,7 +404,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + ret = GVM_X86_SHADOW_INT_STI | GVM_X86_SHADOW_INT_MOV_SS; return ret; } @@ -646,7 +457,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { - unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + size_t rip, old_rip = kvm_rip_read(&svm->vcpu); /* * For guest debugging where we have to reinject #BP if some @@ -668,15 +479,28 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, svm->vmcb->control.event_inj_err = error_code; } +/** +* upper_32_bits - return bits 32-63 of a number +* @n: the number we're accessing +* +* A basic shift-right of a 64- or 32-bit quantity. Use this to suppress +* the "right shift count >= width of type" warning when that quantity is +* 32-bits. +*/ +#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) + +/** +* lower_32_bits - return bits 0-31 of a number
+* @n: the number we're accessing
+*/ +#define lower_32_bits(n) ((u32)(n)) + static void svm_init_erratum_383(void) { u32 low, high; int err; u64 val; - if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) - return; - /* Use _safe variants to not break nested virtualization */ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); if (err) @@ -715,25 +539,23 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) static int has_svm(void) { - const char *msg; + return static_cpu_has(X86_FEATURE_SVM); +} - if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; - } +static inline void cpu_svm_disable(void) +{ + uint64_t efer; - return 1; + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~EFER_SVME); } static void svm_hardware_disable(void) { - /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - cpu_svm_disable(); - amd_pmu_disable_virt(); + //amd_pmu_disable_virt(); } static int svm_hardware_enable(void) @@ -771,11 +593,6 @@ static int svm_hardware_enable(void) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); - } - /* * Get OSVW bits. @@ -808,7 +625,7 @@ static int svm_hardware_enable(void) svm_init_erratum_383(); - amd_pmu_enable_virt(); + //amd_pmu_enable_virt(); return 0; } @@ -864,7 +681,7 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { u8 bit_read, bit_write; - unsigned long tmp; + size_t tmp; u32 offset; /* @@ -965,70 +782,17 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } -/* Note: - * This hash table is used to map VM_ID to a struct kvm_arch, - * when handling AMD IOMMU GALOG notification to schedule in - * a particular vCPU. - */ -#define SVM_VM_DATA_HASH_BITS 8 -DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); -static spinlock_t svm_vm_data_hash_lock; - -/* Note: - * This function is called from IOMMU driver to notify - * SVM to schedule in a particular vCPU of a particular VM. - */ -static int avic_ga_log_notifier(u32 ga_tag) -{ - unsigned long flags; - struct kvm_arch *ka = NULL; - struct kvm_vcpu *vcpu = NULL; - u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); - - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { - struct kvm *kvm = container_of(ka, struct kvm, arch); - struct kvm_arch *vm_data = &kvm->arch; - - if (vm_data->avic_vm_id != vm_id) - continue; - vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - break; - } - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - - if (!vcpu) - return 0; - - /* Note: - * At this point, the IOMMU should have already set the pending - * bit in the vAPIC backing page. So, we just need to schedule - * in the vcpu. - */ - if (vcpu->mode == OUTSIDE_GUEST_MODE) - kvm_vcpu_wake_up(vcpu); - - return 0; -} - -static __init int svm_hardware_setup(void) +static int svm_hardware_setup(void) { int cpu; - struct page *iopm_pages; - void *iopm_va; int r; + PHYSICAL_ADDRESS max_phys = { .QuadPart = MAXULONG64 }; - iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); + iopm_va = MmAllocateContiguousMemory(PAGE_SIZE * (1 << IOPM_ALLOC_ORDER), max_phys); + if (!iopm_va) + return ENOMEM; memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + iopm_base = MmGetPhysicalAddress(iopm_va).QuadPart; init_msrpm_offsets(); @@ -1038,12 +802,6 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -1071,34 +829,31 @@ static __init int svm_hardware_setup(void) if (avic) { if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + !boot_cpu_has(X86_FEATURE_AVIC)) { avic = false; } else { pr_info("AVIC enabled\n"); - - hash_init(svm_vm_data_hash); - spin_lock_init(&svm_vm_data_hash_lock); - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } } return 0; err: - __free_pages(iopm_pages, IOPM_ALLOC_ORDER); + MmFreeContiguousMemory(iopm_va); + iopm_va = NULL; iopm_base = 0; return r; } -static __exit void svm_hardware_unsetup(void) +static void svm_hardware_unsetup(void) { int cpu; for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + MmFreeContiguousMemory(iopm_va); + iopm_va = NULL; iopm_base = 0; } @@ -1128,10 +883,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset, - offset); + } svm->vmcb->control.tsc_offset = offset + g_tsc_offset; @@ -1159,7 +911,6 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - svm->vcpu.fpu_active = 1; svm->vcpu.arch.hflags = 0; set_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1260,11 +1011,6 @@ static void init_vmcb(struct vcpu_svm *svm) svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; - if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - control->pause_filter_count = 3000; - set_intercept(svm, INTERCEPT_PAUSE); - } - if (avic) avic_init_vmcb(svm); @@ -1338,7 +1084,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); + READ_ONCE(*entry, new_entry); new_entry = (page_to_phys(svm->avic_backing_page) & AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; @@ -1379,7 +1125,6 @@ static inline int avic_free_vm_id(int id) static void avic_vm_destroy(struct kvm *kvm) { - unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; avic_free_vm_id(vm_data->avic_vm_id); @@ -1388,15 +1133,10 @@ static void avic_vm_destroy(struct kvm *kvm) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) __free_page(vm_data->avic_physical_id_table_page); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_del(&vm_data->hnode); - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } static int avic_vm_init(struct kvm *kvm) { - unsigned long flags; int vm_id, err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; @@ -1426,10 +1166,6 @@ static int avic_vm_init(struct kvm *kvm) vm_data->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - return 0; free_avic: @@ -1437,36 +1173,6 @@ free_avic: return err; } -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; - - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; -} - static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; @@ -1480,7 +1186,7 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + READ_ONCE(*(svm->avic_physical_id_cache), entry); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; @@ -1491,8 +1197,6 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); } static void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1503,10 +1207,7 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); - + READ_ONCE(*(svm->avic_physical_id_cache), entry); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } @@ -1550,12 +1251,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; struct page *page; - struct page *msrpm_pages; struct page *hsave_page; - struct page *nested_msrpm_pages; + void *msrpm_va; + void *nested_msrpm_va; int err; + PHYSICAL_ADDRESS max_phys = { .QuadPart = MAXULONG64 }; - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + svm = kzalloc_fast(sizeof(struct vcpu_svm), GFP_KERNEL); if (!svm) { err = -ENOMEM; goto out; @@ -1570,12 +1272,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!page) goto uninit; - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) + msrpm_va = MmAllocateContiguousMemory(PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER), max_phys); + if (!msrpm_va) goto free_page1; - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) + nested_msrpm_va = MmAllocateContiguousMemory(PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER), max_phys); + if (!nested_msrpm_va) goto free_page2; hsave_page = alloc_page(GFP_KERNEL); @@ -1586,9 +1288,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = avic_init_backing_page(&svm->vcpu); if (err) goto free_page4; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); } /* We initialize this flag to true to make sure that the is_running @@ -1598,10 +1297,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->nested.hsave = page_address(hsave_page); - svm->msrpm = page_address(msrpm_pages); + svm->msrpm = msrpm_va; svm_vcpu_init_msrpm(svm->msrpm); - svm->nested.msrpm = page_address(nested_msrpm_pages); + svm->nested.msrpm = nested_msrpm_va; svm_vcpu_init_msrpm(svm->nested.msrpm); svm->vmcb = page_address(page); @@ -1617,15 +1316,15 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) free_page4: __free_page(hsave_page); free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(nested_msrpm_va); free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(msrpm_va); free_page1: __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: - kmem_cache_free(kvm_vcpu_cache, svm); + kfree(svm); out: return ERR_PTR(err); } @@ -1635,16 +1334,27 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(svm->msrpm); __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(svm->nested.msrpm); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, svm); + kfree(svm); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + avic_vcpu_load(vcpu, cpu); +} + +static void svm_vcpu_put(struct kvm_vcpu *vcpu) +{ + avic_vcpu_put(vcpu); +} + +static void svm_save_host_state(struct kvm_vcpu *vcpu) +{ struct vcpu_svm *svm = to_svm(vcpu); + int cpu = smp_processor_id(); int i; if (unlikely(cpu != vcpu->cpu)) { @@ -1661,38 +1371,17 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { - u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; - if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); - } - } - /* This assumes that the kernel never uses MSR_TSC_AUX */ - if (static_cpu_has(X86_FEATURE_RDTSCP)) - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); - - avic_vcpu_load(vcpu, cpu); } -static void svm_vcpu_put(struct kvm_vcpu *vcpu) +static void svm_load_host_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); int i; - avic_vcpu_put(vcpu); - - ++vcpu->stat.host_state_reload; kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); load_gs_index(svm->host.gs); -#else -#ifdef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif #endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); @@ -1708,12 +1397,12 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_set_running(vcpu, true); } -static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) +static size_t svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; } -static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void svm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { /* * Any change of EFLAGS.VM is accompanied by a reload of SS @@ -1723,11 +1412,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } -static u32 svm_get_pkru(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -1765,7 +1449,6 @@ static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) case VCPU_SREG_LDTR: return &save->ldtr; } BUG(); - return NULL; } static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -1796,7 +1479,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, * However, the SVM spec states that the G bit is not observed by the * CPU, and some VMware virtual CPUs drop the G bit for all segments. * So let's synthesize a legal G bit for all segments, this helps - * running KVM nested. It also helps cross-vendor migration, because + * running kvm nested. It also helps cross-vendor migration, because * Intel's vmentry has a check on the 'G' bit. */ var->g = s->limit > 0xfffff; @@ -1901,15 +1584,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm) ulong gcr0 = svm->vcpu.arch.cr0; u64 *hcr0 = &svm->vmcb->save.cr0; - if (!svm->vcpu.fpu_active) - *hcr0 |= SVM_CR0_SELECTIVE_MASK; - else - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); mark_dirty(svm->vmcb, VMCB_CR); - if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); } else { @@ -1918,7 +1598,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm) } } -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +static void svm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1940,24 +1620,22 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!npt_enabled) cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) - cr0 |= X86_CR0_TS; /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at * reboot */ - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, GVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); } -static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int svm_set_cr4(struct kvm_vcpu *vcpu, size_t cr4) { - unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; - unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + size_t host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; + size_t old_cr4 = to_svm(vcpu)->vmcb->save.cr4; if (cr4 & X86_CR4_VMXE) return 1; @@ -2014,8 +1692,8 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu) clr_exception_intercept(svm, BP_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + if (vcpu->guest_debug & GVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & GVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); } else vcpu->guest_debug = 0; @@ -2040,7 +1718,7 @@ static u64 svm_get_dr6(struct kvm_vcpu *vcpu) return to_svm(vcpu)->vmcb->save.dr6; } -static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, size_t value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2059,11 +1737,11 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = svm_get_dr6(vcpu); vcpu->arch.dr7 = svm->vmcb->save.dr7; - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_WONT_EXIT; set_dr_intercepts(svm); } -static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) +static void svm_set_dr7(struct kvm_vcpu *vcpu, size_t value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2077,30 +1755,14 @@ static int pf_interception(struct vcpu_svm *svm) u32 error_code; int r = 1; - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; - - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, - svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } + error_code = svm->vmcb->control.exit_info_1; + + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); + return r; } @@ -2109,7 +1771,7 @@ static int db_interception(struct vcpu_svm *svm) struct kvm_run *kvm_run = svm->vcpu.run; if (!(svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; @@ -2117,14 +1779,14 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) + if (!(svm->vcpu.guest_debug & GVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); } if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { - kvm_run->exit_reason = KVM_EXIT_DEBUG; + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) { + kvm_run->exit_reason = GVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = DB_VECTOR; @@ -2138,7 +1800,7 @@ static int bp_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->exit_reason = GVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = BP_VECTOR; return 0; @@ -2160,22 +1822,6 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } -static void svm_fpu_activate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, NM_VECTOR); - - svm->vcpu.fpu_active = 1; - update_cr0_intercept(svm); -} - -static int nm_interception(struct vcpu_svm *svm) -{ - svm_fpu_activate(&svm->vcpu); - return 1; -} - static bool is_erratum_383(void) { int err, i; @@ -2210,7 +1856,7 @@ static bool is_erratum_383(void) } /* Flush tlb to evict multi-match entries */ - __flush_tlb_all(); + //__flush_tlb_all(); return true; } @@ -2222,9 +1868,9 @@ static void svm_handle_mce(struct vcpu_svm *svm) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("kvm: Guest triggered AMD Erratum 383\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, &svm->vcpu); return; } @@ -2233,8 +1879,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. */ - asm volatile ( - "int $0x12\n"); + __int12(); /* not sure if we ever come back to this point */ return; @@ -2256,7 +1901,7 @@ static int shutdown_interception(struct vcpu_svm *svm) clear_page(svm->vmcb); init_vmcb(svm); - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + kvm_run->exit_reason = GVM_EXIT_SHUTDOWN; return 0; } @@ -2303,13 +1948,7 @@ static int halt_interception(struct vcpu_svm *svm) return kvm_emulate_halt(&svm->vcpu); } -static int vmmcall_interception(struct vcpu_svm *svm) -{ - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - return kvm_emulate_hypercall(&svm->vcpu); -} - -static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +static size_t nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2331,7 +1970,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) } static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, - unsigned long root) + size_t root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2455,7 +2094,6 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) * the #vmexit here. */ svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); return false; } @@ -2477,30 +2115,44 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm) return false; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, PMDL *_mdl) { - struct page *page; + size_t hva; + PMDL mdl; + void *ret; might_sleep(); - page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + hva = gfn_to_hva(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + if (kvm_is_error_hva(hva)) goto error; - *_page = page; + mdl = IoAllocateMdl((void *)hva, PAGE_SIZE, FALSE, FALSE, NULL); + if (!mdl) + goto error; + + MmProbeAndLockPages(mdl, KernelMode, IoWriteAccess); - return kmap(page); + ret = kmap(mdl); + if (!ret) + goto error1; + *_mdl = mdl; + return ret; + +error1: + MmUnlockPages(mdl); + IoFreeMdl(mdl); error: kvm_inject_gp(&svm->vcpu, 0); return NULL; } -static void nested_svm_unmap(struct page *page) +static void nested_svm_unmap(PMDL mdl) { - kunmap(page); - kvm_release_page_dirty(page); + kunmap(mdl); + kvm_release_page(mdl); } static int nested_svm_intercept_ioio(struct vcpu_svm *svm) @@ -2569,12 +2221,9 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled) return NESTED_EXIT_HOST; break; - case SVM_EXIT_EXCP_BASE + NM_VECTOR: - nm_interception(svm); - break; default: break; } @@ -2597,26 +2246,77 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { + case SVM_EXIT_READ_CR0: + case SVM_EXIT_READ_CR2: + case SVM_EXIT_READ_CR3: + case SVM_EXIT_READ_CR4: + case SVM_EXIT_READ_CR8: + case SVM_EXIT_WRITE_CR0: + case SVM_EXIT_WRITE_CR2: + case SVM_EXIT_WRITE_CR3: + case SVM_EXIT_WRITE_CR4: + case SVM_EXIT_WRITE_CR8: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr & bit) vmexit = NESTED_EXIT_DONE; break; } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { + case SVM_EXIT_READ_DR0: + case SVM_EXIT_READ_DR1: + case SVM_EXIT_READ_DR2: + case SVM_EXIT_READ_DR3: + case SVM_EXIT_READ_DR4: + case SVM_EXIT_READ_DR5: + case SVM_EXIT_READ_DR6: + case SVM_EXIT_READ_DR7: + case SVM_EXIT_WRITE_DR0: + case SVM_EXIT_WRITE_DR1: + case SVM_EXIT_WRITE_DR2: + case SVM_EXIT_WRITE_DR3: + case SVM_EXIT_WRITE_DR4: + case SVM_EXIT_WRITE_DR5: + case SVM_EXIT_WRITE_DR6: + case SVM_EXIT_WRITE_DR7: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); if (svm->nested.intercept_dr & bit) vmexit = NESTED_EXIT_DONE; break; } - case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + case SVM_EXIT_EXCP_BASE: + case SVM_EXIT_EXCP_BASE + 0x1: + case SVM_EXIT_EXCP_BASE + 0x2: + case SVM_EXIT_EXCP_BASE + 0x3: + case SVM_EXIT_EXCP_BASE + 0x4: + case SVM_EXIT_EXCP_BASE + 0x5: + case SVM_EXIT_EXCP_BASE + 0x6: + case SVM_EXIT_EXCP_BASE + 0x7: + case SVM_EXIT_EXCP_BASE + 0x8: + case SVM_EXIT_EXCP_BASE + 0x9: + case SVM_EXIT_EXCP_BASE + 0xa: + case SVM_EXIT_EXCP_BASE + 0xb: + case SVM_EXIT_EXCP_BASE + 0xc: + case SVM_EXIT_EXCP_BASE + 0xd: + case SVM_EXIT_EXCP_BASE + 0xe: + case SVM_EXIT_EXCP_BASE + 0xf: + case SVM_EXIT_EXCP_BASE + 0x10: + case SVM_EXIT_EXCP_BASE + 0x11: + case SVM_EXIT_EXCP_BASE + 0x12: + case SVM_EXIT_EXCP_BASE + 0x13: + case SVM_EXIT_EXCP_BASE + 0x14: + case SVM_EXIT_EXCP_BASE + 0x15: + case SVM_EXIT_EXCP_BASE + 0x16: + case SVM_EXIT_EXCP_BASE + 0x17: + case SVM_EXIT_EXCP_BASE + 0x18: + case SVM_EXIT_EXCP_BASE + 0x19: + case SVM_EXIT_EXCP_BASE + 0x1a: + case SVM_EXIT_EXCP_BASE + 0x1b: + case SVM_EXIT_EXCP_BASE + 0x1c: + case SVM_EXIT_EXCP_BASE + 0x1d: + case SVM_EXIT_EXCP_BASE + 0x1e: + case SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); if (svm->nested.intercept_exceptions & excp_bits) vmexit = NESTED_EXIT_DONE; - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) - vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_ERR: { @@ -2680,16 +2380,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + PMDL kmap_mdl; - trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, - vmcb->control.exit_info_1, - vmcb->control.exit_info_2, - vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &kmap_mdl); if (!nested_vmcb) return 1; @@ -2789,7 +2482,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) mark_all_dirty(svm->vmcb); - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); @@ -2850,12 +2543,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + PMDL kmap_mdl; u64 vmcb_gpa; vmcb_gpa = svm->vmcb->save.rax; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &kmap_mdl); if (!nested_vmcb) return false; @@ -2865,22 +2558,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); return false; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -2985,7 +2667,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); /* Enter Guest-Mode */ enter_guest_mode(&svm->vcpu); @@ -3024,12 +2706,12 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + PMDL kmap_mdl; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &kmap_mdl); if (!nested_vmcb) return 1; @@ -3037,7 +2719,7 @@ static int vmload_interception(struct vcpu_svm *svm) skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); return 1; } @@ -3045,12 +2727,12 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + PMDL kmap_mdl; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &kmap_mdl); if (!nested_vmcb) return 1; @@ -3058,7 +2740,7 @@ static int vmsave_interception(struct vcpu_svm *svm) skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); return 1; } @@ -3098,7 +2780,7 @@ static int stgi_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); @@ -3129,9 +2811,6 @@ static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), - kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); - /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); @@ -3142,8 +2821,6 @@ static int invlpga_interception(struct vcpu_svm *svm) static int skinit_interception(struct vcpu_svm *svm) { - trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -3227,8 +2904,8 @@ static int task_switch_interception(struct vcpu_svm *svm) if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; svm->vcpu.run->internal.ndata = 0; return 0; } @@ -3248,7 +2925,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -3269,6 +2946,7 @@ static int emulate_on_interception(struct vcpu_svm *svm) static int rdpmc_interception(struct vcpu_svm *svm) { +#if 0 int err; if (!static_cpu_has(X86_FEATURE_NRIPS)) @@ -3276,14 +2954,15 @@ static int rdpmc_interception(struct vcpu_svm *svm) err = kvm_rdpmc(&svm->vcpu); kvm_complete_insn_gp(&svm->vcpu, err); +#endif return 1; } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, - unsigned long val) + size_t val) { - unsigned long cr0 = svm->vcpu.arch.cr0; + size_t cr0 = svm->vcpu.arch.cr0; bool ret = false; u64 intercept; @@ -3309,7 +2988,7 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, static int cr_interception(struct vcpu_svm *svm) { int reg, cr; - unsigned long val; + size_t val; int err; if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) @@ -3346,7 +3025,7 @@ static int cr_interception(struct vcpu_svm *svm) err = kvm_set_cr8(&svm->vcpu, val); break; default: - WARN(1, "unhandled write to CR%d", cr); + //WARN(1, "unhandled write to CR%d", cr); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -3368,7 +3047,7 @@ static int cr_interception(struct vcpu_svm *svm) val = kvm_get_cr8(&svm->vcpu); break; default: - WARN(1, "unhandled read from CR%d", cr); + //WARN(1, "unhandled read from CR%d", cr); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -3382,7 +3061,7 @@ static int cr_interception(struct vcpu_svm *svm) static int dr_interception(struct vcpu_svm *svm) { int reg, dr; - unsigned long val; + size_t val; if (svm->vcpu.guest_debug == 0) { /* @@ -3391,7 +3070,7 @@ static int dr_interception(struct vcpu_svm *svm) * retrieve the full state of the debug registers. */ clr_dr_intercepts(svm); - svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + svm->vcpu.arch.switch_db_regs |= GVM_DEBUGREG_WONT_EXIT; return 1; } @@ -3430,7 +3109,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + kvm_run->exit_reason = GVM_EXIT_SET_TPR; return 0; } @@ -3440,9 +3119,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { - msr_info->data = svm->vmcb->control.tsc_offset + - kvm_scale_tsc(vcpu, rdtsc()); - + msr_info->data = svm->vmcb->control.tsc_offset + rdtsc(); break; } case MSR_STAR: @@ -3536,11 +3213,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) msr_info.index = ecx; msr_info.host_initiated = false; if (svm_get_msr(&svm->vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, msr_info.data & 0xffffffff); kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, @@ -3624,7 +3298,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * svm_vcpu_put. */ svm->tsc_aux = data; - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { @@ -3672,10 +3345,8 @@ static int wrmsr_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (kvm_set_msr(&svm->vcpu, &msr)) { - trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); } return 1; @@ -3691,7 +3362,7 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); @@ -3699,12 +3370,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) return 1; } -static int pause_interception(struct vcpu_svm *svm) -{ - kvm_vcpu_on_spin(&(svm->vcpu)); - return 1; -} - static int nop_interception(struct vcpu_svm *svm) { skip_emulated_instruction(&(svm->vcpu)); @@ -3735,11 +3400,8 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0xFF; struct kvm_lapic *apic = svm->vcpu.arch.apic; - trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); - switch (id) { case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* @@ -3760,7 +3422,6 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) int i; struct kvm_vcpu *vcpu; struct kvm *kvm = svm->vcpu.kvm; - struct kvm_lapic *apic = svm->vcpu.arch.apic; /* * At this point, we expect that the AVIC HW has already @@ -3769,9 +3430,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) */ kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, apic, - icrl & KVM_APIC_SHORT_MASK, + icrl & GVM_APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & KVM_APIC_DEST_MASK); + icrl & GVM_APIC_DEST_MASK); if (m && !avic_vcpu_is_running(vcpu)) kvm_vcpu_wake_up(vcpu); @@ -3781,7 +3442,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) case AVIC_IPI_FAILURE_INVALID_TARGET: break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: - WARN_ONCE(1, "Invalid backing page\n"); + //WARN_ONCE(1, "Invalid backing page\n"); break; default: pr_err("Unknown IPI interception\n"); @@ -3830,7 +3491,7 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); + READ_ONCE(*entry, new_entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); if (valid) @@ -3975,17 +3636,10 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) int ret = 0; u32 offset = svm->vmcb->control.exit_info_1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK; - u32 vector = svm->vmcb->control.exit_info_2 & - AVIC_UNACCEL_ACCESS_VECTOR_MASK; - bool write = (svm->vmcb->control.exit_info_1 >> 32) & - AVIC_UNACCEL_ACCESS_WRITE_MASK; bool trap = is_avic_unaccelerated_access_trap(offset); - trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, - trap, write, vector); if (trap) { /* Handling Trap */ - WARN_ONCE(!write, "svm: Handling trap read.\n"); ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ @@ -4025,7 +3679,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, @@ -4037,7 +3690,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, - [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, @@ -4046,7 +3698,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = vmrun_interception, - [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = vmload_interception, [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, @@ -4185,8 +3836,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -4202,13 +3851,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - vmexit = nested_svm_exit_special(svm); if (vmexit == NESTED_EXIT_CONTINUE) @@ -4221,10 +3863,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->exit_reason = GVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; - pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); + pr_err("kvm: FAILED VMRUN WITH VMCB:\n"); dump_vmcb(vcpu); return 0; } @@ -4240,7 +3882,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); + //WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -4297,7 +3939,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) BUG_ON(!(gif_set(svm))); - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | @@ -4362,11 +4003,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - return; -} - static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) { kvm_lapic_set_irr(vec, vcpu->arch.apic); @@ -4379,209 +4015,6 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - unsigned long flags; - struct amd_svm_iommu_ir *cur; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) - continue; - list_del(&cur->node); - kfree(cur); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - - /** - * In some cases, the existing irte is updaed and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); - } - - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_add(&ir->node, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; -} - -/** - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) -{ - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } - - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); - vcpu_info->vector = irq.vector; - - return 0; -} - -/* - * svm_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - int idx, ret = -EINVAL; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) - return 0; - - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - WARN_ON(guest_irq >= irq_rt->nr_rt_entries); - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; - - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtialization is disabled for the vcpu. - */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; - - /* Try to enable guest_mode in IRTE */ - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; - pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } - } - - if (!ret && svm) { - trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, - host_irq, e->gsi, - vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } - - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4682,10 +4115,6 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) svm->asid_generation--; } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) -{ -} - static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4729,7 +4158,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if ((svm->vcpu.arch.hflags & HF_IRET_MASK) && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); } svm->vcpu.arch.nmi_injected = false; @@ -4739,7 +4168,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; @@ -4813,87 +4242,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); - asm volatile ( - "push %%" _ASM_BP "; \n\t" - "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" - "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" - "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" - "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" - "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" - "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%[svm]), %%r8 \n\t" - "mov %c[r9](%[svm]), %%r9 \n\t" - "mov %c[r10](%[svm]), %%r10 \n\t" - "mov %c[r11](%[svm]), %%r11 \n\t" - "mov %c[r12](%[svm]), %%r12 \n\t" - "mov %c[r13](%[svm]), %%r13 \n\t" - "mov %c[r14](%[svm]), %%r14 \n\t" - "mov %c[r15](%[svm]), %%r15 \n\t" -#endif - - /* Enter guest mode */ - "push %%" _ASM_AX " \n\t" - "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" - "pop %%" _ASM_AX " \n\t" - - /* Save guest registers, load host registers */ - "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" - "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%[svm]) \n\t" - "mov %%r9, %c[r9](%[svm]) \n\t" - "mov %%r10, %c[r10](%[svm]) \n\t" - "mov %%r11, %c[r11](%[svm]) \n\t" - "mov %%r12, %c[r12](%[svm]) \n\t" - "mov %%r13, %c[r13](%[svm]) \n\t" - "mov %%r14, %c[r14](%[svm]) \n\t" - "mov %%r15, %c[r15](%[svm]) \n\t" -#endif - "pop %%" _ASM_BP - : - : [svm]"a"(svm), - [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) -#ifdef CONFIG_X86_64 - , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" - , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx", "esi", "edi" -#endif - ); + __asm_svm_vcpu_run(svm); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS loadsegment(gs, svm->host.gs); #endif -#endif reload_tss(vcpu); @@ -4920,10 +4276,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - /* if exit due to PF check for async PF */ - if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); - if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); @@ -4940,7 +4292,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_set_cr3(struct kvm_vcpu *vcpu, size_t root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4949,7 +4301,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm_flush_tlb(vcpu); } -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) +static void set_tdp_cr3(struct kvm_vcpu *vcpu, size_t root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4974,17 +4326,6 @@ static int is_disabled(void) return 0; } -static void -svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xd9; -} - static void svm_check_processor_compat(void *rtn) { *(int *)rtn = 0; @@ -5008,7 +4349,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry *entry; /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); @@ -5021,7 +4362,7 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) entry->ecx &= ~bit(X86_FEATURE_X2APIC); } -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry *entry) { switch (func) { case 0x1: @@ -5059,7 +4400,7 @@ static int svm_get_lpage_level(void) static bool svm_rdtscp_supported(void) { - return boot_cpu_has(X86_FEATURE_RDTSCP); + return false; } static bool svm_invpcid_supported(void) @@ -5082,14 +4423,6 @@ static bool svm_has_wbinvd_exit(void) return true; } -static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - set_exception_intercept(svm, NM_VECTOR); - update_cr0_intercept(svm); -} - #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -5176,7 +4509,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, icpt_info.exit_code += info->modrm_reg; break; case SVM_EXIT_WRITE_CR0: { - unsigned long cr0, val; + size_t cr0, val; u64 intercept; if (info->intercept == x86_intercept_cr_write) @@ -5280,14 +4613,10 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu) * We must have an instruction with interrupts enabled, so * the timer interrupt isn't delayed by the interrupt shadow. */ - asm("nop"); + __nop(); local_irq_disable(); } -static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ -} - static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) @@ -5297,7 +4626,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static struct kvm_x86_ops svm_x86_ops __ro_after_init = { +static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, @@ -5315,7 +4644,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vm_init = avic_vm_init, .vm_destroy = avic_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .save_host_state = svm_save_host_state, + .load_host_state = svm_load_host_state, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = svm_vcpu_blocking, @@ -5348,11 +4678,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .get_pkru = svm_get_pkru, - - .fpu_activate = svm_fpu_activate, - .fpu_deactivate = svm_fpu_deactivate, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, @@ -5360,7 +4685,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .skip_emulated_instruction = skip_emulated_instruction, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, - .patch_hypercall = svm_patch_hypercall, .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, @@ -5376,7 +4700,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, - .sync_pir_to_irr = svm_sync_pir_to_irr, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, .apicv_post_state_restore = avic_post_state_restore, @@ -5407,23 +4730,15 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, - .sched_in = svm_sched_in, - - .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, - .update_pi_irte = svm_update_pi_irte, }; -static int __init svm_init(void) +int svm_init(void) { - return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), 0); } -static void __exit svm_exit(void) +void svm_exit(void) { kvm_exit(); } - -module_init(svm_init) -module_exit(svm_exit) diff --git a/arch/x86/kvm/svm_def.h b/arch/x86/kvm/svm_def.h new file mode 100755 index 0000000..2b5ce8e --- /dev/null +++ b/arch/x86/kvm/svm_def.h @@ -0,0 +1,176 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#define pr_fmt(fmt) "SVM: " fmt + +#include <linux/kvm_host.h> + +#include "irq.h" +#include "mmu.h" +#include "kvm_cache_regs.h" +#include "x86.h" +#include "cpuid.h" +#include "pmu.h" + +#include <asm/svm.h> +#include <asm/vmx.h> + +#include <__asm.h> + +#define IOPM_ALLOC_ORDER 2 +#define MSRPM_ALLOC_ORDER 1 + +#define SEG_TYPE_LDT 2 +#define SEG_TYPE_BUSY_TSS16 3 + +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_TSC_RATE (1 << 4) +#define SVM_FEATURE_VMCB_CLEAN (1 << 5) +#define SVM_FEATURE_FLUSH_ASID (1 << 6) +#define SVM_FEATURE_DECODE_ASSIST (1 << 7) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) + +#define SVM_AVIC_DOORBELL 0xc001011b + +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + +#define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL + +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 255 + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + +static bool erratum_383_found __read_mostly; + +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_TSC_AUX, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +struct kvm_vcpu; + +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vm_cr_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; + u64 vmcb_iopm; + + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + + /* cache for intercepts of the guest */ + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + + /* Nested Paging related state */ + u64 nested_cr3; +}; + +#define MSRPM_OFFSETS 16 +static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; + +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + size_t vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t tsc_aux; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + struct { + u16 fs; + u16 gs; + u16 ldt; + u64 gs_base; + } host; + + u32 *msrpm; + + ulong nmi_iret_rip; + + struct nested_state nested; + + bool nmi_singlestep; + + unsigned int3_injected; + size_t int3_rip; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; + + u32 ldr_reg; + struct page *avic_backing_page; + u64 *avic_physical_id_cache; + bool avic_is_running; +}; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h deleted file mode 100644 index 0a6cc67..0000000 --- a/arch/x86/kvm/trace.h +++ /dev/null @@ -1,1374 +0,0 @@ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H - -#include <linux/tracepoint.h> -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/clocksource.h> -#include <asm/pvclock-abi.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* - * Tracepoint for guest mode entry. - */ -TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - ), - - TP_printk("vcpu %u", __entry->vcpu_id) -); - -/* - * Tracepoint for hypercall. - */ -TRACE_EVENT(kvm_hypercall, - TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3), - TP_ARGS(nr, a0, a1, a2, a3), - - TP_STRUCT__entry( - __field( unsigned long, nr ) - __field( unsigned long, a0 ) - __field( unsigned long, a1 ) - __field( unsigned long, a2 ) - __field( unsigned long, a3 ) - ), - - TP_fast_assign( - __entry->nr = nr; - __entry->a0 = a0; - __entry->a1 = a1; - __entry->a2 = a2; - __entry->a3 = a3; - ), - - TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", - __entry->nr, __entry->a0, __entry->a1, __entry->a2, - __entry->a3) -); - -/* - * Tracepoint for hypercall. - */ -TRACE_EVENT(kvm_hv_hypercall, - TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, - __u64 ingpa, __u64 outgpa), - TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), - - TP_STRUCT__entry( - __field( __u16, rep_cnt ) - __field( __u16, rep_idx ) - __field( __u64, ingpa ) - __field( __u64, outgpa ) - __field( __u16, code ) - __field( bool, fast ) - ), - - TP_fast_assign( - __entry->rep_cnt = rep_cnt; - __entry->rep_idx = rep_idx; - __entry->ingpa = ingpa; - __entry->outgpa = outgpa; - __entry->code = code; - __entry->fast = fast; - ), - - TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", - __entry->code, __entry->fast ? "fast" : "slow", - __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, - __entry->outgpa) -); - -/* - * Tracepoint for PIO. - */ - -#define KVM_PIO_IN 0 -#define KVM_PIO_OUT 1 - -TRACE_EVENT(kvm_pio, - TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), - TP_ARGS(rw, port, size, count, data), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, port ) - __field( unsigned int, size ) - __field( unsigned int, count ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->port = port; - __entry->size = size; - __entry->count = count; - if (size == 1) - __entry->val = *(unsigned char *)data; - else if (size == 2) - __entry->val = *(unsigned short *)data; - else - __entry->val = *(unsigned int *)data; - ), - - TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", - __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count, __entry->val, - __entry->count > 1 ? "(...)" : "") -); - -/* - * Tracepoint for fast mmio. - */ -TRACE_EVENT(kvm_fast_mmio, - TP_PROTO(u64 gpa), - TP_ARGS(gpa), - - TP_STRUCT__entry( - __field(u64, gpa) - ), - - TP_fast_assign( - __entry->gpa = gpa; - ), - - TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) -); - -/* - * Tracepoint for cpuid. - */ -TRACE_EVENT(kvm_cpuid, - TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, - unsigned long rcx, unsigned long rdx), - TP_ARGS(function, rax, rbx, rcx, rdx), - - TP_STRUCT__entry( - __field( unsigned int, function ) - __field( unsigned long, rax ) - __field( unsigned long, rbx ) - __field( unsigned long, rcx ) - __field( unsigned long, rdx ) - ), - - TP_fast_assign( - __entry->function = function; - __entry->rax = rax; - __entry->rbx = rbx; - __entry->rcx = rcx; - __entry->rdx = rdx; - ), - - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", - __entry->function, __entry->rax, - __entry->rbx, __entry->rcx, __entry->rdx) -); - -#define AREG(x) { APIC_##x, "APIC_" #x } - -#define kvm_trace_symbol_apic \ - AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ - AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ - AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ - AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ - AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ - AREG(ECTRL) -/* - * Tracepoint for apic access. - */ -TRACE_EVENT(kvm_apic, - TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), - TP_ARGS(rw, reg, val), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, reg ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->reg = reg; - __entry->val = val; - ), - - TP_printk("apic_%s %s = 0x%x", - __entry->rw ? "write" : "read", - __print_symbolic(__entry->reg, kvm_trace_symbol_apic), - __entry->val) -); - -#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) -#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) - -#define KVM_ISA_VMX 1 -#define KVM_ISA_SVM 2 - -/* - * Tracepoint for kvm guest exit: - */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, - &__entry->info2); - ), - - TP_printk("reason %s rip 0x%lx info %llx %llx", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), - __entry->guest_rip, __entry->info1, __entry->info2) -); - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), - - TP_STRUCT__entry( - __field( unsigned int, irq ) - ), - - TP_fast_assign( - __entry->irq = irq; - ), - - TP_printk("irq %u", __entry->irq) -); - -#define EXS(x) { x##_VECTOR, "#" #x } - -#define kvm_trace_sym_exc \ - EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(AC), EXS(MC) - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), - - TP_STRUCT__entry( - __field( u8, exception ) - __field( u8, has_error ) - __field( u32, error_code ) - ), - - TP_fast_assign( - __entry->exception = exception; - __entry->has_error = has_error; - __entry->error_code = error_code; - ), - - TP_printk("%s (0x%x)", - __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) -); - -/* - * Tracepoint for page fault. - */ -TRACE_EVENT(kvm_page_fault, - TP_PROTO(unsigned long fault_address, unsigned int error_code), - TP_ARGS(fault_address, error_code), - - TP_STRUCT__entry( - __field( unsigned long, fault_address ) - __field( unsigned int, error_code ) - ), - - TP_fast_assign( - __entry->fault_address = fault_address; - __entry->error_code = error_code; - ), - - TP_printk("address %lx error_code %x", - __entry->fault_address, __entry->error_code) -); - -/* - * Tracepoint for guest MSR access. - */ -TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), - TP_ARGS(write, ecx, data, exception), - - TP_STRUCT__entry( - __field( unsigned, write ) - __field( u32, ecx ) - __field( u64, data ) - __field( u8, exception ) - ), - - TP_fast_assign( - __entry->write = write; - __entry->ecx = ecx; - __entry->data = data; - __entry->exception = exception; - ), - - TP_printk("msr_%s %x = 0x%llx%s", - __entry->write ? "write" : "read", - __entry->ecx, __entry->data, - __entry->exception ? " (#GP)" : "") -); - -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) -#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) -#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) - -/* - * Tracepoint for guest CR access. - */ -TRACE_EVENT(kvm_cr, - TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), - TP_ARGS(rw, cr, val), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, cr ) - __field( unsigned long, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->cr = cr; - __entry->val = val; - ), - - TP_printk("cr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->cr, __entry->val) -); - -#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) -#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) - -TRACE_EVENT(kvm_pic_set_irq, - TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), - TP_ARGS(chip, pin, elcr, imr, coalesced), - - TP_STRUCT__entry( - __field( __u8, chip ) - __field( __u8, pin ) - __field( __u8, elcr ) - __field( __u8, imr ) - __field( bool, coalesced ) - ), - - TP_fast_assign( - __entry->chip = chip; - __entry->pin = pin; - __entry->elcr = elcr; - __entry->imr = imr; - __entry->coalesced = coalesced; - ), - - TP_printk("chip %u pin %u (%s%s)%s", - __entry->chip, __entry->pin, - (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", - (__entry->imr & (1 << __entry->pin)) ? "|masked":"", - __entry->coalesced ? " (coalesced)" : "") -); - -#define kvm_apic_dst_shorthand \ - {0x0, "dst"}, \ - {0x1, "self"}, \ - {0x2, "all"}, \ - {0x3, "all-but-self"} - -TRACE_EVENT(kvm_apic_ipi, - TP_PROTO(__u32 icr_low, __u32 dest_id), - TP_ARGS(icr_low, dest_id), - - TP_STRUCT__entry( - __field( __u32, icr_low ) - __field( __u32, dest_id ) - ), - - TP_fast_assign( - __entry->icr_low = icr_low; - __entry->dest_id = dest_id; - ), - - TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", - __entry->dest_id, (u8)__entry->icr_low, - __print_symbolic((__entry->icr_low >> 8 & 0x7), - kvm_deliver_mode), - (__entry->icr_low & (1<<11)) ? "logical" : "physical", - (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", - (__entry->icr_low & (1<<15)) ? "level" : "edge", - __print_symbolic((__entry->icr_low >> 18 & 0x3), - kvm_apic_dst_shorthand)) -); - -TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), - TP_ARGS(apicid, dm, tm, vec), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( __u16, dm ) - __field( __u8, tm ) - __field( __u8, vec ) - ), - - TP_fast_assign( - __entry->apicid = apicid; - __entry->dm = dm; - __entry->tm = tm; - __entry->vec = vec; - ), - - TP_printk("apicid %x vec %u (%s|%s)", - __entry->apicid, __entry->vec, - __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), - __entry->tm ? "level" : "edge") -); - -TRACE_EVENT(kvm_eoi, - TP_PROTO(struct kvm_lapic *apic, int vector), - TP_ARGS(apic, vector), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( int, vector ) - ), - - TP_fast_assign( - __entry->apicid = apic->vcpu->vcpu_id; - __entry->vector = vector; - ), - - TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) -); - -TRACE_EVENT(kvm_pv_eoi, - TP_PROTO(struct kvm_lapic *apic, int vector), - TP_ARGS(apic, vector), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( int, vector ) - ), - - TP_fast_assign( - __entry->apicid = apic->vcpu->vcpu_id; - __entry->vector = vector; - ), - - TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) -); - -/* - * Tracepoint for nested VMRUN - */ -TRACE_EVENT(kvm_nested_vmrun, - TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl, - __u32 event_inj, bool npt), - TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u64, vmcb ) - __field( __u64, nested_rip ) - __field( __u32, int_ctl ) - __field( __u32, event_inj ) - __field( bool, npt ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->vmcb = vmcb; - __entry->nested_rip = nested_rip; - __entry->int_ctl = int_ctl; - __entry->event_inj = event_inj; - __entry->npt = npt; - ), - - TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " - "event_inj: 0x%08x npt: %s", - __entry->rip, __entry->vmcb, __entry->nested_rip, - __entry->int_ctl, __entry->event_inj, - __entry->npt ? "on" : "off") -); - -TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), - - TP_STRUCT__entry( - __field( __u16, cr_read ) - __field( __u16, cr_write ) - __field( __u32, exceptions ) - __field( __u64, intercept ) - ), - - TP_fast_assign( - __entry->cr_read = cr_read; - __entry->cr_write = cr_write; - __entry->exceptions = exceptions; - __entry->intercept = intercept; - ), - - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) -); -/* - * Tracepoint for #VMEXIT while nested - */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); - -/* - * Tracepoint for #VMEXIT reinjected to the guest - */ -TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - - TP_printk("reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_nested_intr_vmexit, - TP_PROTO(__u64 rip), - TP_ARGS(rip), - - TP_STRUCT__entry( - __field( __u64, rip ) - ), - - TP_fast_assign( - __entry->rip = rip - ), - - TP_printk("rip: 0x%016llx", __entry->rip) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), - TP_ARGS(rip, asid, address), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->asid = asid; - __entry->address = address; - ), - - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", - __entry->rip, __entry->asid, __entry->address) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_skinit, - TP_PROTO(__u64 rip, __u32 slb), - TP_ARGS(rip, slb), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, slb ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->slb = slb; - ), - - TP_printk("rip: 0x%016llx slb: 0x%08x", - __entry->rip, __entry->slb) -); - -#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) -#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) -#define KVM_EMUL_INSN_F_CS_D (1 << 2) -#define KVM_EMUL_INSN_F_CS_L (1 << 3) - -#define kvm_trace_symbol_emul_flags \ - { 0, "real" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ - { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L, "prot64" } - -#define kei_decode_mode(mode) ({ \ - u8 flags = 0xff; \ - switch (mode) { \ - case X86EMUL_MODE_REAL: \ - flags = 0; \ - break; \ - case X86EMUL_MODE_VM86: \ - flags = KVM_EMUL_INSN_F_EFL_VM; \ - break; \ - case X86EMUL_MODE_PROT16: \ - flags = KVM_EMUL_INSN_F_CR0_PE; \ - break; \ - case X86EMUL_MODE_PROT32: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D; \ - break; \ - case X86EMUL_MODE_PROT64: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L; \ - break; \ - } \ - flags; \ - }) - -TRACE_EVENT(kvm_emulate_insn, - TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), - TP_ARGS(vcpu, failed), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) - ), - - TP_fast_assign( - __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr - - vcpu->arch.emulate_ctxt.fetch.data; - __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; - memcpy(__entry->insn, - vcpu->arch.emulate_ctxt.fetch.data, - 15); - __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); - __entry->failed = failed; - ), - - TP_printk("%x:%llx:%s (%s)%s", - __entry->csbase, __entry->rip, - __print_hex(__entry->insn, __entry->len), - __print_symbolic(__entry->flags, - kvm_trace_symbol_emul_flags), - __entry->failed ? " failed" : "" - ) - ); - -#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) -#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) - -TRACE_EVENT( - vcpu_match_mmio, - TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), - TP_ARGS(gva, gpa, write, gpa_match), - - TP_STRUCT__entry( - __field(gva_t, gva) - __field(gpa_t, gpa) - __field(bool, write) - __field(bool, gpa_match) - ), - - TP_fast_assign( - __entry->gva = gva; - __entry->gpa = gpa; - __entry->write = write; - __entry->gpa_match = gpa_match - ), - - TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, - __entry->write ? "Write" : "Read", - __entry->gpa_match ? "GPA" : "GVA") -); - -TRACE_EVENT(kvm_write_tsc_offset, - TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, - __u64 next_tsc_offset), - TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( __u64, previous_tsc_offset ) - __field( __u64, next_tsc_offset ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->previous_tsc_offset = previous_tsc_offset; - __entry->next_tsc_offset = next_tsc_offset; - ), - - TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, - __entry->previous_tsc_offset, __entry->next_tsc_offset) -); - -#ifdef CONFIG_X86_64 - -#define host_clocks \ - {VCLOCK_NONE, "none"}, \ - {VCLOCK_TSC, "tsc"} \ - -TRACE_EVENT(kvm_update_master_clock, - TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), - TP_ARGS(use_master_clock, host_clock, offset_matched), - - TP_STRUCT__entry( - __field( bool, use_master_clock ) - __field( unsigned int, host_clock ) - __field( bool, offset_matched ) - ), - - TP_fast_assign( - __entry->use_master_clock = use_master_clock; - __entry->host_clock = host_clock; - __entry->offset_matched = offset_matched; - ), - - TP_printk("masterclock %d hostclock %s offsetmatched %u", - __entry->use_master_clock, - __print_symbolic(__entry->host_clock, host_clocks), - __entry->offset_matched) -); - -TRACE_EVENT(kvm_track_tsc, - TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, - unsigned int online_vcpus, bool use_master_clock, - unsigned int host_clock), - TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, - host_clock), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( unsigned int, nr_vcpus_matched_tsc ) - __field( unsigned int, online_vcpus ) - __field( bool, use_master_clock ) - __field( unsigned int, host_clock ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->nr_vcpus_matched_tsc = nr_matched; - __entry->online_vcpus = online_vcpus; - __entry->use_master_clock = use_master_clock; - __entry->host_clock = host_clock; - ), - - TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" - " hostclock %s", - __entry->vcpu_id, __entry->use_master_clock, - __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, - __print_symbolic(__entry->host_clock, host_clocks)) -); - -#endif /* CONFIG_X86_64 */ - -/* - * Tracepoint for PML full VMEXIT. - */ -TRACE_EVENT(kvm_pml_full, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - ), - - TP_printk("vcpu %d: PML full", __entry->vcpu_id) -); - -TRACE_EVENT(kvm_ple_window, - TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), - TP_ARGS(grow, vcpu_id, new, old), - - TP_STRUCT__entry( - __field( bool, grow ) - __field( unsigned int, vcpu_id ) - __field( int, new ) - __field( int, old ) - ), - - TP_fast_assign( - __entry->grow = grow; - __entry->vcpu_id = vcpu_id; - __entry->new = new; - __entry->old = old; - ), - - TP_printk("vcpu %u: ple_window %d (%s %d)", - __entry->vcpu_id, - __entry->new, - __entry->grow ? "grow" : "shrink", - __entry->old) -); - -#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ - trace_kvm_ple_window(true, vcpu_id, new, old) -#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ - trace_kvm_ple_window(false, vcpu_id, new, old) - -TRACE_EVENT(kvm_pvclock_update, - TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), - TP_ARGS(vcpu_id, pvclock), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( __u32, version ) - __field( __u64, tsc_timestamp ) - __field( __u64, system_time ) - __field( __u32, tsc_to_system_mul ) - __field( __s8, tsc_shift ) - __field( __u8, flags ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->version = pvclock->version; - __entry->tsc_timestamp = pvclock->tsc_timestamp; - __entry->system_time = pvclock->system_time; - __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul; - __entry->tsc_shift = pvclock->tsc_shift; - __entry->flags = pvclock->flags; - ), - - TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, " - "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, " - "flags 0x%x }", - __entry->vcpu_id, - __entry->version, - __entry->tsc_timestamp, - __entry->system_time, - __entry->tsc_to_system_mul, - __entry->tsc_shift, - __entry->flags) -); - -TRACE_EVENT(kvm_wait_lapic_expire, - TP_PROTO(unsigned int vcpu_id, s64 delta), - TP_ARGS(vcpu_id, delta), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( s64, delta ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->delta = delta; - ), - - TP_printk("vcpu %u: delta %lld (%s)", - __entry->vcpu_id, - __entry->delta, - __entry->delta < 0 ? "early" : "late") -); - -TRACE_EVENT(kvm_enter_smm, - TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), - TP_ARGS(vcpu_id, smbase, entering), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( u64, smbase ) - __field( bool, entering ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->smbase = smbase; - __entry->entering = entering; - ), - - TP_printk("vcpu %u: %s SMM, smbase 0x%llx", - __entry->vcpu_id, - __entry->entering ? "entering" : "leaving", - __entry->smbase) -); - -/* - * Tracepoint for VT-d posted-interrupts. - */ -TRACE_EVENT(kvm_pi_irte_update, - TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, - unsigned int gsi, unsigned int gvec, - u64 pi_desc_addr, bool set), - TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), - - TP_STRUCT__entry( - __field( unsigned int, host_irq ) - __field( unsigned int, vcpu_id ) - __field( unsigned int, gsi ) - __field( unsigned int, gvec ) - __field( u64, pi_desc_addr ) - __field( bool, set ) - ), - - TP_fast_assign( - __entry->host_irq = host_irq; - __entry->vcpu_id = vcpu_id; - __entry->gsi = gsi; - __entry->gvec = gvec; - __entry->pi_desc_addr = pi_desc_addr; - __entry->set = set; - ), - - TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " - "gvec: 0x%x, pi_desc_addr: 0x%llx", - __entry->set ? "enabled and being updated" : "disabled", - __entry->host_irq, - __entry->vcpu_id, - __entry->gsi, - __entry->gvec, - __entry->pi_desc_addr) -); - -/* - * Tracepoint for kvm_hv_notify_acked_sint. - */ -TRACE_EVENT(kvm_hv_notify_acked_sint, - TP_PROTO(int vcpu_id, u32 sint), - TP_ARGS(vcpu_id, sint), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, sint) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->sint = sint; - ), - - TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint) -); - -/* - * Tracepoint for synic_set_irq. - */ -TRACE_EVENT(kvm_hv_synic_set_irq, - TP_PROTO(int vcpu_id, u32 sint, int vector, int ret), - TP_ARGS(vcpu_id, sint, vector, ret), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, sint) - __field(int, vector) - __field(int, ret) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->sint = sint; - __entry->vector = vector; - __entry->ret = ret; - ), - - TP_printk("vcpu_id %d sint %u vector %d ret %d", - __entry->vcpu_id, __entry->sint, __entry->vector, - __entry->ret) -); - -/* - * Tracepoint for kvm_hv_synic_send_eoi. - */ -TRACE_EVENT(kvm_hv_synic_send_eoi, - TP_PROTO(int vcpu_id, int vector), - TP_ARGS(vcpu_id, vector), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, sint) - __field(int, vector) - __field(int, ret) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->vector = vector; - ), - - TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector) -); - -/* - * Tracepoint for synic_set_msr. - */ -TRACE_EVENT(kvm_hv_synic_set_msr, - TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host), - TP_ARGS(vcpu_id, msr, data, host), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, msr) - __field(u64, data) - __field(bool, host) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->msr = msr; - __entry->data = data; - __entry->host = host - ), - - TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d", - __entry->vcpu_id, __entry->msr, __entry->data, __entry->host) -); - -/* - * Tracepoint for stimer_set_config. - */ -TRACE_EVENT(kvm_hv_stimer_set_config, - TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host), - TP_ARGS(vcpu_id, timer_index, config, host), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, config) - __field(bool, host) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->config = config; - __entry->host = host; - ), - - TP_printk("vcpu_id %d timer %d config 0x%llx host %d", - __entry->vcpu_id, __entry->timer_index, __entry->config, - __entry->host) -); - -/* - * Tracepoint for stimer_set_count. - */ -TRACE_EVENT(kvm_hv_stimer_set_count, - TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host), - TP_ARGS(vcpu_id, timer_index, count, host), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, count) - __field(bool, host) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->count = count; - __entry->host = host; - ), - - TP_printk("vcpu_id %d timer %d count %llu host %d", - __entry->vcpu_id, __entry->timer_index, __entry->count, - __entry->host) -); - -/* - * Tracepoint for stimer_start(periodic timer case). - */ -TRACE_EVENT(kvm_hv_stimer_start_periodic, - TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time), - TP_ARGS(vcpu_id, timer_index, time_now, exp_time), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, time_now) - __field(u64, exp_time) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->time_now = time_now; - __entry->exp_time = exp_time; - ), - - TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu", - __entry->vcpu_id, __entry->timer_index, __entry->time_now, - __entry->exp_time) -); - -/* - * Tracepoint for stimer_start(one-shot timer case). - */ -TRACE_EVENT(kvm_hv_stimer_start_one_shot, - TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count), - TP_ARGS(vcpu_id, timer_index, time_now, count), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, time_now) - __field(u64, count) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->time_now = time_now; - __entry->count = count; - ), - - TP_printk("vcpu_id %d timer %d time_now %llu count %llu", - __entry->vcpu_id, __entry->timer_index, __entry->time_now, - __entry->count) -); - -/* - * Tracepoint for stimer_timer_callback. - */ -TRACE_EVENT(kvm_hv_stimer_callback, - TP_PROTO(int vcpu_id, int timer_index), - TP_ARGS(vcpu_id, timer_index), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - ), - - TP_printk("vcpu_id %d timer %d", - __entry->vcpu_id, __entry->timer_index) -); - -/* - * Tracepoint for stimer_expiration. - */ -TRACE_EVENT(kvm_hv_stimer_expiration, - TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), - TP_ARGS(vcpu_id, timer_index, msg_send_result), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(int, msg_send_result) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->msg_send_result = msg_send_result; - ), - - TP_printk("vcpu_id %d timer %d msg send result %d", - __entry->vcpu_id, __entry->timer_index, - __entry->msg_send_result) -); - -/* - * Tracepoint for stimer_cleanup. - */ -TRACE_EVENT(kvm_hv_stimer_cleanup, - TP_PROTO(int vcpu_id, int timer_index), - TP_ARGS(vcpu_id, timer_index), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - ), - - TP_printk("vcpu_id %d timer %d", - __entry->vcpu_id, __entry->timer_index) -); - -/* - * Tracepoint for AMD AVIC - */ -TRACE_EVENT(kvm_avic_incomplete_ipi, - TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index), - TP_ARGS(vcpu, icrh, icrl, id, index), - - TP_STRUCT__entry( - __field(u32, vcpu) - __field(u32, icrh) - __field(u32, icrl) - __field(u32, id) - __field(u32, index) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->icrh = icrh; - __entry->icrl = icrl; - __entry->id = id; - __entry->index = index; - ), - - TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", - __entry->vcpu, __entry->icrh, __entry->icrl, - __entry->id, __entry->index) -); - -TRACE_EVENT(kvm_avic_unaccelerated_access, - TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec), - TP_ARGS(vcpu, offset, ft, rw, vec), - - TP_STRUCT__entry( - __field(u32, vcpu) - __field(u32, offset) - __field(bool, ft) - __field(bool, rw) - __field(u32, vec) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->offset = offset; - __entry->ft = ft; - __entry->rw = rw; - __entry->vec = vec; - ), - - TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", - __entry->vcpu, - __entry->offset, - __print_symbolic(__entry->offset, kvm_trace_symbol_apic), - __entry->ft ? "trap" : "fault", - __entry->rw ? "write" : "read", - __entry->vec) -); - -TRACE_EVENT(kvm_hv_timer_state, - TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), - TP_ARGS(vcpu_id, hv_timer_in_use), - TP_STRUCT__entry( - __field(unsigned int, vcpu_id) - __field(unsigned int, hv_timer_in_use) - ), - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->hv_timer_in_use = hv_timer_in_use; - ), - TP_printk("vcpu_id %x hv_timer %x\n", - __entry->vcpu_id, - __entry->hv_timer_in_use) -); -#endif /* _TRACE_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h index 622aa10..622aa10 100644..100755 --- a/arch/x86/kvm/tss.h +++ b/arch/x86/kvm/tss.h diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5382b82..4de8486 100644..100755 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6,6 +6,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@qumranet.com> @@ -22,639 +23,67 @@ #include "lapic.h" #include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/moduleparam.h> -#include <linux/mod_devicetable.h> -#include <linux/trace_events.h> -#include <linux/slab.h> -#include <linux/tboot.h> -#include <linux/hrtimer.h> +#include <linux/list.h> +#include <ntkrutils.h> +#include <__asm.h> #include "kvm_cache_regs.h" #include "x86.h" - -#include <asm/cpu.h> -#include <asm/io.h> -#include <asm/desc.h> #include <asm/vmx.h> -#include <asm/virtext.h> -#include <asm/mce.h> -#include <asm/fpu/internal.h> -#include <asm/perf_event.h> -#include <asm/debugreg.h> -#include <asm/kexec.h> -#include <asm/apic.h> -#include <asm/irq_remapping.h> - -#include "trace.h" -#include "pmu.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) +#include "pmu.h" +// seperate struct definitions to vmx_def.h so that asmgen can include +#include "vmx_def.h" -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); +#pragma warning(disable : 4146) +#pragma warning(disable : 4127) +#pragma warning(disable : 4334) -static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); +#define DR6_RESERVED (0xFFFF0FF0) -static bool __read_mostly enable_vpid = 1; -module_param_named(vpid, enable_vpid, bool, 0444); +static bool enable_vpid = 0; -static bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); +static bool flexpriority_enabled = 1; -static bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); +static bool enable_ept = 1; -static bool __read_mostly enable_unrestricted_guest = 1; -module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); +static bool enable_unrestricted_guest = 1; -static bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); +static bool enable_ept_ad_bits = 1; -static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static bool emulate_invalid_guest_state = true; -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); +static bool vmm_exclusive = 1; -static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); +static bool fasteoi = 1; -static bool __read_mostly enable_apicv = 1; -module_param(enable_apicv, bool, S_IRUGO); +static bool enable_apicv = 1; -static bool __read_mostly enable_shadow_vmcs = 1; -module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); +static bool enable_shadow_vmcs = 0; /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static bool __read_mostly nested = 0; -module_param(nested, bool, S_IRUGO); - -static u64 __read_mostly host_xss; - -static bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); - -#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL - -/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ -static int __read_mostly cpu_preemption_timer_multi; -static bool __read_mostly enable_preemption_timer = 1; -#ifdef CONFIG_X86_64 -module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); -#endif - -#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) - -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) - -#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 - -/* - * These 2 parameters are used to config the controls for Pause-Loop Exiting: - * ple_gap: upper bound on the amount of time between two successive - * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually smaller than 128 cycles. - * ple_window: upper bound on the amount of time a guest is allowed to execute - * in a PAUSE loop. Tests indicate that most spinlocks are held for - * less than 2^12 cycles - * Time is measured based on a counter that runs at the same rate as the TSC, - * refer SDM volume 3b section 21.6.13 & 22.1.3. - */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 -#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 -#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 -#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ - INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW - -static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; -module_param(ple_gap, int, S_IRUGO); - -static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, int, S_IRUGO); - -/* Default doubles per-vcpu window every exit. */ -static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, int, S_IRUGO); - -/* Default resets per-vcpu window every exit to ple_window. */ -static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, int, S_IRUGO); - -/* Default is to compute the maximum so we can never overflow. */ -static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, int, S_IRUGO); - -extern const ulong vmx_return; - -#define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 - -struct vmcs { - u32 revision_id; - u32 abort; - char data[0]; -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - struct vmcs *shadow_vmcs; - int cpu; - int launched; - struct list_head loaded_vmcss_on_cpu_link; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * If there are changes in this struct, VMCS12_REVISION must be changed. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - u32 revision_id; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 posted_intr_desc_addr; - u64 ept_pointer; - u64 eoi_exit_bitmap0; - u64 eoi_exit_bitmap1; - u64 eoi_exit_bitmap2; - u64 eoi_exit_bitmap3; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 padding64[8]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 vmx_preemption_timer_value; - u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; - u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 guest_intr_status; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; -}; - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - */ -#define VMCS12_REVISION 0x11e57ed0 +static bool nested = 0; -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 +static u64 host_xss; -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - gpa_t vmxon_ptr; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* The host-usable pointer to the above */ - struct page *current_vmcs12_page; - struct vmcs12 *current_vmcs12; - /* - * Cache of the guest's VMCS, existing outside of guest memory. - * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMXOFF, VMCLEAR, VMPTRLD. - */ - struct vmcs12 *cached_vmcs12; - /* - * Indicates if the shadow vmcs must be updated with the - * data hold by vmcs12 - */ - bool sync_shadow_vmcs; - - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; - bool change_vmcs01_virtual_x2apic_mode; - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; - struct pi_desc *pi_desc; - bool pi_pending; - u16 posted_intr_nv; - - unsigned long *msr_bitmap; - - struct hrtimer preemption_timer; - bool preemption_timer_expired; - - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - - u16 vpid02; - u16 last_vpid; - - u32 nested_vmx_procbased_ctls_low; - u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; - u32 nested_vmx_secondary_ctls_low; - u32 nested_vmx_secondary_ctls_high; - u32 nested_vmx_pinbased_ctls_low; - u32 nested_vmx_pinbased_ctls_high; - u32 nested_vmx_exit_ctls_low; - u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; - u32 nested_vmx_entry_ctls_low; - u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; - u32 nested_vmx_misc_low; - u32 nested_vmx_misc_high; - u32 nested_vmx_ept_caps; - u32 nested_vmx_vpid_caps; -}; - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} +static bool enable_pml = 0; -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - bool nmi_known_unmasked; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; - unsigned long host_idt_base; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; - } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - int gs_ldt_reload_needed; - int fs_reload_needed; - u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr4; /* May not match real cr4 */ - } host_state; - struct { - int vm86_active; - ulong save_rflags; - struct kvm_segment segs[8]; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - u32 exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; - - /* Dynamic PLE window. */ - int ple_window; - bool ple_window_dirty; - - /* Support for PML */ -#define PML_ENTITY_NUM 512 - struct page *pml_pg; - - /* apic deadline value in host tsc */ - u64 hv_deadline_tsc; - - u64 current_tsc_ratio; - - bool guest_pkru_valid; - u32 guest_pkru; - u32 host_pkru; - - /* - * Only bits masked by msr_ia32_feature_control_valid_bits can be set in - * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included - * in msr_ia32_feature_control_valid_bits. - */ - u64 msr_ia32_feature_control; - u64 msr_ia32_feature_control_valid_bits; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; +extern const size_t vmx_return; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } -static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ [number##_HIGH] = VMCS12_OFFSET(name)+4 -static unsigned long shadow_read_only_fields[] = { +static size_t shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -680,7 +109,7 @@ static unsigned long shadow_read_only_fields[] = { static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static unsigned long shadow_read_write_fields[] = { +static size_t shadow_read_write_fields[] = { TPR_THRESHOLD, GUEST_RIP, GUEST_RSP, @@ -853,10 +282,8 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RIP, host_rip), }; -static inline short vmcs_field_to_offset(unsigned long field) +static inline short vmcs_field_to_offset(size_t field) { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || vmcs_field_to_offset_table[field] == 0) return -ENOENT; @@ -869,27 +296,31 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } -static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) +static PMDL nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { - struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); - if (is_error_page(page)) + PMDL mdl; + size_t hva; + + hva = kvm_vcpu_gfn_to_hva(vcpu, addr >> PAGE_SHIFT); + if (kvm_is_error_hva(hva)) return NULL; - return page; -} + mdl = IoAllocateMdl((void *)hva, PAGE_SIZE, FALSE, FALSE, NULL); + if (!mdl) + return NULL; -static void nested_release_page(struct page *page) -{ - kvm_release_page_dirty(page); + MmProbeAndLockPages(mdl, KernelMode, IoWriteAccess); + + return mdl; } -static void nested_release_page_clean(struct page *page) +static void nested_release_page(PMDL mdl) { - kvm_release_page_clean(page); + kvm_release_page(mdl); } -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); +static size_t nested_ept_get_cr3(struct kvm_vcpu *vcpu); +static u64 construct_eptp(size_t root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_xsaves_supported(void); @@ -904,14 +335,10 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -/* - * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed - * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. - */ -static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +static DEFINE_PER_CPU(struct desc_ptr, host_idt); /* * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we @@ -920,16 +347,16 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +static size_t *vmx_io_bitmap_a; +static size_t *vmx_io_bitmap_b; +static size_t *vmx_msr_bitmap_legacy; +static size_t *vmx_msr_bitmap_longmode; +static size_t *vmx_msr_bitmap_legacy_x2apic; +static size_t *vmx_msr_bitmap_longmode_x2apic; +static size_t *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +static size_t *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; +static size_t *vmx_vmread_bitmap; +static size_t *vmx_vmwrite_bitmap; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -982,17 +409,6 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); -/* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. - */ -static const u32 vmx_msr_index[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, -}; - static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1015,11 +431,6 @@ static inline bool is_page_fault(u32 intr_info) return is_exception_n(intr_info, PF_VECTOR); } -static inline bool is_no_device(u32 intr_info) -{ - return is_exception_n(intr_info, NM_VECTOR); -} - static inline bool is_invalid_opcode(u32 intr_info) { return is_exception_n(intr_info, UD_VECTOR); @@ -1083,69 +494,10 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } -/* - * Comment's format: document - errata name - stepping - processor name. - * Refer from - * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp - */ -static u32 vmx_preemption_cpu_tfms[] = { -/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ -0x000206E6, -/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ -/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ -/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020652, -/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020655, -/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ -/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ -/* - * 320767.pdf - AAP86 - B1 - - * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile - */ -0x000106E5, -/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ -0x000106A0, -/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ -0x000106A1, -/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ -0x000106A4, - /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ - /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ - /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ -0x000106A5, -}; - -static inline bool cpu_has_broken_vmx_preemption_timer(void) -{ - u32 eax = cpuid_eax(0x00000001), i; - - /* Clear the reserved bits */ - eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) - if (eax == vmx_preemption_cpu_tfms[i]) - return true; - - return false; -} - -static inline bool cpu_has_vmx_preemption_timer(void) -{ - return vmcs_config.pin_based_exec_ctrl & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool cpu_has_vmx_posted_intr(void) -{ - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; -} - static inline bool cpu_has_vmx_apicv(void) { return cpu_has_vmx_apic_register_virt() && - cpu_has_vmx_virtual_intr_delivery() && - cpu_has_vmx_posted_intr(); + cpu_has_vmx_virtual_intr_delivery(); } static inline bool cpu_has_vmx_flexpriority(void) @@ -1211,12 +563,6 @@ static inline bool cpu_has_vmx_unrestricted_guest(void) SECONDARY_EXEC_UNRESTRICTED_GUEST; } -static inline bool cpu_has_vmx_ple(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_PAUSE_LOOP_EXITING; -} - static inline bool cpu_has_vmx_basic_inout(void) { return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); @@ -1273,12 +619,6 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } -static inline bool cpu_has_vmx_tsc_scaling(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_TSC_SCALING; -} - static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1301,12 +641,6 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } -static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1338,11 +672,6 @@ static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } -static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; -} - static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1351,65 +680,20 @@ static inline bool is_exception(u32 intr_info) static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification); + size_t exit_qualification); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification); - -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) - return i; - return -1; -} + u32 reason, size_t qualification); -static inline void __invvpid(int ext, u16 vpid, gva_t gva) -{ - struct { - u64 vpid : 16; - u64 rsvd : 48; - u64 gva; - } operand = { vpid, 0, gva }; - - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); -} - -static inline void __invept(int ext, u64 eptp, gpa_t gpa) -{ - struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); -} - -static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - i = __find_msr_index(vmx, msr); - if (i >= 0) - return &vmx->guest_msrs[i]; - return NULL; -} +#define __invvpid(a, b, c) +#define __invept(a, b, c) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); + error = __vmx_vmclear(&phys_addr); if (error) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); @@ -1429,85 +713,12 @@ static void vmcs_load(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + error = __vmx_vmptrld(&phys_addr); + if (error) { + DbgBreakPoint(); printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -} - -#ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static void crash_vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v; - - if (!crash_local_vmclear_enabled(cpu)) - return; - - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - vmcs_clear(v->vmcs); -} -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } -#endif /* CONFIG_KEXEC_CORE */ - -static void __loaded_vmcs_clear(void *arg) -{ - struct loaded_vmcs *loaded_vmcs = arg; - int cpu = raw_smp_processor_id(); - - if (loaded_vmcs->cpu != cpu) - return; /* vcpu migration can race with cpu offline */ - if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) - per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); - list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); - - /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. - */ - smp_wmb(); - - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); -} - -static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) -{ - int cpu = loaded_vmcs->cpu; - - if (cpu != -1) - smp_call_function_single(cpu, - __loaded_vmcs_clear, loaded_vmcs, 1); + vmcs, phys_addr); + } } static inline void vpid_sync_vcpu_single(int vpid) @@ -1549,154 +760,118 @@ static inline void ept_sync_context(u64 eptp) } } -static __always_inline void vmcs_check16(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "16-bit accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "16-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "16-bit accessor invalid for natural width field"); -} +#define VMCS_RW_DEBUG -static __always_inline void vmcs_check32(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "32-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "32-bit accessor invalid for natural width field"); -} +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +static void vmx_vcpu_put(struct kvm_vcpu *vcpu); -static __always_inline void vmcs_check64(unsigned long field) +static __forceinline size_t __vmcs_readl(struct kvm_vcpu* vcpu, size_t field) { - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "64-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "64-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "64-bit accessor invalid for 32-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "64-bit accessor invalid for natural width field"); -} + size_t value; -static __always_inline void vmcs_checkl(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "Natural width accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "Natural width accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "Natural width accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "Natural width accessor invalid for 32-bit field"); -} + preempt_disable(); + vmcs_load(to_vmx(vcpu)->loaded_vmcs->vmcs); -static __always_inline unsigned long __vmcs_readl(unsigned long field) -{ - unsigned long value; + __vmx_vmread(field, &value); + + vmcs_clear(to_vmx(vcpu)->loaded_vmcs->vmcs); + preempt_enable(); - asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") - : "=a"(value) : "d"(field) : "cc"); return value; } -static __always_inline u16 vmcs_read16(unsigned long field) +static __forceinline u16 vmcs_read16(struct kvm_vcpu* vcpu, size_t field) { - vmcs_check16(field); - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); } -static __always_inline u32 vmcs_read32(unsigned long field) +static __forceinline u32 vmcs_read32(struct kvm_vcpu* vcpu, size_t field) { - vmcs_check32(field); - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); } -static __always_inline u64 vmcs_read64(unsigned long field) +static __forceinline u64 vmcs_read64(struct kvm_vcpu* vcpu, size_t field) { - vmcs_check64(field); #ifdef CONFIG_X86_64 - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); #else return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); #endif } -static __always_inline unsigned long vmcs_readl(unsigned long field) +static __forceinline size_t vmcs_readl(struct kvm_vcpu* vcpu, size_t field) { - vmcs_checkl(field); - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); } -static noinline void vmwrite_error(unsigned long field, unsigned long value) +static __declspec(noinline) void vmwrite_error(struct kvm_vcpu* vcpu, size_t field, size_t value) { printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + field, value, vmcs_read32(vcpu, VM_INSTRUCTION_ERROR)); +#if 0 dump_stack(); +#endif } -static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) +static __always_inline void __vmcs_writel(struct kvm_vcpu* vcpu, size_t field, size_t value) { u8 error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); - if (unlikely(error)) - vmwrite_error(field, value); + preempt_disable(); + vmcs_load(to_vmx(vcpu)->loaded_vmcs->vmcs); + + error = __vmx_vmwrite(field, value); + if (unlikely(error)) { + DbgBreakPoint(); + vmwrite_error(vcpu, field, value); + } + + vmcs_clear(to_vmx(vcpu)->loaded_vmcs->vmcs); + preempt_enable(); } -static __always_inline void vmcs_write16(unsigned long field, u16 value) +static __always_inline void vmcs_write16(struct kvm_vcpu* vcpu, size_t field, u16 value) { - vmcs_check16(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); } -static __always_inline void vmcs_write32(unsigned long field, u32 value) +static __always_inline void vmcs_write32(struct kvm_vcpu* vcpu, size_t field, u32 value) { - vmcs_check32(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); } -static __always_inline void vmcs_write64(unsigned long field, u64 value) +static __always_inline void vmcs_write64(struct kvm_vcpu* vcpu, size_t field, u64 value) { - vmcs_check64(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); #ifndef CONFIG_X86_64 asm volatile (""); __vmcs_writel(field+1, value >> 32); #endif } -static __always_inline void vmcs_writel(unsigned long field, unsigned long value) +static __always_inline void vmcs_writel(struct kvm_vcpu* vcpu, size_t field, size_t value) { - vmcs_checkl(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); } -static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_clear_bits(struct kvm_vcpu* vcpu, size_t field, u32 mask) { - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_clear_bits does not support 64-bit fields"); - __vmcs_writel(field, __vmcs_readl(field) & ~mask); + __vmcs_writel(vcpu, field, __vmcs_readl(vcpu, field) & ~mask); } -static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_set_bits(struct kvm_vcpu* vcpu, size_t field, u32 mask) { - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_set_bits does not support 64-bit fields"); - __vmcs_writel(field, __vmcs_readl(field) | mask); + __vmcs_writel(vcpu, field, __vmcs_readl(vcpu, field) | mask); } static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) { - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); + vmx->vm_entry_controls_shadow = vmcs_read32(&vmx->vcpu, VM_ENTRY_CONTROLS); } static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { - vmcs_write32(VM_ENTRY_CONTROLS, val); + vmcs_write32(&vmx->vcpu, VM_ENTRY_CONTROLS, val); vmx->vm_entry_controls_shadow = val; } @@ -1724,12 +899,12 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) { - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); + vmx->vm_exit_controls_shadow = vmcs_read32(&vmx->vcpu, VM_EXIT_CONTROLS); } static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { - vmcs_write32(VM_EXIT_CONTROLS, val); + vmcs_write32(&vmx->vcpu, VM_EXIT_CONTROLS, val); vmx->vm_exit_controls_shadow = val; } @@ -1780,7 +955,7 @@ static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) u16 *p = &vmx->segment_cache.seg[seg].selector; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) - *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + *p = vmcs_read16(&vmx->vcpu, kvm_vmx_segment_fields[seg].selector); return *p; } @@ -1789,7 +964,7 @@ static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) ulong *p = &vmx->segment_cache.seg[seg].base; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) - *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + *p = vmcs_readl(&vmx->vcpu, kvm_vmx_segment_fields[seg].base); return *p; } @@ -1798,7 +973,7 @@ static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) u32 *p = &vmx->segment_cache.seg[seg].limit; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + *p = vmcs_read32(&vmx->vcpu, kvm_vmx_segment_fields[seg].limit); return *p; } @@ -1807,7 +982,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) u32 *p = &vmx->segment_cache.seg[seg].ar; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + *p = vmcs_read32(&vmx->vcpu, kvm_vmx_segment_fields[seg].ar_bytes); return *p; } @@ -1816,17 +991,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); + (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + (GVM_GUESTDBG_ENABLE | GVM_GUESTDBG_USE_SW_BP)) == + (GVM_GUESTDBG_ENABLE | GVM_GUESTDBG_USE_SW_BP)) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - if (vcpu->fpu_active) - eb &= ~(1u << NM_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@ -1836,11 +1009,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; - vmcs_write32(EXCEPTION_BITMAP, eb); + vmcs_write32(vcpu, EXCEPTION_BITMAP, eb); } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit) + size_t entry, size_t exit) { vm_entry_controls_clearbit(vmx, entry); vm_exit_controls_clearbit(vmx, exit); @@ -1879,17 +1052,17 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) --m->nr; m->guest[i] = m->guest[m->nr]; m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + vmcs_write32(&vmx->vcpu, VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(&vmx->vcpu, VM_EXIT_MSR_LOAD_COUNT, m->nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit, - unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + size_t entry, size_t exit, + size_t guest_val_vmcs, size_t host_val_vmcs, u64 guest_val, u64 host_val) { - vmcs_write64(guest_val_vmcs, guest_val); - vmcs_write64(host_val_vmcs, host_val); + vmcs_write64(&vmx->vcpu, guest_val_vmcs, guest_val); + vmcs_write64(&vmx->vcpu, host_val_vmcs, host_val); vm_entry_controls_setbit(vmx, entry); vm_exit_controls_setbit(vmx, exit); } @@ -1942,8 +1115,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, return; } else if (i == m->nr) { ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + //vmcs_write32(&vmx->vcpu, VM_ENTRY_MSR_LOAD_COUNT, m->nr); + //vmcs_write32(&vmx->vcpu, VM_EXIT_MSR_LOAD_COUNT, m->nr); } m->guest[i].index = msr; @@ -1965,7 +1138,7 @@ static void reload_tss(void) load_TR_desc(); } -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; @@ -1995,36 +1168,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) clear_atomic_switch_msr(vmx, MSR_EFER); - /* - * On EPT, we can't emulate NX, so we must switch EFER atomically. - * On CPUs that support "load IA32_EFER", always switch EFER - * atomically, since it's faster than switching it manually. - */ - if (cpu_has_load_ia32_efer || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { - if (!(guest_efer & EFER_LMA)) - guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer); - return false; - } else { - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; - - return true; - } + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); + return false; } -static unsigned long segment_base(u16 selector) +static size_t segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; - unsigned long table_base; - unsigned long v; + size_t table_base; + size_t v; if (!(selector & ~3)) return 0; @@ -2043,47 +1200,40 @@ static unsigned long segment_base(u16 selector) v = get_desc_base(d); #ifdef CONFIG_X86_64 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; + v |= ((size_t)((struct ldttss_desc64 *)d)->base3) << 32; #endif return v; } -static inline unsigned long kvm_read_tr_base(void) +static inline size_t kvm_read_tr_base(void) { - u16 tr; - asm("str %0" : "=g"(tr)); + u16 tr = 0; + tr = gvm_read_tr(); return segment_base(tr); } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int i; - if (vmx->host_state.loaded) - return; - - vmx->host_state.loaded = 1; /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * Set host fs and gs selectors. Unfortunately, 26.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmcs_write16(vcpu, HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; } else { - vmcs_write16(HOST_FS_SELECTOR, 0); + vmcs_write16(vcpu, HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + vmcs_write16(vcpu, HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; + vmcs_write16(vcpu, HOST_GS_SELECTOR, 0); + vmx->host_state.gs_reload_needed = 1; } #ifdef CONFIG_X86_64 @@ -2092,8 +1242,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); + vmcs_writel(vcpu, HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(vcpu, HOST_GS_BASE, read_msr(MSR_GS_BASE)); #else vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); @@ -2106,25 +1256,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif if (boot_cpu_has(X86_FEATURE_MPX)) rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) - return; - ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (vmx->host_state.gs_reload_needed) { #ifdef CONFIG_X86_64 load_gs_index(vmx->host_state.gs_sel); #else @@ -2145,74 +1286,14 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - /* - * If the FPU is not active (through the host task or - * the guest vcpu), then restore the cr0.TS bit. - */ - if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) - stts(); load_gdt(this_cpu_ptr(&host_gdt)); + load_idt(this_cpu_ptr(&host_idt)); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_load_host_state(struct kvm_vcpu *vcpu) { - preempt_disable(); + struct vcpu_vmx *vmx = to_vmx(vcpu); __vmx_load_host_state(vmx); - preempt_enable(); -} - -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - do { - old.control = new.control = pi_desc->control; - - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); -} - -static void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } /* @@ -2227,104 +1308,26 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (!already_loaded) - loaded_vmcs_clear(vmx->loaded_vmcs); - - if (!already_loaded) { - local_irq_disable(); - crash_disable_local_vmclear(cpu); - - /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). - */ - smp_rmb(); - - list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, - &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); - local_irq_enable(); - } - - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } if (!already_loaded) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ - vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ - - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); - - vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); -} - -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_vcpu_pi_put(vcpu); - - __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; kvm_cpu_vmxoff(); } } -static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -{ - ulong cr0; - - if (vcpu->fpu_active) - return; - vcpu->fpu_active = 1; - cr0 = vmcs_readl(GUEST_CR0); - cr0 &= ~(X86_CR0_TS | X86_CR0_MP); - cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); - vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - if (is_guest_mode(vcpu)) - vcpu->arch.cr0_guest_owned_bits &= - ~get_vmcs12(vcpu)->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2332,51 +1335,24 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by * its hypervisor (cr0_read_shadow). */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +static inline size_t nested_read_cr0(struct vmcs12 *fields) { return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); } -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +static inline size_t nested_read_cr4(struct vmcs12 *fields) { return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); } -static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +static size_t vmx_get_rflags(struct kvm_vcpu *vcpu) { - /* Note that there is no vcpu->fpu_active = 0 here. The caller must - * set this *before* calling this function. - */ - vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = 0; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - if (is_guest_mode(vcpu)) { - /* - * L1's specified read shadow might not contain the TS bit, - * so now that we turned on shadowing of this bit, we need to - * set this bit of the shadow. Like in nested_vmx_run we need - * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet - * up-to-date here because we just decached cr0.TS (and we'll - * only update vmcs12->guest_cr0 on nested exit). - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | - (vcpu->arch.cr0 & X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - } else - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); -} - -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags, save_rflags; + size_t rflags, save_rflags; if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - rflags = vmcs_readl(GUEST_RFLAGS); + rflags = vmcs_readl(vcpu, GUEST_RFLAGS); if (to_vmx(vcpu)->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; save_rflags = to_vmx(vcpu)->rmode.save_rflags; @@ -2387,7 +1363,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->rflags; } -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void vmx_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; @@ -2395,49 +1371,44 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } - vmcs_writel(GUEST_RFLAGS, rflags); -} - -static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->guest_pkru; + vmcs_writel(vcpu, GUEST_RFLAGS, rflags); } static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { - u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO); int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) - ret |= KVM_X86_SHADOW_INT_STI; + ret |= GVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= KVM_X86_SHADOW_INT_MOV_SS; + ret |= GVM_X86_SHADOW_INT_MOV_SS; return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { - u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility_old = vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO); u32 interruptibility = interruptibility_old; interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - if (mask & KVM_X86_SHADOW_INT_MOV_SS) + if (mask & GVM_X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; - else if (mask & KVM_X86_SHADOW_INT_STI) + else if (mask & GVM_X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); + vmcs_write32(vcpu, GUEST_INTERRUPTIBILITY_INFO, interruptibility); } static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rip; + size_t rip; rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip += vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); kvm_rip_write(vcpu, rip); /* skipping an emulated instruction also counts */ @@ -2445,7 +1416,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } /* - * KVM wants to inject page-faults which it got to the guest. This function + * kvm wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) @@ -2456,8 +1427,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) return 0; nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_readl(vcpu, EXIT_QUALIFICATION)); return 1; } @@ -2473,7 +1444,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + vmcs_write32(vcpu, VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } @@ -2482,18 +1453,18 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, if (kvm_exception_is_soft(nr)) inc_eip = vcpu->arch.event_exit_inst_len; if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } if (kvm_exception_is_soft(nr)) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_write32(vcpu, VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, intr_info); } static bool vmx_rdtscp_supported(void) @@ -2506,26 +1477,15 @@ static bool vmx_invpcid_supported(void) return cpu_has_vmx_invpcid() && enable_ept; } -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) -{ - struct shared_msr_entry tmp; - - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; -} - static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { - unsigned long *msr_bitmap; + size_t *msr_bitmap; if (is_guest_mode(vcpu)) msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; - else if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + else + if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) @@ -2545,7 +1505,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) msr_bitmap = vmx_msr_bitmap_legacy; } - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + vmcs_write64(vcpu, MSR_BITMAP, __pa(msr_bitmap)); } /* @@ -2555,37 +1515,33 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs, index; + u64 value; - save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) - move_msr_up(vmx, index, save_nmsrs++); + if (!rdmsrl_safe(MSR_SYSCALL_MASK, &value) && + !wrmsrl_safe(MSR_SYSCALL_MASK, value)) + add_atomic_switch_msr(vmx, MSR_SYSCALL_MASK, 0, value); + if (!rdmsrl_safe(MSR_LSTAR, &value) && + !wrmsrl_safe(MSR_LSTAR, value)) + add_atomic_switch_msr(vmx, MSR_LSTAR, 0, value); + if (!rdmsrl_safe(MSR_CSTAR, &value) && + !wrmsrl_safe(MSR_CSTAR, value)) + add_atomic_switch_msr(vmx, MSR_CSTAR, 0, value); + if (!rdmsrl_safe(MSR_GS_BASE, &value) && + !wrmsrl_safe(MSR_GS_BASE, value)) + add_atomic_switch_msr(vmx, MSR_GS_BASE, 0, value); /* * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vmx, MSR_STAR); - if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) - move_msr_up(vmx, index, save_nmsrs++); + if (vmx->vcpu.arch.efer & EFER_SCE) + if (!rdmsrl_safe(MSR_STAR, &value) && + !wrmsrl_safe(MSR_STAR, value)) + add_atomic_switch_msr(vmx, MSR_STAR, 0, value); } #endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; + update_transition_efer(vmx); if (cpu_has_vmx_msr_bitmap()) vmx_set_msr_bitmap(&vmx->vcpu); @@ -2600,9 +1556,10 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu) { u64 host_tsc, tsc_offset; - host_tsc = rdtsc(); - tsc_offset = vmcs_read64(TSC_OFFSET); - return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + host_tsc = __rdtsc(); + tsc_offset = vmcs_read64(vcpu, TSC_OFFSET); + //return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + return host_tsc + tsc_offset; } /* @@ -2610,6 +1567,7 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu) */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + vmcs_write64(vcpu, TSC_OFFSET, offset); if (is_guest_mode(vcpu)) { /* * We're here if L1 chose not to trap WRMSR to TSC. According @@ -2620,19 +1578,17 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) struct vmcs12 *vmcs12; /* recalculate vmcs02.TSC_OFFSET: */ vmcs12 = get_vmcs12(vcpu); - vmcs_write64(TSC_OFFSET, offset + + vmcs_write64(vcpu, TSC_OFFSET, offset + (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12->tsc_offset : 0)); } else { - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vmcs_read64(TSC_OFFSET), offset); - vmcs_write64(TSC_OFFSET, offset); + vmcs_write64(vcpu, TSC_OFFSET, offset); } } static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); + struct kvm_cpuid_entry *best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); } @@ -2685,11 +1641,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; vmx->nested.nested_vmx_pinbased_ctls_high |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - PIN_BASED_VMX_PREEMPTION_TIMER; - if (kvm_vcpu_apicv_active(&vmx->vcpu)) - vmx->nested.nested_vmx_pinbased_ctls_high |= - PIN_BASED_POSTED_INTR; + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, @@ -2804,7 +1756,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = 0; /* - * Old versions of KVM use the single-context version without + * Old versions of kvm use the single-context version without * checking for support, so declare that it is supported even * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. @@ -2957,18 +1909,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, */ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - struct shared_msr_entry *msr; - switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: - msr_info->data = vmcs_readl(GUEST_FS_BASE); + msr_info->data = vmcs_readl(vcpu, GUEST_FS_BASE); break; case MSR_GS_BASE: - msr_info->data = vmcs_readl(GUEST_GS_BASE); + msr_info->data = vmcs_readl(vcpu, GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(to_vmx(vcpu)); msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; break; #endif @@ -2978,33 +1927,22 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = guest_read_tsc(vcpu); break; case MSR_IA32_SYSENTER_CS: - msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + msr_info->data = vmcs_read32(vcpu, GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); + msr_info->data = vmcs_readl(vcpu, GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); + msr_info->data = vmcs_readl(vcpu, GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) return 1; - msr_info->data = vmcs_read64(GUEST_BNDCFGS); - break; - case MSR_IA32_MCG_EXT_CTL: - if (!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) - return 1; - msr_info->data = vcpu->arch.mcg_ext_ctl; + msr_info->data = vmcs_read64(vcpu, GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested_vmx_allowed(vcpu)) - return 1; - return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3013,12 +1951,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_TSC_AUX: if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) return 1; + case MSR_SYSCALL_MASK: + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i = 0; + for (i = 0; i < vmx->msr_autoload.nr; i++) + if (vmx->msr_autoload.guest[i].index == msr_info->index) + msr_info->data = vmx->msr_autoload.guest[i].value; + break; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_info->index); - if (msr) { - msr_info->data = msr->data; - break; + if (msr_info->index >= MSR_IA32_VMX_BASIC + && msr_info->index <= MSR_IA32_VMX_VMFUNC) { + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(vcpu, msr_info->index, + &msr_info->data); } return kvm_get_msr_common(vcpu, msr_info); } @@ -3036,10 +1986,10 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu); static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; + u64 host_value = 0; switch (msr_index) { case MSR_EFER: @@ -3048,30 +1998,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_FS_BASE: vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_FS_BASE, data); + vmcs_writel(vcpu, GUEST_FS_BASE, data); break; case MSR_GS_BASE: vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_GS_BASE, data); + vmcs_writel(vcpu, GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); vmx->msr_guest_kernel_gs_base = data; break; #endif case MSR_IA32_SYSENTER_CS: - vmcs_write32(GUEST_SYSENTER_CS, data); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: - vmcs_writel(GUEST_SYSENTER_EIP, data); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: - vmcs_writel(GUEST_SYSENTER_ESP, data); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) return 1; - vmcs_write64(GUEST_BNDCFGS, data); + vmcs_write64(vcpu, GUEST_BNDCFGS, data); break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); @@ -3080,7 +2029,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; - vmcs_write64(GUEST_IA32_PAT, data); + vmcs_write64(vcpu, GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; } @@ -3089,14 +2038,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_MCG_EXT_CTL: - if ((!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) || - (data & ~MCG_EXT_CTL_LMCE_EN)) - return 1; - vcpu->arch.mcg_ext_ctl = data; - break; case MSR_IA32_FEATURE_CONTROL: if (!vmx_feature_control_msr_valid(vcpu, data) || (to_vmx(vcpu)->msr_ia32_feature_control & @@ -3106,14 +2047,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; /* * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. + * it is not supported on kvm. */ if (data != 0) return 1; @@ -3130,22 +2069,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; + case MSR_SYSCALL_MASK: + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + if (!rdmsrl_safe(msr_index, &host_value)) + add_atomic_switch_msr(vmx, msr_index, data, host_value); + break; /* Otherwise falls through */ default: - msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } + if (msr_index >= MSR_IA32_VMX_BASIC + && msr_index <= MSR_IA32_VMX_VMFUNC) + return 1; /* they are read-only */ ret = kvm_set_msr_common(vcpu, msr_info); } @@ -3154,13 +2089,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (size_t *)&vcpu->arch.regs_avail); switch (reg) { case VCPU_REGS_RSP: - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(vcpu, GUEST_RSP); break; case VCPU_REGS_RIP: - vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(vcpu, GUEST_RIP); break; case VCPU_EXREG_PDPTR: if (enable_ept) @@ -3171,32 +2106,28 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) +static int cpu_has_kvm_support(void) { return cpu_has_vmx(); } -static __init int vmx_disabled_by_bios(void) +static int vmx_disabled_by_bios(void) { u64 msr; - rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + if (rdmsrl_safe(MSR_IA32_FEATURE_CONTROL, &msr)) + return 0; + if (msr & FEATURE_CONTROL_LOCKED) { - /* launched w/ TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && tboot_enabled()) - return 1; /* launched w/o TXT and VMX only enabled w/ TXT */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && !tboot_enabled()) { + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)) { printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - "activate TXT before enabling KVM\n"); + "activate TXT before enabling kvm\n"); return 1; } /* launched w/o TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && !tboot_enabled()) + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)) return 1; } @@ -3205,11 +2136,9 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { - intel_pt_handle_vmx(1); - - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&addr), "m"(addr) - : "memory", "cc"); + u8 rc = __vmx_on(&addr); + if (rc) + printk(KERN_CRIT "rc is %d\n", rc); } static int hardware_enable(void) @@ -3221,27 +2150,13 @@ static int hardware_enable(void) if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; - INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); - rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (tboot_enabled()) - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; if ((old & test_bits) != test_bits) { /* enable and lock */ @@ -3255,44 +2170,31 @@ static int hardware_enable(void) } native_store_gdt(this_cpu_ptr(&host_gdt)); + native_store_idt(this_cpu_ptr(&host_idt)); return 0; } -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - - /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() * tricks. */ static void kvm_cpu_vmxoff(void) { - asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); - - intel_pt_handle_vmx(0); + __vmx_off(); } static void hardware_disable(void) { if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } cr4_clear_bits(X86_CR4_VMXE); } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { - u32 vmx_msr_low, vmx_msr_high; + u32 vmx_msr_low = 0, vmx_msr_high = 0; u32 ctl = ctl_min | ctl_opt; rdmsr(msr, vmx_msr_low, vmx_msr_high); @@ -3308,17 +2210,17 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init bool allow_1_setting(u32 msr, u32 ctl) +static bool allow_1_setting(u32 msr, u32 ctl) { - u32 vmx_msr_low, vmx_msr_high; + u32 vmx_msr_low = 0, vmx_msr_high = 0; rdmsr(msr, vmx_msr_low, vmx_msr_high); return vmx_msr_high & ctl; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf) { - u32 vmx_msr_low, vmx_msr_high; + u32 vmx_msr_low = 0, vmx_msr_high = 0; u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; @@ -3360,15 +2262,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING; + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3407,18 +2307,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; + opt = PIN_BASED_VIRTUAL_NMIS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; - if (cpu_has_broken_vmx_preemption_timer()) - _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) - _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = VM_ENTRY_LOAD_DEBUG_CONTROLS; opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -3442,7 +2335,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); + /* should always 0 */ + vmcs_conf->order = 0; vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; @@ -3478,8 +2372,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) * BA97 (model 46) * */ - if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { + if (cpu_has_load_perf_global_ctrl && x86_cpuid_family() == 0x6) { + switch (x86_cpuid_model()) { case 26: case 30: case 37: @@ -3500,16 +2394,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(void) { - int node = cpu_to_node(cpu); - struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); - if (!pages) + vmcs = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, GVM_POOL_TAG); + if (!vmcs) return NULL; - vmcs = page_address(pages); memset(vmcs, 0, vmcs_config.size); vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ return vmcs; @@ -3517,12 +2408,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) static struct vmcs *alloc_vmcs(void) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(); } static void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_config.order); + ExFreePoolWithTag(vmcs, GVM_POOL_TAG); } /* @@ -3532,7 +2423,6 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { if (!loaded_vmcs->vmcs) return; - loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; WARN_ON(loaded_vmcs->shadow_vmcs != NULL); @@ -3583,14 +2473,14 @@ static void init_vmcs_shadow_fields(void) vmx_vmread_bitmap); } -static __init int alloc_kvm_area(void) +static int alloc_kvm_area(void) { int cpu; for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -3627,7 +2517,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, static void enter_pmode(struct kvm_vcpu *vcpu) { - unsigned long flags; + size_t flags; struct vcpu_vmx *vmx = to_vmx(vcpu); /* @@ -3647,13 +2537,13 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); - flags = vmcs_readl(GUEST_RFLAGS); + flags = vmcs_readl(vcpu, GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(vcpu, GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | - (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + vmcs_writel(vcpu, GUEST_CR4, (vmcs_readl(vcpu, GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(vcpu, CR4_READ_SHADOW) & X86_CR4_VME)); update_exception_bitmap(vcpu); @@ -3665,7 +2555,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); } -static void fix_rmode_seg(int seg, struct kvm_segment *save) +static void fix_rmode_seg(struct kvm_vcpu* vcpu, int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment var = *save; @@ -3692,15 +2582,15 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) "protected mode (seg=%d)", seg); } - vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); - vmcs_write32(sf->limit, var.limit); - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); + vmcs_write16(vcpu, sf->selector, var.selector); + vmcs_write32(vcpu, sf->base, var.base); + vmcs_write32(vcpu, sf->limit, var.limit); + vmcs_write32(vcpu, sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) { - unsigned long flags; + size_t flags; struct vcpu_vmx *vmx = to_vmx(vcpu); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); @@ -3714,34 +2604,34 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 1; /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * Very old userspace does not call GVM_SET_TSS_ADDR before entering * vcpu. Warn the user that an update is overdue. */ if (!vcpu->kvm->arch.tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + printk_once(KERN_WARNING "kvm: GVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); - vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + vmcs_writel(vcpu, GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); + vmcs_write32(vcpu, GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, 0x008b); - flags = vmcs_readl(GUEST_RFLAGS); + flags = vmcs_readl(vcpu, GUEST_RFLAGS); vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + vmcs_writel(vcpu, GUEST_RFLAGS, flags); + vmcs_writel(vcpu, GUEST_CR4, vmcs_readl(vcpu, GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); - fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); - fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_rmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); kvm_mmu_reset_context(vcpu); } @@ -3749,41 +2639,31 @@ static void enter_rmode(struct kvm_vcpu *vcpu) static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - if (!msr) - return; - /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). - */ - vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; - if (efer & EFER_LMA) { + if (efer & EFER_LMA) vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { + else vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer & ~EFER_LME; - } setup_msrs(vmx); } #ifdef CONFIG_X86_64 +#define pr_debug_ratelimited DbgPrint + static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; vmx_segment_cache_clear(to_vmx(vcpu)); - guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + guest_tr_ar = vmcs_read32(vcpu, GUEST_TR_AR_BYTES); if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); - vmcs_write32(GUEST_TR_AR_BYTES, + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, (guest_tr_ar & ~VMX_AR_TYPE_MASK) | VMX_AR_TYPE_BUSY_64_TSS); } @@ -3818,13 +2698,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(vcpu, GUEST_CR0) & cr0_guest_owned_bits; } static void vmx_decache_cr3(struct kvm_vcpu *vcpu) { if (enable_ept && is_paging(vcpu)) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + vcpu->arch.cr3 = vmcs_readl(vcpu, GUEST_CR3); __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } @@ -3833,7 +2713,7 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(vcpu, GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -3841,14 +2721,14 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) + (size_t *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); + vmcs_write64(vcpu, GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(vcpu, GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(vcpu, GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(vcpu, GUEST_PDPTR3, mmu->pdptrs[3]); } } @@ -3857,38 +2737,38 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(vcpu, GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(vcpu, GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(vcpu, GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(vcpu, GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); + (size_t *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + (size_t *)&vcpu->arch.regs_dirty); } -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +static int vmx_set_cr4(struct kvm_vcpu *vcpu, size_t cr4); -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, +static void ept_update_paging_mode_cr0(size_t *hw_cr0, + size_t cr0, struct kvm_vcpu *vcpu) { if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) vmx_decache_cr3(vcpu); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL) | (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL) & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; @@ -3899,16 +2779,16 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, *hw_cr0 &= ~X86_CR0_WP; } -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +static void vmx_set_cr0(struct kvm_vcpu *vcpu, size_t cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + size_t hw_cr0; - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); + hw_cr0 = (cr0 & ~GVM_GUEST_CR0_MASK); if (enable_unrestricted_guest) - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + hw_cr0 |= GVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + hw_cr0 |= GVM_VM_CR0_ALWAYS_ON; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3929,18 +2809,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS | X86_CR0_MP; - - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); + vmcs_writel(vcpu, CR0_READ_SHADOW, cr0); + vmcs_writel(vcpu, GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static u64 construct_eptp(size_t root_hpa) { u64 eptp; @@ -3954,15 +2831,15 @@ static u64 construct_eptp(unsigned long root_hpa) return eptp; } -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void vmx_set_cr3(struct kvm_vcpu *vcpu, size_t cr3) { - unsigned long guest_cr3; + size_t guest_cr3; u64 eptp; guest_cr3 = cr3; if (enable_ept) { eptp = construct_eptp(cr3); - vmcs_write64(EPT_POINTER, eptp); + vmcs_write64(vcpu, EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else @@ -3971,21 +2848,21 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } vmx_flush_tlb(vcpu); - vmcs_writel(GUEST_CR3, guest_cr3); + vmcs_writel(vcpu, GUEST_CR3, guest_cr3); } -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int vmx_set_cr4(struct kvm_vcpu *vcpu, size_t cr4) { /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ - unsigned long hw_cr4 = + size_t hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE) | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + GVM_RMODE_VM_CR4_ALWAYS_ON : GVM_PMODE_VM_CR4_ALWAYS_ON); if (cr4 & X86_CR4_VMXE) { /* @@ -3994,7 +2871,7 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * So basically the check on whether to allow nested VMX * is here. */ - if (!nested_vmx_allowed(vcpu)) + //if (!nested_vmx_allowed(vcpu)) return 1; } if (to_vmx(vcpu)->nested.vmxon && @@ -4025,8 +2902,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) */ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, hw_cr4); + vmcs_writel(vcpu, CR4_READ_SHADOW, cr4); + vmcs_writel(vcpu, GUEST_CR4, hw_cr4); return 0; } @@ -4121,15 +2998,15 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; if (seg == VCPU_SREG_TR) - vmcs_write16(sf->selector, var->selector); + vmcs_write16(vcpu, sf->selector, var->selector); else if (var->s) - fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + fix_rmode_seg(vcpu, seg, &vmx->rmode.segs[seg]); goto out; } - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); + vmcs_writel(vcpu, sf->base, var->base); + vmcs_write32(vcpu, sf->limit, var->limit); + vmcs_write16(vcpu, sf->selector, var->selector); /* * Fix the "Accessed" bit in AR field of segment registers for older @@ -4145,7 +3022,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); + vmcs_write32(vcpu, sf->ar_bytes, vmx_segment_access_rights(var)); out: vmx->emulation_required = emulation_required(vcpu); @@ -4161,26 +3038,26 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->size = vmcs_read32(GUEST_IDTR_LIMIT); - dt->address = vmcs_readl(GUEST_IDTR_BASE); + dt->size = vmcs_read32(vcpu, GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(vcpu, GUEST_IDTR_BASE); } static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_IDTR_LIMIT, dt->size); - vmcs_writel(GUEST_IDTR_BASE, dt->address); + vmcs_write32(vcpu, GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(vcpu, GUEST_IDTR_BASE, dt->address); } static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->size = vmcs_read32(GUEST_GDTR_LIMIT); - dt->address = vmcs_readl(GUEST_GDTR_BASE); + dt->size = vmcs_read32(vcpu, GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(vcpu, GUEST_GDTR_BASE); } static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_GDTR_LIMIT, dt->size); - vmcs_writel(GUEST_GDTR_BASE, dt->address); + vmcs_write32(vcpu, GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(vcpu, GUEST_GDTR_BASE, dt->address); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) @@ -4454,24 +3331,24 @@ out2: return r; } -static void seg_setup(int seg) +static void seg_setup(struct kvm_vcpu *vcpu, int seg) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; unsigned int ar; - vmcs_write16(sf->selector, 0); - vmcs_writel(sf->base, 0); - vmcs_write32(sf->limit, 0xffff); + vmcs_write16(vcpu, sf->selector, 0); + vmcs_writel(vcpu, sf->base, 0); + vmcs_write32(vcpu, sf->limit, 0xffff); ar = 0x93; if (seg == VCPU_SREG_CS) ar |= 0x08; /* code segment */ - vmcs_write32(sf->ar_bytes, ar); + vmcs_write32(vcpu, sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) { - struct page *page; + pfn_t pfn; int r = 0; mutex_lock(&kvm->slots_lock); @@ -4482,17 +3359,12 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { + pfn = gfn_to_pfn(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_noslot_pfn(pfn)) { r = -EFAULT; goto out; } - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); @@ -4540,10 +3412,10 @@ static void free_vpid(int vpid) #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static void __vmx_disable_intercept_for_msr(size_t *msr_bitmap, u32 msr, int type) { - int f = sizeof(unsigned long); + int f = sizeof(size_t); if (!cpu_has_vmx_msr_bitmap()) return; @@ -4575,10 +3447,10 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static void __vmx_enable_intercept_for_msr(size_t *msr_bitmap, u32 msr, int type) { - int f = sizeof(unsigned long); + int f = sizeof(size_t); if (!cpu_has_vmx_msr_bitmap()) return; @@ -4614,11 +3486,11 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, +static void nested_vmx_disable_intercept_for_msr(size_t *msr_bitmap_l1, + size_t *msr_bitmap_nested, u32 msr, int type) { - int f = sizeof(unsigned long); + int f = sizeof(size_t); if (!cpu_has_vmx_msr_bitmap()) { WARN_ON(1); @@ -4715,125 +3587,6 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - void *vapic_page; - u16 status; - - if (vmx->nested.pi_desc && - vmx->nested.pi_pending) { - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; - - max_irr = find_last_bit( - (unsigned long *)vmx->nested.pi_desc->pir, 256); - - if (max_irr == 256) - return 0; - - vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); - kunmap(vmx->nested.virtual_apic_page); - - status = vmcs_read16(GUEST_INTR_STATUS); - if ((u8)max_irr > ((u8)status & 0xff)) { - status &= ~0xff; - status |= (u8)max_irr; - vmcs_write16(GUEST_INTR_STATUS, status); - } - } - return 0; -} - -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. - * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. - */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); - - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); - return true; - } -#endif - return false; -} - -static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, - int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu) && - vector == vmx->nested.posted_intr_nv) { - /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu); - /* - * If a posted intr is not recognized by hardware, - * we will accomplish it in the next vmentry. - */ - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; - } - return -1; -} -/* - * Send interrupt to vcpu via posted interrupt way. - * 1. If target vcpu is running(non-root mode), send posted interrupt - * notification to vcpu and hardware will sync PIR to vIRR atomically. - * 2. If target vcpu isn't running(root mode), kick it to pick up the - * interrupt from PIR in next vmentry. - */ -static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - - r = vmx_deliver_nested_posted_interrupt(vcpu, vector); - if (!r) - return; - - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return; - - r = pi_test_and_set_on(&vmx->pi_desc); - kvm_make_request(KVM_REQ_EVENT, vcpu); - if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) - kvm_vcpu_kick(vcpu); -} - -static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!pi_test_and_clear_on(&vmx->pi_desc)) - return; - - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); -} - /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -4842,71 +3595,63 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) */ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { - u32 low32, high32; - unsigned long tmpl; - struct desc_ptr dt; - unsigned long cr4; + u32 low32 = 0, high32 = 0; + size_t tmpl; + size_t cr4; + struct kvm_vcpu *vcpu = &vmx->vcpu; - vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + vmcs_writel(vcpu, HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(vcpu, HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); - vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmcs_writel(vcpu, HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->host_state.vmcs_host_cr4 = cr4; - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in * __vmx_load_host_state(), in case userspace uses the null selectors * too (the expected case). */ - vmcs_write16(HOST_DS_SELECTOR, 0); - vmcs_write16(HOST_ES_SELECTOR, 0); + vmcs_write16(vcpu, HOST_DS_SELECTOR, 0); + vmcs_write16(vcpu, HOST_ES_SELECTOR, 0); #else - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #endif - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - native_store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; + vmcs_write16(vcpu, HOST_SS_SELECTOR, __KERNEL_SS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ + vmcs_writel(vcpu, HOST_RIP, vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); - vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + vmcs_write32(vcpu, HOST_IA32_SYSENTER_CS, low32); rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); - vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ + vmcs_writel(vcpu, HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { rdmsr(MSR_IA32_CR_PAT, low32, high32); - vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); + vmcs_write64(vcpu, HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } } static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + vmx->vcpu.arch.cr4_guest_owned_bits = GVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) vmx->vcpu.arch.cr4_guest_owned_bits &= ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + vmcs_writel(&vmx->vcpu, CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!kvm_vcpu_apicv_active(&vmx->vcpu)) - pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; - /* Enable the preemption timer dynamically */ - pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -4914,14 +3659,14 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmcs_write32(vcpu, PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_set_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); else - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_clear_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } @@ -4934,7 +3679,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; - if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + if (vmx->vcpu.arch.switch_db_regs & GVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; if (!cpu_need_tpr_shadow(&vmx->vcpu)) { @@ -4966,8 +3711,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); @@ -5003,114 +3746,90 @@ static void ept_set_mmio_spte_mask(void) static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 - unsigned long a; + size_t a; #endif - int i; + struct kvm_vcpu *vcpu = &vmx->vcpu; /* I/O */ - vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); + vmcs_write64(vcpu, IO_BITMAP_A, __pa(vmx_io_bitmap_a)); + vmcs_write64(vcpu, IO_BITMAP_B, __pa(vmx_io_bitmap_b)); if (enable_shadow_vmcs) { - vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + vmcs_write64(vcpu, VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(vcpu, VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); + vmcs_write64(vcpu, MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + vmcs_write64(vcpu, VMCS_LINK_POINTER, (u64)-1); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; + vmcs_write32(vcpu, PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { - vmcs_write64(EOI_EXIT_BITMAP0, 0); - vmcs_write64(EOI_EXIT_BITMAP1, 0); - vmcs_write64(EOI_EXIT_BITMAP2, 0); - vmcs_write64(EOI_EXIT_BITMAP3, 0); - - vmcs_write16(GUEST_INTR_STATUS, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP0, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP1, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP2, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP3, 0); - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); - } + vmcs_write16(vcpu, GUEST_INTR_STATUS, 0); - if (ple_gap) { - vmcs_write32(PLE_GAP, ple_gap); - vmx->ple_window = ple_window; - vmx->ple_window_dirty = true; + //vmcs_write16(vcpu, POSTED_INTR_NV, POSTED_INTR_VECTOR); + //vmcs_write64(vcpu, POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MATCH, 0); + vmcs_write32(vcpu, CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_FS_BASE, a); /* 22.2.4 */ rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_GS_BASE, a); /* 22.2.4 */ #else - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_GS_BASE, 0); /* 22.2.4 */ #endif - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write32(vcpu, VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(vcpu, VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(vcpu, VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(vcpu, VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - + vmcs_write64(vcpu, GUEST_IA32_PAT, vmx->vcpu.arch.pat); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmcs_writel(vcpu, CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); if (vmx_xsaves_supported()) - vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + vmcs_write64(vcpu, XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { ASSERT(vmx->pml_pg); - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write64(vcpu, PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(vcpu, GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } return 0; } - static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5135,72 +3854,68 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_segment_cache_clear(vmx); - seg_setup(VCPU_SREG_CS); - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); + seg_setup(vcpu, VCPU_SREG_CS); + vmcs_write16(vcpu, GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(vcpu, GUEST_CS_BASE, 0xffff0000ul); - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); + seg_setup(vcpu, VCPU_SREG_DS); + seg_setup(vcpu, VCPU_SREG_ES); + seg_setup(vcpu, VCPU_SREG_FS); + seg_setup(vcpu, VCPU_SREG_GS); + seg_setup(vcpu, VCPU_SREG_SS); - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + vmcs_write16(vcpu, GUEST_TR_SELECTOR, 0); + vmcs_writel(vcpu, GUEST_TR_BASE, 0); + vmcs_write32(vcpu, GUEST_TR_LIMIT, 0xffff); + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, 0x008b); - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + vmcs_write16(vcpu, GUEST_LDTR_SELECTOR, 0); + vmcs_writel(vcpu, GUEST_LDTR_BASE, 0); + vmcs_write32(vcpu, GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(vcpu, GUEST_LDTR_AR_BYTES, 0x00082); if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, 0); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, 0); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, 0); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, 0); } - vmcs_writel(GUEST_RFLAGS, 0x02); + vmcs_writel(vcpu, GUEST_RFLAGS, 0x02); kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + vmcs_writel(vcpu, GUEST_GDTR_BASE, 0); + vmcs_write32(vcpu, GUEST_GDTR_LIMIT, 0xffff); - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmcs_writel(vcpu, GUEST_IDTR_BASE, 0); + vmcs_write32(vcpu, GUEST_IDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmcs_write32(vcpu, GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); + vmcs_write32(vcpu, GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_writel(vcpu, GUEST_PENDING_DBG_EXCEPTIONS, 0); setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + vmcs_write64(vcpu, VIRTUAL_APIC_PAGE_ADDR, 0); if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + vmcs_write64(vcpu, VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); + vmcs_write32(vcpu, TPR_THRESHOLD, 0); } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - - if (kvm_vcpu_apicv_active(vcpu)) - memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); + kvm_make_request(GVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmcs_write16(vcpu, VIRTUAL_PROCESSOR_ID, vmx->vpid); cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx->vcpu.arch.cr0 = cr0; vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); - vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); @@ -5236,9 +3951,9 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -5246,14 +3961,14 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) u32 cpu_based_vm_exec_control; if (!cpu_has_virtual_nmis() || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -5262,25 +3977,23 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); - ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } intr = irq | INTR_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { intr |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_write32(vcpu, VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); } else intr |= INTR_TYPE_EXT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, intr); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -5307,11 +4020,11 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } @@ -5321,7 +4034,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->soft_vnmi_blocked; if (to_vmx(vcpu)->nmi_known_unmasked) return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + return vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -5336,10 +4049,10 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } else { vmx->nmi_known_unmasked = !masked; if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_set_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_clear_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } } @@ -5352,7 +4065,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + return !(vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); } @@ -5360,8 +4073,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + vmcs_readl(vcpu, GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -5386,13 +4099,13 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) * from user space while in guest debugging mode. */ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); + if (vcpu->guest_debug & GVM_GUESTDBG_USE_SW_BP) return false; /* fall through */ case DB_VECTOR: if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) return false; /* fall through */ case DE_VECTOR: @@ -5445,14 +4158,14 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, */ static void kvm_machine_check(void) { -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; + /* + * On an #MC intercept the MCE handler is not called automatically in + * the host. So do it by hand here. + */ + __int12(); + /* not sure if we ever come back to this point */ - do_machine_check(®s, 0); -#endif + return; } static int handle_machine_check(struct kvm_vcpu *vcpu) @@ -5466,7 +4179,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; + size_t cr2, rip, dr6; u32 vect_info; enum emulation_result er; @@ -5479,11 +4192,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ - if (is_no_device(intr_info)) { - vmx_fpu_activate(vcpu); - return 1; - } - if (is_invalid_opcode(intr_info)) { if (is_guest_mode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5497,7 +4205,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) - error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + error_code = vmcs_read32(vcpu, VM_EXIT_INTR_ERROR_CODE); /* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing @@ -5506,8 +4214,8 @@ static int handle_exception(struct kvm_vcpu *vcpu) */ if ((vect_info & VECTORING_INFO_VALID_MASK) && !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_SIMUL_EX; vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; @@ -5518,8 +4226,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); - cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); + cr2 = vmcs_readl(vcpu, EXIT_QUALIFICATION); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); @@ -5536,9 +4243,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); + dr6 = vmcs_readl(vcpu, EXIT_QUALIFICATION); if (!(vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ @@ -5548,7 +4255,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; } kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; - kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + kvm_run->debug.arch.dr7 = vmcs_readl(vcpu, GUEST_DR7); /* fall through */ case BP_VECTOR: /* @@ -5557,14 +4264,14 @@ static int handle_exception(struct kvm_vcpu *vcpu) * #DB as well causes no harm, it is not used in that case. */ vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_run->exit_reason = KVM_EXIT_DEBUG; + vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); + kvm_run->exit_reason = GVM_EXIT_DEBUG; rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.pc = vmcs_readl(vcpu, GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; default: - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->exit_reason = GVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; kvm_run->ex.error_code = error_code; break; @@ -5580,17 +4287,17 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) static int handle_triple_fault(struct kvm_vcpu *vcpu) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = GVM_EXIT_SHUTDOWN; return 0; } static int handle_io(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + size_t exit_qualification; int size, in, string; unsigned port; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; in = (exit_qualification & 8) != 0; @@ -5606,20 +4313,9 @@ static int handle_io(struct kvm_vcpu *vcpu) return kvm_fast_pio_out(vcpu, size, port); } -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, size_t val) { - /* - * Patch in the VMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xc1; -} - -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; + size_t always_on = VMXON_CR0_ALWAYSON; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & @@ -5630,11 +4326,11 @@ static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) } /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ -static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +static int handle_set_cr0(struct kvm_vcpu *vcpu, size_t val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; + size_t orig_val = val; /* * We get here when L2 changed cr0 in a way that did not change @@ -5652,7 +4348,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) if (kvm_set_cr0(vcpu, val)) return 1; - vmcs_writel(CR0_READ_SHADOW, orig_val); + vmcs_writel(vcpu, CR0_READ_SHADOW, orig_val); return 0; } else { if (to_vmx(vcpu)->nested.vmxon && @@ -5662,53 +4358,36 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) } } -static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) +static int handle_set_cr4(struct kvm_vcpu *vcpu, size_t val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; + size_t orig_val = val; /* analogously to handle_set_cr0 */ val = (val & ~vmcs12->cr4_guest_host_mask) | (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); if (kvm_set_cr4(vcpu, val)) return 1; - vmcs_writel(CR4_READ_SHADOW, orig_val); + vmcs_writel(vcpu, CR4_READ_SHADOW, orig_val); return 0; } else return kvm_set_cr4(vcpu, val); } -/* called to set cr0 as appropriate for clts instruction exit. */ -static void handle_clts(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) { - /* - * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS - * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, - * just pretend it's off (also in arch.cr0 for fpu_activate). - */ - vmcs_writel(CR0_READ_SHADOW, - vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); - vcpu->arch.cr0 &= ~X86_CR0_TS; - } else - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); -} - static int handle_cr(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification, val; + size_t exit_qualification, val; int cr; int reg; int err; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ val = kvm_register_readl(vcpu, reg); - trace_kvm_cr_write(cr, val); switch (cr) { case 0: err = handle_set_cr0(vcpu, val); @@ -5731,36 +4410,31 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 1; if (cr8_prev <= cr8) return 1; - vcpu->run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = GVM_EXIT_SET_TPR; return 0; } } break; case 2: /* clts */ - handle_clts(vcpu); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); skip_emulated_instruction(vcpu); - vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); @@ -5776,10 +4450,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + size_t exit_qualification; int dr, dr7, reg; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ @@ -5789,19 +4463,19 @@ static int handle_dr(struct kvm_vcpu *vcpu) /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; - dr7 = vmcs_readl(GUEST_DR7); + dr7 = vmcs_readl(vcpu, GUEST_DR7); if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the * guest debugging itself. */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; - vcpu->run->exit_reason = KVM_EXIT_DEBUG; + vcpu->run->exit_reason = GVM_EXIT_DEBUG; return 0; } else { vcpu->arch.dr6 &= ~15; @@ -5812,7 +4486,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) } if (vcpu->guest_debug == 0) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + vmcs_clear_bits(vcpu, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); /* @@ -5820,13 +4494,13 @@ static int handle_dr(struct kvm_vcpu *vcpu) * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers. */ - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + vcpu->arch.switch_db_regs |= GVM_DEBUGREG_WONT_EXIT; return 1; } reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; + size_t val; if (kvm_get_dr(vcpu, dr, &val)) return 1; @@ -5844,7 +4518,7 @@ static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) return vcpu->arch.dr6; } -static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +static void vmx_set_dr6(struct kvm_vcpu *vcpu, size_t val) { } @@ -5855,15 +4529,15 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); get_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + vcpu->arch.dr7 = vmcs_readl(vcpu, GUEST_DR7); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_WONT_EXIT; + vmcs_set_bits(vcpu, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +static void vmx_set_dr7(struct kvm_vcpu *vcpu, size_t val) { - vmcs_writel(GUEST_DR7, val); + vmcs_writel(vcpu, GUEST_DR7, val); } static int handle_cpuid(struct kvm_vcpu *vcpu) @@ -5880,16 +4554,13 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) msr_info.index = ecx; msr_info.host_initiated = false; if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_read(ecx, msr_info.data); - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; + vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & (unsigned)-1; + vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & (unsigned)-1; skip_emulated_instruction(vcpu); return 1; } @@ -5898,26 +4569,24 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) { struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & (unsigned)-1) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & (unsigned)-1) << 32); msr.data = data; msr.index = ecx; msr.host_initiated = false; if (kvm_set_msr(vcpu, &msr) != 0) { - trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 1; } @@ -5926,11 +4595,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) u32 cpu_based_vm_exec_control; /* clear pending irq */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; return 1; @@ -5941,11 +4610,6 @@ static int handle_halt(struct kvm_vcpu *vcpu) return kvm_emulate_halt(vcpu); } -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - static int handle_invd(struct kvm_vcpu *vcpu) { return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -5953,7 +4617,7 @@ static int handle_invd(struct kvm_vcpu *vcpu) static int handle_invlpg(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); skip_emulated_instruction(vcpu); @@ -5962,10 +4626,12 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) static int handle_rdpmc(struct kvm_vcpu *vcpu) { +#if 0 int err; err = kvm_rdpmc(vcpu); kvm_complete_insn_gp(vcpu, err); +#endif return 1; } @@ -5989,21 +4655,21 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_xsaves(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); + //WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); + //WARN(1, "this should never happen\n"); return 1; } static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; @@ -6025,7 +4691,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ @@ -6035,7 +4701,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ @@ -6046,7 +4712,7 @@ static int handle_apic_write(struct kvm_vcpu *vcpu) static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification; + size_t exit_qualification; bool has_error_code = false; u32 error_code = 0; u16 tss_selector; @@ -6056,7 +4722,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { @@ -6074,7 +4740,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) VECTORING_INFO_DELIVER_CODE_MASK) { has_error_code = true; error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs_read32(vcpu, IDT_VECTORING_ERROR_CODE); } /* fall through */ case INTR_TYPE_SOFT_EXCEPTION: @@ -6094,8 +4760,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) if (kvm_task_switch(vcpu, tss_selector, type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return 0; } @@ -6110,22 +4776,22 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + size_t exit_qualification; gpa_t gpa; u32 error_code; int gla_validity; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity == 0x2) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); + (long unsigned int)vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS), + vmcs_readl(vcpu, GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->exit_reason = GVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; return 0; } @@ -6139,10 +4805,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + vmcs_set_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); + gpa = vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS); /* it is a read fault? */ error_code = (exit_qualification << 2) & PFERR_USER_MASK; @@ -6163,10 +4828,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) int ret; gpa_t gpa; - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + gpa = vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS); + if (!kvm_io_bus_write(vcpu, GVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); - trace_kvm_fast_mmio(gpa); return 1; } @@ -6184,7 +4848,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) /* It is the real ept misconfig */ WARN_ON(1); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->exit_reason = GVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; return 0; @@ -6195,11 +4859,11 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) u32 cpu_based_vm_exec_control; /* clear pending NMI */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); ++vcpu->stat.nmi_window_exits; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 1; } @@ -6213,14 +4877,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) bool intr_window_requested; unsigned count = 130; - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_exec_ctrl = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (test_bit(GVM_REQ_EVENT, &vcpu->requests)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); @@ -6232,8 +4896,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return 0; } @@ -6244,155 +4908,65 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } +#if 0 if (signal_pending(current)) goto out; if (need_resched()) schedule(); +#endif } out: return ret; } -static int __grow_ple_window(int val) -{ - if (ple_window_grow < 1) - return ple_window; - - val = min(val, ple_window_actual_max); - - if (ple_window_grow < ple_window) - val *= ple_window_grow; - else - val += ple_window_grow; - - return val; -} - -static int __shrink_ple_window(int val, int modifier, int minimum) -{ - if (modifier < 1) - return ple_window; - - if (modifier < ple_window) - val /= modifier; - else - val -= modifier; - - return max(val, minimum); -} - -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) +static int hardware_setup(void) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, - ple_window_shrink, ple_window); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); -} - -/* - * ple_window_actual_max is computed to be one grow_ple_window() below - * ple_window_max. (See __grow_ple_window for the reason.) - * This prevents overflows, because ple_window_max is int. - * ple_window_max effectively rounded down to a multiple of ple_window_grow in - * this process. - * ple_window_max is also prevented from setting vmx->ple_window < ple_window. - */ -static void update_ple_window_actual_max(void) -{ - ple_window_actual_max = - __shrink_ple_window(max(ple_window_max, ple_window), - ple_window_grow, INT_MIN); -} - -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - -static __init int hardware_setup(void) -{ - int r = -ENOMEM, i, msr; + int r = -ENOMEM, msr; rdmsrl_safe(MSR_EFER, &host_efer); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_io_bitmap_a = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) return r; - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_io_bitmap_b = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_b) goto out; - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_msr_bitmap_legacy = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_legacy) goto out1; vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_legacy_x2apic) goto out2; vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) goto out3; - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_msr_bitmap_longmode = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) goto out4; vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) goto out5; vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) goto out6; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_vmread_bitmap = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out7; - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_vmwrite_bitmap = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) goto out8; @@ -6421,10 +4995,6 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -6453,28 +5023,15 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - if (!cpu_has_vmx_apicv()) enable_apicv = 0; - if (cpu_has_vmx_tsc_scaling()) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -6522,8 +5079,6 @@ static __init int hardware_setup(void) } else kvm_disable_tdp(); - update_ple_window_actual_max(); - /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -6538,78 +5093,50 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { - u64 vmx_msr; - - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - } - - kvm_set_posted_intr_wakeup_handler(wakeup_handler); - - kvm_mce_cap_supported |= MCG_LMCE_P; + //kvm_set_posted_intr_wakeup_handler(wakeup_handler); return alloc_kvm_area(); out9: - free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((size_t)vmx_vmwrite_bitmap); out8: - free_page((unsigned long)vmx_vmread_bitmap); + free_page((size_t)vmx_vmread_bitmap); out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic); out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((size_t)vmx_msr_bitmap_longmode); out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic); out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((size_t)vmx_msr_bitmap_legacy); out1: - free_page((unsigned long)vmx_io_bitmap_b); + free_page((size_t)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + free_page((size_t)vmx_io_bitmap_a); return r; } -static __exit void hardware_unsetup(void) +static void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_legacy); + free_page((size_t)vmx_msr_bitmap_longmode); + free_page((size_t)vmx_io_bitmap_b); + free_page((size_t)vmx_io_bitmap_a); + free_page((size_t)vmx_vmwrite_bitmap); + free_page((size_t)vmx_vmread_bitmap); free_kvm_area(); } -/* - * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE - * exiting, so only get here on cpu with PAUSE-Loop-Exiting. - */ -static int handle_pause(struct kvm_vcpu *vcpu) -{ - if (ple_gap) - grow_ple_window(vcpu); - - skip_emulated_instruction(vcpu); - kvm_vcpu_on_spin(vcpu); - - return 1; -} - static int handle_nop(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); @@ -6650,11 +5177,13 @@ static int handle_monitor(struct kvm_vcpu *vcpu) static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) { struct vmcs02_list *item; +#define LIST_ENTRY_TYPE_INFO struct vmcs02_list list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) if (item->vmptr == vmx->nested.current_vmptr) { list_move(&item->list, &vmx->nested.vmcs02_pool); return &item->vmcs02; } +#undef LIST_ENTRY_TYPE_INFO if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { /* Recycle the least recently used VMCS. */ @@ -6686,6 +5215,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) { struct vmcs02_list *item; +#define LIST_ENTRY_TYPE_INFO struct vmcs02_list list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) if (item->vmptr == vmptr) { free_loaded_vmcs(&item->vmcs02); @@ -6694,6 +5224,7 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) vmx->nested.vmcs02_num--; return; } +#undef LIST_ENTRY_TYPE_INFO } /* @@ -6706,6 +5237,7 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) struct vmcs02_list *item, *n; WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); +#define LIST_ENTRY_TYPE_INFO struct vmcs02_list list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { /* * Something will leak if the above WARN triggers. Better than @@ -6719,6 +5251,7 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) kfree(item); vmx->nested.vmcs02_num--; } +#undef LIST_ENTRY_TYPE_INFO } /* @@ -6766,22 +5299,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } -static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) -{ - struct vcpu_vmx *vmx = - container_of(timer, struct vcpu_vmx, nested.preemption_timer); - - vmx->nested.preemption_timer_expired = true; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - kvm_vcpu_kick(&vmx->vcpu); - - return HRTIMER_NORESTART; -} - /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). @@ -6789,7 +5310,7 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) * #UD or #GP. */ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, + size_t exit_qualification, u32 vmx_instruction_info, bool wr, gva_t *ret) { gva_t off; @@ -6892,12 +5413,12 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, gva_t gva; gpa_t vmptr; struct x86_exception e; - struct page *page; + PMDL kmap_mdl; struct vcpu_vmx *vmx = to_vmx(vcpu); int maxphyaddr = cpuid_maxphyaddr(vcpu); - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + if (get_vmx_mem_address(vcpu, vmcs_readl(vcpu, EXIT_QUALIFICATION), + vmcs_read32(vcpu, VMX_INSTRUCTION_INFO), false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, @@ -6924,15 +5445,15 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, return 1; } - page = nested_get_page(vcpu, vmptr); - if (page == NULL || - *(u32 *)kmap(page) != VMCS12_REVISION) { + kmap_mdl = nested_get_page(vcpu, vmptr); + if (kmap_mdl == NULL || + *(u32 *)kmap(kmap_mdl) != VMCS12_REVISION) { nested_vmx_failInvalid(vcpu); - kunmap(page); + kunmap(kmap_mdl); skip_emulated_instruction(vcpu); return 1; } - kunmap(page); + kunmap(kmap_mdl); vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: @@ -7030,7 +5551,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx->nested.msr_bitmap) goto out_msr_bitmap; } @@ -7053,10 +5574,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; - vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -7067,7 +5584,7 @@ out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: - free_page((unsigned long)vmx->nested.msr_bitmap); + free_page((size_t)vmx->nested.msr_bitmap); out_msr_bitmap: return -ENOMEM; @@ -7105,6 +5622,8 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { + struct kvm_vcpu* vcpu = &vmx->vcpu; + if (vmx->nested.current_vmptr == -1ull) return; @@ -7117,18 +5636,17 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_clear_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(vcpu, VMCS_LINK_POINTER, -1ull); } - vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, VMCS12_SIZE); - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); + kunmap(vmx->nested.current_vmcs12_mdl); + nested_release_page(vmx->nested.current_vmcs12_mdl); vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } @@ -7146,7 +5664,7 @@ static void free_nested(struct vcpu_vmx *vmx) free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (vmx->nested.msr_bitmap) { - free_page((unsigned long)vmx->nested.msr_bitmap); + free_page((size_t)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } if (enable_shadow_vmcs) { @@ -7156,19 +5674,13 @@ static void free_nested(struct vcpu_vmx *vmx) } kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ - if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; + if (vmx->nested.apic_access_mdl) { + nested_release_page(vmx->nested.apic_access_mdl); + vmx->nested.apic_access_mdl = NULL; } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; + if (vmx->nested.virtual_apic_mdl) { + nested_release_page(vmx->nested.virtual_apic_mdl); + vmx->nested.virtual_apic_mdl = NULL; } nested_free_all_saved_vmcss(vmx); @@ -7191,7 +5703,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; struct vmcs12 *vmcs12; - struct page *page; + PMDL kmap_mdl; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7202,8 +5714,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { + kmap_mdl = nested_get_page(vcpu, vmptr); + if (kmap_mdl == NULL) { /* * For accurate processor emulation, VMCLEAR beyond available * physical memory should do nothing at all. However, it is @@ -7211,13 +5723,13 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) * resulted in this case, so let's shut down before doing any * more damage: */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return 1; } - vmcs12 = kmap(page); + vmcs12 = kmap(kmap_mdl); vmcs12->launch_state = 0; - kunmap(page); - nested_release_page(page); + kunmap(kmap_mdl); + nested_release_page(kmap_mdl); nested_free_vmcs02(vmx, vmptr); @@ -7248,14 +5760,14 @@ enum vmcs_field_type { VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 }; -static inline int vmcs_field_type(unsigned long field) +static inline int vmcs_field_type(size_t field) { if (0x1 & field) /* the *_HIGH fields are all 32 bit */ return VMCS_FIELD_TYPE_U32; return (field >> 13) & 0x3 ; } -static inline int vmcs_field_readonly(unsigned long field) +static inline int vmcs_field_readonly(size_t field) { return (((field >> 10) & 0x3) == 1); } @@ -7268,7 +5780,7 @@ static inline int vmcs_field_readonly(unsigned long field) * 64-bit fields are to be returned). */ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) + size_t field, u64 *ret) { short offset = vmcs_field_to_offset(field); char *p; @@ -7299,7 +5811,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 field_value){ + size_t field, u64 field_value){ short offset = vmcs_field_to_offset(field); char *p = ((char *) get_vmcs12(vcpu)) + offset; if (offset < 0) @@ -7328,11 +5840,12 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { int i; - unsigned long field; + size_t field; u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - const unsigned long *fields = shadow_read_write_fields; + const size_t *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + struct kvm_vcpu* vcpu = &vmx->vcpu; preempt_disable(); @@ -7342,16 +5855,16 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) field = fields[i]; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: - field_value = vmcs_read16(field); + field_value = vmcs_read16(vcpu, field); break; case VMCS_FIELD_TYPE_U32: - field_value = vmcs_read32(field); + field_value = vmcs_read32(vcpu, field); break; case VMCS_FIELD_TYPE_U64: - field_value = vmcs_read64(field); + field_value = vmcs_read64(vcpu, field); break; case VMCS_FIELD_TYPE_NATURAL_WIDTH: - field_value = vmcs_readl(field); + field_value = vmcs_readl(vcpu, field); break; default: WARN_ON(1); @@ -7368,7 +5881,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const unsigned long *fields[] = { + const size_t *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -7377,9 +5890,10 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) max_shadow_read_only_fields }; int i, q; - unsigned long field; + size_t field; u64 field_value = 0; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct kvm_vcpu* vcpu = &vmx->vcpu; vmcs_load(shadow_vmcs); @@ -7390,16 +5904,16 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: - vmcs_write16(field, (u16)field_value); + vmcs_write16(vcpu, field, (u16)field_value); break; case VMCS_FIELD_TYPE_U32: - vmcs_write32(field, (u32)field_value); + vmcs_write32(vcpu, field, (u32)field_value); break; case VMCS_FIELD_TYPE_U64: - vmcs_write64(field, (u64)field_value); + vmcs_write64(vcpu, field, (u64)field_value); break; case VMCS_FIELD_TYPE_NATURAL_WIDTH: - vmcs_writel(field, (long)field_value); + vmcs_writel(vcpu, field, (long)field_value); break; default: WARN_ON(1); @@ -7429,10 +5943,10 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) static int handle_vmread(struct kvm_vcpu *vcpu) { - unsigned long field; + size_t field; u64 field_value; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); gva_t gva = 0; if (!nested_vmx_check_permission(vcpu) || @@ -7472,10 +5986,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) static int handle_vmwrite(struct kvm_vcpu *vcpu) { - unsigned long field; + size_t field; gva_t gva; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -7537,17 +6051,17 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; - struct page *page; - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { + PMDL kmap_mdl; + kmap_mdl = nested_get_page(vcpu, vmptr); + if (kmap_mdl == NULL) { nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; } - new_vmcs12 = kmap(page); + new_vmcs12 = kmap(kmap_mdl); if (new_vmcs12->revision_id != VMCS12_REVISION) { - kunmap(page); - nested_release_page_clean(page); + kunmap(kmap_mdl); + nested_release_page(kmap_mdl); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); skip_emulated_instruction(vcpu); @@ -7557,7 +6071,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; - vmx->nested.current_vmcs12_page = page; + vmx->nested.current_vmcs12_mdl = kmap_mdl; /* * Load VMCS12 from guest memory since it is not already * cached. @@ -7566,9 +6080,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12, VMCS12_SIZE); if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_set_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, + vmcs_write64(vcpu, VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } @@ -7582,8 +6096,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); gva_t vmcs_gva; struct x86_exception e; @@ -7610,7 +6124,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; - unsigned long type; + size_t type; gva_t gva; struct x86_exception e; struct { @@ -7632,7 +6146,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -7647,7 +6161,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmcs_readl(vcpu, EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, @@ -7664,7 +6178,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) */ case VMX_EPT_EXTENT_CONTEXT: kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: @@ -7680,7 +6194,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info; - unsigned long type, types; + size_t type, types; gva_t gva; struct x86_exception e; int vpid; @@ -7695,7 +6209,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; @@ -7710,7 +6224,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmcs_readl(vcpu, EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, @@ -7722,7 +6236,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) switch (type) { case VMX_VPID_EXTENT_SINGLE_CONTEXT: /* - * Old versions of KVM use the single-context version so we + * Old versions of kvm use the single-context version so we * have to support it; just treat it the same as all-context. */ case VMX_VPID_EXTENT_ALL_CONTEXT: @@ -7741,11 +6255,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) static int handle_pml_full(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; - - trace_kvm_pml_full(vcpu->vcpu_id); + size_t exit_qualification; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); /* * PML buffer FULL happened while executing iret from NMI, @@ -7754,7 +6266,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_set_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); /* @@ -7764,12 +6276,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static int handle_preemption_timer(struct kvm_vcpu *vcpu) -{ - kvm_lapic_expired_hv_timer(vcpu); - return 1; -} - /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7791,7 +6297,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, [EXIT_REASON_VMPTRLD] = handle_vmptrld, @@ -7811,7 +6316,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, - [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, @@ -7820,7 +6324,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, - [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -7829,7 +6332,7 @@ static const int kvm_vmx_max_exit_handlers = static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification; + size_t exit_qualification; gpa_t bitmap, last_bitmap; unsigned int port; int size; @@ -7838,7 +6341,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7915,10 +6418,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); int cr = exit_qualification & 15; int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_readl(vcpu, reg); + size_t val = kvm_register_readl(vcpu, reg); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ @@ -7995,24 +6498,17 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, */ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) { - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + u32 intr_info = vmcs_read32(vcpu, VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 exit_reason = vmx->exit_reason; - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); - if (vmx->nested.nested_run_pending) return false; if (unlikely(vmx->fail)) { pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); + vmcs_read32(vcpu, VM_INSTRUCTION_ERROR)); return true; } @@ -8022,15 +6518,12 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return false; else if (is_page_fault(intr_info)) return enable_ept; - else if (is_no_device(intr_info) && - !(vmcs12->guest_cr0 & X86_CR0_TS)) - return false; else if (is_debug(intr_info) && vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) return false; else if (is_breakpoint(intr_info) && - vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + vcpu->guest_debug & GVM_GUESTDBG_USE_SW_BP) return false; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); @@ -8129,8 +6622,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; default: return true; } @@ -8138,8 +6629,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); + *info1 = vmcs_readl(vcpu, EXIT_QUALIFICATION); + *info2 = vmcs_read32(vcpu, VM_EXIT_INTR_INFO); } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -8156,7 +6647,7 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) u64 *pml_buf; u16 pml_idx; - pml_idx = vmcs_read16(GUEST_PML_INDEX); + pml_idx = vmcs_read16(vcpu, GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ if (pml_idx == (PML_ENTITY_NUM - 1)) @@ -8178,7 +6669,7 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) } /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(vcpu, GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } /* @@ -8199,160 +6690,156 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) kvm_vcpu_kick(vcpu); } -static void vmx_dump_sel(char *name, uint32_t sel) +static void vmx_dump_sel(struct kvm_vcpu* vcpu, char *name, uint32_t sel) { - pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), - vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), - vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), - vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", + name, vmcs_read32(vcpu, sel), + vmcs_read32(vcpu, sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(vcpu, sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(vcpu, sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); } -static void vmx_dump_dtsel(char *name, uint32_t limit) +static void vmx_dump_dtsel(struct kvm_vcpu* vcpu, char *name, uint32_t limit) { - pr_err("%s limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(limit), - vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); + pr_err("%s limit=0x%08x, base=0x%016llx\n", + name, vmcs_read32(vcpu, limit), + vmcs_readl(vcpu, limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +static void dump_vmcs(struct kvm_vcpu* vcpu) { - u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); - u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); - u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 vmentry_ctl = vmcs_read32(vcpu, VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(vcpu, VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(vcpu, PIN_BASED_VM_EXEC_CONTROL); u32 secondary_exec_control = 0; - unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_read64(GUEST_IA32_EFER); + size_t cr4 = vmcs_readl(vcpu, GUEST_CR4); + u64 efer = vmcs_read64(vcpu, GUEST_IA32_EFER); int i, n; if (cpu_has_secondary_exec_ctrls()) - secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + secondary_exec_control = vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL); pr_err("*** Guest State ***\n"); - pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), - vmcs_readl(CR0_GUEST_HOST_MASK)); - pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); - pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + pr_err("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", + vmcs_readl(vcpu, GUEST_CR0), vmcs_readl(vcpu, CR0_READ_SHADOW), + vmcs_readl(vcpu, CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", + cr4, vmcs_readl(vcpu, CR4_READ_SHADOW), vmcs_readl(vcpu, CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016llx\n", vmcs_readl(vcpu, GUEST_CR3)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) { pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + vmcs_read64(vcpu, GUEST_PDPTR0), vmcs_read64(vcpu, GUEST_PDPTR1)); pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); - } - pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", - vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); - pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", - vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(GUEST_SYSENTER_ESP), - vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); - vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); - vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); - vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); - vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); - vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); - vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); - vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); - vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); - vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); - vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + vmcs_read64(vcpu, GUEST_PDPTR2), vmcs_read64(vcpu, GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016llx RIP = 0x%016llx\n", + vmcs_readl(vcpu, GUEST_RSP), vmcs_readl(vcpu, GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016llx\n", + vmcs_readl(vcpu, GUEST_RFLAGS), vmcs_readl(vcpu, GUEST_DR7)); + pr_err("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", + vmcs_readl(vcpu, GUEST_SYSENTER_ESP), + vmcs_read32(vcpu, GUEST_SYSENTER_CS), vmcs_readl(vcpu, GUEST_SYSENTER_EIP)); + vmx_dump_sel(vcpu, "CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel(vcpu, "DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel(vcpu, "SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel(vcpu, "ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel(vcpu, "FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel(vcpu, "GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel(vcpu, "GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel(vcpu, "LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel(vcpu, "IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel(vcpu, "TR: ", GUEST_TR_SELECTOR); if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - efer, vmcs_read64(GUEST_IA32_PAT)); - pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", - vmcs_read64(GUEST_IA32_DEBUGCTL), - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + efer, vmcs_read64(vcpu, GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016llx\n", + vmcs_read64(vcpu, GUEST_IA32_DEBUGCTL), + vmcs_readl(vcpu, GUEST_PENDING_DBG_EXCEPTIONS)); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); + vmcs_read64(vcpu, GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) - pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(vcpu, GUEST_BNDCFGS)); pr_err("Interruptibility = %08x ActivityState = %08x\n", - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), - vmcs_read32(GUEST_ACTIVITY_STATE)); + vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(vcpu, GUEST_ACTIVITY_STATE)); if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) pr_err("InterruptStatus = %04x\n", - vmcs_read16(GUEST_INTR_STATUS)); + vmcs_read16(vcpu, GUEST_INTR_STATUS)); pr_err("*** Host State ***\n"); - pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", - vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("RIP = 0x%016llx RSP = 0x%016llx\n", + vmcs_readl(vcpu, HOST_RIP), vmcs_readl(vcpu, HOST_RSP)); pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", - vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), - vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), - vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), - vmcs_read16(HOST_TR_SELECTOR)); - pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", - vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), - vmcs_readl(HOST_TR_BASE)); - pr_err("GDTBase=%016lx IDTBase=%016lx\n", - vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); - pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", - vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), - vmcs_readl(HOST_CR4)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(HOST_IA32_SYSENTER_ESP), - vmcs_read32(HOST_IA32_SYSENTER_CS), - vmcs_readl(HOST_IA32_SYSENTER_EIP)); + vmcs_read16(vcpu, HOST_CS_SELECTOR), vmcs_read16(vcpu, HOST_SS_SELECTOR), + vmcs_read16(vcpu, HOST_DS_SELECTOR), vmcs_read16(vcpu, HOST_ES_SELECTOR), + vmcs_read16(vcpu, HOST_FS_SELECTOR), vmcs_read16(vcpu, HOST_GS_SELECTOR), + vmcs_read16(vcpu, HOST_TR_SELECTOR)); + pr_err("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n", + vmcs_readl(vcpu, HOST_FS_BASE), vmcs_readl(vcpu, HOST_GS_BASE), + vmcs_readl(vcpu, HOST_TR_BASE)); + pr_err("GDTBase=%016llx IDTBase=%016llx\n", + vmcs_readl(vcpu, HOST_GDTR_BASE), vmcs_readl(vcpu, HOST_IDTR_BASE)); + pr_err("CR0=%016llx CR3=%016llx CR4=%016llx\n", + vmcs_readl(vcpu, HOST_CR0), vmcs_readl(vcpu, HOST_CR3), + vmcs_readl(vcpu, HOST_CR4)); + pr_err("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", + vmcs_readl(vcpu, HOST_IA32_SYSENTER_ESP), + vmcs_read32(vcpu, HOST_IA32_SYSENTER_CS), + vmcs_readl(vcpu, HOST_IA32_SYSENTER_EIP)); if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - vmcs_read64(HOST_IA32_EFER), - vmcs_read64(HOST_IA32_PAT)); + vmcs_read64(vcpu, HOST_IA32_EFER), + vmcs_read64(vcpu, HOST_IA32_PAT)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + vmcs_read64(vcpu, HOST_IA32_PERF_GLOBAL_CTRL)); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", - vmcs_read32(EXCEPTION_BITMAP), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + vmcs_read32(vcpu, EXCEPTION_BITMAP), + vmcs_read32(vcpu, PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(vcpu, PAGE_FAULT_ERROR_CODE_MATCH)); pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), - vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + vmcs_read32(vcpu, VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(vcpu, VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(vcpu, VM_ENTRY_INSTRUCTION_LEN)); pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - pr_err(" reason=%08x qualification=%016lx\n", - vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_read32(vcpu, VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016llx\n", + vmcs_read32(vcpu, VM_EXIT_REASON), vmcs_readl(vcpu, EXIT_QUALIFICATION)); + pr_err(" gpa=%016llx\n", vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS)); pr_err("IDTVectoring: info=%08x errcode=%08x\n", - vmcs_read32(IDT_VECTORING_INFO_FIELD), - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); - if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) - pr_err("TSC Multiplier = 0x%016llx\n", - vmcs_read64(TSC_MULTIPLIER)); + vmcs_read32(vcpu, IDT_VECTORING_INFO_FIELD), + vmcs_read32(vcpu, IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(vcpu, TSC_OFFSET)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); - if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) - pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(vcpu, TPR_THRESHOLD)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) - pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(vcpu, EPT_POINTER)); + n = vmcs_read32(vcpu, CR3_TARGET_COUNT); for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + pr_err("CR3 target%u=%016llx target%u=%016llx\n", + i, vmcs_readl(vcpu, CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(vcpu, CR3_TARGET_VALUE0 + i * 2 + 2)); if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + pr_err("CR3 target%u=%016llx\n", + i, vmcs_readl(vcpu, CR3_TARGET_VALUE0 + i * 2)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", - vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + vmcs_read32(vcpu, PLE_GAP), vmcs_read32(vcpu, PLE_WINDOW)); if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", - vmcs_read16(VIRTUAL_PROCESSOR_ID)); + vmcs_read16(vcpu, VIRTUAL_PROCESSOR_ID)); } /* @@ -8365,8 +6852,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before @@ -8383,23 +6868,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { nested_vmx_vmexit(vcpu, exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_readl(vcpu, EXIT_QUALIFICATION)); return 1; } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + dump_vmcs(vcpu); + vcpu->run->exit_reason = GVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; return 0; } if (unlikely(vmx->fail)) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->exit_reason = GVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); + = vmcs_read32(vcpu, VM_INSTRUCTION_ERROR); return 0; } @@ -8415,8 +6900,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && exit_reason != EXIT_REASON_TASK_SWITCH)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; @@ -8447,7 +6932,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + //WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8462,11 +6947,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) return; if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); + vmcs_write32(vcpu, TPR_THRESHOLD, 0); return; } - vmcs_write32(TPR_THRESHOLD, irr); + vmcs_write32(vcpu, TPR_THRESHOLD, irr); } static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) @@ -8485,7 +6970,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) if (!cpu_need_tpr_shadow(vcpu)) return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control = vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL); if (set) { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; @@ -8494,14 +6979,14 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, sec_exec_control); vmx_set_msr_bitmap(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + //struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Currently we do not handle the nested case where L2 has an @@ -8516,10 +7001,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) * prepare_vmcs02. If the latter, the vmcs01 will be updated in * the next L2->L1 exit. */ - if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(&vmx->vcpu), - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmcs_write64(APIC_ACCESS_ADDR, hpa); + //if (!is_guest_mode(vcpu) || + //!nested_cpu_has2(get_vmcs12(&vmx->vcpu), + //SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmcs_write64(vcpu, APIC_ACCESS_ADDR, hpa); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -8530,16 +7015,16 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) if (max_isr == -1) max_isr = 0; - status = vmcs_read16(GUEST_INTR_STATUS); + status = vmcs_read16(vcpu, GUEST_INTR_STATUS); old = status >> 8; if (max_isr != old) { status &= 0xff; status |= max_isr << 8; - vmcs_write16(GUEST_INTR_STATUS, status); + vmcs_write16(vcpu, GUEST_INTR_STATUS, status); } } -static void vmx_set_rvi(int vector) +static void vmx_set_rvi(struct kvm_vcpu *vcpu, int vector) { u16 status; u8 old; @@ -8547,19 +7032,19 @@ static void vmx_set_rvi(int vector) if (vector == -1) vector = 0; - status = vmcs_read16(GUEST_INTR_STATUS); + status = vmcs_read16(vcpu, GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { status &= ~0xff; status |= (u8)vector; - vmcs_write16(GUEST_INTR_STATUS, status); + vmcs_write16(vcpu, GUEST_INTR_STATUS, status); } } static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); + vmx_set_rvi(vcpu, max_irr); return; } @@ -8589,12 +7074,13 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) if (!kvm_vcpu_apicv_active(vcpu)) return; - vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); - vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); - vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); - vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } +static u64 nmi_count = 0; static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8603,7 +7089,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = vmcs_read32(&vmx->vcpu, VM_EXIT_INTR_INFO); exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ @@ -8614,15 +7100,15 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (exit_intr_info & INTR_INFO_VALID_MASK)) { kvm_before_handle_nmi(&vmx->vcpu); - asm("int $2"); + __int2(); + nmi_count++; kvm_after_handle_nmi(&vmx->vcpu); } } static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); + u32 exit_intr_info = vmcs_read32(vcpu, VM_EXIT_INTR_INFO); /* * If external interrupt exists, IF bit is set in rflags/eflags on the @@ -8632,36 +7118,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; - unsigned long entry; + size_t entry; gate_desc *desc; - struct vcpu_vmx *vmx = to_vmx(vcpu); -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; + desc = (gate_desc *)(this_cpu_ptr(&host_idt))->address + vector; entry = gate_offset(*desc); - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" - "call *%[entry]\n\t" - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - "+r"(__sp) - : - [entry]"r"(entry), - [ss]"i"(__KERNEL_DS), - [cs]"i"(__KERNEL_CS) - ); + __asm_vmx_handle_external_intr(entry); } } @@ -8698,7 +7161,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) * Can't use vmx->exit_intr_info since we're not sure what * the exit reason is. */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + exit_intr_info = vmcs_read32(&vmx->vcpu, VM_EXIT_INTR_INFO); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -8713,11 +7176,11 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) */ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_set_bits(&vmx->vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + !(vmcs_read32(&vmx->vcpu, GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += @@ -8742,7 +7205,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, if (!idtv_info_valid) return; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; @@ -8758,17 +7221,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, vmx_set_nmi_mask(vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(vcpu, instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); + u32 err = vmcs_read32(vcpu, error_code_field); kvm_requeue_exception_e(vcpu, vector, err); } else kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(vcpu, instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); @@ -8788,15 +7251,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(vcpu, VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, VM_ENTRY_EXCEPTION_ERROR_CODE); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, 0); } static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { +#if 0 int i, nr_msrs; struct perf_guest_switch_msr *msrs; @@ -8811,32 +7275,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, msrs[i].host); +#endif } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl; - u32 delta_tsc; - - if (vmx->hv_deadline_tsc == -1) - return; - - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; - - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); -} +u64 last_vmexit_rip = 0; +u64 last_vmexit_rsp = 0; +u64 rip = 0xffffffffffffffff; +u8 do_print = 1; +u8 do_print1 = 1; -static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __declspec(noinline) vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; + //size_t debugctlmsr, cr4; + size_t cr4; + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + struct desc_ptr *idt = this_cpu_ptr(&host_idt); + size_t sysenter_esp; + unsigned int i; + struct msr_autoload *m = &vmx->msr_autoload; + /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -8847,24 +7305,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; - if (vmx->ple_window_dirty) { - vmx->ple_window_dirty = false; - vmcs_write32(PLE_WINDOW, vmx->ple_window); - } - if (vmx->nested.sync_shadow_vmcs) { copy_vmcs12_to_shadow(vmx); vmx->nested.sync_shadow_vmcs = false; } - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + if (test_bit(VCPU_REGS_RSP, (size_t *)&vcpu->arch.regs_dirty)) + vmcs_writel(vcpu, GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (size_t *)&vcpu->arch.regs_dirty)) + vmcs_writel(vcpu, GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { - vmcs_writel(HOST_CR4, cr4); + vmcs_writel(vcpu, HOST_CR4, cr4); vmx->host_state.vmcs_host_cr4 = cr4; } @@ -8873,126 +7326,50 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * vmentry fails as it then expects bit 14 (BS) in pending debug * exceptions being set, but that's not correct for the guest debugging * case. */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - if (vmx->guest_pkru_valid) - __write_pkru(vmx->guest_pkru); + vmcs_writel(vcpu, HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_IDTR_BASE, idt->address); /* 22.2.4 */ + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(vcpu, HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ +#if 0 atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - - vmx_arm_hv_timer(vcpu); - - vmx->__launched = vmx->loaded_vmcs->launched; - asm( - /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ - "push %%" _ASM_CX " \n\t" - "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" - "1: \n\t" - /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 2f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "2: \n\t" - /* Check if vmlaunch of vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" - /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%" _ASM_AX " \n\t" - "mov %c[rbx](%0), %%" _ASM_BX " \n\t" - "mov %c[rdx](%0), %%" _ASM_DX " \n\t" - "mov %c[rsi](%0), %%" _ASM_SI " \n\t" - "mov %c[rdi](%0), %%" _ASM_DI " \n\t" - "mov %c[rbp](%0), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" -#endif - "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ - - /* Enter guest mode */ - "jne 1f \n\t" - __ex(ASM_VMX_VMLAUNCH) "\n\t" - "jmp 2f \n\t" - "1: " __ex(ASM_VMX_VMRESUME) "\n\t" - "2: " - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" - "pop %0 \n\t" - "mov %%" _ASM_AX ", %c[rax](%0) \n\t" - "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" - __ASM_SIZE(pop) " %c[rcx](%0) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" -#endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - "setbe %c[fail](%0) \n\t" - ".pushsection .rodata \n\t" - ".global vmx_return \n\t" - "vmx_return: " _ASM_PTR " 2b \n\t" - ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), - [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi", "rsi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "eax", "ebx", "edi", "esi" -#endif - ); + if (do_print1) { + dump_vmcs(vcpu); + do_print1 = 0; + } + vmcs_load(vmx->loaded_vmcs->vmcs); + + for (i = 0; i < m->nr; i++) + wrmsrl(m->guest[i].index, m->guest[i].value); + /* Calls to low-level assembly functions*/ + __asm_vmx_vcpu_run(vmx); + for (i = 0; i < m->nr; i++) + wrmsrl(m->host[i].index, m->host[i].value); + vmcs_clear(vmx->loaded_vmcs->vmcs); + + if (vcpu->vcpu_id == 0) { + last_vmexit_rip = vmcs_read64(vcpu, GUEST_RIP); + last_vmexit_rsp = vmcs_read64(vcpu, GUEST_RSP); + } + if (do_print && (vcpu->vcpu_id == 0)) { + DbgPrint("-------------------vcpu 0-----------------------------------------------------------\n"); + dump_vmcs(vcpu); + do_print = 0; + } + if (last_vmexit_rip == rip) + DbgBreakPoint(); +#if 0 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); +#endif #ifndef CONFIG_X86_64 /* @@ -9014,33 +7391,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx->idt_vectoring_info = vmcs_read32(vcpu, IDT_VECTORING_INFO_FIELD); - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (boot_cpu_has(X86_FEATURE_OSPKE)) { - vmx->guest_pkru = __read_pkru(); - if (vmx->guest_pkru != vmx->host_pkru) { - vmx->guest_pkru_valid = true; - __write_pkru(vmx->host_pkru); - } else - vmx->guest_pkru_valid = false; - } + vmx->exit_reason = vmcs_read32(vcpu, VM_EXIT_REASON); /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * the GVM_REQ_EVENT optimization bit is only on for one entry, and if * we did not inject a still-pending event to L1 now because of * nested_run_pending, we need to re-enable this bit. */ if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); vmx->nested.nested_run_pending = 0; @@ -9072,13 +7433,9 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - r = vcpu_load(vcpu); - BUG_ON(r); vmx_load_vmcs01(vcpu); free_nested(vmx); - vcpu_put(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -9091,16 +7448,14 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) leave_guest_mode(vcpu); vmx_free_vcpu_nested(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); - kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vmx); + kfree(vmx); } static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - int cpu; + struct vcpu_vmx *vmx = kzalloc_fast(sizeof(struct vcpu_vmx), GFP_KERNEL); if (!vmx) return ERR_PTR(-ENOMEM); @@ -9125,30 +7480,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) - > PAGE_SIZE); - - if (!vmx->guest_msrs) - goto free_pml; - vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->vmcs = alloc_vmcs(); + DbgPrint("vmcs allocated with phys %llx on cpu %d\n", __pa(vmx->loaded_vmcs->vmcs), smp_processor_id()); vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) - goto free_msrs; + goto free_pml; if (!vmm_exclusive) kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); loaded_vmcs_init(vmx->loaded_vmcs); if (!vmm_exclusive) kvm_cpu_vmxoff(); - cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); - vmx_vcpu_put(&vmx->vcpu); - put_cpu(); if (err) goto free_vmcs; if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { @@ -9171,7 +7515,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.vpid02 = allocate_vpid(); } - vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; @@ -9182,19 +7525,17 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) free_vmcs: free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); -free_msrs: - kfree(vmx->guest_msrs); free_pml: vmx_destroy_pml_buffer(vmx); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: free_vpid(vmx->vpid); - kmem_cache_free(kvm_vcpu_cache, vmx); + kfree(vmx); return ERR_PTR(err); } -static void __init vmx_check_processor_compat(void *rtn) +static void vmx_check_processor_compat(void *rtn) { struct vmcs_config vmcs_conf; @@ -9234,22 +7575,14 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) goto exit; } - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + //if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; } - if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; - else - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + //cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; @@ -9264,7 +7597,7 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } -static void vmcs_set_secondary_exec_control(u32 new_ctl) +static void vmcs_set_secondary_exec_control(struct kvm_vcpu *vcpu, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -9277,15 +7610,15 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + u32 cur_ctl = vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, (new_ctl & ~mask) | (cur_ctl & mask)); } static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); @@ -9316,7 +7649,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } if (cpu_has_secondary_exec_ctrls()) - vmcs_set_secondary_exec_control(secondary_exec_ctl); + vmcs_set_secondary_exec_control(vcpu, secondary_exec_ctl); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -9326,7 +7659,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry *entry) { if (func == 1 && nested) entry->ecx |= bit(X86_FEATURE_VMX); @@ -9348,7 +7681,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, /* Callbacks for nested_ept_init_mmu_context: */ -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +static size_t nested_ept_get_cr3(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ return get_vmcs12(vcpu)->ept_pointer; @@ -9393,8 +7726,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_readl(vcpu, EXIT_QUALIFICATION)); else kvm_inject_page_fault(vcpu, fault); } @@ -9416,9 +7749,9 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, * physical address remains valid. We keep a reference * to it so we can release it later. */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = + if (vmx->nested.apic_access_mdl) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_mdl); + vmx->nested.apic_access_mdl = nested_get_page(vcpu, vmcs12->apic_access_addr); } @@ -9427,9 +7760,9 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, vmcs12->virtual_apic_page_addr >> maxphyaddr) return false; - if (vmx->nested.virtual_apic_page) /* shouldn't happen */ - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = + if (vmx->nested.virtual_apic_mdl) /* shouldn't happen */ + nested_release_page(vmx->nested.virtual_apic_mdl); + vmx->nested.virtual_apic_mdl = nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); /* @@ -9442,61 +7775,13 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, * the execution control. But such a configuration is useless, * so let's keep the code simple. */ - if (!vmx->nested.virtual_apic_page) + if (!vmx->nested.virtual_apic_mdl) return false; } - if (nested_cpu_has_posted_intr(vmcs12)) { - if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || - vmcs12->posted_intr_desc_addr >> maxphyaddr) - return false; - - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); - } - vmx->nested.pi_desc_page = - nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); - if (!vmx->nested.pi_desc_page) - return false; - - vmx->nested.pi_desc = - (struct pi_desc *)kmap(vmx->nested.pi_desc_page); - if (!vmx->nested.pi_desc) { - nested_release_page_clean(vmx->nested.pi_desc_page); - return false; - } - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } - return true; } -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { - vmx_preemption_timer_fn(&vmx->nested.preemption_timer); - return; - } - - preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; - preemption_timeout *= 1000000; - do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); - hrtimer_start(&vmx->nested.preemption_timer, - ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); -} - static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -9527,22 +7812,22 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { int msr; - struct page *page; - unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + PMDL kmap_mdl; + size_t *msr_bitmap_l1; + size_t *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; - page = nested_get_page(vcpu, vmcs12->msr_bitmap); - if (!page) { + kmap_mdl = nested_get_page(vcpu, vmcs12->msr_bitmap); + if (!kmap_mdl) { WARN_ON(1); return false; } - msr_bitmap_l1 = (unsigned long *)kmap(page); + msr_bitmap_l1 = (size_t *)kmap(kmap_mdl); if (!msr_bitmap_l1) { - nested_release_page_clean(page); + nested_release_page(kmap_mdl); WARN_ON(1); return false; } @@ -9572,8 +7857,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, MSR_TYPE_W); } } - kunmap(page); - nested_release_page_clean(page); + kunmap(kmap_mdl); + nested_release_page(kmap_mdl); return true; } @@ -9583,8 +7868,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, { if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && !nested_cpu_has_apic_reg_virt(vmcs12) && - !nested_cpu_has_vid(vmcs12) && - !nested_cpu_has_posted_intr(vmcs12)) + !nested_cpu_has_vid(vmcs12)) return 0; /* @@ -9603,17 +7887,6 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, !nested_exit_on_intr(vcpu)) return -EINVAL; - /* - * bits 15:8 should be zero in posted_intr_nv, - * the descriptor address has been already checked - * in nested_get_vmcs12_pages. - */ - if (nested_cpu_has_posted_intr(vmcs12) && - (!nested_cpu_has_vid(vmcs12) || - !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) - return -EINVAL; - /* tpr shadow is needed by all apicv features. */ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return -EINVAL; @@ -9622,8 +7895,8 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, } static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, - unsigned long count_field, - unsigned long addr_field) + size_t count_field, + size_t addr_field) { int maxphyaddr; u64 count, addr; @@ -9792,98 +8065,74 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + vmcs_write16(vcpu, GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(vcpu, GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(vcpu, GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(vcpu, GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(vcpu, GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(vcpu, GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(vcpu, GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(vcpu, GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(vcpu, GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(vcpu, GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(vcpu, GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(vcpu, GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(vcpu, GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(vcpu, GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(vcpu, GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(vcpu, GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(vcpu, GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(vcpu, GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(vcpu, GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(vcpu, GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(vcpu, GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + vmcs_write32(vcpu, GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(vcpu, GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(vcpu, GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(vcpu, GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(vcpu, GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(vcpu, GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(vcpu, GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(vcpu, GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(vcpu, GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(vcpu, GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(vcpu, GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(vcpu, GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(vcpu, GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(vcpu, GUEST_IDTR_BASE, vmcs12->guest_idtr_base); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs_write32(vcpu, VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_write32(vcpu, VM_ENTRY_INSTRUCTION_LEN, vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs_write32(vcpu, GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_writel(vcpu, GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(vcpu, XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + vmcs_write64(vcpu, VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - - /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { - /* - * Note that we use L0's vector here and in - * vmx_deliver_nested_posted_interrupt. - */ - vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } else - exec_control &= ~PIN_BASED_POSTED_INTR; - - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + vmcs_write32(vcpu, PIN_BASED_VM_EXEC_CONTROL, exec_control); /* * Whether page-faults are trapped is determined by a combination of @@ -9905,9 +8154,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * To fix this, we will need to emulate the PFEC checking (on the L1 * page tables), using walk_addr(), when injecting PFs to L1. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MASK, enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MATCH, enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { @@ -9929,12 +8178,12 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * can never be accessed, this feature won't do * anything anyway. */ - if (!vmx->nested.apic_access_page) + if (!vmx->nested.apic_access_mdl) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; else - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->nested.apic_access_page)); + vmcs_write64(vcpu, APIC_ACCESS_ADDR, + mdl_to_phys(vmx->nested.apic_access_mdl)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= @@ -9943,19 +8192,19 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { - vmcs_write64(EOI_EXIT_BITMAP0, + vmcs_write64(vcpu, EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, + vmcs_write64(vcpu, EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, + vmcs_write64(vcpu, EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, + vmcs_write64(vcpu, EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); - vmcs_write16(GUEST_INTR_STATUS, + vmcs_write16(vcpu, GUEST_INTR_STATUS, vmcs12->guest_intr_status); } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9983,9 +8232,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control |= vmcs12->cpu_based_vm_exec_control; if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->nested.virtual_apic_page)); - vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + vmcs_write64(vcpu, VIRTUAL_APIC_PAGE_ADDR, + mdl_to_phys(vmx->nested.virtual_apic_mdl)); + vmcs_write32(vcpu, TPR_THRESHOLD, vmcs12->tpr_threshold); } if (cpu_has_vmx_msr_bitmap() && @@ -10002,7 +8251,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, exec_control); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to @@ -10010,13 +8259,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(vcpu, CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); /* L2->L1 exit controls are emulated - the hardware exit is to L0 so * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + vmcs_write32(vcpu, VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -10027,24 +8276,22 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + vmcs_write64(vcpu, GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + vmcs_write64(vcpu, GUEST_IA32_PAT, vmx->vcpu.arch.pat); set_cr4_guest_host_mask(vmx); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + vmcs_write64(vcpu, GUEST_BNDCFGS, vmcs12->guest_bndcfgs); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vmcs_write64(TSC_OFFSET, + vmcs_write64(vcpu, TSC_OFFSET, vcpu->arch.tsc_offset + vmcs12->tsc_offset); else - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(vcpu, TSC_OFFSET, vcpu->arch.tsc_offset); if (enable_vpid) { /* @@ -10056,13 +8303,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * even if spawn a lot of nested vCPUs. */ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + vmcs_write16(vcpu, VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); } } else { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmcs_write16(vcpu, VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx_flush_tlb(vcpu); } @@ -10091,10 +8338,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * have more bits than L1 expected. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + vmcs_writel(vcpu, CR0_READ_SHADOW, nested_read_cr0(vmcs12)); vmx_set_cr4(vcpu, vmcs12->guest_cr4); - vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); + vmcs_writel(vcpu, CR4_READ_SHADOW, nested_read_cr4(vmcs12)); /* shadow page tables on either EPT or shadow page tables */ kvm_set_cr3(vcpu, vmcs12->guest_cr3); @@ -10107,10 +8354,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * L1 may access the L2's PDPTR, so save them to construct vmcs12 */ if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + vmcs_write64(vcpu, GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(vcpu, GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(vcpu, GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(vcpu, GUEST_PDPTR3, vmcs12->guest_pdptr3); } kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); @@ -10273,7 +8520,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.vmcs01_debugctl = vmcs_read64(vcpu, GUEST_IA32_DEBUGCTL); cpu = get_cpu(); vmx->loaded_vmcs = vmcs02; @@ -10330,23 +8577,23 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * didn't necessarily allow them to be changed in GUEST_CR0 - and rather * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. */ -static inline unsigned long +static inline size_t vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return - /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | + /*1*/ (vmcs_readl(vcpu, GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | - /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | + /*3*/ (vmcs_readl(vcpu, CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | vcpu->arch.cr0_guest_owned_bits)); } -static inline unsigned long +static inline size_t vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return - /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | + /*1*/ (vmcs_readl(vcpu, GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | - /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | + /*3*/ (vmcs_readl(vcpu, CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | vcpu->arch.cr4_guest_owned_bits)); } @@ -10396,14 +8643,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { - if (vmx->nested.nested_run_pending) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); - return 0; - } - if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { if (vmx->nested.nested_run_pending || vcpu->arch.interrupt.pending) @@ -10428,21 +8667,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); -} - -static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) -{ - ktime_t remaining = - hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); - u64 value; - - if (ktime_to_ns(remaining) <= 0) - return 0; - - value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; - do_div(value, 1000000); - return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + return 0; } /* @@ -10458,7 +8683,7 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + size_t exit_qualification) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -10466,62 +8691,54 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); - - vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); - vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); - vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); - vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); - vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); - vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); - vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); - vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); - vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); - vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); - vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); - vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); - vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); - vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); - vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); - vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); - vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); - vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); - vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); - vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); - vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); - vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); - vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); - vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); - vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); - vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); - vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); - vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); - vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); - vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); - vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + vmcs12->guest_rflags = vmcs_readl(vcpu, GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(vcpu, GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(vcpu, GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(vcpu, GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(vcpu, GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(vcpu, GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(vcpu, GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(vcpu, GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(vcpu, GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(vcpu, GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(vcpu, GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(vcpu, GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(vcpu, GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(vcpu, GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(vcpu, GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(vcpu, GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(vcpu, GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(vcpu, GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(vcpu, GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(vcpu, GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(vcpu, GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(vcpu, GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(vcpu, GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(vcpu, GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(vcpu, GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(vcpu, GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(vcpu, GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(vcpu, GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(vcpu, GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(vcpu, GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(vcpu, GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(vcpu, GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(vcpu, GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(vcpu, GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(vcpu, GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(vcpu, GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(vcpu, GUEST_IDTR_BASE); vmcs12->guest_interruptibility_info = - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs_readl(vcpu, GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == GVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } - /* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. @@ -10531,41 +8748,41 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Additionally, restore L2's PDPTR to vmcs12. */ if (enable_ept) { - vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); - vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); - vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); - vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); - vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + vmcs12->guest_cr3 = vmcs_readl(vcpu, GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(vcpu, GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(vcpu, GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(vcpu, GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(vcpu, GUEST_PDPTR3); } if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(vcpu, GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) - vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmcs12->guest_intr_status = vmcs_read16(vcpu, GUEST_INTR_STATUS); vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + kvm_get_dr(vcpu, 7, (size_t *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(vcpu, GUEST_IA32_DEBUGCTL); } /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + vmcs12->guest_ia32_pat = vmcs_read64(vcpu, GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + vmcs12->guest_sysenter_cs = vmcs_read32(vcpu, GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(vcpu, GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(vcpu, GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + vmcs12->guest_bndcfgs = vmcs_read64(vcpu, GUEST_BNDCFGS); if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); + vmcs12->xss_exit_bitmap = vmcs_read64(vcpu, XSS_EXIT_BITMAP); /* update exit information fields: */ @@ -10577,10 +8794,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs_read32(vcpu, VM_EXIT_INTR_ERROR_CODE); vmcs12->idt_vectoring_info_field = 0; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmcs12->vm_exit_instruction_len = vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { /* vm_entry_intr_info_field is cleared on exit. Emulate this @@ -10641,14 +8858,14 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * but we also need to update cr0_guest_host_mask and exception_bitmap. */ update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(vcpu, CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); /* * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); + * (kvm doesn't change it)- no reason to call set_cr4_guest_host_mask(); */ - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(vcpu, CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); @@ -10669,22 +8886,22 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, } - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); + vmcs_writel(vcpu, GUEST_IDTR_BASE, vmcs12->host_idtr_base); + vmcs_writel(vcpu, GUEST_GDTR_BASE, vmcs12->host_gdtr_base); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, 0); + vmcs_write64(vcpu, GUEST_BNDCFGS, 0); if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vmcs_write64(vcpu, GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, + vmcs_write64(vcpu, GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); /* Set L1 segment info according to Intel SDM @@ -10734,7 +8951,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, 0); if (cpu_has_vmx_msr_bitmap()) vmx_set_msr_bitmap(vcpu); @@ -10751,7 +8968,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + size_t exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -10777,13 +8994,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); @@ -10795,15 +9005,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(vcpu, TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { vmx->nested.change_vmcs01_virtual_x2apic_mode = false; @@ -10815,26 +9017,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx->host_rsp = 0; /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; + if (vmx->nested.apic_access_mdl) { + nested_release_page(vmx->nested.apic_access_mdl); + vmx->nested.apic_access_mdl = NULL; } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; + if (vmx->nested.virtual_apic_mdl) { + nested_release_page(vmx->nested.virtual_apic_mdl); + vmx->nested.virtual_apic_mdl = NULL; } /* * We are now running in L2, mmu_notifier will force to reload the * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + kvm_make_request(GVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Exiting from L2 to L1, we're now back to L1 which thinks it just @@ -10843,14 +9039,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vmcs_read32(vcpu, VM_INSTRUCTION_ERROR)); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; } /* @@ -10872,7 +9068,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu) */ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification) + u32 reason, size_t qualification) { load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; @@ -10889,75 +9085,10 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } -#ifdef CONFIG_X86_64 -/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ -static inline int u64_shl_div_u64(u64 a, unsigned int shift, - u64 divisor, u64 *result) -{ - u64 low = a << shift, high = a >> (64 - shift); - - /* To avoid the overflow on divq */ - if (high >= divisor) - return 1; - - /* Low hold the result, high hold rem which is discarded */ - asm("divq %2\n\t" : "=a" (low), "=d" (high) : - "rm" (divisor), "0" (low), "1" (high)); - *result = low; - - return 0; -} - -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(); - u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); - u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - - /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && - u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, - &delta_tsc)) - return -ERANGE; - - /* - * If the delta tsc can't fit in the 32 bit after the multi shift, - * we can't use the preemption timer. - * It's possible that it fits on later vmentries, but checking - * on every vmentry is costly so we just use an hrtimer. - */ - if (delta_tsc >> (cpu_preemption_timer_multi + 32)) - return -ERANGE; - - vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; -} - -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); -} -#endif - -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (ple_gap) - shrink_ple_window(vcpu); -} - static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { kvm_mmu_slot_leaf_clear_dirty(kvm, slot); - kvm_mmu_slot_largepage_remove_write_access(kvm, slot); } static void vmx_slot_disable_log_dirty(struct kvm *kvm, @@ -10968,257 +9099,17 @@ static void vmx_slot_disable_log_dirty(struct kvm *kvm, static void vmx_flush_log_dirty(struct kvm *kvm) { - kvm_flush_pml_buffers(kvm); + //kvm_flush_pml_buffers(kvm); } static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t offset, unsigned long mask) + gfn_t offset, size_t mask) { kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - - do { - old.control = new.control = pi_desc->control; - - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - return 0; -} - -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops->set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); - ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - -static void vmx_setup_mce(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mcg_cap & MCG_LMCE_P) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= - FEATURE_CONTROL_LMCE; - else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_LMCE; -} - -static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { +static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .hardware_setup = hardware_setup, @@ -11233,7 +9124,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .save_host_state = vmx_save_host_state, + .load_host_state = vmx_load_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -11264,11 +9156,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .get_pkru = vmx_get_pkru, - - .fpu_activate = vmx_fpu_activate, - .fpu_deactivate = vmx_fpu_deactivate, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, @@ -11276,7 +9163,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .skip_emulated_instruction = skip_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, @@ -11295,8 +9181,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -11326,52 +9210,22 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .check_nested_events = vmx_check_nested_events, - .sched_in = vmx_sched_in, - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, - .update_pi_irte = vmx_update_pi_irte, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, + //.pmu_ops = &intel_pmu_ops, }; -static int __init vmx_init(void) +int vmx_init(void) { - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); - if (r) - return r; - -#ifdef CONFIG_KEXEC_CORE - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); -#endif - - return 0; + return kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 0); } -static void __exit vmx_exit(void) +void vmx_exit(void) { -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif - kvm_exit(); } -module_init(vmx_init) -module_exit(vmx_exit) diff --git a/arch/x86/kvm/vmx_def.h b/arch/x86/kvm/vmx_def.h new file mode 100755 index 0000000..89ff76a --- /dev/null +++ b/arch/x86/kvm/vmx_def.h @@ -0,0 +1,425 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "irq.h" +#include "mmu.h" +#include "cpuid.h" +#include "lapic.h" + +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <ntkrutils.h> +#include <__asm.h> +#include "kvm_cache_regs.h" +#include "x86.h" +#include <asm/vmx.h> + +#include "pmu.h" + +/* MTRR memory types, which are defined in SDM */ +#define MTRR_TYPE_UNCACHABLE 0 +#define MTRR_TYPE_WRCOMB 1 +/*#define MTRR_TYPE_ 2*/ +/*#define MTRR_TYPE_ 3*/ +#define MTRR_TYPE_WRTHROUGH 4 +#define MTRR_TYPE_WRPROT 5 +#define MTRR_TYPE_WRBACK 6 +#define MTRR_NUM_TYPES 7 + + +#define GVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define GVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) +#define GVM_VM_CR0_ALWAYS_ON \ + (GVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define GVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) + +#define GVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define GVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +#define NR_AUTOLOAD_MSRS 8 +#define VMCS02_POOL_SIZE 1 + +struct vmcs { + u32 revision_id; + u32 abort; + char data[1016]; +}; + +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + struct vmcs *shadow_vmcs; + int cpu; + int launched; +}; + +/* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * If there are changes in this struct, VMCS12_REVISION must be changed. + */ +typedef u64 natural_width; +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + u32 revision_id; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 posted_intr_desc_addr; + u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 guest_bndcfgs; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 padding64[8]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * size_t fields with no explict size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 guest_intr_status; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; +}; + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications. + */ +#define VMCS12_SIZE 0x1000 + +/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +struct vmcs02_list { + struct list_head list; + gpa_t vmptr; + struct loaded_vmcs vmcs02; +}; + +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + gpa_t vmxon_ptr; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* The host-usable pointer to the above */ + PMDL current_vmcs12_mdl; + struct vmcs12 *current_vmcs12; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMXOFF, VMCLEAR, VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; + /* + * Indicates if the shadow vmcs must be updated with the + * data hold by vmcs12 + */ + bool sync_shadow_vmcs; + + /* vmcs02_list cache of VMCSs recently used to run L2 guests */ + struct list_head vmcs02_pool; + int vmcs02_num; + bool change_vmcs01_virtual_x2apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; + /* + * Guest pages referred to in vmcs02 with host-physical pointers, so + * we must keep them pinned while L2 runs. + */ + PMDL apic_access_mdl; + PMDL virtual_apic_mdl; + + size_t *msr_bitmap; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; + + u16 vpid02; + u16 last_vpid; + + u32 nested_vmx_procbased_ctls_low; + u32 nested_vmx_procbased_ctls_high; + u32 nested_vmx_true_procbased_ctls_low; + u32 nested_vmx_secondary_ctls_low; + u32 nested_vmx_secondary_ctls_high; + u32 nested_vmx_pinbased_ctls_low; + u32 nested_vmx_pinbased_ctls_high; + u32 nested_vmx_exit_ctls_low; + u32 nested_vmx_exit_ctls_high; + u32 nested_vmx_true_exit_ctls_low; + u32 nested_vmx_entry_ctls_low; + u32 nested_vmx_entry_ctls_high; + u32 nested_vmx_true_entry_ctls_low; + u32 nested_vmx_misc_low; + u32 nested_vmx_misc_high; + u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + size_t host_rsp; + u8 fail; + bool nmi_known_unmasked; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { + unsigned nr; + struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + } msr_autoload; + struct { + u16 fs_sel, gs_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif + int gs_reload_needed; + int fs_reload_needed; + u64 msr_host_bndcfgs; + size_t vmcs_host_cr4; /* May not match real cr4 */ + } host_state; + struct { + int vm86_active; + ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment { + u16 selector; + size_t base; + u32 limit; + u32 ar; + } seg[8]; + } segment_cache; + int vpid; + bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + u32 exit_reason; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; +}; + +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04c5d96..44637f3 100644..100755 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7,6 +7,7 @@ * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@qumranet.com> @@ -19,67 +20,28 @@ * */ +#include <gvm_types.h> +#include <ntkrutils.h> +#include <gvm-main.h> #include <linux/kvm_host.h> #include "irq.h" #include "mmu.h" -#include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" -#include "hyperv.h" - -#include <linux/clocksource.h> -#include <linux/interrupt.h> -#include <linux/kvm.h> -#include <linux/fs.h> -#include <linux/vmalloc.h> -#include <linux/export.h> -#include <linux/moduleparam.h> -#include <linux/mman.h> -#include <linux/highmem.h> -#include <linux/iommu.h> -#include <linux/intel-iommu.h> -#include <linux/cpufreq.h> -#include <linux/user-return-notifier.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <linux/hash.h> -#include <linux/pci.h> -#include <linux/timekeeper_internal.h> -#include <linux/pvclock_gtod.h> -#include <linux/kvm_irqfd.h> -#include <linux/irqbypass.h> -#include <trace/events/kvm.h> - -#include <asm/debugreg.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <asm/mce.h> -#include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ -#include <asm/pvclock.h> -#include <asm/div64.h> -#include <asm/irq_remapping.h> - -#define CREATE_TRACE_POINTS -#include "trace.h" +#include <asm/vmx.h> + #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) /* EFER defaults: - * - enable syscall per default because its emulated by KVM - * - enable LME and LMA per default on 64 bit KVM + * - enable syscall per default because its emulated by kvm + * - enable LME and LMA per default on 64 bit kvm */ #ifdef CONFIG_X86_64 static @@ -88,219 +50,39 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x) offsetof(struct kvm, stat.x), GVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), GVM_STAT_VCPU -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ - KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +#define GVM_X2APIC_API_VALID_FLAGS (GVM_X2APIC_API_USE_32BIT_IDS | \ + GVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); -static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags); struct kvm_x86_ops *kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - -static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, S_IRUGO); - -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ unsigned int __read_mostly lapic_timer_advance_ns = 0; -module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, S_IRUGO); static bool __read_mostly backwards_tsc_observed = false; -#define KVM_NR_SHARED_MSRS 16 - -struct kvm_shared_msrs_global { - int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; -}; - -struct kvm_shared_msrs { - struct user_return_notifier urn; - bool registered; - struct kvm_shared_msr_values { - u64 host; - u64 curr; - } values[KVM_NR_SHARED_MSRS]; -}; - -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, - { NULL } -}; - u64 __read_mostly host_xcr0; -static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); - -static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) -{ - int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) - vcpu->arch.apf.gfns[i] = ~0; -} - -static void kvm_on_user_return(struct user_return_notifier *urn) -{ - unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; - unsigned long flags; - - /* - * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() - */ - local_irq_save(flags); - if (locals->registered) { - locals->registered = false; - user_return_notifier_unregister(urn); - } - local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; - if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); - values->curr = values->host; - } - } -} - -static void shared_msr_update(unsigned slot, u32 msr) -{ - u64 value; - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; - } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; -} - -void kvm_define_shared_msr(unsigned slot, u32 msr) -{ - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; -} -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); - -static void kvm_shared_msr_cpu_online(void) -{ - unsigned i; - - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); -} - -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) -{ - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - int err; - - if (((value ^ smsr->values[slot].curr) & mask) == 0) - return 0; - smsr->values[slot].curr = value; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); - if (err) - return 1; - - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); - -static void drop_user_return_notifiers(void) -{ - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - if (smsr->registered) - kvm_on_user_return(&smsr->urn); -} - u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -323,14 +105,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_lapic_set_base(vcpu, msr_info->data); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); - -asmlinkage __visible void kvm_spurious_fault(void) -{ - /* Fault while not rebooting. We want the trace. */ - BUG(); -} -EXPORT_SYMBOL_GPL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 @@ -385,7 +159,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, u32 prev_nr; int class1, class2; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); if (!vcpu->arch.exception.pending) { queue: @@ -403,7 +177,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); @@ -426,13 +200,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, true); } -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -441,7 +213,6 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) else kvm_x86_ops->skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -449,7 +220,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -464,21 +234,18 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau void kvm_inject_nmi(struct kvm_vcpu *vcpu) { atomic_inc(&vcpu->arch.nmi_queued); - kvm_make_request(KVM_REQ_NMI, vcpu); + kvm_make_request(GVM_REQ_NMI, vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, true); } -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue @@ -491,7 +258,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } -EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { @@ -501,7 +267,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) kvm_queue_exception(vcpu, UD_VECTOR); return false; } -EXPORT_SYMBOL_GPL(kvm_require_dr); /* * This function will be used to read from the physical memory of the currently @@ -525,7 +290,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len, u32 access) @@ -537,7 +301,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, size_t cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; @@ -564,14 +328,13 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); + (size_t *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + (size_t *)&vcpu->arch.regs_dirty); out: return ret; } -EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { @@ -585,7 +348,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) return false; if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + (size_t *)&vcpu->arch.regs_avail)) return true; gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; @@ -600,10 +363,10 @@ out: return changed; } -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0) { - unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; + size_t old_cr0 = kvm_read_cr0(vcpu); + size_t update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -642,28 +405,21 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } - if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + //kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, GVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr0); -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +void kvm_lmsw(struct kvm_vcpu *vcpu, size_t msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } -EXPORT_SYMBOL_GPL(kvm_lmsw); static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) { @@ -733,12 +489,11 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_set_cr4(struct kvm_vcpu *vcpu, size_t cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + size_t old_cr4 = kvm_read_cr4(vcpu); + size_t pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) @@ -789,9 +544,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr4); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, size_t cr3) { #ifdef CONFIG_X86_64 cr3 &= ~CR3_PCID_INVD; @@ -799,7 +553,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -815,9 +569,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_cr3(vcpu); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr3); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int kvm_set_cr8(struct kvm_vcpu *vcpu, size_t cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; @@ -827,46 +580,44 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr8); -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) +size_t kvm_get_cr8(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { int i; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - for (i = 0; i < KVM_NR_DB_REGS; i++) + if (!(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < GVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; + vcpu->arch.switch_db_regs |= GVM_DEBUGREG_RELOAD; } } static void kvm_update_dr6(struct kvm_vcpu *vcpu) { - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + if (!(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP)) kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); } static void kvm_update_dr7(struct kvm_vcpu *vcpu) { - unsigned long dr7; + size_t dr7; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; + vcpu->arch.switch_db_regs |= GVM_DEBUGREG_BP_ENABLED; } static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) @@ -878,12 +629,15 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) return fixed; } -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, size_t val) { switch (dr) { - case 0 ... 3: + case 0: + case 1: + case 2: + case 3: vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + if (!(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; case 4: @@ -907,7 +661,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, size_t val) { if (__kvm_set_dr(vcpu, dr, val)) { kvm_inject_gp(vcpu, 0); @@ -915,18 +669,20 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, size_t *val) { switch (dr) { - case 0 ... 3: + case 0: + case 1: + case 2: + case 3: *val = vcpu->arch.db[dr]; break; case 4: /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) *val = vcpu->arch.dr6; else *val = kvm_x86_ops->get_dr6(vcpu); @@ -939,8 +695,8 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dr); +#if 0 bool kvm_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -954,11 +710,11 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); return err; } -EXPORT_SYMBOL_GPL(kvm_rdpmc); +#endif /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * List of msr numbers which we expose to userspace through GVM_GET_MSRS + * and GVM_SET_MSRS, and GVM_GET_MSR_INDEX_LIST. * * This list is modified at module load time to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are @@ -972,45 +728,19 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_TSC, MSR_IA32_CR_PAT, //MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, //MSR_IA32_BNDCFGS, MSR_TSC_AUX, }; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, -}; - -static unsigned num_emulated_msrs; - bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) return false; if (efer & EFER_FFXSR) { - struct kvm_cpuid_entry2 *feat; + struct kvm_cpuid_entry *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) @@ -1018,7 +748,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) } if (efer & EFER_SVME) { - struct kvm_cpuid_entry2 *feat; + struct kvm_cpuid_entry *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) @@ -1027,7 +757,6 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) return true; } -EXPORT_SYMBOL_GPL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { @@ -1056,7 +785,6 @@ void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); /* * Writes msr value into into the appropriate "register". @@ -1092,7 +820,6 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } return kvm_x86_ops->set_msr(vcpu, msr); } -EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention @@ -1122,257 +849,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, &msr); } -#ifdef CONFIG_X86_64 -struct pvclock_gtod_data { - seqcount_t seq; - - struct { /* extract of a clocksource struct */ - int vclock_mode; - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; - - u64 boot_ns; - u64 nsec_base; -}; - -static struct pvclock_gtod_data pvclock_gtod_data; - -static void update_pvclock_gtod(struct timekeeper *tk) -{ - struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; - - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); - - write_seqcount_begin(&vdata->seq); - - /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->tkr_mono.cycle_last; - vdata->clock.mask = tk->tkr_mono.mask; - vdata->clock.mult = tk->tkr_mono.mult; - vdata->clock.shift = tk->tkr_mono.shift; - - vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr_mono.xtime_nsec; - - write_seqcount_end(&vdata->seq); -} -#endif - void kvm_set_pending_timer(struct kvm_vcpu *vcpu) { /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * Note: GVM_REQ_PENDING_TIMER is implicitly checked in * vcpu_enter_guest. This function is only called from * the physical CPU that is running vcpu. */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); -} - -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) -{ - int version; - int r; - struct pvclock_wall_clock wc; - struct timespec64 boot; - - if (!wall_clock) - return; - - r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); - if (r) - return; - - if (version & 1) - ++version; /* first time write, random junk */ - - ++version; - - if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) - return; - - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. - */ - getboottime64(&boot); - - if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; - wc.version = version; - - kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); - - version++; - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); -} - -static uint32_t div_frac(uint32_t dividend, uint32_t divisor) -{ - do_shl32_div32(dividend, divisor); - return dividend; -} - -static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, - s8 *pshift, u32 *pmultiplier) -{ - uint64_t scaled64; - int32_t shift = 0; - uint64_t tps64; - uint32_t tps32; - - tps64 = base_hz; - scaled64 = scaled_hz; - while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { - tps64 >>= 1; - shift--; - } - - tps32 = (uint32_t)tps64; - while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { - if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) - scaled64 >>= 1; - else - tps32 <<= 1; - shift++; - } - - *pshift = shift; - *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", - __func__, base_hz, scaled_hz, shift, *pmultiplier); + kvm_make_request(GVM_REQ_PENDING_TIMER, vcpu); } #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif -static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -static unsigned long max_tsc_khz; - -static u32 adjust_tsc_khz(u32 khz, s32 ppm) -{ - u64 v = (u64)khz * (1000000 + ppm); - do_div(v, 1000000); - return v; -} - -static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - u64 ratio; - - /* Guest TSC same frequency as host TSC? */ - if (!scale) { - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - return 0; - } - - /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - return 0; - } else { - WARN(1, "user requested TSC rate below hardware speed\n"); - return -1; - } - } - - /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, - user_tsc_khz, tsc_khz); - - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { - WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); - return -1; - } - - vcpu->arch.tsc_scaling_ratio = ratio; - return 0; -} - -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) -{ - u32 thresh_lo, thresh_hi; - int use_scaling = 0; - - /* tsc_khz can be zero if TSC calibration fails */ - if (user_tsc_khz == 0) { - /* set tsc_scaling_ratio to a safe value */ - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - return -1; - } - - /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, - &vcpu->arch.virtual_tsc_shift, - &vcpu->arch.virtual_tsc_mult); - vcpu->arch.virtual_tsc_khz = user_tsc_khz; - - /* - * Compute the variation in TSC rate which is acceptable - * within the range of tolerance and decide if the - * rate being applied is within that bounds of the hardware - * rate. If so, no scaling or compensation need be done. - */ - thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); - thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); - if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); - use_scaling = 1; - } - return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); -} - -static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) -{ - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, - vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.this_tsc_write; - return tsc; -} - -static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - bool vcpus_matched; - struct kvm_arch *ka = &vcpu->kvm->arch; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&vcpu->kvm->online_vcpus)); - - /* - * Once the masterclock is enabled, always perform request in - * order to update it. - * - * In order to enable masterclock, the host clocksource must be TSC - * and the vcpus need to have matched TSCs. When that happens, - * perform request to enable masterclock. - */ - if (ka->use_master_clock || - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, - atomic_read(&vcpu->kvm->online_vcpus), - ka->use_master_clock, gtod->clock.vclock_mode); -#endif -} +static DEFINE_PER_CPU(size_t, cpu_tsc_khz); +static size_t max_tsc_khz; static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { @@ -1380,47 +872,19 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } -/* - * Multiply tsc by a fixed point number represented by ratio. - * - * The most significant 64-N bits (mult) of ratio represent the - * integral part of the fixed point number; the remaining N bits - * (frac) represent the fractional part, ie. ratio represents a fixed - * point number (mult + frac * 2^(-N)). - * - * N equals to kvm_tsc_scaling_ratio_frac_bits. - */ -static inline u64 __scale_tsc(u64 ratio, u64 tsc) -{ - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); -} - -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) -{ - u64 _tsc = tsc; - u64 ratio = vcpu->arch.tsc_scaling_ratio; - - if (ratio != kvm_default_tsc_scaling_ratio) - _tsc = __scale_tsc(ratio, tsc); - - return _tsc; -} -EXPORT_SYMBOL_GPL(kvm_scale_tsc); - static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc()); + tsc = __rdtsc(); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.tsc_offset + host_tsc; } -EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { @@ -1430,128 +894,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { - struct kvm *kvm = vcpu->kvm; - u64 offset, ns, elapsed; - unsigned long flags; - s64 usdiff; - bool matched; - bool already_matched; + u64 offset; + //size_t flags; u64 data = msr->data; - raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + //spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); - elapsed = ns - kvm->arch.last_tsc_nsec; - - if (vcpu->arch.virtual_tsc_khz) { - int faulted = 0; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("1: idivl %[divisor]\n" - "2: xor %%edx, %%edx\n" - " movl $0, %[faulted]\n" - "3:\n" - ".section .fixup,\"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - - : "=A"(usdiff), [faulted] "=r" (faulted) - : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); - -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* idivl overflow => difference is larger than USEC_PER_SEC */ - if (faulted) - usdiff = USEC_PER_SEC; - } else - usdiff = USEC_PER_SEC; /* disable TSC match window below */ - - /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * - * For a reliable TSC, we can match TSC offsets, and for an unstable - * TSC, we add elapsed time in this computation. We could let the - * compensation code attempt to catch up if we fall behind, but - * it's better to try to match offsets from the beginning. - */ - if (usdiff < USEC_PER_SEC && - vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { - if (!check_tsc_unstable()) { - offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); - } else { - u64 delta = nsec_to_cycles(vcpu, elapsed); - data += delta; - offset = kvm_compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); - } - matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; - pr_debug("kvm: new tsc generation %llu, clock %llu\n", - kvm->arch.cur_tsc_generation, data); - } - - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) update_ia32_tsc_adjust_msr(vcpu, offset); kvm_vcpu_write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } + //spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) @@ -1559,549 +914,16 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); } -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) - WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); - adjust_tsc_offset_guest(vcpu, adjustment); -} - -#ifdef CONFIG_X86_64 - -static cycle_t read_tsc(void) -{ - cycle_t ret = (cycle_t)rdtsc_ordered(); - u64 last = pvclock_gtod_data.clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a function of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -static inline u64 vgettsc(cycle_t *cycle_now) -{ - long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - - *cycle_now = read_tsc(); - - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; - return v * gtod->clock.mult; -} - -static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - unsigned long seq; - int mode; - u64 ns; - - do { - seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; - ns = gtod->nsec_base; - ns += vgettsc(cycle_now); - ns >>= gtod->clock.shift; - ns += gtod->boot_ns; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - *t = ns; - - return mode; -} - -/* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) -{ - /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) - return false; - - return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; -} -#endif - -/* - * - * Assuming a stable TSC across physical CPUS, and a stable TSC - * across virtual CPUs, the following condition is possible. - * Each numbered line represents an event visible to both - * CPUs at the next numbered event. - * - * "timespecX" represents host monotonic time. "tscX" represents - * RDTSC value. - * - * VCPU0 on CPU0 | VCPU1 on CPU1 - * - * 1. read timespec0,tsc0 - * 2. | timespec1 = timespec0 + N - * | tsc1 = tsc0 + M - * 3. transition to guest | transition to guest - * 4. ret0 = timespec0 + (rdtsc - tsc0) | - * 5. | ret1 = timespec1 + (rdtsc - tsc1) - * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) - * - * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: - * - * - ret0 < ret1 - * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) - * ... - * - 0 < N - M => M < N - * - * That is, when timespec0 != timespec1, M < N. Unfortunately that is not - * always the case (the difference between two distinct xtime instances - * might be smaller then the difference between corresponding TSC reads, - * when updating guest vcpus pvclock areas). - * - * To avoid that problem, do not allow visibility of distinct - * system_timestamp/tsc_timestamp values simultaneously: use a master - * copy of host monotonic time values. Update that master copy - * in lockstep. - * - * Rely on synchronization of host TSCs and guest TSCs for monotonicity. - * - */ - -static void pvclock_update_vm_gtod_copy(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - struct kvm_arch *ka = &kvm->arch; - int vclock_mode; - bool host_tsc_clocksource, vcpus_matched; - - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&kvm->online_vcpus)); - - /* - * If the host uses TSC clock, then passthrough TSC as stable - * to the guest. - */ - host_tsc_clocksource = kvm_get_time_and_clockread( - &ka->master_kernel_ns, - &ka->master_cycle_now); - - ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed - && !ka->boot_vcpu_runs_old_kvmclock; - - if (ka->use_master_clock) - atomic_set(&kvm_guest_has_master_clock, 1); - - vclock_mode = pvclock_gtod_data.clock.vclock_mode; - trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, - vcpus_matched); -#endif -} - -void kvm_make_mclock_inprogress_request(struct kvm *kvm) -{ - kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); -} - -static void kvm_gen_update_masterclock(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - kvm_make_mclock_inprogress_request(kvm); - /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - /* guest entries allowed */ - kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); - - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif -} - -static u64 __get_kvmclock_ns(struct kvm *kvm) -{ - struct kvm_arch *ka = &kvm->arch; - struct pvclock_vcpu_time_info hv_clock; - - spin_lock(&ka->pvclock_gtod_sync_lock); - if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; - } - - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); - - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, - &hv_clock.tsc_shift, - &hv_clock.tsc_to_system_mul); - return __pvclock_read_cycles(&hv_clock, rdtsc()); -} - -u64 get_kvmclock_ns(struct kvm *kvm) -{ - unsigned long flags; - s64 ns; - - local_irq_save(flags); - ns = __get_kvmclock_ns(kvm); - local_irq_restore(flags); - - return ns; -} - -static void kvm_setup_pvclock_page(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - struct pvclock_vcpu_time_info guest_hv_clock; - - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return; - - /* This VCPU is paused, but it's legal for a guest to read another - * VCPU's kvmclock, so we really have to follow the specification where - * it says that version is odd if data is being modified, and even after - * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. - */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); - - smp_wmb(); - - /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - smp_wmb(); - - vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); -} - -static int kvm_guest_time_update(struct kvm_vcpu *v) -{ - unsigned long flags, tgt_tsc_khz; - struct kvm_vcpu_arch *vcpu = &v->arch; - struct kvm_arch *ka = &v->kvm->arch; - s64 kernel_ns; - u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; - bool use_master_clock; - - kernel_ns = 0; - host_tsc = 0; - - /* - * If the host uses TSC clock, then passthrough TSC as stable - * to the guest. - */ - spin_lock(&ka->pvclock_gtod_sync_lock); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - spin_unlock(&ka->pvclock_gtod_sync_lock); - - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); - if (unlikely(tgt_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - if (!use_master_clock) { - host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); - } - - tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); - - /* - * We may have to catch up the TSC to match elapsed wall clock - * time for two reasons, even if kvmclock is used. - * 1) CPU could have been running below the maximum TSC rate - * 2) Broken TSC compensation resets the base at each VCPU - * entry to avoid unknown leaps of TSC even when running - * again on the same CPU. This may cause apparent elapsed - * time to disappear, and the guest to stand still or run - * very slowly. - */ - if (vcpu->tsc_catchup) { - u64 tsc = compute_guest_tsc(v, kernel_ns); - if (tsc > tsc_timestamp) { - adjust_tsc_offset_guest(v, tsc - tsc_timestamp); - tsc_timestamp = tsc; - } - } - - local_irq_restore(flags); - - /* With all the info we got, fill in the values */ - - if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); - - if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = tgt_tsc_khz; - } - - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_guest_tsc = tsc_timestamp; - - /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; - if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; - - vcpu->hv_clock.flags = pvclock_flags; - - if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v); - if (v == kvm_get_vcpu(v->kvm, 0)) - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); - return 0; -} - -/* - * kvmclock updates which are isolated to a given vcpu, such as - * vcpu->cpu migration, should not allow system_timestamp from - * the rest of the vcpus to remain static. Otherwise ntp frequency - * correction applies to one vcpu's system_timestamp but not - * the others. - * - * So in those cases, request a kvmclock update for all vcpus. - * We need to rate-limit these requests though, as they can - * considerably slow guests that have a large number of vcpus. - * The time for a remote vcpu to update its kvmclock is bound - * by the delay we use to rate-limit the updates. - */ - -#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - -static void kvmclock_update_fn(struct work_struct *work) -{ - int i; - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_update_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_vcpu_kick(vcpu); - } -} - -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) -{ - struct kvm *kvm = v->kvm; - - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - schedule_delayed_work(&kvm->arch.kvmclock_update_work, - KVMCLOCK_UPDATE_DELAY); -} - -#define KVMCLOCK_SYNC_PERIOD (300 * HZ) - -static void kvmclock_sync_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_sync_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); -} - -static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_MCG_STATUS: - vcpu->arch.mcg_status = data; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return 1; - if (data != 0 && data != ~(u64)0) - return -1; - vcpu->arch.mcg_ctl = data; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - vcpu->arch.mce_banks[offset] = data; - break; - } - return 1; - } - return 0; -} - -static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 - : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; - u32 page_num = data & ~PAGE_MASK; - u64 page_addr = data & PAGE_MASK; - u8 *page; - int r; - - r = -E2BIG; - if (page_num >= blob_size) - goto out; - r = -ENOMEM; - page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; - } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; -} - -static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) -{ - gpa_t gpa = data & ~0x3f; - - /* Bits 2:5 are reserved, Should be zero */ - if (data & 0x3c) - return 1; - - vcpu->arch.apf.msr_val = data; - - if (!(data & KVM_ASYNC_PF_ENABLED)) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - return 0; - } - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) - return 1; - - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); - kvm_async_pf_wakeup_all(vcpu); - return 0; -} - -static void kvmclock_reset(struct kvm_vcpu *vcpu) -{ - vcpu->arch.pv_time_enabled = false; -} - -static void record_steal_time(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) - return; - - if (vcpu->arch.st.steal.version & 1) - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ - - vcpu->arch.st.steal.version += 1; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - - smp_wmb(); - - vcpu->arch.st.steal.steal += current->sched_info.run_delay - - vcpu->arch.st.last_steal; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - - smp_wmb(); - - vcpu->arch.st.steal.version += 1; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; + //bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; + if (msr >= 0x200 && msr <= 0x2ff) + return kvm_mtrr_set_msr(vcpu, msr, data); + if (msr >= APIC_BASE_MSR && msr <= (APIC_BASE_MSR + 0x3ff)) + return kvm_x2apic_msr_write(vcpu, msr, data); switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_REV: @@ -2143,15 +965,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", __func__, data); break; - case 0x200 ... 0x2ff: - return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: - kvm_set_lapic_tscdeadline_msr(vcpu, data); - break; case MSR_IA32_TSC_ADJUST: if (guest_cpuid_has_tsc_adjust(vcpu)) { if (!msr_info->host_initiated) { @@ -2169,81 +984,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; - case MSR_KVM_WALL_CLOCK_NEW: - case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data); - break; - case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; - struct kvm_arch *ka = &vcpu->kvm->arch; - - kvmclock_reset(vcpu); - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - if (!(data & 1)) - break; - - gpa_offset = data & ~(PAGE_MASK | 1); - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else - vcpu->arch.pv_time_enabled = true; - - break; - } - case MSR_KVM_ASYNC_PF_EN: - if (kvm_pv_enable_async_pf(vcpu, data)) - return 1; - break; - case MSR_KVM_STEAL_TIME: - - if (unlikely(!sched_info_on())) - return 1; - - if (data & KVM_STEAL_RESERVED_MASK) - return 1; - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS, - sizeof(struct kvm_steal_time))) - return 1; - - vcpu->arch.st.msr_val = data; - - if (!(data & KVM_MSR_ENABLED)) - break; - - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - - break; - case MSR_KVM_PV_EOI_EN: - if (kvm_lapic_enable_pv_eoi(vcpu, data)) - return 1; - break; - - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return set_msr_mce(vcpu, msr, data); - +#if 0 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: pr = true; /* fall through */ @@ -2256,6 +997,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; +#endif case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. @@ -2266,18 +1008,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * the need to ignore the workaround. */ break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: - return kvm_hv_set_msr_common(vcpu, msr, data, - msr_info->host_initiated); +#if 0 case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; +#endif case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) return 1; @@ -2289,8 +1027,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.osvw.status = data; break; default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return xen_hvm_config(vcpu, data); +#if 0 if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { @@ -2302,10 +1039,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr, data); break; } +#endif + break; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_msr_common); /* @@ -2317,45 +1055,13 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return kvm_x86_ops->get_msr(vcpu, msr); } -EXPORT_SYMBOL_GPL(kvm_get_msr); - -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - data = 0; - break; - case MSR_IA32_MCG_CAP: - data = vcpu->arch.mcg_cap; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return 1; - data = vcpu->arch.mcg_ctl; - break; - case MSR_IA32_MCG_STATUS: - data = vcpu->arch.mcg_status; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; - data = vcpu->arch.mce_banks[offset]; - break; - } - return 1; - } - *pdata = data; - return 0; -} int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + if (msr_info->index >= 0x200 && msr_info->index <= 0x2ff) + return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); + if (msr_info->index >= APIC_BASE_MSR && msr_info->index <= (APIC_BASE_MSR + 0x3ff)) + return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: @@ -2376,6 +1082,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PERF_CTL: msr_info->data = 0; break; +#if 0 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -2384,11 +1091,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); msr_info->data = 0; break; +#endif case MSR_IA32_UCODE_REV: msr_info->data = 0x100000000ULL; break; case MSR_MTRRcap: - case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -2410,12 +1117,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; - case MSR_IA32_TSCDEADLINE: - msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); - break; case MSR_IA32_TSC_ADJUST: msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; @@ -2436,30 +1137,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EFER: msr_info->data = vcpu->arch.efer; break; - case MSR_KVM_WALL_CLOCK: - case MSR_KVM_WALL_CLOCK_NEW: - msr_info->data = vcpu->kvm->arch.wall_clock; - break; - case MSR_KVM_SYSTEM_TIME: - case MSR_KVM_SYSTEM_TIME_NEW: - msr_info->data = vcpu->arch.time; - break; - case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; - break; - case MSR_KVM_STEAL_TIME: - msr_info->data = vcpu->arch.st.msr_val; - break; - case MSR_KVM_PV_EOI_EN: - msr_info->data = vcpu->arch.pv_eoi.msr_val; - break; - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr_info->index, &msr_info->data); +#if 0 case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2472,13 +1150,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ msr_info->data = 0x20000000; break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: - return kvm_hv_get_msr_common(vcpu, - msr_info->index, &msr_info->data); - break; +#endif case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -2503,6 +1175,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; default: +#if 0 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { @@ -2513,10 +1186,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; } break; +#endif + break; } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. @@ -2544,7 +1218,8 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, * * @return number of msrs set successfully. */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, +static int msr_io(PIRP pIrp, struct kvm_vcpu *vcpu, + struct kvm_msrs __user *user_msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback) @@ -2573,10 +1248,17 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, if (r < 0) goto out_free; - r = -EFAULT; - if (writeback && copy_to_user(user_msrs->entries, entries, size)) + /* write back n of msrs handled here*/ + r = gvmUpdateReturnBuffer(pIrp, 0, &n, sizeof(n)); + if (r) goto out_free; + if (writeback) { + r = gvmUpdateReturnBuffer(pIrp, sizeof(msrs), entries, size); + if (r) + goto out_free; + } + r = n; out_free: @@ -2590,56 +1272,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int r; switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_HLT: - case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_SET_TSS_ADDR: - case KVM_CAP_EXT_CPUID: - case KVM_CAP_EXT_EMUL_CPUID: - case KVM_CAP_CLOCKSOURCE: - case KVM_CAP_PIT: - case KVM_CAP_NOP_IO_DELAY: - case KVM_CAP_MP_STATE: - case KVM_CAP_SYNC_MMU: - case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: - case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IOEVENTFD: - case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: - case KVM_CAP_SET_IDENTITY_MAP_ADDR: - case KVM_CAP_XEN_HVM: - case KVM_CAP_VCPU_EVENTS: - case KVM_CAP_HYPERV: - case KVM_CAP_HYPERV_VAPIC: - case KVM_CAP_HYPERV_SPIN: - case KVM_CAP_HYPERV_SYNIC: - case KVM_CAP_PCI_SEGMENT: - case KVM_CAP_DEBUGREGS: - case KVM_CAP_X86_ROBUST_SINGLESTEP: - case KVM_CAP_XSAVE: - case KVM_CAP_ASYNC_PF: - case KVM_CAP_GET_TSC_KHZ: - case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: - case KVM_CAP_HYPERV_TIME: - case KVM_CAP_IOAPIC_POLARITY_IGNORED: - case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: - case KVM_CAP_DISABLE_QUIRKS: - case KVM_CAP_SET_BOOT_CPU_ID: - case KVM_CAP_SPLIT_IRQCHIP: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif + case GVM_CAP_IRQCHIP: + case GVM_CAP_HLT: + case GVM_CAP_MMU_SHADOW_CACHE_CONTROL: + case GVM_CAP_EXT_EMUL_CPUID: + case GVM_CAP_NOP_IO_DELAY: + case GVM_CAP_SYNC_MMU: + case GVM_CAP_USER_NMI: + case GVM_CAP_REINJECT_CONTROL: + case GVM_CAP_SET_IDENTITY_MAP_ADDR: + case GVM_CAP_VCPU_EVENTS: r = 1; break; - case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + case GVM_CAP_PCI_SEGMENT: + case GVM_CAP_DEBUGREGS: + case GVM_CAP_X86_ROBUST_SINGLESTEP: + case GVM_CAP_XSAVE: + case GVM_CAP_READONLY_MEM: + case GVM_CAP_IOAPIC_POLARITY_IGNORED: + case GVM_CAP_ENABLE_CAP_VM: + case GVM_CAP_DISABLE_QUIRKS: + case GVM_CAP_SET_BOOT_CPU_ID: + r = 0; break; - case KVM_CAP_X86_SMM: + case GVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is @@ -2650,41 +1306,21 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; - case KVM_CAP_VAPIC: + case GVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; - case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; - break; - case KVM_CAP_MAX_VCPUS: - r = KVM_MAX_VCPUS; - break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; - case KVM_CAP_PV_MMU: /* obsolete */ - r = 0; + case GVM_CAP_NR_VCPUS: + r = GVM_SOFT_MAX_VCPUS; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); + case GVM_CAP_MAX_VCPUS: + r = GVM_MAX_VCPUS; break; -#endif - case KVM_CAP_MCE: - r = KVM_MAX_MCE_BANKS; + case GVM_CAP_NR_MEMSLOTS: + r = GVM_USER_MEM_SLOTS; break; - case KVM_CAP_XCRS: + case GVM_CAP_XCRS: r = boot_cpu_has(X86_FEATURE_XSAVE); break; - case KVM_CAP_TSC_CONTROL: - r = kvm_has_tsc_control; - break; - case KVM_CAP_X2APIC_API: - r = KVM_X2APIC_API_VALID_FLAGS; - break; default: r = 0; break; @@ -2693,64 +1329,53 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) } -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_dev_ioctl(struct gvm_device_extension *devext, + PIRP pIrp, unsigned int ioctl) { - void __user *argp = (void __user *)arg; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; + size_t args = IoGetCurrentIrpStackLocation(pIrp)->Parameters.DeviceIoControl.InputBufferLength; long r; switch (ioctl) { - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; + case GVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list *msr_list = argp; unsigned n; - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) - goto out; - r = -E2BIG; - if (n < msr_list.nmsrs) + if (args < sizeof(struct kvm_msr_list)) { + r = -EINVAL; goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) + } + + r = STATUS_SUCCESS; + n = msr_list->nmsrs; + __u32 nmsrs = num_msrs_to_save; + r = gvmUpdateReturnBuffer(pIrp, 0, &nmsrs, sizeof(nmsrs)); + if (r) goto out; - if (copy_to_user(user_msr_list->indices + num_msrs_to_save, - &emulated_msrs, - num_emulated_msrs * sizeof(u32))) + + if (n < nmsrs) { + r = -E2BIG; goto out; - r = 0; + } + + r = gvmUpdateReturnBuffer(pIrp, sizeof(nmsrs), &msrs_to_save, + num_msrs_to_save * sizeof(u32)); break; } - case KVM_GET_SUPPORTED_CPUID: - case KVM_GET_EMULATED_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + case GVM_GET_SUPPORTED_CPUID: + case GVM_GET_EMULATED_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + r = kvm_dev_ioctl_get_cpuid(pIrp, &cpuid, cpuid_arg->entries, ioctl); if (r) goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { - r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) - goto out; r = 0; break; } @@ -2761,84 +1386,20 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - -static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_arch_has_noncoherent_dma(vcpu->kvm); -} - -static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - /* Address WBINVD may be executed by guest */ - if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); - } - kvm_x86_ops->vcpu_load(vcpu, cpu); - - /* Apply any externally detected TSC adjustments (due to suspend) */ - if (unlikely(vcpu->arch.tsc_offset_adjustment)) { - adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); - vcpu->arch.tsc_offset_adjustment = 0; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } - - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - rdtsc() - vcpu->arch.last_host_tsc; - if (tsc_delta < 0) - mark_tsc_unstable("KVM discovered backwards TSC"); - - if (check_tsc_unstable()) { - u64 offset = kvm_compute_tsc_offset(vcpu, - vcpu->arch.last_guest_tsc); - kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; - } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); - /* - * On a host with synchronized TSC, there is no need to update - * kvmclock on vcpu->cpu migration - */ - if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); - vcpu->cpu = cpu; - } - - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + vcpu->cpu = cpu; } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - return kvm_apic_get_state(vcpu, s); } @@ -2878,12 +1439,12 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq >= KVM_NR_INTERRUPTS) + if (irq->irq >= GVM_NR_INTERRUPTS) return -EINVAL; if (!irqchip_in_kernel(vcpu->kvm)) { kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -2898,7 +1459,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EEXIST; vcpu->arch.pending_external_vector = irq->irq; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -2911,7 +1472,7 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_make_request(GVM_REQ_SMI, vcpu); return 0; } @@ -2925,80 +1486,6 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, return 0; } -static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, - u64 mcg_cap) -{ - int r; - unsigned bank_num = mcg_cap & 0xff, bank; - - r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) - goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) - goto out; - r = 0; - vcpu->arch.mcg_cap = mcg_cap; - /* Init IA32_MCG_CTL to all 1s */ - if (mcg_cap & MCG_CTL_P) - vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) - vcpu->arch.mce_banks[bank*4] = ~(u64)0; - - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); -out: - return r; -} - -static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, - struct kvm_x86_mce *mce) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - u64 *banks = vcpu->arch.mce_banks; - - if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) - return -EINVAL; - /* - * if IA32_MCG_CTL is not all 1s, the uncorrected error - * reporting is disabled - */ - if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && - vcpu->arch.mcg_ctl != ~(u64)0) - return 0; - banks += 4 * mce->bank; - /* - * if IA32_MCi_CTL is not all 1s, the uncorrected error - * reporting is disabled for the bank - */ - if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) - return 0; - if (mce->status & MCI_STATUS_UC) { - if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 0; - } - if (banks[1] & MCI_STATUS_VAL) - mce->status |= MCI_STATUS_OVER; - banks[2] = mce->addr; - banks[3] = mce->misc; - vcpu->arch.mcg_status = mce->mcg_status; - banks[1] = mce->status; - kvm_queue_exception(vcpu, MC_VECTOR); - } else if (!(banks[1] & MCI_STATUS_VAL) - || !(banks[1] & MCI_STATUS_UC)) { - if (banks[1] & MCI_STATUS_VAL) - mce->status |= MCI_STATUS_OVER; - banks[2] = mce->addr; - banks[3] = mce->misc; - banks[1] = mce->status; - } else - banks[1] |= MCI_STATUS_OVER; - return 0; -} - static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -3030,19 +1517,19 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); events->smi.latched_init = kvm_lapic_latched_init(vcpu); - events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SHADOW - | KVM_VCPUEVENT_VALID_SMM); + events->flags = (GVM_VCPUEVENT_VALID_NMI_PENDING + | GVM_VCPUEVENT_VALID_SHADOW + | GVM_VCPUEVENT_VALID_SMM); memset(&events->reserved, 0, sizeof(events->reserved)); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW - | KVM_VCPUEVENT_VALID_SMM)) + if (events->flags & ~(GVM_VCPUEVENT_VALID_NMI_PENDING + | GVM_VCPUEVENT_VALID_SIPI_VECTOR + | GVM_VCPUEVENT_VALID_SHADOW + | GVM_VCPUEVENT_VALID_SMM)) return -EINVAL; if (events->exception.injected && @@ -3058,20 +1545,20 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) + if (events->flags & GVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + if (events->flags & GVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && + if (events->flags & GVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; - if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (events->flags & GVM_VCPUEVENT_VALID_SMM) { if (events->smi.smm) vcpu->arch.hflags |= HF_SMM_MASK; else @@ -3083,13 +1570,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + set_bit(GVM_APIC_INIT, &vcpu->arch.apic->pending_events); else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + clear_bit(GVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -3097,7 +1584,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - unsigned long val; + size_t val; memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); @@ -3128,11 +1615,87 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } +u64 xfeatures_mask; +static unsigned int xstate_offsets[XFEATURE_MAX] = { 0 }; +static unsigned int xstate_sizes[XFEATURE_MAX] = { 0 }; +static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; + +/* + * Note that in the future we will likely need a pair of + * functions here: one for user xstates and the other for + * system xstates. For now, they are the same. + */ +static int xfeature_enabled(enum xfeature xfeature) +{ + return !!(xfeatures_mask & ((u64)1 << xfeature)); +} + +/* + * Given an xstate feature mask, calculate where in the xsave + * buffer the state is. Callers should ensure that the buffer + * is valid. + * + * Note: does not work for compacted buffers. + */ +static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) +{ + int feature_nr = fls64(xstate_feature_mask) - 1; + + if (!xfeature_enabled(feature_nr)) { + return NULL; + } + + return (u8 *)xsave + xstate_comp_offsets[feature_nr]; +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Note that if there is no data for the field in the xsave buffer + * this will return NULL. + * + * Inputs: + * xstate: the thread's storage area for all FPU data + * xstate_feature: state which is defined in xsave.h (e.g. + * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) + * Output: + * address of the state in the xsave area, or NULL if the + * field is not present in the xsave buffer. + */ +void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) +{ + /* + * Do we even *have* xsave state? + */ + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return NULL; + + /* + * This assumes the last 'xsave*' instruction to + * have requested that 'xstate_feature' be saved. + * If it did not, we might be seeing and old value + * of the field in the buffer. + * + * This can happen because the last 'xsave' did not + * request that this feature be saved (unlikely) + * or because the "init optimization" caused it + * to not be saved. + */ + if (!(xsave->header.xfeatures & xstate_feature)) + return NULL; + + return __raw_xsave_addr(xsave, xstate_feature); +} + #define XSTATE_COMPACTION_ENABLED (1ULL << 63) static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; @@ -3151,7 +1714,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; + u64 feature = valid & -(s64)valid; int index = fls64(feature) - 1; void *src = get_xsave_addr(xsave, feature); @@ -3168,7 +1731,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3189,7 +1752,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; + u64 feature = valid & -(s64)valid; int index = fls64(feature) - 1; void *dest = get_xsave_addr(xsave, feature); @@ -3212,7 +1775,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state.fxsave, + &vcpu->arch.guest_fpu.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XFEATURE_MASK_FPSSE; @@ -3237,7 +1800,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, } else { if (xstate_bv & ~XFEATURE_MASK_FPSSE) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state.fxsave, + memcpy(&vcpu->arch.guest_fpu.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); } return 0; @@ -3265,7 +1828,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; - if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + if (guest_xcrs->nr_xcrs > GVM_MAX_XCRS || guest_xcrs->flags) return -EINVAL; for (i = 0; i < guest_xcrs->nr_xcrs; i++) @@ -3280,40 +1843,11 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, return r; } -/* - * kvm_set_guest_paused() indicates to the guest kernel that it has been - * stopped by the hypervisor. This function will be called from the host only. - * EINVAL is returned when the host attempts to set the flag for a guest that - * does not support pv clocks. - */ -static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.pv_time_enabled) - return -EINVAL; - vcpu->arch.pvclock_set_guest_stopped_request = true; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return 0; -} - -static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, - struct kvm_enable_cap *cap) +long kvm_arch_vcpu_ioctl(struct gvm_device_extension *devext, + PIRP pIrp, unsigned int ioctl) { - if (cap->flags) - return -EINVAL; - - switch (cap->cap) { - case KVM_CAP_HYPERV_SYNIC: - return kvm_hv_activate_synic(vcpu); - default: - return -EINVAL; - } -} - -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu = devext->PrivData; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; int r; union { struct kvm_lapic_state *lapic; @@ -3324,7 +1858,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u.buffer = NULL; switch (ioctl) { - case KVM_GET_LAPIC: { + case GVM_GET_LAPIC: { r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; @@ -3336,13 +1870,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, u.lapic, + sizeof(struct kvm_lapic_state)); break; } - case KVM_SET_LAPIC: { + case GVM_SET_LAPIC: { r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; @@ -3353,7 +1885,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); break; } - case KVM_INTERRUPT: { + case GVM_INTERRUPT: { struct kvm_interrupt irq; r = -EFAULT; @@ -3362,59 +1894,50 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; } - case KVM_NMI: { + case GVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); break; } - case KVM_SMI: { + case GVM_SMI: { r = kvm_vcpu_ioctl_smi(vcpu); break; } - case KVM_SET_CPUID: { + case GVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - break; - } - case KVM_SET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); break; } - case KVM_GET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + case GVM_GET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, + r = kvm_vcpu_ioctl_get_cpuid(vcpu, &cpuid, cpuid_arg->entries); if (r) goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + r = gvmUpdateReturnBuffer(pIrp, 0, &cpuid, sizeof(cpuid)); + if (r) goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, sizeof(cpuid), &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry)); break; } - case KVM_GET_MSRS: - r = msr_io(vcpu, argp, do_get_msr, 1); + case GVM_GET_MSRS: + r = msr_io(pIrp, vcpu, argp, do_get_msr, 1); break; - case KVM_SET_MSRS: - r = msr_io(vcpu, argp, do_set_msr, 0); + case GVM_SET_MSRS: + r = msr_io(pIrp, vcpu, argp, do_set_msr, 0); break; - case KVM_TPR_ACCESS_REPORTING: { + case GVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; r = -EFAULT; @@ -3423,13 +1946,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, &tac, sizeof tac)) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &tac, sizeof(tac)); break; }; - case KVM_SET_VAPIC_ADDR: { + case GVM_SET_VAPIC_ADDR: { struct kvm_vapic_addr va; int idx; @@ -3444,36 +1964,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } - case KVM_X86_SETUP_MCE: { - u64 mcg_cap; - - r = -EFAULT; - if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) - goto out; - r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); - break; - } - case KVM_X86_SET_MCE: { - struct kvm_x86_mce mce; - - r = -EFAULT; - if (copy_from_user(&mce, argp, sizeof mce)) - goto out; - r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - break; - } - case KVM_GET_VCPU_EVENTS: { + case GVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); - r = -EFAULT; - if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &events, + sizeof(struct kvm_vcpu_events)); break; } - case KVM_SET_VCPU_EVENTS: { + case GVM_SET_VCPU_EVENTS: { struct kvm_vcpu_events events; r = -EFAULT; @@ -3483,19 +1983,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); break; } - case KVM_GET_DEBUGREGS: { + case GVM_GET_DEBUGREGS: { struct kvm_debugregs dbgregs; kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); - r = -EFAULT; - if (copy_to_user(argp, &dbgregs, - sizeof(struct kvm_debugregs))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &dbgregs, + sizeof(struct kvm_debugregs)); break; } - case KVM_SET_DEBUGREGS: { + case GVM_SET_DEBUGREGS: { struct kvm_debugregs dbgregs; r = -EFAULT; @@ -3506,7 +2003,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } - case KVM_GET_XSAVE: { + case GVM_GET_XSAVE: { u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!u.xsave) @@ -3514,13 +2011,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); - r = -EFAULT; - if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, u.xsave, + sizeof(struct kvm_xsave)); break; } - case KVM_SET_XSAVE: { + case GVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); if (IS_ERR(u.xsave)) return PTR_ERR(u.xsave); @@ -3528,7 +2023,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } - case KVM_GET_XCRS: { + case GVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!u.xcrs) @@ -3536,14 +2031,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); - r = -EFAULT; - if (copy_to_user(argp, u.xcrs, - sizeof(struct kvm_xcrs))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, u.xcrs, + sizeof(struct kvm_xcrs)); break; } - case KVM_SET_XCRS: { + case GVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); if (IS_ERR(u.xcrs)) return PTR_ERR(u.xcrs); @@ -3551,40 +2043,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } - case KVM_SET_TSC_KHZ: { - u32 user_tsc_khz; - - r = -EINVAL; - user_tsc_khz = (u32)arg; - - if (user_tsc_khz >= kvm_max_guest_tsc_khz) - goto out; - - if (user_tsc_khz == 0) - user_tsc_khz = tsc_khz; - - if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) - r = 0; - - goto out; - } - case KVM_GET_TSC_KHZ: { - r = vcpu->arch.virtual_tsc_khz; - goto out; - } - case KVM_KVMCLOCK_CTRL: { - r = kvm_set_guest_paused(vcpu); - goto out; - } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); - break; - } default: r = -EINVAL; } @@ -3593,12 +2051,7 @@ out: return r; } -int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - -static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, size_t addr) { int ret; @@ -3618,7 +2071,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { - if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) + if (kvm_nr_mmu_pages < GVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; mutex_lock(&kvm->slots_lock); @@ -3641,17 +2094,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: + case GVM_IRQCHIP_PIC_MASTER: memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], sizeof(struct kvm_pic_state)); break; - case KVM_IRQCHIP_PIC_SLAVE: + case GVM_IRQCHIP_PIC_SLAVE: memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], sizeof(struct kvm_pic_state)); break; - case KVM_IRQCHIP_IOAPIC: + case GVM_IRQCHIP_IOAPIC: r = kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: @@ -3667,21 +2120,21 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: + case GVM_IRQCHIP_PIC_MASTER: spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); spin_unlock(&pic_irqchip(kvm)->lock); break; - case KVM_IRQCHIP_PIC_SLAVE: + case GVM_IRQCHIP_PIC_SLAVE: spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); spin_unlock(&pic_irqchip(kvm)->lock); break; - case KVM_IRQCHIP_IOAPIC: + case GVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: @@ -3692,83 +2145,6 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) return r; } -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; - - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); - - mutex_lock(&kps->lock); - memcpy(ps, &kps->channels, sizeof(*ps)); - mutex_unlock(&kps->lock); - return 0; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int i; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return 0; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int start = 0; - int i; - u32 prev_legacy, cur_legacy; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&pit->pit_state.channels, &ps->channels, - sizeof(pit->pit_state.channels)); - pit->pit_state.flags = ps->flags; - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, - start && i == 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - if (!pit) - return -ENXIO; - - /* pit->pit_state.lock was overloaded to prevent userspace from getting - * an inconsistent state after running multiple KVM_REINJECT_CONTROL - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. - */ - mutex_lock(&pit->pit_state.lock); - kvm_pit_set_reinject(pit, control->pit_reinject); - mutex_unlock(&pit->pit_state.lock); - - return 0; -} - /** * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot * @kvm: kvm instance @@ -3779,7 +2155,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, * * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * bitmap may be corrupt. Regardless of previous outcome the kvm logging API * does not preclude user space subsequent dirty log read. Flushing TLB ensures * writes will be marked dirty for next log read. * @@ -3791,7 +2167,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { bool is_dirty = false; - int r; + int r = 0; mutex_lock(&kvm->slots_lock); @@ -3807,7 +2183,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ - lockdep_assert_held(&kvm->slots_lock); if (is_dirty) kvm_flush_remote_tlbs(kvm); @@ -3821,7 +2196,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, if (!irqchip_in_kernel(kvm)) return -ENXIO; - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->status = kvm_set_irq(kvm, GVM_USERSPACE_IRQ_SOURCE_ID, irq_event->irq, irq_event->level, line_status); return 0; @@ -3836,44 +2211,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; switch (cap->cap) { - case KVM_CAP_DISABLE_QUIRKS: + case GVM_CAP_DISABLE_QUIRKS: kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; - case KVM_CAP_SPLIT_IRQCHIP: { - mutex_lock(&kvm->lock); - r = -EINVAL; - if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) - goto split_irqchip_unlock; - r = -EEXIST; - if (irqchip_in_kernel(kvm)) - goto split_irqchip_unlock; - if (kvm->created_vcpus) - goto split_irqchip_unlock; - r = kvm_setup_empty_irq_routing(kvm); - if (r) - goto split_irqchip_unlock; - /* Pairs with irqchip_in_kernel. */ - smp_wmb(); - kvm->arch.irqchip_split = true; - kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; - r = 0; -split_irqchip_unlock: - mutex_unlock(&kvm->lock); - break; - } - case KVM_CAP_X2APIC_API: - r = -EINVAL; - if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) - break; - - if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) - kvm->arch.x2apic_format = true; - if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) - kvm->arch.x2apic_broadcast_quirk_disabled = true; - - r = 0; - break; default: r = -EINVAL; break; @@ -3881,43 +2222,39 @@ split_irqchip_unlock: return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vm_ioctl(struct gvm_device_extension *devext, + PIRP pIrp, unsigned int ioctl) { - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; + struct kvm *kvm = devext->PrivData; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; int r = -ENOTTY; - /* - * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be - * combined, not added together. - */ - union { - struct kvm_pit_state ps; - struct kvm_pit_state2 ps2; - struct kvm_pit_config pit_config; - } u; switch (ioctl) { - case KVM_SET_TSS_ADDR: - r = kvm_vm_ioctl_set_tss_addr(kvm, arg); + case GVM_SET_TSS_ADDR: + r = -EFAULT; + if (IoGetCurrentIrpStackLocation(pIrp)->Parameters.DeviceIoControl.InputBufferLength + < sizeof(size_t)) + goto out; + r = kvm_vm_ioctl_set_tss_addr(kvm, *(size_t *)argp); break; - case KVM_SET_IDENTITY_MAP_ADDR: { + case GVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; r = -EFAULT; - if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + if (IoGetCurrentIrpStackLocation(pIrp)->Parameters.DeviceIoControl.InputBufferLength + < sizeof(ident_addr)) goto out; + ident_addr = *(u64 *)argp; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); break; } - case KVM_SET_NR_MMU_PAGES: - r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); + case GVM_SET_NR_MMU_PAGES: + r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, *(unsigned int*)argp); break; - case KVM_GET_NR_MMU_PAGES: + case GVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_CREATE_IRQCHIP: { + case GVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; mutex_lock(&kvm->lock); @@ -3956,27 +2293,7 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->lock); break; } - case KVM_CREATE_PIT: - u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; - goto create_pit; - case KVM_CREATE_PIT2: - r = -EFAULT; - if (copy_from_user(&u.pit_config, argp, - sizeof(struct kvm_pit_config))) - goto out; - create_pit: - mutex_lock(&kvm->lock); - r = -EEXIST; - if (kvm->arch.vpit) - goto create_pit_unlock; - r = -ENOMEM; - kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); - if (kvm->arch.vpit) - r = 0; - create_pit_unlock: - mutex_unlock(&kvm->lock); - break; - case KVM_GET_IRQCHIP: { + case GVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; @@ -3987,20 +2304,17 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_in_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) goto get_irqchip_out; - r = -EFAULT; - if (copy_to_user(argp, chip, sizeof *chip)) - goto get_irqchip_out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, chip, sizeof(*chip)); get_irqchip_out: kfree(chip); break; } - case KVM_SET_IRQCHIP: { + case GVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; @@ -4011,7 +2325,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_in_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4021,121 +2335,16 @@ long kvm_arch_vm_ioctl(struct file *filp, kfree(chip); break; } - case KVM_GET_PIT: { - r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_get_pit(kvm, &u.ps); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) - goto out; - r = 0; - break; - } - case KVM_SET_PIT: { - r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof u.ps)) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - break; - } - case KVM_GET_PIT2: { - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) - goto out; - r = 0; - break; - } - case KVM_SET_PIT2: { - r = -EFAULT; - if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - break; - } - case KVM_REINJECT_CONTROL: { - struct kvm_reinject_control control; - r = -EFAULT; - if (copy_from_user(&control, argp, sizeof(control))) - goto out; - r = kvm_vm_ioctl_reinject(kvm, &control); - break; - } - case KVM_SET_BOOT_CPU_ID: + case GVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; else - kvm->arch.bsp_vcpu_id = arg; + kvm->arch.bsp_vcpu_id = *(u32 *)argp; mutex_unlock(&kvm->lock); break; - case KVM_XEN_HVM_CONFIG: { - r = -EFAULT; - if (copy_from_user(&kvm->arch.xen_hvm_config, argp, - sizeof(struct kvm_xen_hvm_config))) - goto out; - r = -EINVAL; - if (kvm->arch.xen_hvm_config.flags) - goto out; - r = 0; - break; - } - case KVM_SET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - local_irq_enable(); - kvm_gen_update_masterclock(kvm); - break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - local_irq_enable(); - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; - break; - } - case KVM_ENABLE_CAP: { + case GVM_ENABLE_CAP: { struct kvm_enable_cap cap; r = -EFAULT; @@ -4145,7 +2354,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + break; } out: return r; @@ -4183,6 +2392,7 @@ static void kvm_init_msr_list(void) } num_msrs_to_save = j; +#if 0 for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { switch (emulated_msrs[i]) { case MSR_IA32_SMBASE: @@ -4198,6 +2408,7 @@ static void kvm_init_msr_list(void) j++; } num_emulated_msrs = j; +#endif } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -4205,17 +2416,20 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, { int handled = 0; int n; + const char *__v = v; do { n = min(len, 8); if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) + && kvm_io_bus_write(vcpu, GVM_MMIO_BUS, addr, n, v)) break; handled += n; addr += n; len -= n; - v += n; + __v = (char *)v; + __v += n; + v = (void *)__v; } while (len); return handled; @@ -4225,19 +2439,21 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { int handled = 0; int n; + char *__v; do { n = min(len, 8); if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) + && kvm_io_bus_read(vcpu, GVM_MMIO_BUS, addr, n, v)) break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); handled += n; addr += n; len -= n; - v += n; + __v = (char *)v; + __v += n; + v = (void *)__v; } while (len); return handled; @@ -4304,6 +2520,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { void *data = val; + char *__data; int r = X86EMUL_CONTINUE; while (bytes) { @@ -4323,7 +2540,9 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, } bytes -= toread; - data += toread; + __data = (char *)data; + __data += toread; + data = (void *)__data; addr += toread; } out: @@ -4367,7 +2586,6 @@ int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4378,7 +2596,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, } static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes) + size_t addr, void *val, unsigned int bytes) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); @@ -4393,6 +2611,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); void *data = val; + char *__data; int r = X86EMUL_CONTINUE; while (bytes) { @@ -4412,15 +2631,16 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, } bytes -= towrite; - data += towrite; + __data = (char *)data; + __data += towrite; + data = (void *)__data; addr += towrite; } out: return r; } -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); -static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, +static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, size_t gva, gpa_t *gpa, struct x86_exception *exception, bool write) { @@ -4437,7 +2657,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); - trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } @@ -4451,7 +2670,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 1; if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); return 1; } @@ -4485,8 +2703,6 @@ struct read_write_emulator_ops { static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_fragments[0].gpa, *(u64 *)val); vcpu->mmio_read_completed = 0; return 1; } @@ -4508,14 +2724,12 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) { - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); return vcpu_mmio_write(vcpu, gpa, bytes, val); } static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); return X86EMUL_IO_NEEDED; } @@ -4542,7 +2756,7 @@ static const struct read_write_emulator_ops write_emultor = { .write = true, }; -static int emulator_read_write_onepage(unsigned long addr, void *val, +static int emulator_read_write_onepage(size_t addr, void *val, unsigned int bytes, struct x86_exception *exception, struct kvm_vcpu *vcpu, @@ -4552,6 +2766,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + char *__val; ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -4575,9 +2790,11 @@ mmio: gpa += handled; bytes -= handled; - val += handled; + __val = val; + __val += handled; + val = __val; - WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + WARN_ON(vcpu->mmio_nr_fragments >= GVM_MAX_MMIO_FRAGMENTS); frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; frag->gpa = gpa; frag->data = val; @@ -4586,7 +2803,7 @@ mmio: } static int emulator_read_write(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, void *val, unsigned int bytes, struct x86_exception *exception, const struct read_write_emulator_ops *ops) @@ -4594,6 +2811,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; int rc; + char *__val; if (ops->read_write_prepare && ops->read_write_prepare(vcpu, val, bytes)) @@ -4605,7 +2823,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int now; - now = -addr & ~PAGE_MASK; + now = -(ssize_t)addr & ~PAGE_MASK; rc = emulator_read_write_onepage(addr, val, now, exception, vcpu, ops); @@ -4614,7 +2832,9 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, addr += now; if (ctxt->mode != X86EMUL_MODE_PROT64) addr = (u32)addr; - val += now; + __val = val; + __val += now; + val = __val; bytes -= now; } @@ -4633,14 +2853,14 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; - vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = GVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = gpa; return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, void *val, unsigned int bytes, struct x86_exception *exception) @@ -4650,7 +2870,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, } static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, const void *val, unsigned int bytes, struct x86_exception *exception) @@ -4670,7 +2890,7 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, #endif static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, const void *old, const void *new, unsigned int bytes, @@ -4678,9 +2898,10 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; - struct page *page; char *kaddr; bool exchanged; + size_t hva; + PMDL kmap_mdl; /* guests cmpxchg8b have to be emulated atomically */ if (bytes > 8 || (bytes & (bytes - 1))) @@ -4695,11 +2916,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); + if (kvm_is_error_hva(hva)) goto emul_write; - kaddr = kmap_atomic(page); + if (get_user_pages_fast(hva, 1, 1, &kmap_mdl) != 1) + goto emul_write; + + kaddr = kmap_atomic(kmap_mdl); + if (!kaddr) + goto emul_write; kaddr += offset_in_page(gpa); switch (bytes) { case 1: @@ -4717,8 +2943,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr); - kvm_release_page_dirty(page); + kunmap_atomic(kmap_mdl); if (!exchanged) return X86EMUL_CMPXCHG_FAILED; @@ -4740,10 +2965,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu, GVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, + r = kvm_io_bus_write(vcpu, GVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); return r; @@ -4763,10 +2988,10 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 1; } - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->exit_reason = GVM_EXIT_IO; + vcpu->run->io.direction = in ? GVM_EXIT_IO_IN : GVM_EXIT_IO_OUT; vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.data_offset = GVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; @@ -4787,7 +3012,6 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (ret) { data_avail: memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; return 1; } @@ -4802,11 +3026,10 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } -static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +static size_t get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); } @@ -4818,19 +3041,6 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { - if (!need_emulate_wbinvd(vcpu)) - return X86EMUL_CONTINUE; - - if (kvm_x86_ops->has_wbinvd_exit()) { - int cpu = get_cpu(); - - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); - put_cpu(); - cpumask_clear(vcpu->arch.wbinvd_dirty_mask); - } else - wbinvd(); return X86EMUL_CONTINUE; } @@ -4839,7 +3049,6 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) kvm_x86_ops->skip_emulated_instruction(vcpu); return kvm_emulate_wbinvd_noskip(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -4849,16 +3058,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) } static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) + size_t *dest) { - return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + //return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return 0; } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long value) + size_t value) { - - return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); + return 0; + //return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -4866,10 +3076,10 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) +static size_t emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long value; + size_t value; switch (cr) { case 0: @@ -4888,7 +3098,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - kvm_err("%s: unexpected cr %u\n", __func__, cr); + //kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4917,7 +3127,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - kvm_err("%s: unexpected cr %u\n", __func__, cr); + //kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } @@ -4949,7 +3159,7 @@ static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); } -static unsigned long emulator_get_cached_segment_base( +static size_t emulator_get_cached_segment_base( struct x86_emulate_ctxt *ctxt, int seg) { return get_segment_base(emul_to_vcpu(ctxt), seg); @@ -4972,7 +3182,7 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.g) var.limit >>= 12; set_desc_limit(desc, var.limit); - set_desc_base(desc, (unsigned long)var.base); + set_desc_base(desc, (size_t)var.base); #ifdef CONFIG_X86_64 if (base3) *base3 = var.base >> 32; @@ -5063,13 +3273,15 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + //return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + return 0; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { - return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); + //return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); + return 0; } static void emulator_halt(struct x86_emulate_ctxt *ctxt) @@ -5085,11 +3297,12 @@ static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) * CR0.TS may reference the host fpu state, not the guest fpu state, * so it may be clear at this point. */ - clts(); + __clts(); } static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) { + kvm_save_guest_fpu(emul_to_vcpu(ctxt)); preempt_enable(); } @@ -5154,7 +3367,6 @@ static const struct x86_emulate_ops emulate_ops = { .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, - .fix_hypercall = emulator_fix_hypercall, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, @@ -5177,7 +3389,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (unlikely(int_shadow || mask)) { kvm_x86_ops->set_interrupt_shadow(vcpu, mask); if (!mask) - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } } @@ -5209,9 +3421,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); @@ -5244,17 +3453,15 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu) { int r = EMULATE_DONE; ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; r = EMULATE_FAIL; } @@ -5303,8 +3510,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, if (is_error_noslot_pfn(pfn)) return false; - kvm_release_pfn_clean(pfn); - /* The instructions are well-emulated on direct mmu. */ if (vcpu->arch.mmu.direct_map) { unsigned int indirect_shadow_pages; @@ -5335,10 +3540,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) + size_t cr2, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + size_t last_retry_eip, last_retry_addr, gpa = cr2; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; @@ -5384,11 +3589,8 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu); static void kvm_smm_changed(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.hflags & HF_SMM_MASK)) { - /* This is a good place to trace that we are exiting SMM. */ - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); - /* Process a latched INIT or SMI, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } kvm_mmu_reset_context(vcpu); @@ -5404,8 +3606,8 @@ static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) kvm_smm_changed(vcpu); } -static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, - unsigned long *db) +static int kvm_vcpu_check_hw_bp(size_t addr, u32 type, u32 dr7, + size_t *db) { u32 dr6 = 0; int i; @@ -5419,7 +3621,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, size_t rflags, int *r) { struct kvm_run *kvm_run = vcpu->run; @@ -5432,12 +3634,12 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) { kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->exit_reason = GVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; @@ -5455,10 +3657,10 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { - if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + if (unlikely(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { struct kvm_run *kvm_run = vcpu->run; - unsigned long eip = kvm_get_linear_rip(vcpu); + size_t eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.guest_debug_dr7, vcpu->arch.eff_db); @@ -5467,7 +3669,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->exit_reason = GVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; return true; } @@ -5475,7 +3677,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { - unsigned long eip = kvm_get_linear_rip(vcpu); + size_t eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); @@ -5493,7 +3695,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, + size_t cr2, int emulation_type, void *insn, int insn_len) @@ -5531,7 +3733,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, r = x86_decode_insn(ctxt, insn, insn_len); - trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) @@ -5600,7 +3801,7 @@ restart: r = EMULATE_DONE; if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + size_t rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (vcpu->arch.hflags != ctxt->emul_flags) @@ -5613,214 +3814,40 @@ restart: __kvm_set_rflags(vcpu, ctxt->eflags); /* - * For STI, interrupts are shadowed; so KVM_REQ_EVENT will + * For STI, interrupts are shadowed; so GVM_REQ_EVENT will * do nothing, and it will be requested again as soon as * the shadow expires. But we still need to check here, * because POPF has no interrupt shadow. */ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + size_t val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); /* do not return to emulator after return from userspace */ vcpu->arch.pio.count = 0; return ret; } -EXPORT_SYMBOL_GPL(kvm_fast_pio_out); - -static int kvmclock_cpu_down_prep(unsigned int cpu) -{ - __this_cpu_write(cpu_tsc_khz, 0); - return 0; -} - -static void tsc_khz_changed(void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long khz = 0; - - if (data) - khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - khz = cpufreq_quick_get(raw_smp_processor_id()); - if (!khz) - khz = tsc_khz; - __this_cpu_write(cpu_tsc_khz, khz); -} - -static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i, send_ipi = 0; - - /* - * We allow guests to temporarily run on slowing clocks, - * provided we notify them after, or to run on accelerating - * clocks, provided we notify them before. Thus time never - * goes backwards. - * - * However, we have a problem. We can't atomically update - * the frequency of a given CPU from this function; it is - * merely a notifier, which can be called from any CPU. - * Changing the TSC frequency at arbitrary points in time - * requires a recomputation of local variables related to - * the TSC for each VCPU. We must flag these local variables - * to be updated and be sure the update takes place with the - * new frequency before any guests proceed. - * - * Unfortunately, the combination of hotplug CPU and frequency - * change creates an intractable locking scenario; the order - * of when these callouts happen is undefined with respect to - * CPU hotplug, and they can race with each other. As such, - * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is - * undefined; you can actually have a CPU frequency change take - * place in between the computation of X and the setting of the - * variable. To protect against this problem, all updates of - * the per_cpu tsc_khz variable are done in an interrupt - * protected IPI, and all callers wishing to update the value - * must wait for a synchronous IPI to complete (which is trivial - * if the caller is on the CPU already). This establishes the - * necessary total order on variable updates. - * - * Note that because a guest time update may take place - * anytime after the setting of the VCPU's request bit, the - * correct TSC value must be set before the request. However, - * to ensure the update actually makes it to any guest which - * starts running in hardware virtualization between the set - * and the acquisition of the spinlock, we must also ping the - * CPU after setting the request bit. - * - */ - - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) - return 0; - if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) - return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != freq->cpu) - continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) - send_ipi = 1; - } - } - spin_unlock(&kvm_lock); - - if (freq->old < freq->new && send_ipi) { - /* - * We upscale the frequency. Must make the guest - * doesn't see old kvmclock values while running with - * the new frequency, otherwise we risk the guest sees - * time go backwards. - * - * In case we update the frequency for another cpu - * (which might be in guest context) send an interrupt - * to kick the cpu out of guest context. Next time - * guest context is entered kvmclock will be updated, - * so the guest will not see stale values. - */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - } - return 0; -} - -static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier -}; - -static int kvmclock_cpu_online(unsigned int cpu) -{ - tsc_khz_changed(NULL); - return 0; -} - -static void kvm_timer_init(void) -{ - max_tsc_khz = tsc_khz; - - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; - int cpu; - - memset(&policy, 0, sizeof(policy)); - cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; - put_cpu(); -#endif - cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE", - kvmclock_cpu_online, kvmclock_cpu_down_prep); -} static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) { __this_cpu_write(current_vcpu, vcpu); } -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) { __this_cpu_write(current_vcpu, NULL); } -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); static void kvm_set_mmio_spte_mask(void) { @@ -5852,53 +3879,9 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } -#ifdef CONFIG_X86_64 -static void pvclock_gtod_update_fn(struct work_struct *work) -{ - struct kvm *kvm; - - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); -} - -static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); - -/* - * Notification about pvclock gtod data update. - */ -static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, - void *priv) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - struct timekeeper *tk = priv; - - update_pvclock_gtod(tk); - - /* disable master clock if host does not trust, or does not - * use, TSC clocksource - */ - if (gtod->clock.vclock_mode != VCLOCK_TSC && - atomic_read(&kvm_guest_has_master_clock) != 0) - queue_work(system_long_wq, &pvclock_gtod_work); - - return 0; -} - -static struct notifier_block pvclock_gtod_notifier = { - .notifier_call = pvclock_gtod_notify, -}; -#endif - int kvm_arch_init(void *opaque) { - int r; + int r = -EFAULT, i; struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { @@ -5918,17 +3901,6 @@ int kvm_arch_init(void *opaque) goto out; } - r = -ENOMEM; - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out; - } - - r = kvm_mmu_module_init(); - if (r) - goto out_free_percpu; - kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; @@ -5936,79 +3908,46 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, PT_PRESENT_MASK); - kvm_timer_init(); - - perf_register_guest_info_callbacks(&kvm_guest_cbs); if (boot_cpu_has(X86_FEATURE_XSAVE)) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + /* We have to move array initialization here since gcc's extension + * of array initialization is not supported here. + */ + for (i = 0; i < XFEATURE_MAX; i++) + xstate_offsets[i] = xstate_sizes[i] = -1; kvm_lapic_init(); -#ifdef CONFIG_X86_64 - pvclock_gtod_register_notifier(&pvclock_gtod_notifier); -#endif return 0; -out_free_percpu: - free_percpu(shared_msrs); out: return r; } void kvm_arch_exit(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); -#ifdef CONFIG_X86_64 - pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); -#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); - free_percpu(shared_msrs); } int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + vcpu->arch.mp_state = GVM_MP_STATE_HALTED; return 1; } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; + vcpu->run->exit_reason = GVM_EXIT_HLT; return 0; } } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { kvm_x86_ops->skip_emulated_instruction(vcpu); return kvm_vcpu_halt(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -/* - * kvm_pv_kick_cpu_op: Kick a vcpu. - * - * @apicid - apicid of vcpu to be kicked. - */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) -{ - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = 0; - lapic_irq.dest_mode = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; - - lapic_irq.delivery_mode = APIC_DM_REMRD; - kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); -} void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) { @@ -6016,70 +3955,6 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; - - kvm_x86_ops->skip_emulated_instruction(vcpu); - - if (kvm_hv_hypercall_enabled(vcpu->kvm)) - return kvm_hv_hypercall(vcpu); - - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - - trace_kvm_hypercall(nr, a0, a1, a2, a3); - - op_64_bit = is_64_bit_mode(vcpu); - if (!op_64_bit) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } - - if (kvm_x86_ops->get_cpl(vcpu) != 0) { - ret = -KVM_EPERM; - goto out; - } - - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - case KVM_HC_KICK_CPU: - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); - ret = 0; - break; - default: - ret = -KVM_ENOSYS; - break; - } -out: - if (!op_64_bit) - ret = (u32)ret; - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - ++vcpu->stat.hypercalls; - return r; -} -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); - -static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - char instruction[3]; - unsigned long rip = kvm_rip_read(vcpu); - - kvm_x86_ops->patch_hypercall(vcpu, instruction); - - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); -} - static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { return vcpu->run->request_interrupt_window && @@ -6091,7 +3966,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; + kvm_run->flags = is_smm(vcpu) ? GVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); kvm_run->ready_for_interrupt_injection = @@ -6131,10 +4006,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -6182,7 +4053,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * calling check_nested_events again here to avoid a race condition. * See https://lkml.org/lkml/2014/7/2/60 for discussion about this * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? + * GVM_REQ_EVENT only on certain events and not unconditionally? */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); @@ -6213,7 +4084,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } #define put_smstate(type, buf, offset, val) \ @@ -6273,7 +4144,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) { struct desc_ptr dt; struct kvm_segment seg; - unsigned long val; + size_t val; int i; put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); @@ -6324,7 +4195,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) #ifdef CONFIG_X86_64 struct desc_ptr dt; struct kvm_segment seg; - unsigned long val; + size_t val; int i; for (i = 0; i < 16; i++) @@ -6383,7 +4254,6 @@ static void enter_smm(struct kvm_vcpu *vcpu) char buf[512]; u32 cr0; - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has_longmode(vcpu)) @@ -6448,12 +4318,12 @@ static void enter_smm(struct kvm_vcpu *vcpu) static void process_smi(struct kvm_vcpu *vcpu) { vcpu->arch.smi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } void kvm_make_scan_ioapic_request(struct kvm *kvm) { - kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); + kvm_make_all_cpus_request(kvm, GVM_REQ_SCAN_IOAPIC); } static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) @@ -6465,15 +4335,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); - if (irqchip_split(vcpu->kvm)) - kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } - bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, - vcpu_to_synic(vcpu)->vec_bitmap, 256); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + bitmap_copy((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, 256); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } @@ -6485,7 +4348,7 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; + pfn_t pfn = 0; if (!lapic_in_kernel(vcpu)) return; @@ -6493,29 +4356,128 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) + pfn = gfn_to_pfn(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_noslot_pfn(pfn)) return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + kvm_x86_ops->set_apic_access_page_addr(vcpu, pfn << PAGE_SHIFT); /* * Do not pin apic access page in memory, the MMU notifier * will call us again if it is migrated or swapped out. */ - put_page(page); + //put_page(page); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) + size_t address) { /* * The physical address of apic access page is stored in the VMCS. * Update it when it becomes invalid. */ if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + kvm_make_all_cpus_request(kvm, GVM_REQ_APIC_PAGE_RELOAD); +} + +//#define HOST_STAT_DEBUG +/* + * A useful tool to check whether host state remains the same across + * host->guest->host switches. In theory, host state should be saved/restored + * only when it is subject to change. However, without souce code and + * document, you never know. When something goes terribly wrong, this tool + * can help check whether it is caused by incomplete host stat restore. + */ +#ifdef HOST_STAT_DEBUG +#include <intrin.h> +struct host_stat { + struct desc_ptr gdt; + struct desc_ptr idt; + u16 cs_sel; + u16 ss_sel; + u16 ds_sel; + u16 es_sel; + u16 fs_sel; + u16 gs_sel; + u16 ldt_sel; + u16 tr_sel; + struct desc_struct cs; + struct desc_struct ss; + struct desc_struct ds; + struct desc_struct es; + struct desc_struct fs; + struct desc_struct gs; + struct desc_struct ldt; + struct desc_struct tr; + u64 fs_base; + u64 gs_base; + u64 kernel_gs_base; + u64 cr0; + u64 cr2; + u64 cr3; + u64 cr4; + u64 cr8; + u64 efer; + u64 star; + u64 lstar; + u64 cstar; + u64 sf_mask; + u64 sysenter_cs; + u64 sysenter_eip; + u64 sysenter_esp; +}; + +static void save_host_stat_full(struct host_stat *hs) +{ + struct desc_struct *gdt; + + _sgdt(&hs->gdt); + __sidt(&hs->idt); + + savesegment(cs, hs->ds_sel); + savesegment(ss, hs->ds_sel); + savesegment(ds, hs->ds_sel); + savesegment(es, hs->es_sel); + savesegment(fs, hs->fs_sel); + savesegment(gs, hs->gs_sel); + hs->ldt_sel = gvm_read_ldt(); + hs->tr_sel = gvm_read_tr(); + + gdt = (struct desc_struct *)hs->gdt.address; + hs->cs = gdt[hs->cs_sel >> 3]; + hs->ss = gdt[hs->ss_sel >> 3]; + hs->ds = gdt[hs->ds_sel >> 3]; + hs->es = gdt[hs->es_sel >> 3]; + hs->fs = gdt[hs->fs_sel >> 3]; + hs->gs = gdt[hs->gs_sel >> 3]; + hs->ldt = gdt[hs->ldt_sel >> 3]; + hs->tr = gdt[hs->tr_sel >> 3]; + + hs->fs_base = __readmsr(MSR_FS_BASE); + hs->gs_base = __readmsr(MSR_GS_BASE); + hs->kernel_gs_base = __readmsr(MSR_KERNEL_GS_BASE); + + hs->cr0 = __readcr0(); + hs->cr2 = __readcr2(); + hs->cr3 = __readcr3(); + hs->cr4 = __readcr4(); + hs->cr8 = __readcr8(); + + hs->efer = __readmsr(MSR_EFER); + hs->star = __readmsr(MSR_STAR); + hs->lstar = __readmsr(MSR_LSTAR); + hs->cstar = __readmsr(MSR_CSTAR); + hs->sf_mask = __readmsr(MSR_SYSCALL_MASK); + + hs->sysenter_cs = __readmsr(MSR_IA32_SYSENTER_CS); + hs->sysenter_eip = __readmsr(MSR_IA32_SYSENTER_EIP); + hs->sysenter_esp = __readmsr(MSR_IA32_SYSENTER_ESP); +} + +static int check_host_stat(struct host_stat *a, struct host_stat *b) +{ + return 0; } +#endif /* * Returns 1 to let vcpu_run() continue the guest execution loop without @@ -6530,100 +4492,46 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_cpu_accept_dm_intr(vcpu); bool req_immediate_exit = false; +#ifdef HOST_STAT_DEBUG + struct host_stat *enter = kzalloc(sizeof(struct host_stat), GFP_KERNEL); + struct host_stat *exit = kzalloc(sizeof(struct host_stat), GFP_KERNEL); +#endif if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + if (kvm_check_request(GVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); - if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) - __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); - if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) - kvm_gen_kvmclock_update(vcpu); - if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { - r = kvm_guest_time_update(vcpu); - if (unlikely(r)) - goto out; - } - if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) + if (kvm_check_request(GVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + if (kvm_check_request(GVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu); - if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; + if (kvm_check_request(GVM_REQ_REPORT_TPR_ACCESS, vcpu)) { + vcpu->run->exit_reason = GVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + if (kvm_check_request(GVM_REQ_TRIPLE_FAULT, vcpu)) { + vcpu->run->exit_reason = GVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } - if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { - /* Page is swapped out. Do synthetic halt */ - vcpu->arch.apf.halted = true; - r = 1; - goto out; - } - if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) - record_steal_time(vcpu); - if (kvm_check_request(KVM_REQ_SMI, vcpu)) + if (kvm_check_request(GVM_REQ_SMI, vcpu)) process_smi(vcpu); - if (kvm_check_request(KVM_REQ_NMI, vcpu)) + if (kvm_check_request(GVM_REQ_NMI, vcpu)) process_nmi(vcpu); - if (kvm_check_request(KVM_REQ_PMU, vcpu)) +#if 0 + if (kvm_check_request(GVM_REQ_PMU, vcpu)) kvm_pmu_handle_event(vcpu); - if (kvm_check_request(KVM_REQ_PMI, vcpu)) + if (kvm_check_request(GVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); - if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { - BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); - if (test_bit(vcpu->arch.pending_ioapic_eoi, - vcpu->arch.ioapic_handled_vectors)) { - vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; - vcpu->run->eoi.vector = - vcpu->arch.pending_ioapic_eoi; - r = 0; - goto out; - } - } - if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) +#endif + if (kvm_check_request(GVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); - if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) + if (kvm_check_request(GVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); - if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; - r = 0; - goto out; - } - if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; - r = 0; - goto out; - } - if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv = vcpu->arch.hyperv.exit; - r = 0; - goto out; - } - - /* - * KVM_REQ_HV_STIMER has to be processed after - * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers - * depend on the guest clock being up-to-date - */ - if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) - kvm_hv_process_stimers(vcpu); } /* - * KVM_REQ_EVENT is not set when posted interrupts are set by + * GVM_REQ_EVENT is not set when posted interrupts are set by * VT-d hardware, so we have to update RVI unconditionally. */ if (kvm_lapic_enabled(vcpu)) { @@ -6636,9 +4544,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_find_highest_irr(vcpu)); } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + if (kvm_check_request(GVM_REQ_EVENT, vcpu) || req_int_win) { kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + if (vcpu->arch.mp_state == GVM_MP_STATE_INIT_RECEIVED) { r = 1; goto out; } @@ -6674,14 +4582,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - preempt_disable(); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); + local_irq_disable(); +#ifdef HOST_STAT_DEBUG + save_host_stat_full(enter); +#endif + kvm_x86_ops->save_host_state(vcpu); vcpu->mode = IN_GUEST_MODE; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vcpu->cpu = smp_processor_id(); /* * We should set ->mode before check ->requests, @@ -6690,16 +4599,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * to the page tables done while the VCPU is running. * Please see the comment in kvm_flush_remote_tlbs. */ - smp_mb__after_srcu_read_unlock(); + smp_mb(); - local_irq_disable(); - - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests - || need_resched() || signal_pending(current)) { + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_x86_ops->load_host_state(vcpu); local_irq_enable(); - preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = 1; goto cancel_injection; @@ -6708,14 +4614,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_xcr0(vcpu); if (req_immediate_exit) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); smp_send_reschedule(vcpu->cpu); } - trace_kvm_entry(vcpu->vcpu_id); - wait_lapic_expire(vcpu); - guest_enter_irqoff(); - if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -6723,26 +4625,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_RELOAD; } + kvm_load_guest_fpu(vcpu); + kvm_x86_ops->run(vcpu); /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit * can (a) read the correct value of the debug registers, (b) set - * KVM_DEBUGREG_WONT_EXIT again. + * GVM_DEBUGREG_WONT_EXIT again. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { - WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + if (unlikely(vcpu->arch.switch_db_regs & GVM_DEBUGREG_WONT_EXIT)) { + WARN_ON(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP); kvm_x86_ops->sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_RELOAD; } +#if 0 /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. @@ -6752,36 +4657,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (hw_breakpoint_active()) hw_breakpoint_restore(); +#endif vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + kvm_save_guest_fpu(vcpu); + + //Set CPU to -1 since we don't know when we got scheduled to another + //cpu by Windows scheduler. + vcpu->cpu = -1; vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); kvm_put_guest_xcr0(vcpu); + kvm_x86_ops->load_host_state(vcpu); + kvm_x86_ops->vcpu_put(vcpu); +#ifdef HOST_STAT_DEBUG + save_host_stat_full(exit); + BUG_ON(check_host_stat(enter, exit)); +#endif kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; - guest_exit_irqoff(); - local_irq_enable(); - preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) { - unsigned long rip = kvm_rip_read(vcpu); - profile_hit(KVM_PROFILING, (void *)rip); - } - - if (unlikely(vcpu->arch.tsc_always_catchup)) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); @@ -6790,7 +4693,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) cancel_injection: kvm_x86_ops->cancel_injection(vcpu); - if (unlikely(vcpu->arch.apic_attention)) + if ((vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: return r; @@ -6798,29 +4701,23 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); - - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + if (!kvm_check_request(GVM_REQ_UNHALT, vcpu)) return 1; } kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { - case KVM_MP_STATE_HALTED: - vcpu->arch.pv.pv_unhalted = false; + case GVM_MP_STATE_HALTED: vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; + GVM_MP_STATE_RUNNABLE; + case GVM_MP_STATE_RUNNABLE: break; - case KVM_MP_STATE_INIT_RECEIVED: + case GVM_MP_STATE_INIT_RECEIVED: break; default: return -EINTR; @@ -6831,8 +4728,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted); + return (vcpu->arch.mp_state == GVM_MP_STATE_RUNNABLE); } static int vcpu_run(struct kvm_vcpu *vcpu) @@ -6852,31 +4748,21 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + clear_bit(GVM_REQ_PENDING_TIMER, &vcpu->requests); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); if (dm_request_for_irq_injection(vcpu) && kvm_vcpu_ready_for_interrupt_injection(vcpu)) { r = 0; - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + vcpu->run->exit_reason = GVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } - - kvm_check_async_pf_completion(vcpu); - - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; + if (test_and_clear_bit(0, (size_t *)&vcpu->run->user_event_pending)) { + vcpu->run->exit_reason = GVM_EXIT_INTR; break; } - if (need_resched()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - cond_resched(); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - } } srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); @@ -6925,6 +4811,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; unsigned len; + char *__data; BUG_ON(!vcpu->mmio_needed); @@ -6940,7 +4827,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->mmio_cur_fragment++; } else { /* Go forward to the next mmio piece. */ - frag->data += len; + __data = frag->data; + __data += len; + frag->data = __data; frag->gpa += len; frag->len -= len; } @@ -6955,7 +4844,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return complete_emulated_io(vcpu); } - run->exit_reason = KVM_EXIT_MMIO; + run->exit_reason = GVM_EXIT_MMIO; run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) memcpy(run->mmio.data, frag->data, min(8u, frag->len)); @@ -6968,19 +4857,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct fpu *fpu = ¤t->thread.fpu; int r; - sigset_t sigsaved; - - fpu__activate_curr(fpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + if (unlikely(vcpu->arch.mp_state == GVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + clear_bit(GVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; } @@ -7006,9 +4888,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: post_kvm_run_save(vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - return r; } @@ -7079,7 +4958,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -7092,7 +4971,6 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *db = cs.db; *l = cs.l; } -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -7128,7 +5006,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, - (unsigned long *)sregs->interrupt_bitmap); + (size_t *)sregs->interrupt_bitmap); return 0; } @@ -7137,11 +5015,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && - vcpu->arch.pv.pv_unhalted) - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; - else - mp_state->mp_state = vcpu->arch.mp_state; + mp_state->mp_state = vcpu->arch.mp_state; return 0; } @@ -7150,15 +5024,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + mp_state->mp_state != GVM_MP_STATE_RUNNABLE) return -EINVAL; - if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; - set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); + if (mp_state->mp_state == GVM_MP_STATE_SIPI_RECEIVED) { + vcpu->arch.mp_state = GVM_MP_STATE_INIT_RECEIVED; + set_bit(GVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else vcpu->arch.mp_state = mp_state->mp_state; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -7178,10 +5052,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(kvm_task_switch); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -7233,9 +5106,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - max_bits = KVM_NR_INTERRUPTS; + max_bits = GVM_NR_INTERRUPTS; pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); + (const size_t *)sregs->interrupt_bitmap, max_bits); if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); @@ -7257,9 +5130,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -7267,14 +5140,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - unsigned long rflags; + size_t rflags; int i, r; - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { + if (dbg->control & (GVM_GUESTDBG_INJECT_DB | GVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) goto out; - if (dbg->control & KVM_GUESTDBG_INJECT_DB) + if (dbg->control & GVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else kvm_queue_exception(vcpu, BP_VECTOR); @@ -7287,20 +5160,20 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, rflags = kvm_get_rflags(vcpu); vcpu->guest_debug = dbg->control; - if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) + if (!(vcpu->guest_debug & GVM_GUESTDBG_ENABLE)) vcpu->guest_debug = 0; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - for (i = 0; i < KVM_NR_DB_REGS; ++i) + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) { + for (i = 0; i < GVM_NR_DB_REGS; ++i) vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; } else { - for (i = 0; i < KVM_NR_DB_REGS; i++) + for (i = 0; i < GVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; } kvm_update_dr7(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + get_segment_base(vcpu, VCPU_SREG_CS); @@ -7325,7 +5198,7 @@ out: int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { - unsigned long vaddr = tr->linear_address; + size_t vaddr = tr->linear_address; gpa_t gpa; int idx; @@ -7343,7 +5216,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + &vcpu->arch.guest_fpu.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -7360,7 +5233,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + &vcpu->arch.guest_fpu.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7374,11 +5247,28 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +static inline void fpstate_init_fxstate(struct fxregs_state *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = 0x1f80; +} + +static void fpstate_init(union fpu_state *state) +{ + memset(state, 0, PAGE_SIZE); + +#if 0 + if (static_cpu_has(X86_FEATURE_XSAVES)) + fpstate_init_xstate(&state->xsave); +#endif + fpstate_init_fxstate(&state->fxsave); +} + static void fx_init(struct kvm_vcpu *vcpu) { - fpstate_init(&vcpu->arch.guest_fpu.state); + fpstate_init(&vcpu->arch.guest_fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = + vcpu->arch.guest_fpu.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -7389,54 +5279,78 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +static inline void fpu_fxsave(union fpu_state *fpu) { - if (vcpu->guest_fpu_loaded) - return; +#if 0 + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + } +#endif - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - vcpu->guest_fpu_loaded = 1; - __kernel_fpu_begin(); - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); - trace_kvm_fpu(1); +#ifdef _WIN64 + _fxsave64(&fpu->fxsave); +#else + _fxsave(&fpu->fxsave); +#endif } -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +static inline void fpu_fxstore(union fpu_state *fpu) { - if (!vcpu->guest_fpu_loaded) { - vcpu->fpu_counter = 0; +#if 0 + if (use_xsave()) { + copy_kernel_to_xregs(&fpstate->xsave, mask); return; } +#endif +#ifdef _WIN64 + _fxrstor64(&fpu->fxsave); +#else + _fxrstor(&fpu->fxsave); +#endif +} - vcpu->guest_fpu_loaded = 0; - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - __kernel_fpu_end(); - ++vcpu->stat.fpu_reload; - /* - * If using eager FPU mode, or if the guest is a frequent user - * of the FPU, just leave the FPU active for next time. - * Every 255 times fpu_counter rolls over to 0; a guest that uses - * the FPU in bursts will revert to loading it on demand. - */ - if (!use_eager_fpu()) { - if (++vcpu->fpu_counter < 5) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - } - trace_kvm_fpu(0); +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + uint64_t efer; + + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~EFER_FFXSR); + + fpu_fxsave(&vcpu->arch.host_fpu); + fpu_fxstore(&vcpu->arch.guest_fpu); + + if (efer & EFER_FFXSR) + wrmsrl(MSR_EFER, efer); } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_save_guest_fpu(struct kvm_vcpu *vcpu) { - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + uint64_t efer; - kvmclock_reset(vcpu); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_FFXSR) + wrmsrl(MSR_EFER, efer & ~EFER_FFXSR); + fpu_fxsave(&vcpu->arch.guest_fpu); + fpu_fxstore(&vcpu->arch.host_fpu); + + if (efer & EFER_FFXSR) + wrmsrl(MSR_EFER, efer); +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ kvm_x86_ops->vcpu_free(vcpu); - free_cpumask_var(wbinvd_dirty_mask); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, @@ -7456,47 +5370,25 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int r; - kvm_vcpu_mtrr_init(vcpu); - r = vcpu_load(vcpu); - if (r) - return r; kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); - vcpu_put(vcpu); - return r; + return 0; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct msr_data msr; - struct kvm *kvm = vcpu->kvm; - if (vcpu_load(vcpu)) - return; msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); - vcpu_put(vcpu); - - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int r; - vcpu->arch.apf.msr_val = 0; - - r = vcpu_load(vcpu); - BUG_ON(r); kvm_mmu_unload(vcpu); - vcpu_put(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -7521,18 +5413,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.cr2 = 0; - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; - vcpu->arch.st.msr_val = 0; - - kvmclock_reset(vcpu); - - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - vcpu->arch.apf.halted = false; + kvm_make_request(GVM_REQ_EVENT, vcpu); if (!init_event) { - kvm_pmu_reset(vcpu); + //kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; } @@ -7556,99 +5440,12 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) int kvm_arch_hardware_enable(void) { - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; - int ret; - u64 local_tsc; - u64 max_tsc = 0; - bool stable, backwards_tsc = false; - - kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); - if (ret != 0) - return ret; - - local_tsc = rdtsc(); - stable = !check_tsc_unstable(); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!stable && vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (stable && vcpu->arch.last_host_tsc > local_tsc) { - backwards_tsc = true; - if (vcpu->arch.last_host_tsc > max_tsc) - max_tsc = vcpu->arch.last_host_tsc; - } - } - } - - /* - * Sometimes, even reliable TSCs go backwards. This happens on - * platforms that reset TSC during suspend or hibernate actions, but - * maintain synchronization. We must compensate. Fortunately, we can - * detect that condition here, which happens early in CPU bringup, - * before any KVM threads can be running. Unfortunately, we can't - * bring the TSCs fully up to date with real time, as we aren't yet far - * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot - * variables that haven't been updated yet. - * - * So we simply find the maximum observed TSC above, then record the - * adjustment to TSC in each VCPU. When the VCPU later gets loaded, - * the adjustment will be applied. Note that we accumulate - * adjustments, in case multiple suspend cycles happen before some VCPU - * gets a chance to run again. In the event that no KVM threads get a - * chance to run, we will miss the entire elapsed period, as we'll have - * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may - * loose cycle time. This isn't too big a deal, since the loss will be - * uniform across all VCPUs (not to mention the scenario is extremely - * unlikely). It is possible that a second hibernate recovery happens - * much faster than a first, causing the observed TSC here to be - * smaller; this would require additional padding adjustment, which is - * why we set last_host_tsc to the local tsc observed here. - * - * N.B. - this code below runs only on platforms with reliable TSC, - * as that is the only way backwards_tsc is set above. Also note - * that this runs for ALL vcpus, which is not a bug; all VCPUs should - * have the same delta_cyc adjustment applied if backwards_tsc - * is detected. Note further, this adjustment is only done once, - * as we reset last_host_tsc on all VCPUs to stop this from being - * called multiple times (one for each physical CPU bringup). - * - * Platforms with unreliable TSCs don't have to deal with this, they - * will be compensated by the logic in vcpu_load, which sets the TSC to - * catchup mode. This will catchup all VCPUs to real time, but cannot - * guarantee that they stay in perfect synchronization. - */ - if (backwards_tsc) { - u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.tsc_offset_adjustment += delta_cyc; - vcpu->arch.last_host_tsc = local_tsc; - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - } - - /* - * We have to disable TSC offset matching.. if you were - * booting a VM while issuing an S4 host suspend.... - * you may have some problem. Solving this issue is - * left as an exercise to the reader. - */ - kvm->arch.last_tsc_nsec = 0; - kvm->arch.last_tsc_write = 0; - } - - } - return 0; + return kvm_x86_ops->hardware_enable(); } void kvm_arch_hardware_disable(void) { kvm_x86_ops->hardware_disable(); - drop_user_return_notifiers(); } int kvm_arch_hardware_setup(void) @@ -7659,20 +5456,6 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; - if (kvm_has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated needed because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; - } - kvm_init_msr_list(); return 0; } @@ -7691,19 +5474,16 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -struct static_key kvm_no_apic_vcpu __read_mostly; -EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); +int kvm_no_apic_vcpu = 1; int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - struct page *page; struct kvm *kvm; int r; @@ -7711,50 +5491,27 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); - vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); + vcpu->arch.mp_state = GVM_MP_STATE_UNINITIALIZED; - kvm_set_tsc_khz(vcpu, max_tsc_khz); + vcpu->arch.pio_data = (void *)((size_t)vcpu->run + PAGE_SIZE); r = kvm_mmu_create(vcpu); if (r < 0) - goto fail_free_pio_data; + goto fail; if (irqchip_in_kernel(kvm)) { r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; - } else - static_key_slow_inc(&kvm_no_apic_vcpu); - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { - r = -ENOMEM; - goto fail_free_mce_banks; - } + } fx_init(vcpu); vcpu->arch.ia32_tsc_adjust_msr = 0x0; - vcpu->arch.pv_time_enabled = false; vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -7763,23 +5520,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); + //kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; - kvm_hv_vcpu_init(vcpu); - return 0; -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_lapic: - kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); fail: return r; } @@ -7788,24 +5536,14 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; - kvm_hv_vcpu_uninit(vcpu); - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); + //kvm_pmu_destroy(vcpu); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); - if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); -} - -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - kvm_x86_ops->sched_in(vcpu, cpu); } -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +int kvm_arch_init_vm(struct kvm *kvm, size_t type) { if (type) return -EINVAL; @@ -7813,24 +5551,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); - atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); + set_bit(GVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); - pvclock_update_vm_gtod_copy(kvm); - - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); @@ -7843,11 +5569,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - int r; - r = vcpu_load(vcpu); - BUG_ON(r); kvm_mmu_unload(vcpu); - vcpu_put(vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -7859,7 +5581,6 @@ static void kvm_free_vcpus(struct kvm *kvm) * Unpin any mmu pages first. */ kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) @@ -7873,23 +5594,15 @@ static void kvm_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); - kvm_free_pit(kvm); -} - int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + size_t hva; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot, old; /* Called with kvm->slots_lock held. */ - if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) + if (WARN_ON(id >= GVM_MEM_SLOTS_NUM)) return -EINVAL; slot = id_to_memslot(slots, id); @@ -7913,7 +5626,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } old = *slot; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; m.slot = id | (i << 16); @@ -7933,7 +5646,6 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) return 0; } -EXPORT_SYMBOL_GPL(__x86_set_memory_region); int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { @@ -7945,11 +5657,10 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) return r; } -EXPORT_SYMBOL_GPL(x86_set_memory_region); void kvm_arch_destroy_vm(struct kvm *kvm) { - if (current->mm == kvm->mm) { + if (IoGetCurrentProcess() == kvm->process) { /* * Free memory regions allocated on behalf of userspace, * unless the the memory map has changed due to process exit @@ -7961,82 +5672,31 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference(kvm->arch.apic_map)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_destroy(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } - if (i == 0) - continue; - - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + if (!dont || free->arch.rmap != dont->arch.rmap) { + kvfree(free->arch.rmap); + free->arch.rmap = NULL; } - kvm_page_track_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) + size_t npages) { - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - struct kvm_lpage_info *linfo; - unsigned long ugfn; - int lpages; - int level = i + 1; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); - if (!linfo) - goto out_free; - - slot->arch.lpage_info[i - 1] = linfo; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - linfo[0].disallow_lpage = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - linfo[lpages - 1].disallow_lpage = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - linfo[j].disallow_lpage = 1; - } - } + slot->arch.rmap = + kvm_kvzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + goto out_free; if (kvm_page_track_create_memslot(slot, npages)) goto out_free; @@ -8044,15 +5704,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; - - kvfree(slot->arch.lpage_info[i - 1]); - slot->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.rmap); return -ENOMEM; } @@ -8077,7 +5729,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *new) { /* Still write protect RO slot */ - if (new->flags & KVM_MEM_READONLY) { + if (new->flags & GVM_MEM_READONLY) { kvm_mmu_slot_remove_write_access(kvm, new); return; } @@ -8087,8 +5739,8 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * kvm_x86_ops->slot_disable_log_dirty is called when: * - * - KVM_MR_CREATE with dirty logging is disabled - * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag + * - GVM_MR_CREATE with dirty logging is disabled + * - GVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag * * The reason is, in case of PML, we need to set D-bit for any slots * with dirty logging disabled in order to eliminate unnecessary GPA @@ -8112,7 +5764,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * See the comments in fast_page_fault(). */ - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (new->flags & GVM_MEM_LOG_DIRTY_PAGES) { if (kvm_x86_ops->slot_enable_log_dirty) kvm_x86_ops->slot_enable_log_dirty(kvm, new); else @@ -8149,22 +5801,22 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. */ - if ((change != KVM_MR_DELETE) && - (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if ((change != GVM_MR_DELETE) && + (old->flags & GVM_MEM_LOG_DIRTY_PAGES) && + !(new->flags & GVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); /* * Set up write protection and/or dirty logging for the new slot. * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have + * For GVM_MR_DELETE and GVM_MR_MOVE, the shadow pages of old slot have * been zapped so no dirty logging staff is needed for old slot. For - * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the + * GVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the * new and it's also covered when dealing with the new slot. * * FIXME: const-ify all uses of struct kvm_memory_slot. */ - if (change != KVM_MR_DELETE) + if (change != GVM_MR_DELETE) kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); } @@ -8181,28 +5833,19 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { - if (!list_empty_careful(&vcpu->async_pf.done)) - return true; - if (kvm_apic_has_events(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (atomic_read(&vcpu->arch.nmi_queued)) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (test_bit(GVM_REQ_SMI, &vcpu->requests)) return true; if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) return true; - if (kvm_hv_has_stimer_pending(vcpu)) - return true; - return false; } @@ -8224,295 +5867,45 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } -unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) +size_t kvm_get_linear_rip(struct kvm_vcpu *vcpu) { if (is_64_bit_mode(vcpu)) return kvm_rip_read(vcpu); return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + kvm_rip_read(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_get_linear_rip); -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, size_t linear_rip) { return kvm_get_linear_rip(vcpu) == linear_rip; } -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +size_t kvm_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + size_t rflags; rflags = kvm_x86_ops->get_rflags(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; } -EXPORT_SYMBOL_GPL(kvm_get_rflags); -static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); } -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +void kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { __kvm_set_rflags(vcpu, rflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_set_rflags); - -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - work->wakeup_all) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) - return; - - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); -} - -static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -{ - return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); -} - -static inline u32 kvm_async_pf_next_probe(u32 key) -{ - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); -} - -static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 key = kvm_async_pf_hash_fn(gfn); - - while (vcpu->arch.apf.gfns[key] != ~0) - key = kvm_async_pf_next_probe(key); - - vcpu->arch.apf.gfns[key] = gfn; -} - -static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int i; - u32 key = kvm_async_pf_hash_fn(gfn); - - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && - (vcpu->arch.apf.gfns[key] != gfn && - vcpu->arch.apf.gfns[key] != ~0); i++) - key = kvm_async_pf_next_probe(key); - - return key; -} - -bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; -} - -static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 i, j, k; - - i = j = kvm_async_pf_gfn_slot(vcpu, gfn); - while (true) { - vcpu->arch.apf.gfns[i] = ~0; - do { - j = kvm_async_pf_next_probe(j); - if (vcpu->arch.apf.gfns[j] == ~0) - return; - k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); - /* - * k lies cyclically in ]i,j] - * | i.k.j | - * |....j i.k.| or |.k..j i...| - */ - } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); - vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; - i = j; - } -} - -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); -} - -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_not_present(work->arch.token, work->gva); - kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } -} - -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (work->wakeup_all) - work->arch.token = ~0; /* broadcast wakeup */ - else - kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } - vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -} - -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) - return true; - else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); -} - -void kvm_arch_start_assignment(struct kvm *kvm) -{ - atomic_inc(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); - -void kvm_arch_end_assignment(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); - -bool kvm_arch_has_assigned_device(struct kvm *kvm) -{ - return atomic_read(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); - -void kvm_arch_register_noncoherent_dma(struct kvm *kvm) -{ - atomic_inc(&kvm->arch.noncoherent_dma_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); - -void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.noncoherent_dma_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); - -bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) -{ - return atomic_read(&kvm->arch.noncoherent_dma_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); - -bool kvm_arch_has_irq_bypass(void) -{ - return kvm_x86_ops->update_pi_irte != NULL; -} - -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - irqfd->producer = prod; - - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); -} - -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - int ret; - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; - - /* - * When producer of consumer is unregistered, we change back to - * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabled or the consumer side (KVM - * int this case doesn't want to receive the interrupts. - */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); -} - -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + kvm_make_request(GVM_REQ_EVENT, vcpu); } bool kvm_vector_hashing_enabled(void) { return vector_hashing; } -EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); - -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); + diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4..0b6b308 100644..100755 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,9 +1,14 @@ -#ifndef ARCH_X86_KVM_X86_H -#define ARCH_X86_KVM_X86_H +/* + * Copyright 2019 Google LLC + */ + +#ifndef ARCH_X86_GVM_X86_H +#define ARCH_X86_GVM_X86_H #include <linux/kvm_host.h> -#include <asm/pvclock.h> +#include <gvm_types.h> #include "kvm_cache_regs.h" +#include <asm/msr-index.h> #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL @@ -67,17 +72,17 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) static inline int is_pae(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); + return (int)kvm_read_cr4_bits(vcpu, X86_CR4_PAE); } static inline int is_pse(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); + return (int)kvm_read_cr4_bits(vcpu, X86_CR4_PSE); } static inline int is_paging(struct kvm_vcpu *vcpu) { - return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); + return likely((int)kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } static inline u32 bit(int bitno) @@ -113,7 +118,7 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) vcpu->arch.mmio_gva = 0; } -static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) +static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, size_t gva) { if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) @@ -131,21 +136,21 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, +static inline size_t kvm_register_readl(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - unsigned long val = kvm_register_read(vcpu, reg); + size_t val = kvm_register_read(vcpu, reg); return is_64_bit_mode(vcpu) ? val : (u32)val; } static inline void kvm_register_writel(struct kvm_vcpu *vcpu, enum kvm_reg reg, - unsigned long val) + size_t val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; - return kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, val); } static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) @@ -178,7 +183,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); -#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ +#define GVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU) @@ -190,13 +195,7 @@ extern unsigned int min_timer_period_us; extern unsigned int lapic_timer_advance_ns; -extern struct static_key kvm_no_apic_vcpu; - -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) -{ - return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); -} +extern int kvm_no_apic_vcpu; /* Same "calling convention" as do_div: * - divide (n << 32) by base |