diff options
author | Haitao Shan <hshan@google.com> | 2019-11-06 13:39:17 -0800 |
---|---|---|
committer | Haitao Shan <hshan@google.com> | 2019-11-06 13:39:17 -0800 |
commit | 5e1e4edf91bbd320dfd914b210a2f47d5533ac50 (patch) | |
tree | 54207657ffc86c9fc508046324aeded789325b34 | |
parent | d9c0b9070fc0aca672f828bc7c34c093fca059a0 (diff) | |
download | aehd-5e1e4edf91bbd320dfd914b210a2f47d5533ac50.tar.gz |
Android Emulator Hypervisor Driver for AMD Processors 1.0
122 files changed, 17220 insertions, 29512 deletions
diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..e877789 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__asm.inc +gvm/.vs/* +gvm/DriverTest/* +**/x64/* +cscope* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..654a071 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,28 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to <https://cla.developers.google.com/> to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows [Google's Open Source Community +Guidelines](https://opensource.google/conduct/). @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4f4b3ea --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# Android Emulator Hypervisor Driver for AMD Processors +Android Emulator Hypervisor Driver for AMD Processors is a hypervisor to +accelerate [Android Emulator][android-studio]. It is made by porting KVM to +Windows (Windows 7 or later, 64bit). + +Android Emulator Hypervisor Driver for AMD Processors runs as a Windows driver. +User space support for Android Emulator Hypervisor Driver for AMD Processors is +available from Android Emulator. + +## Downloads +Android Emulator Hypervisor Driver for AMD Processors is released through +[android-studio]. + +## Contributing +If you would like to contribute a patch to the code base, please read +[these guidelines](CONTRIBUTING.md). + +## Reporting an Issue +You are welcome to file an issue at [Issuetracker]. Please remember to supply +your OS information, CPU model in addition to details on the issue. + +## Notes +At the time of open source releasing, user space support from [qemu] is NOT +offered. But it should be straight forward if you need to port from Android +Emulator to qemu. + +As its name suggests, Android Emulator Hypervisor Driver for AMD Processors is +developed and tested on AMD platform. We only make our best effort in keeping +Intel Processor support. + +[android-studio]: https://developer.android.com/studio/index.html +[qemu]: https://www.qemu.org/ +[Issuetracker]: https://issuetracker.google.com/issues?q=componentid:192727 @@ -0,0 +1,170 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#pragma once +// assembly function declaration +#include <gvm_types.h> + +extern u16 gvm_read_ldt(void); +extern void gvm_load_ldt(u16 sel); +extern void load_TR_desc(void); +extern u16 gvm_read_tr(void); +extern void gvm_load_tr(u16 sel); + +#pragma warning(disable : 4210) +#define savesegment(seg, value) \ +extern u16 save_##seg ##_segment(void); \ +value = save_##seg ##_segment() + +#define loadsegment(seg, value) \ +extern u16 load_##seg ##_segment(u16 sel); \ +load_##seg ##_segment(value) + +extern void load_gs_index(u16 value); +extern void __asm_vmx_vcpu_run(void *vmx); +extern void __asm_vmx_handle_external_intr(size_t entry); + +extern void __asm_svm_vcpu_run(void *svm); + +extern void __int2(void); +extern void __int12(void); + +//debug register +extern u64 __read_dr0(); +extern u64 __read_dr1(); +extern u64 __read_dr2(); +extern u64 __read_dr3(); +extern u64 __read_dr6(); +extern u64 __read_dr7(); +extern void __write_dr0(u64 val); +extern void __write_dr1(u64 val); +extern void __write_dr2(u64 val); +extern void __write_dr3(u64 val); +extern void __write_dr6(u64 val); +extern void __write_dr7(u64 val); + +#define dr_read_case(regno) \ +case regno: \ + val = __read_dr##regno(); \ + break + +static __forceinline u64 __get_debugreg(int regno) +{ + u64 val = 0; + + switch (regno) { + dr_read_case(0); + dr_read_case(1); + dr_read_case(2); + dr_read_case(3); + dr_read_case(6); + dr_read_case(7); + default: + BUG(); + } + return val; +} +#define get_debugreg(a, b) a = __get_debugreg(b) + +#define dr_write_case(regno) \ +case regno: \ + __write_dr##regno(val); \ + break + +static __forceinline void set_debugreg(u64 val, int regno) +{ + switch (regno) { + dr_write_case(0); + dr_write_case(1); + dr_write_case(2); + dr_write_case(3); + dr_write_case(6); + dr_write_case(7); + default: + BUG(); + } +} + +//mmx +extern void __asm_save_mm0(u64 *data); +extern void __asm_save_mm1(u64 *data); +extern void __asm_save_mm2(u64 *data); +extern void __asm_save_mm3(u64 *data); +extern void __asm_save_mm4(u64 *data); +extern void __asm_save_mm5(u64 *data); +extern void __asm_save_mm6(u64 *data); +extern void __asm_save_mm7(u64 *data); +extern void __asm_store_mm0(u64 *data); +extern void __asm_store_mm1(u64 *data); +extern void __asm_store_mm2(u64 *data); +extern void __asm_store_mm3(u64 *data); +extern void __asm_store_mm4(u64 *data); +extern void __asm_store_mm5(u64 *data); +extern void __asm_store_mm6(u64 *data); +extern void __asm_store_mm7(u64 *data); + +//fpu +extern void __fninit(void); +extern void __fnstcw(u16 *fcw); +extern void __fnstsw(u16 *fcw); +extern void __fwait(void); +extern void __clts(void); + +//bswap +extern void __bswap64(u64 *val); +extern void __bswap32(u32 *val); + +#define read_cr0 __readcr0 +#define read_cr3 __readcr3 + +#define stts() __writecr0(__readcr0() | X86_CR0_TS) + +#define load_gdt(pdesc) _lgdt((void *)pdesc) +#define load_idt(pdesc) __lidt((void *)pdesc) + +static __forceinline size_t cr4_read_shadow(void) +{ + return __readcr4(); +} + +static __forceinline void cr4_set_bits(size_t mask) +{ + size_t cr4 = __readcr4(); + + if ((cr4 | mask) != cr4) + { + cr4 |= mask; + __writecr4(cr4); + } +} + +static __forceinline void cr4_clear_bits(size_t mask) +{ + size_t cr4 = __readcr4(); + + if ((cr4 & ~mask) != cr4) + { + cr4 &= ~mask; + __writecr4(cr4); + } +} + +static __forceinline void native_store_gdt(void *gdt) +{ + _sgdt(gdt); +} + +static __forceinline void native_store_idt(void *idt) +{ + __sidt(idt); +} diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h new file mode 100755 index 0000000..439aca7 --- /dev/null +++ b/arch/x86/include/asm/apicdef.h @@ -0,0 +1,454 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef _ASM_X86_APICDEF_H +#define _ASM_X86_APICDEF_H + +#include <gvm_types.h> + +/* + * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) + * + * Alan Cox <Alan.Cox@linux.org>, 1995. + * Ingo Molnar <mingo@redhat.com>, 1999, 2000 + */ + +#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 + +/* + * This is the IO-APIC register space as specified + * by Intel docs: + */ +#define IO_APIC_SLOT_SIZE 1024 + +#define APIC_ID 0x20 + +#define APIC_LVR 0x30 +#define APIC_LVR_MASK 0xFF00FF +#define APIC_LVR_DIRECTED_EOI (1 << 24) +#define GET_APIC_VERSION(x) ((x) & 0xFFu) +#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) +#ifdef CONFIG_X86_32 +# define APIC_INTEGRATED(x) ((x) & 0xF0u) +#else +# define APIC_INTEGRATED(x) (1) +#endif +#define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_EXT_SPACE(x) ((x) & 0x80000000) +#define APIC_TASKPRI 0x80 +#define APIC_TPRI_MASK 0xFFu +#define APIC_ARBPRI 0x90 +#define APIC_ARBPRI_MASK 0xFFu +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_EOI_ACK 0x0 /* Docs say 0 for future compat. */ +#define APIC_RRR 0xC0 +#define APIC_LDR 0xD0 +#define APIC_LDR_MASK (0xFFu << 24) +#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu) +#define SET_APIC_LOGICAL_ID(x) (((x) << 24)) +#define APIC_ALL_CPUS 0xFFu +#define APIC_DFR 0xE0 +#define APIC_DFR_CLUSTER 0x0FFFFFFFul +#define APIC_DFR_FLAT 0xFFFFFFFFul +#define APIC_SPIV 0xF0 +#define APIC_SPIV_DIRECTED_EOI (1 << 12) +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 +#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */ +#define APIC_TMR 0x180 +#define APIC_IRR 0x200 +#define APIC_ESR 0x280 +#define APIC_ESR_SEND_CS 0x00001 +#define APIC_ESR_RECV_CS 0x00002 +#define APIC_ESR_SEND_ACC 0x00004 +#define APIC_ESR_RECV_ACC 0x00008 +#define APIC_ESR_SENDILL 0x00020 +#define APIC_ESR_RECVILL 0x00040 +#define APIC_ESR_ILLREGA 0x00080 +#define APIC_LVTCMCI 0x2f0 +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define APIC_LVTT 0x320 +#define APIC_LVTTHMR 0x330 +#define APIC_LVTPC 0x340 +#define APIC_LVT0 0x350 +#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) +#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) +#define SET_APIC_TIMER_BASE(x) (((x) << 18)) +#define APIC_TIMER_BASE_CLKIN 0x0 +#define APIC_TIMER_BASE_TMBASE 0x1 +#define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_ONESHOT (0 << 17) +#define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) +#define APIC_LVT_MASKED (1 << 16) +#define APIC_LVT_LEVEL_TRIGGER (1 << 15) +#define APIC_LVT_REMOTE_IRR (1 << 14) +#define APIC_INPUT_POLARITY (1 << 13) +#define APIC_SEND_PENDING (1 << 12) +#define APIC_MODE_MASK 0x700 +#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7) +#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8)) +#define APIC_MODE_FIXED 0x0 +#define APIC_MODE_NMI 0x4 +#define APIC_MODE_EXTINT 0x7 +#define APIC_LVT1 0x360 +#define APIC_LVTERR 0x370 +#define APIC_TMICT 0x380 +#define APIC_TMCCT 0x390 +#define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 +#define APIC_TDR_DIV_TMBASE (1 << 2) +#define APIC_TDR_DIV_1 0xB +#define APIC_TDR_DIV_2 0x0 +#define APIC_TDR_DIV_4 0x1 +#define APIC_TDR_DIV_8 0x2 +#define APIC_TDR_DIV_16 0x3 +#define APIC_TDR_DIV_32 0x8 +#define APIC_TDR_DIV_64 0x9 +#define APIC_TDR_DIV_128 0xA +#define APIC_EFEAT 0x400 +#define APIC_ECTRL 0x410 +#define APIC_EILVTn(n) (0x500 + 0x10 * n) +#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ +#define APIC_EILVT_NR_AMD_10H 4 +#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H +#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) +#define APIC_EILVT_MSG_FIX 0x0 +#define APIC_EILVT_MSG_SMI 0x2 +#define APIC_EILVT_MSG_NMI 0x4 +#define APIC_EILVT_MSG_EXT 0x7 +#define APIC_EILVT_MASKED (1 << 16) + +#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) +#define APIC_BASE_MSR 0x800 +#define XAPIC_ENABLE (1ULL << 11) +#define X2APIC_ENABLE (1ULL << 10) + +#ifdef CONFIG_X86_32 +# define MAX_IO_APICS 64 +# define MAX_LOCAL_APIC 256 +#else +# define MAX_IO_APICS 128 +# define MAX_LOCAL_APIC 32768 +#endif + +/* + * All x86-64 systems are xAPIC compatible. + * In the following, "apicid" is a physical APIC ID. + */ +#define XAPIC_DEST_CPUS_SHIFT 4 +#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) +#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK) +#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT) +#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) +#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) + +/* + * the local APIC register structure, memory mapped. Not terribly well + * tested, but we might eventually use this one in the future - the + * problem why we cannot use it right now is the P5 APIC, it has an + * errata which cannot take 8-bit reads and writes, only 32-bit ones ... + */ +#define u32 unsigned int + +#pragma pack(push, 1) +// It seems Windows SDK/WDK defines __reserved which causes conflict here +#undef __reserved +struct local_apic { + + /*000*/ struct { u32 __reserved[4]; } __reserved_01; + + /*010*/ struct { u32 __reserved[4]; } __reserved_02; + + /*020*/ struct { /* APIC ID Register */ + u32 __reserved_1 : 24, + phys_apic_id : 4, + __reserved_2 : 4; + u32 __reserved[3]; + } id; + + /*030*/ const + struct { /* APIC Version Register */ + u32 version : 8, + __reserved_1 : 8, + max_lvt : 8, + __reserved_2 : 8; + u32 __reserved[3]; + } version; + + /*040*/ struct { u32 __reserved[4]; } __reserved_03; + + /*050*/ struct { u32 __reserved[4]; } __reserved_04; + + /*060*/ struct { u32 __reserved[4]; } __reserved_05; + + /*070*/ struct { u32 __reserved[4]; } __reserved_06; + + /*080*/ struct { /* Task Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } tpr; + + /*090*/ const + struct { /* Arbitration Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } apr; + + /*0A0*/ const + struct { /* Processor Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } ppr; + + /*0B0*/ struct { /* End Of Interrupt Register */ + u32 eoi; + u32 __reserved[3]; + } eoi; + + /*0C0*/ struct { u32 __reserved[4]; } __reserved_07; + + /*0D0*/ struct { /* Logical Destination Register */ + u32 __reserved_1 : 24, + logical_dest : 8; + u32 __reserved_2[3]; + } ldr; + + /*0E0*/ struct { /* Destination Format Register */ + u32 __reserved_1 : 28, + model : 4; + u32 __reserved_2[3]; + } dfr; + + /*0F0*/ struct { /* Spurious Interrupt Vector Register */ + u32 spurious_vector : 8, + apic_enabled : 1, + focus_cpu : 1, + __reserved_2 : 22; + u32 __reserved_3[3]; + } svr; + + /*100*/ struct { /* In Service Register */ + /*170*/ u32 bitfield; + u32 __reserved[3]; + } isr[8]; + + /*180*/ struct { /* Trigger Mode Register */ + /*1F0*/ u32 bitfield; + u32 __reserved[3]; + } tmr[8]; + + /*200*/ struct { /* Interrupt Request Register */ + /*270*/ u32 bitfield; + u32 __reserved[3]; + } irr[8]; + + /*280*/ union { /* Error Status Register */ + struct { + u32 send_cs_error : 1, + receive_cs_error : 1, + send_accept_error : 1, + receive_accept_error : 1, + __reserved_1 : 1, + send_illegal_vector : 1, + receive_illegal_vector : 1, + illegal_register_address : 1, + __reserved_2 : 24; + u32 __reserved_3[3]; + } error_bits; + struct { + u32 errors; + u32 __reserved_3[3]; + } all_errors; + } esr; + + /*290*/ struct { u32 __reserved[4]; } __reserved_08; + + /*2A0*/ struct { u32 __reserved[4]; } __reserved_09; + + /*2B0*/ struct { u32 __reserved[4]; } __reserved_10; + + /*2C0*/ struct { u32 __reserved[4]; } __reserved_11; + + /*2D0*/ struct { u32 __reserved[4]; } __reserved_12; + + /*2E0*/ struct { u32 __reserved[4]; } __reserved_13; + + /*2F0*/ struct { u32 __reserved[4]; } __reserved_14; + + /*300*/ struct { /* Interrupt Command Register 1 */ + u32 vector : 8, + delivery_mode : 3, + destination_mode : 1, + delivery_status : 1, + __reserved_1 : 1, + level : 1, + trigger : 1, + __reserved_2 : 2, + shorthand : 2, + __reserved_3 : 12; + u32 __reserved_4[3]; + } icr1; + + /*310*/ struct { /* Interrupt Command Register 2 */ + union { + u32 __reserved_1 : 24, + phys_dest : 4, + __reserved_2 : 4; + u32 __reserved_3 : 24, + logical_dest : 8; + } dest; + u32 __reserved_4[3]; + } icr2; + + /*320*/ struct { /* LVT - Timer */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + timer_mode : 1, + __reserved_3 : 14; + u32 __reserved_4[3]; + } lvt_timer; + + /*330*/ struct { /* LVT - Thermal Sensor */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_thermal; + + /*340*/ struct { /* LVT - Performance Counter */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_pc; + + /*350*/ struct { /* LVT - LINT0 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint0; + + /*360*/ struct { /* LVT - LINT1 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint1; + + /*370*/ struct { /* LVT - Error */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_error; + + /*380*/ struct { /* Timer Initial Count Register */ + u32 initial_count; + u32 __reserved_2[3]; + } timer_icr; + + /*390*/ const + struct { /* Timer Current Count Register */ + u32 curr_count; + u32 __reserved_2[3]; + } timer_ccr; + + /*3A0*/ struct { u32 __reserved[4]; } __reserved_16; + + /*3B0*/ struct { u32 __reserved[4]; } __reserved_17; + + /*3C0*/ struct { u32 __reserved[4]; } __reserved_18; + + /*3D0*/ struct { u32 __reserved[4]; } __reserved_19; + + /*3E0*/ struct { /* Timer Divide Configuration Register */ + u32 divisor : 4, + __reserved_1 : 28; + u32 __reserved_2[3]; + } timer_dcr; + + /*3F0*/ struct { u32 __reserved[4]; } __reserved_20; + +}; +#pragma pack(pop) + +#undef u32 + +#ifdef CONFIG_X86_32 + #define BAD_APICID 0xFFu +#else + #define BAD_APICID 0xFFFFu +#endif + +enum ioapic_irq_destination_types { + dest_Fixed = 0, + dest_LowestPrio = 1, + dest_SMI = 2, + dest__reserved_1 = 3, + dest_NMI = 4, + dest_INIT = 5, + dest__reserved_2 = 6, + dest_ExtINT = 7 +}; + +#endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h new file mode 100755 index 0000000..7cc6ac6 --- /dev/null +++ b/arch/x86/include/asm/cpufeatures.h @@ -0,0 +1,310 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef _ASM_X86_CPUFEATURES_H +#define _ASM_X86_CPUFEATURES_H + +/* + * Defines x86 CPU feature bits + */ +#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ + +/* + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +/* Don't duplicate feature flags which are redundant with Intel! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ + +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ + +/* Other features, Linux-defined mapping, word 3 */ +/* This range is used for feature bits which conflict or are synthesized */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +/* cpu types for specific tunings: */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ + +/* + * Auxiliary flags: Linux defined - For features scattered in various + * CPUID levels like 0x6, 0xA etc, word 7. + * + * Reuse free bits when adding new feature flags! + */ + +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ + +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + +/* Virtualization flags: Linux defined, word 8 */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + +/* + * BUG word(s) + */ +#define X86_BUG(x) (NCAPINTS*32 + (x)) + +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#ifdef CONFIG_X86_32 +/* + * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional + * to avoid confusion. + */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#endif +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h new file mode 100755 index 0000000..55397e4 --- /dev/null +++ b/arch/x86/include/asm/fpu/types.h @@ -0,0 +1,283 @@ +/* + * Copyright 2019 Google LLC + */ + +/* + * FPU data structures: + */ +#ifndef _ASM_X86_FPU_H +#define _ASM_X86_FPU_H + +#include <gvm_types.h> + +/* + * The legacy x87 FPU state format, as saved by FSAVE and + * restored by the FRSTOR instructions: + */ +struct fregs_state { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE]: */ + u32 status; +}; + +/* + * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and + * restored by the FXRSTOR instructions. It's similar to the FSAVE + * format, but differs in some areas, plus has extensions at + * the end for the XMM registers. + */ +__align(16) +struct fxregs_state { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +}; + +/* Default value for fxregs_state.mxcsr: */ +#define MXCSR_DEFAULT 0x1f80 + +/* + * Software based FPU emulation state. This is arbitrary really, + * it matches the x87 format to make it easier to understand: + */ +struct swregs_state { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + /* + * Values above here are "legacy states". + * Those below are "extended states". + */ + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + XFEATURE_PT_UNIMPLEMENTED_SO_FAR, + XFEATURE_PKRU, + + XFEATURE_MAX, +}; + +#define XFEATURE_MASK_FP (1 << XFEATURE_FP) +#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) +#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) +#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) +#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) +#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) +#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) +#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) +#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) + +#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ + | XFEATURE_MASK_ZMM_Hi256 \ + | XFEATURE_MASK_Hi16_ZMM) + +#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM + +struct reg_128_bit { + u8 regbytes[128/8]; +}; +struct reg_256_bit { + u8 regbytes[256/8]; +}; +struct reg_512_bit { + u8 regbytes[512/8]; +}; + +#undef __packed +#define __packed +#pragma pack(push, 1) +/* + * State component 2: + * + * There are 16x 256-bit AVX registers named YMM0-YMM15. + * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) + * and are stored in 'struct fxregs_state::xmm_space[]' in the + * "legacy" area. + * + * The high 128 bits are stored here. + */ +struct ymmh_struct { + struct reg_128_bit hi_ymm[16]; +} __packed; + +/* Intel MPX support: */ + +struct mpx_bndreg { + u64 lower_bound; + u64 upper_bound; +} __packed; +/* + * State component 3 is used for the 4 128-bit bounds registers + */ +struct mpx_bndreg_state { + struct mpx_bndreg bndreg[4]; +} __packed; + +/* + * State component 4 is used for the 64-bit user-mode MPX + * configuration register BNDCFGU and the 64-bit MPX status + * register BNDSTATUS. We call the pair "BNDCSR". + */ +struct mpx_bndcsr { + u64 bndcfgu; + u64 bndstatus; +} __packed; + +/* + * The BNDCSR state is padded out to be 64-bytes in size. + */ +struct mpx_bndcsr_state { + union { + struct mpx_bndcsr bndcsr; + u8 pad_to_64_bytes[64]; + }; +} __packed; + +/* AVX-512 Components: */ + +/* + * State component 5 is used for the 8 64-bit opmask registers + * k0-k7 (opmask state). + */ +struct avx_512_opmask_state { + u64 opmask_reg[8]; +} __packed; + +/* + * State component 6 is used for the upper 256 bits of the + * registers ZMM0-ZMM15. These 16 256-bit values are denoted + * ZMM0_H-ZMM15_H (ZMM_Hi256 state). + */ +struct avx_512_zmm_uppers_state { + struct reg_256_bit zmm_upper[16]; +} __packed; + +/* + * State component 7 is used for the 16 512-bit registers + * ZMM16-ZMM31 (Hi16_ZMM state). + */ +struct avx_512_hi16_state { + struct reg_512_bit hi16_zmm[16]; +} __packed; + +/* + * State component 9: 32-bit PKRU register. The state is + * 8 bytes long but only 4 bytes is used currently. + */ +struct pkru_state { + u32 pkru; + u32 pad; +} __packed; + +struct xstate_header { + u64 xfeatures; + u64 xcomp_bv; + u64 reserved[6]; +}; +#pragma pack(pop) + +/* + * xstate_header.xcomp_bv[63] indicates that the extended_state_area + * is in compacted format. + */ +#define XCOMP_BV_COMPACTED_FORMAT ((u64)1 << 63) + +/* + * This is our most modern FPU state format, as saved by the XSAVE + * and restored by the XRSTOR instructions. + * + * It consists of a legacy fxregs portion, an xstate header and + * subsequent areas as defined by the xstate header. Not all CPUs + * support all the extensions, so the size of the extended area + * can vary quite a bit between CPUs. + */ +#pragma pack(push, 16) +struct xregs_state { + struct fxregs_state i387; + struct xstate_header header; + u8 extended_state_area[0]; +}; +#pragma pack(pop) + +/* + * This is a union of all the possible FPU state formats + * put together, so that we can pick the right one runtime. + * + * The size of the structure is determined by the largest + * member - which is the xsave area. The padding is there + * to ensure that statically-allocated task_structs (just + * the init_task today) have enough space. + */ +union fpu_state { + struct fxregs_state fxsave; + struct xregs_state xsave; + u8 __padding[PAGE_SIZE]; +}; + +#endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7be..c195abb 100644..100755 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + /****************************************************************************** * x86_emulate.h * @@ -11,7 +15,7 @@ #ifndef _ASM_X86_KVM_X86_EMULATE_H #define _ASM_X86_KVM_X86_EMULATE_H -#include <asm/desc_defs.h> +#include <gvm_types.h> struct x86_emulate_ctxt; enum x86_intercept; @@ -107,7 +111,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, + size_t addr, void *val, unsigned int bytes, struct x86_exception *fault); @@ -118,7 +122,7 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + int (*read_phys)(struct x86_emulate_ctxt *ctxt, size_t addr, void *val, unsigned int bytes); /* @@ -129,7 +133,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to write to memory. */ int (*write_std)(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes, + size_t addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * fetch: Read bytes of standard (non-emulated/special) memory. @@ -139,7 +143,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*fetch)(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes, + size_t addr, void *val, unsigned int bytes, struct x86_exception *fault); /* @@ -149,7 +153,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_emulated)(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes, + size_t addr, void *val, unsigned int bytes, struct x86_exception *fault); /* @@ -160,7 +164,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to write to memory. */ int (*write_emulated)(struct x86_emulate_ctxt *ctxt, - unsigned long addr, const void *val, + size_t addr, const void *val, unsigned int bytes, struct x86_exception *fault); @@ -173,9 +177,9 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, const void *old, - const void *new, + const void *_new, unsigned int bytes, struct x86_exception *fault); void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); @@ -192,7 +196,7 @@ struct x86_emulate_ops { struct desc_struct *desc, u32 *base3, int seg); void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc, u32 base3, int seg); - unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, + size_t (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, int seg); void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); @@ -223,7 +227,7 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); }; -typedef u32 __attribute__((vector_size(16))) sse128_t; +typedef u32 sse128_t[4]; /* Type, address-of, and value of an instruction's operand. */ struct operand { @@ -231,11 +235,11 @@ struct operand { unsigned int bytes; unsigned int count; union { - unsigned long orig_val; + size_t orig_val; u64 orig_val64; }; union { - unsigned long *reg; + size_t *reg; struct segmented_address { ulong ea; unsigned seg; @@ -244,7 +248,7 @@ struct operand { unsigned mm; } addr; union { - unsigned long val; + size_t val; u64 val64; char valptr[sizeof(sse128_t)]; sse128_t vec_val; @@ -261,8 +265,8 @@ struct fetch_cache { struct read_cache { u8 data[1024]; - unsigned long pos; - unsigned long end; + size_t pos; + size_t end; }; /* Execution mode, passed to the emulator. */ @@ -283,8 +287,8 @@ struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ - unsigned long eflags; - unsigned long eip; /* eip before instruction emulation */ + size_t eflags; + size_t eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ enum x86emul_mode mode; @@ -334,10 +338,10 @@ struct x86_emulate_ctxt { u8 modrm_seg; u8 seg_override; u64 d; - unsigned long _eip; + size_t _eip; struct operand memop; /* Fields above regs are cleared together. */ - unsigned long _regs[NR_VCPU_REGS]; + size_t _regs[NR_VCPU_REGS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h deleted file mode 100644 index a92b176..0000000 --- a/arch/x86/include/asm/kvm_guest.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_X86_KVM_GUEST_H -#define _ASM_X86_KVM_GUEST_H - -int kvm_setup_vsyscall_timeinfo(void); - -#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdde807..4ea863c 100644..100755 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1,4 +1,6 @@ /* + * Copyright 2019 Google LLC + * * Kernel-based Virtual Machine driver for Linux * * This header defines architecture specific interfaces, x86 version @@ -11,119 +13,74 @@ #ifndef _ASM_X86_KVM_HOST_H #define _ASM_X86_KVM_HOST_H -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/mmu_notifier.h> -#include <linux/tracepoint.h> -#include <linux/cpumask.h> -#include <linux/irq_work.h> - -#include <linux/kvm.h> -#include <linux/kvm_para.h> #include <linux/kvm_types.h> -#include <linux/perf_event.h> -#include <linux/pvclock_gtod.h> -#include <linux/clocksource.h> -#include <linux/irqbypass.h> -#include <linux/hyperv.h> - -#include <asm/apic.h> -#include <asm/pvclock-abi.h> -#include <asm/desc.h> -#include <asm/mtrr.h> -#include <asm/msr-index.h> -#include <asm/asm.h> #include <asm/kvm_page_track.h> -#define KVM_MAX_VCPUS 288 -#define KVM_SOFT_MAX_VCPUS 240 -#define KVM_MAX_VCPU_ID 1023 -#define KVM_USER_MEM_SLOTS 509 +#include <asm/fpu/types.h> +#include <uapi/asm/kvm.h> +#include <ntkrutils.h> +#include <__asm.h> + +#define GVM_MAX_VCPUS 288 +#define GVM_SOFT_MAX_VCPUS 240 +#define GVM_MAX_VCPU_ID 1023 +#define GVM_USER_MEM_SLOTS 125 /* memory slots that are not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 3 -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) +#define GVM_PRIVATE_MEM_SLOTS 3 +#define GVM_MEM_SLOTS_NUM (GVM_USER_MEM_SLOTS + GVM_PRIVATE_MEM_SLOTS) -#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 -#define KVM_HALT_POLL_NS_DEFAULT 400000 +#define GVM_PIO_PAGE_OFFSET 1 -#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS +#define GVM_IRQCHIP_NUM_PINS GVM_IOAPIC_NUM_PINS /* x86-specific vcpu->requests bit members */ -#define KVM_REQ_MIGRATE_TIMER 8 -#define KVM_REQ_REPORT_TPR_ACCESS 9 -#define KVM_REQ_TRIPLE_FAULT 10 -#define KVM_REQ_MMU_SYNC 11 -#define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_DEACTIVATE_FPU 13 -#define KVM_REQ_EVENT 14 -#define KVM_REQ_APF_HALT 15 -#define KVM_REQ_STEAL_UPDATE 16 -#define KVM_REQ_NMI 17 -#define KVM_REQ_PMU 18 -#define KVM_REQ_PMI 19 -#define KVM_REQ_SMI 20 -#define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS 22 -#define KVM_REQ_SCAN_IOAPIC 23 -#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD 25 -#define KVM_REQ_HV_CRASH 26 -#define KVM_REQ_IOAPIC_EOI_EXIT 27 -#define KVM_REQ_HV_RESET 28 -#define KVM_REQ_HV_EXIT 29 -#define KVM_REQ_HV_STIMER 30 +#define GVM_REQ_REPORT_TPR_ACCESS 9 +#define GVM_REQ_TRIPLE_FAULT 10 +#define GVM_REQ_MMU_SYNC 11 +#define GVM_REQ_EVENT 14 +#define GVM_REQ_NMI 17 +#define GVM_REQ_PMU 18 +#define GVM_REQ_PMI 19 +#define GVM_REQ_SMI 20 +#define GVM_REQ_SCAN_IOAPIC 23 +#define GVM_REQ_GLOBAL_CLOCK_UPDATE 24 +#define GVM_REQ_APIC_PAGE_RELOAD 25 #define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + (~(size_t)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL +#define BIT_64(a) (unsigned long long)(a) #define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + (~(size_t)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ | X86_CR4_PKE)) -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) +#define CR8_RESERVED_BITS (~(size_t)X86_CR8_TPR) -#define INVALID_PAGE (~(hpa_t)0) +/* Let we assume Windows won't give us a page at BIOS range */ +#define INVALID_PAGE (~(hpa_t)0xFFFF) #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define UNMAPPED_GVA (~(gpa_t)0) -/* KVM Hugepage definitions for x86 */ -#define KVM_NR_PAGE_SIZES 3 -#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) -#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) -#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) -#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) -#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) - -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - -#define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 -#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) -#define KVM_MIN_FREE_MMU_PAGES 5 -#define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 80 -#define KVM_NR_FIXED_MTRR_REGION 88 -#define KVM_NR_VAR_MTRR 8 - -#define ASYNC_PF_PER_VCPU 64 +#define GVM_PERMILLE_MMU_PAGES 20 +#define GVM_MIN_ALLOC_MMU_PAGES 64 +#define GVM_MMU_HASH_SHIFT 10 +#define GVM_NUM_MMU_PAGES (1 << GVM_MMU_HASH_SHIFT) +#define GVM_MIN_FREE_MMU_PAGES 5 +#define GVM_REFILL_PAGES 25 +#define GVM_MAX_CPUID_ENTRIES 80 +#define GVM_NR_FIXED_MTRR_REGION 88 +#define GVM_NR_VAR_MTRR 8 enum kvm_reg { VCPU_REGS_RAX = 0, @@ -168,9 +125,9 @@ enum { #include <asm/kvm_emulate.h> -#define KVM_NR_MEM_OBJS 40 +#define GVM_NR_MEM_OBJS 40 -#define KVM_NR_DB_REGS 4 +#define GVM_NR_DB_REGS 4 #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) @@ -200,14 +157,7 @@ enum { #define PFERR_PK_MASK (1U << PFERR_PK_BIT) /* apic attention bits */ -#define KVM_APIC_CHECK_VAPIC 0 -/* - * The following bit is set with PV-EOI, unset on EOI. - * We detect PV-EOI changes by guest by comparing - * this bit with PV-EOI in guest memory. - * See the implementation in apic_update_pv_eoi. - */ -#define KVM_APIC_PV_EOI_PENDING 1 +#define GVM_APIC_CHECK_VAPIC 0 struct kvm_kernel_irq_routing_entry; @@ -217,7 +167,7 @@ struct kvm_kernel_irq_routing_entry; */ struct kvm_mmu_memory_cache { int nobjs; - void *objects[KVM_NR_MEM_OBJS]; + void *objects[GVM_NR_MEM_OBJS]; }; /* @@ -254,7 +204,7 @@ union kvm_mmu_page_role { }; struct kvm_rmap_head { - unsigned long val; + size_t val; }; struct kvm_mmu_page { @@ -277,7 +227,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ - unsigned long mmu_valid_gen; + size_t mmu_valid_gen; DECLARE_BITMAP(unsync_child_bitmap, 512); @@ -294,7 +244,7 @@ struct kvm_mmu_page { }; struct kvm_pio_request { - unsigned long count; + size_t count; int in; int port; int size; @@ -311,11 +261,10 @@ struct rsvd_bits_validate { * mode. */ struct kvm_mmu { - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); - unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + void (*set_cr3)(struct kvm_vcpu *vcpu, size_t root); + size_t (*get_cr3)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, - bool prefault); + int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, @@ -340,14 +289,6 @@ struct kvm_mmu { */ u8 permissions[16]; - /* - * The pkru_mask indicates if protection key checks are needed. It - * consists of 16 domains indexed by page fault error code bits [4:1], - * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. - * Each domain has 2 bits which are ANDed with AD and WD from PKRU. - */ - u32 pkru_mask; - u64 *pae_root; u64 *lm_root; @@ -369,8 +310,8 @@ struct kvm_mmu { }; enum pmc_type { - KVM_PMC_GP = 0, - KVM_PMC_FIXED, + GVM_PMC_GP = 0, + GVM_PMC_FIXED, }; struct kvm_pmc { @@ -403,9 +344,9 @@ struct kvm_pmu { struct kvm_pmu_ops; enum { - KVM_DEBUGREG_BP_ENABLED = 1, - KVM_DEBUGREG_WONT_EXIT = 2, - KVM_DEBUGREG_RELOAD = 4, + GVM_DEBUGREG_BP_ENABLED = 1, + GVM_DEBUGREG_WONT_EXIT = 2, + GVM_DEBUGREG_RELOAD = 4, }; struct kvm_mtrr_range { @@ -415,70 +356,36 @@ struct kvm_mtrr_range { }; struct kvm_mtrr { - struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; - mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; + struct kvm_mtrr_range var_ranges[GVM_NR_VAR_MTRR]; + mtrr_type fixed_ranges[GVM_NR_FIXED_MTRR_REGION]; u64 deftype; struct list_head head; }; -/* Hyper-V SynIC timer */ -struct kvm_vcpu_hv_stimer { - struct hrtimer timer; - int index; - u64 config; - u64 count; - u64 exp_time; - struct hv_message msg; - bool msg_pending; -}; - -/* Hyper-V synthetic interrupt controller (SynIC)*/ -struct kvm_vcpu_hv_synic { - u64 version; - u64 control; - u64 msg_page; - u64 evt_page; - atomic64_t sint[HV_SYNIC_SINT_COUNT]; - atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT]; - DECLARE_BITMAP(auto_eoi_bitmap, 256); - DECLARE_BITMAP(vec_bitmap, 256); - bool active; -}; - -/* Hyper-V per vcpu emulation context */ -struct kvm_vcpu_hv { - u64 hv_vapic; - s64 runtime_offset; - struct kvm_vcpu_hv_synic synic; - struct kvm_hyperv_exit exit; - struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; - DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); -}; - struct kvm_vcpu_arch { /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. */ - unsigned long regs[NR_VCPU_REGS]; + size_t regs[NR_VCPU_REGS]; u32 regs_avail; u32 regs_dirty; - unsigned long cr0; - unsigned long cr0_guest_owned_bits; - unsigned long cr2; - unsigned long cr3; - unsigned long cr4; - unsigned long cr4_guest_owned_bits; - unsigned long cr8; + size_t cr0; + size_t cr0_guest_owned_bits; + size_t cr2; + size_t cr3; + size_t cr4; + size_t cr4_guest_owned_bits; + size_t cr8; u32 hflags; u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool apicv_active; DECLARE_BITMAP(ioapic_handled_vectors, 256); - unsigned long apic_attention; + size_t apic_attention; int32_t apic_arb_prio; int mp_state; u64 ia32_misc_enable_msr; @@ -515,7 +422,8 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; - struct fpu guest_fpu; + union fpu_state host_fpu; + union fpu_state guest_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -542,7 +450,7 @@ struct kvm_vcpu_arch { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + struct kvm_cpuid_entry cpuid_entries[GVM_MAX_CPUID_ENTRIES]; int maxphyaddr; @@ -554,34 +462,13 @@ struct kvm_vcpu_arch { int (*complete_userspace_io)(struct kvm_vcpu *vcpu); gpa_t time; - struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; struct gfn_to_hva_cache pv_time; - bool pv_time_enabled; - /* set guest stopped flag in pvclock flags field */ - bool pvclock_set_guest_stopped_request; - - struct { - u64 msr_val; - u64 last_steal; - struct gfn_to_hva_cache stime; - struct kvm_steal_time steal; - } st; u64 tsc_offset; u64 last_guest_tsc; - u64 last_host_tsc; u64 tsc_offset_adjustment; - u64 this_tsc_nsec; - u64 this_tsc_write; - u64 this_tsc_generation; - bool tsc_catchup; - bool tsc_always_catchup; - s8 virtual_tsc_shift; - u32 virtual_tsc_mult; - u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; - u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -592,17 +479,11 @@ struct kvm_vcpu_arch { u64 pat; unsigned switch_db_regs; - unsigned long db[KVM_NR_DB_REGS]; - unsigned long dr6; - unsigned long dr7; - unsigned long eff_db[KVM_NR_DB_REGS]; - unsigned long guest_debug_dr7; - - u64 mcg_cap; - u64 mcg_status; - u64 mcg_ctl; - u64 mcg_ext_ctl; - u64 *mce_banks; + size_t db[GVM_NR_DB_REGS]; + size_t dr6; + size_t dr7; + size_t eff_db[GVM_NR_DB_REGS]; + size_t guest_debug_dr7; /* Cache MMIO info */ u64 mmio_gva; @@ -613,23 +494,10 @@ struct kvm_vcpu_arch { struct kvm_pmu pmu; /* used for guest single stepping over the given code position */ - unsigned long singlestep_rip; - - struct kvm_vcpu_hv hyperv; + size_t singlestep_rip; - cpumask_var_t wbinvd_dirty_mask; - - unsigned long last_retry_eip; - unsigned long last_retry_addr; - - struct { - bool halted; - gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; - struct gfn_to_hva_cache data; - u64 msr_val; - u32 id; - bool send_user_only; - } apf; + size_t last_retry_eip; + size_t last_retry_addr; /* OSVW MSRs (AMD only) */ struct { @@ -637,11 +505,6 @@ struct kvm_vcpu_arch { u64 status; } osvw; - struct { - u64 msr_val; - struct gfn_to_hva_cache data; - } pv_eoi; - /* * Indicate whether the access faults on its page table in guest * which is set when fix page fault and used to detect unhandeable @@ -650,24 +513,14 @@ struct kvm_vcpu_arch { bool write_fault_to_shadow_pgtable; /* set at EPT violation at this point */ - unsigned long exit_qualification; - - /* pv related host specific info */ - struct { - bool pv_unhalted; - } pv; + size_t exit_qualification; int pending_ioapic_eoi; int pending_external_vector; }; -struct kvm_lpage_info { - int disallow_lpage; -}; - struct kvm_arch_memory_slot { - struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; - struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct kvm_rmap_head *rmap; unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; }; @@ -678,12 +531,11 @@ struct kvm_arch_memory_slot { * configured for multiple modes; in that case, we cannot use the map and * hence cannot use kvm_irq_delivery_to_apic_fast either. */ -#define KVM_APIC_MODE_XAPIC_CLUSTER 4 -#define KVM_APIC_MODE_XAPIC_FLAT 8 -#define KVM_APIC_MODE_X2APIC 16 +#define GVM_APIC_MODE_XAPIC_CLUSTER 4 +#define GVM_APIC_MODE_XAPIC_FLAT 8 +#define GVM_APIC_MODE_X2APIC 16 struct kvm_apic_map { - struct rcu_head rcu; u8 mode; u32 max_apic_id; union { @@ -693,41 +545,21 @@ struct kvm_apic_map { struct kvm_lapic *phys_map[]; }; -/* Hyper-V emulation context */ -struct kvm_hv { - u64 hv_guest_os_id; - u64 hv_hypercall; - u64 hv_tsc_page; - - /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ - u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; - u64 hv_crash_ctl; - - HV_REFERENCE_TSC_PAGE tsc_ref; -}; - struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; - unsigned long mmu_valid_gen; - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + size_t mmu_valid_gen; + struct hlist_head mmu_page_hash[GVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; - struct kvm_page_track_notifier_head track_notifier_head; - - struct list_head assigned_dev_head; - struct iommu_domain *iommu_domain; - bool iommu_noncoherent; -#define __KVM_HAVE_ARCH_NONCOHERENT_DMA - atomic_t noncoherent_dma_count; -#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE - atomic_t assigned_device_count; + struct kvm_page_track_notifier_node mmu_sp_tracker; + struct kvm_page_track_notifier_head track_notifier_head; + struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; @@ -743,33 +575,15 @@ struct kvm_arch { bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; - unsigned long irq_sources_bitmap; - s64 kvmclock_offset; + size_t irq_sources_bitmap; raw_spinlock_t tsc_write_lock; - u64 last_tsc_nsec; - u64 last_tsc_write; - u32 last_tsc_khz; - u64 cur_tsc_nsec; - u64 cur_tsc_write; - u64 cur_tsc_offset; - u64 cur_tsc_generation; - int nr_vcpus_matched_tsc; - - spinlock_t pvclock_gtod_sync_lock; - bool use_master_clock; - u64 master_kernel_ns; - cycle_t master_cycle_now; - struct delayed_work kvmclock_update_work; - struct delayed_work kvmclock_sync_work; - struct kvm_xen_hvm_config xen_hvm_config; + u64 master_kernel_ns; /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; - struct kvm_hv hyperv; - - #ifdef CONFIG_KVM_MMU_AUDIT + #ifdef CONFIG_GVM_MMU_AUDIT int audit_point; #endif @@ -778,7 +592,6 @@ struct kvm_arch { u64 disabled_quirks; - bool irqchip_split; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; @@ -828,7 +641,6 @@ struct kvm_vcpu_stat { u64 irq_exits; u64 host_state_reload; u64 efer_reload; - u64 fpu_reload; u64 insn_emulation; u64 insn_emulation_fail; u64 hypercalls; @@ -875,7 +687,8 @@ struct kvm_x86_ops { void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*save_host_state)(struct kvm_vcpu *vcpu); + void (*load_host_state)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); @@ -892,24 +705,21 @@ struct kvm_x86_ops { void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr3)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); - void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*set_cr0)(struct kvm_vcpu *vcpu, size_t cr0); + void (*set_cr3)(struct kvm_vcpu *vcpu, size_t cr3); + int (*set_cr4)(struct kvm_vcpu *vcpu, size_t cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); u64 (*get_dr6)(struct kvm_vcpu *vcpu); - void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*set_dr6)(struct kvm_vcpu *vcpu, size_t value); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); - void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); + void (*set_dr7)(struct kvm_vcpu *vcpu, size_t value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); - unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - u32 (*get_pkru)(struct kvm_vcpu *vcpu); - void (*fpu_activate)(struct kvm_vcpu *vcpu); - void (*fpu_deactivate)(struct kvm_vcpu *vcpu); + size_t (*get_rflags)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, size_t rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -941,7 +751,6 @@ struct kvm_x86_ops { void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); - void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); @@ -949,9 +758,9 @@ struct kvm_x86_ops { bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, size_t cr3); - void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); + void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry *entry); bool (*has_wbinvd_exit)(void); @@ -968,8 +777,6 @@ struct kvm_x86_ops { int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); - /* * Arch-specific dirty logging hooks. These hooks are only supposed to * be valid if the specific arch has hardware-accelerated dirty logging @@ -993,40 +800,14 @@ struct kvm_x86_ops { void (*flush_log_dirty)(struct kvm *kvm); void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t offset, unsigned long mask); + gfn_t offset, size_t mask); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); - - int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); - void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); - - void (*setup_mce)(struct kvm_vcpu *vcpu); -}; - -struct kvm_arch_async_pf { - u32 token; - gfn_t gfn; - unsigned long cr3; - bool direct_map; }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -1049,19 +830,17 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); -void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); + gfn_t gfn_offset, size_t mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, size_t cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1081,21 +860,6 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, extern bool tdp_enabled; -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); - -/* control of guest tsc rate supported? */ -extern bool kvm_has_tsc_control; -/* maximum supported tsc_khz for guests */ -extern u32 kvm_max_guest_tsc_khz; -/* number of bits of the fractional part of the TSC scaling ratio */ -extern u8 kvm_tsc_scaling_ratio_frac_bits; -/* maximum allowed value of TSC scaling ratio */ -extern u64 kvm_max_tsc_scaling_ratio; -/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ -extern u64 kvm_default_tsc_scaling_ratio; - -extern u64 kvm_mce_cap_supported; - enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ @@ -1107,7 +871,7 @@ enum emulation_result { #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_RETRY (1 << 3) #define EMULTYPE_NO_REEXECUTE (1 << 4) -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, +int x86_emulate_instruction(struct kvm_vcpu *vcpu, size_t cr2, int emulation_type, void *insn, int insn_len); static inline int emulate_instruction(struct kvm_vcpu *vcpu, @@ -1136,22 +900,22 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +int kvm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0); +int kvm_set_cr3(struct kvm_vcpu *vcpu, size_t cr3); +int kvm_set_cr4(struct kvm_vcpu *vcpu, size_t cr4); +int kvm_set_cr8(struct kvm_vcpu *vcpu, size_t cr8); +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, size_t val); +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, size_t *val); +size_t kvm_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lmsw(struct kvm_vcpu *vcpu, size_t msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +size_t kvm_get_rflags(struct kvm_vcpu *vcpu); +void kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags); bool kvm_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); @@ -1165,7 +929,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); -static inline int __kvm_irq_line_state(unsigned long *irq_state, +static inline int __kvm_irq_line_state(size_t *irq_state, int irq_source_id, int level) { /* Logical OR for level trig interrupt */ @@ -1214,32 +978,33 @@ void kvm_disable_tdp(void); static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { + _CRT_UNUSED(vcpu); + _CRT_UNUSED(access); + _CRT_UNUSED(exception); return gpa; } static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + struct page *page = (struct page *)pfn_to_page(shadow_page >> PAGE_SHIFT); return (struct kvm_mmu_page *)page_private(page); } static inline u16 kvm_read_ldt(void) { - u16 ldt; - asm("sldt %0" : "=g"(ldt)); - return ldt; + return gvm_read_ldt(); } static inline void kvm_load_ldt(u16 sel) { - asm("lldt %0" : : "rm"(sel)); + gvm_load_ldt(sel); } #ifdef CONFIG_X86_64 -static inline unsigned long read_msr(unsigned long msr) +static inline size_t read_msr(unsigned int msr) { - u64 value; + u64 value = 0; rdmsrl(msr, value); return value; @@ -1293,41 +1058,18 @@ enum { #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) -#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -#define KVM_ADDRESS_SPACE_NUM 2 +#define __GVM_VCPU_MULTIPLE_ADDRESS_SPACE +#define GVM_ADDRESS_SPACE_NUM 2 #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Trap the fault and ignore the instruction if that happens. - */ -asmlinkage void kvm_spurious_fault(void); - -#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ - "666: " insn "\n\t" \ - "668: \n\t" \ - ".pushsection .fixup, \"ax\" \n" \ - "667: \n\t" \ - cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ - "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t" \ - "call kvm_spurious_fault \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(666b, 667b) - -#define __kvm_handle_fault_on_reboot(insn) \ - ____kvm_handle_fault_on_reboot(insn, "") - -#define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +#define GVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva(struct kvm *kvm, size_t hva); +int kvm_unmap_hva_range(struct kvm *kvm, size_t start, size_t end); +int kvm_age_hva(struct kvm *kvm, size_t start, size_t end); +int kvm_test_age_hva(struct kvm *kvm, size_t hva); +void kvm_set_spte_hva(struct kvm *kvm, size_t hva, pte_t pte); int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); @@ -1335,7 +1077,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address); + size_t address); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); @@ -1343,21 +1085,11 @@ int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); -unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); +size_t kvm_get_linear_rip(struct kvm_vcpu *vcpu); +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, size_t linear_rip); -void kvm_make_mclock_inprogress_request(struct kvm *kvm); void kvm_make_scan_ioapic_request(struct kvm *kvm); -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work); -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work); -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work); -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); -extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); - void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); @@ -1385,13 +1117,17 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_unblocking(vcpu); } -static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) +{ + _CRT_UNUSED(vcpu); +} static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC return __default_cpu_present_to_apicid(mps_cpu); #else + _CRT_UNUSED(mps_cpu); WARN_ON_ONCE(1); return BAD_APICID; #endif diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24..117ef6a 100644..100755 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -1,6 +1,12 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H +#include <ntkrutils.h> + enum kvm_page_track_mode { KVM_PAGE_TRACK_WRITE, KVM_PAGE_TRACK_MAX, @@ -35,11 +41,12 @@ struct kvm_page_track_notifier_node { }; void kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_destroy(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages); + size_t npages); void kvm_slot_page_track_add_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h deleted file mode 100644 index bc62e7c..0000000 --- a/arch/x86/include/asm/kvm_para.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef _ASM_X86_KVM_PARA_H -#define _ASM_X86_KVM_PARA_H - -#include <asm/processor.h> -#include <asm/alternative.h> -#include <uapi/asm/kvm_para.h> - -extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); - -#ifdef CONFIG_KVM_GUEST -bool kvm_check_and_clear_guest_paused(void); -#else -static inline bool kvm_check_and_clear_guest_paused(void) -{ - return false; -} -#endif /* CONFIG_KVM_GUEST */ - -#define KVM_HYPERCALL \ - ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) - -/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall - * instruction. The hypervisor may replace it with something else but only the - * instructions are guaranteed to be supported. - * - * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. - * The hypercall number should be placed in rax and the return value will be - * placed in rax. No other registers will be clobbered unless explicitly - * noted by the particular hypercall. - */ - -static inline long kvm_hypercall0(unsigned int nr) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr) - : "memory"); - return ret; -} - -static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1) - : "memory"); - return ret; -} - -static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, - unsigned long p2) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2) - : "memory"); - return ret; -} - -static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, - unsigned long p2, unsigned long p3) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2), "d"(p3) - : "memory"); - return ret; -} - -static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) - : "memory"); - return ret; -} - -#ifdef CONFIG_KVM_GUEST -bool kvm_para_available(void); -unsigned int kvm_arch_para_features(void); -void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); -void kvm_async_pf_task_wake(u32 token); -u32 kvm_read_and_reset_pf_reason(void); -extern void kvm_disable_steal_time(void); - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -void __init kvm_spinlock_init(void); -#else /* !CONFIG_PARAVIRT_SPINLOCKS */ -static inline void kvm_spinlock_init(void) -{ -} -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - -#else /* CONFIG_KVM_GUEST */ -#define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) -#define kvm_async_pf_task_wake(T) do {} while(0) - -static inline bool kvm_para_available(void) -{ - return false; -} - -static inline unsigned int kvm_arch_para_features(void) -{ - return 0; -} - -static inline u32 kvm_read_and_reset_pf_reason(void) -{ - return 0; -} - -static inline void kvm_disable_steal_time(void) -{ - return; -} -#endif - -#endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h new file mode 100755 index 0000000..4cc48af --- /dev/null +++ b/arch/x86/include/asm/msidef.h @@ -0,0 +1,56 @@ +#ifndef _ASM_X86_MSIDEF_H +#define _ASM_X86_MSIDEF_H + +/* + * Constants for Intel APIC based MSI messages. + */ + +/* + * Shifts for MSI data + */ + +#define MSI_DATA_VECTOR_SHIFT 0 +#define MSI_DATA_VECTOR_MASK 0x000000ff +#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \ + MSI_DATA_VECTOR_MASK) + +#define MSI_DATA_DELIVERY_MODE_SHIFT 8 +#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) +#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) + +#define MSI_DATA_LEVEL_SHIFT 14 +#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) +#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) + +#define MSI_DATA_TRIGGER_SHIFT 15 +#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) +#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) + +/* + * Shift/mask fields for msi address + */ + +#define MSI_ADDR_BASE_HI 0 +#define MSI_ADDR_BASE_LO 0xfee00000 + +#define MSI_ADDR_DEST_MODE_SHIFT 2 +#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT) +#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT) + +#define MSI_ADDR_REDIRECTION_SHIFT 3 +#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) + /* dedicated cpu */ +#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) + /* lowest priority */ + +#define MSI_ADDR_DEST_ID_SHIFT 12 +#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 +#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ + MSI_ADDR_DEST_ID_MASK) +#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00) + +#define MSI_ADDR_IR_EXT_INT (1 << 4) +#define MSI_ADDR_IR_SHV (1 << 3) +#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13) +#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5) +#endif /* _ASM_X86_MSIDEF_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h new file mode 100755 index 0000000..38fd286 --- /dev/null +++ b/arch/x86/include/asm/msr-index.h @@ -0,0 +1,698 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef _ASM_X86_MSR_INDEX_H +#define _ASM_X86_MSR_INDEX_H + +/* + * CPU model specific register (MSR) numbers. + * + * Do not add new entries to this file unless the definitions are shared + * between multiple compilation units. + */ + +/* x86-64 specific MSRs */ +#define MSR_EFER 0xc0000080 /* extended feature register */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ + +/* EFER bits: */ +#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +#define _EFER_LME 8 /* Long mode enable */ +#define _EFER_LMA 10 /* Long mode active (read-only) */ +#define _EFER_NX 11 /* No execute enable */ +#define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ +#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ + +#define EFER_SCE (1<<_EFER_SCE) +#define EFER_LME (1<<_EFER_LME) +#define EFER_LMA (1<<_EFER_LMA) +#define EFER_NX (1<<_EFER_NX) +#define EFER_SVME (1<<_EFER_SVME) +#define EFER_LMSLE (1<<_EFER_LMSLE) +#define EFER_FFXSR (1<<_EFER_FFXSR) + +/* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_PERFCTR0 0x000000c1 +#define MSR_IA32_PERFCTR1 0x000000c2 +#define MSR_FSB_FREQ 0x000000cd +#define MSR_PLATFORM_INFO 0x000000ce + +#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 +#define NHM_C3_AUTO_DEMOTE (1ULL << 25) +#define NHM_C1_AUTO_DEMOTE (1ULL << 26) +#define ATM_LNC_C6_AUTO_DEMOTE (1ULL << 25) +#define SNB_C1_AUTO_UNDEMOTE (1ULL << 27) +#define SNB_C3_AUTO_UNDEMOTE (1ULL << 28) + +#define MSR_MTRRcap 0x000000fe +#define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +#define MSR_IA32_MCG_CAP 0x00000179 +#define MSR_IA32_MCG_STATUS 0x0000017a +#define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 + +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af + +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + +#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */ +#define LBR_INFO_MISPRED BIT_ULL(63) +#define LBR_INFO_IN_TX BIT_ULL(62) +#define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYCLES 0xffff + +#define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_IA32_DS_AREA 0x00000600 +#define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 + +#define MSR_IA32_RTIT_CTL 0x00000570 +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define MSR_IA32_RTIT_ADDR0_A 0x00000580 +#define MSR_IA32_RTIT_ADDR0_B 0x00000581 +#define MSR_IA32_RTIT_ADDR1_A 0x00000582 +#define MSR_IA32_RTIT_ADDR1_B 0x00000583 +#define MSR_IA32_RTIT_ADDR2_A 0x00000584 +#define MSR_IA32_RTIT_ADDR2_B 0x00000585 +#define MSR_IA32_RTIT_ADDR3_A 0x00000586 +#define MSR_IA32_RTIT_ADDR3_B 0x00000587 +#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 +#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 +#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 + +#define MSR_MTRRfix64K_00000 0x00000250 +#define MSR_MTRRfix16K_80000 0x00000258 +#define MSR_MTRRfix16K_A0000 0x00000259 +#define MSR_MTRRfix4K_C0000 0x00000268 +#define MSR_MTRRfix4K_C8000 0x00000269 +#define MSR_MTRRfix4K_D0000 0x0000026a +#define MSR_MTRRfix4K_D8000 0x0000026b +#define MSR_MTRRfix4K_E0000 0x0000026c +#define MSR_MTRRfix4K_E8000 0x0000026d +#define MSR_MTRRfix4K_F0000 0x0000026e +#define MSR_MTRRfix4K_F8000 0x0000026f +#define MSR_MTRRdefType 0x000002ff + +#define MSR_IA32_CR_PAT 0x00000277 + +#define MSR_IA32_DEBUGCTLMSR 0x000001d9 +#define MSR_IA32_LASTBRANCHFROMIP 0x000001db +#define MSR_IA32_LASTBRANCHTOIP 0x000001dc +#define MSR_IA32_LASTINTFROMIP 0x000001dd +#define MSR_IA32_LASTINTTOIP 0x000001de + +/* DEBUGCTLMSR bits (others vary by model): */ +#define DEBUGCTLMSR_LBR (1ULL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF (1ULL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_TR (1ULL << 6) +#define DEBUGCTLMSR_BTS (1ULL << 7) +#define DEBUGCTLMSR_BTINT (1ULL << 8) +#define DEBUGCTLMSR_BTS_OFF_OS (1ULL << 9) +#define DEBUGCTLMSR_BTS_OFF_USR (1ULL << 10) +#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1ULL << 11) + +#define MSR_PEBS_FRONTEND 0x000003f7 + +#define MSR_IA32_POWER_CTL 0x000001fc + +#define MSR_IA32_MC0_CTL 0x00000400 +#define MSR_IA32_MC0_STATUS 0x00000401 +#define MSR_IA32_MC0_ADDR 0x00000402 +#define MSR_IA32_MC0_MISC 0x00000403 + +/* C-state Residency Counters */ +#define MSR_PKG_C3_RESIDENCY 0x000003f8 +#define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_PKG_C7_RESIDENCY 0x000003fa +#define MSR_CORE_C3_RESIDENCY 0x000003fc +#define MSR_CORE_C6_RESIDENCY 0x000003fd +#define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff +#define MSR_PKG_C2_RESIDENCY 0x0000060d +#define MSR_PKG_C8_RESIDENCY 0x00000630 +#define MSR_PKG_C9_RESIDENCY 0x00000631 +#define MSR_PKG_C10_RESIDENCY 0x00000632 + +/* Interrupt Response Limit */ +#define MSR_PKGC3_IRTL 0x0000060a +#define MSR_PKGC6_IRTL 0x0000060b +#define MSR_PKGC7_IRTL 0x0000060c +#define MSR_PKGC8_IRTL 0x00000633 +#define MSR_PKGC9_IRTL 0x00000634 +#define MSR_PKGC10_IRTL 0x00000635 + +/* Run Time Average Power Limiting (RAPL) Interface */ + +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a +#define MSR_PP0_PERF_STATUS 0x0000063b + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + +#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D + +#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 +#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 +#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A +#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B + +#define MSR_CORE_C1_RES 0x00000660 + +#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 +#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 + +#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 +#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 +#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 + +/* Hardware P state interface */ +#define MSR_PPERF 0x0000064e +#define MSR_PERF_LIMIT_REASONS 0x0000064f +#define MSR_PM_ENABLE 0x00000770 +#define MSR_HWP_CAPABILITIES 0x00000771 +#define MSR_HWP_REQUEST_PKG 0x00000772 +#define MSR_HWP_INTERRUPT 0x00000773 +#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_STATUS 0x00000777 + +/* CPUID.6.EAX */ +#define HWP_BASE_BIT (1<<7) +#define HWP_NOTIFICATIONS_BIT (1<<8) +#define HWP_ACTIVITY_WINDOW_BIT (1<<9) +#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10) +#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) + +/* IA32_HWP_CAPABILITIES */ +#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff) +#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff) +#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff) +#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) + +/* IA32_HWP_REQUEST */ +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) +#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) +#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) + +/* IA32_HWP_STATUS */ +#define HWP_GUARANTEED_CHANGE(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4) + +/* IA32_HWP_INTERRUPT */ +#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2) + +#define MSR_AMD64_MC0_MASK 0xc0010044 + +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + +#define MSR_P6_PERFCTR0 0x000000c1 +#define MSR_P6_PERFCTR1 0x000000c2 +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +#define MSR_KNC_PERFCTR0 0x00000020 +#define MSR_KNC_PERFCTR1 0x00000021 +#define MSR_KNC_EVNTSEL0 0x00000028 +#define MSR_KNC_EVNTSEL1 0x00000029 + +/* Alternative perfctr range with full access. */ +#define MSR_IA32_PMC0 0x000004c1 + +/* AMD64 MSRs. Not complete. See the architecture manual for a more + complete list. */ + +#define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 +#define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_LS_CFG 0xc0011020 +#define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_BU_CFG2 0xc001102a +#define MSR_AMD64_IBSFETCHCTL 0xc0011030 +#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 +#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1ULL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1) +#define MSR_AMD64_IBSOPCTL 0xc0011033 +#define MSR_AMD64_IBSOPRIP 0xc0011034 +#define MSR_AMD64_IBSOPDATA 0xc0011035 +#define MSR_AMD64_IBSOPDATA2 0xc0011036 +#define MSR_AMD64_IBSOPDATA3 0xc0011037 +#define MSR_AMD64_IBSDCLINAD 0xc0011038 +#define MSR_AMD64_IBSDCPHYSAD 0xc0011039 +#define MSR_AMD64_IBSOP_REG_COUNT 7 +#define MSR_AMD64_IBSOP_REG_MASK ((1ULL<<MSR_AMD64_IBSOP_REG_COUNT)-1) +#define MSR_AMD64_IBSCTL 0xc001103a +#define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_IBSOPDATA4 0xc001103d +#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ + +/* Fam 17h MSRs */ +#define MSR_F17H_IRPERF 0xc00000e9 + +/* Fam 16h MSRs */ +#define MSR_F16H_L2I_PERF_CTL 0xc0010230 +#define MSR_F16H_L2I_PERF_CTR 0xc0010231 +#define MSR_F16H_DR1_ADDR_MASK 0xc0011019 +#define MSR_F16H_DR2_ADDR_MASK 0xc001101a +#define MSR_F16H_DR3_ADDR_MASK 0xc001101b +#define MSR_F16H_DR0_ADDR_MASK 0xc0011027 + +/* Fam 15h MSRs */ +#define MSR_F15H_PERF_CTL 0xc0010200 +#define MSR_F15H_PERF_CTR 0xc0010201 +#define MSR_F15H_NB_PERF_CTL 0xc0010240 +#define MSR_F15H_NB_PERF_CTR 0xc0010241 +#define MSR_F15H_PTSC 0xc0010280 +#define MSR_F15H_IC_CFG 0xc0011021 + +/* Fam 10h MSRs */ +#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 +#define FAM10H_MMIO_CONF_ENABLE (1<<0) +#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf +#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 +#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL +#define FAM10H_MMIO_CONF_BASE_SHIFT 20 +#define MSR_FAM10H_NODE_ID 0xc001100c + +/* K8 MSRs */ +#define MSR_K8_TOP_MEM1 0xc001001a +#define MSR_K8_TOP_MEM2 0xc001001d +#define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_INT_PENDING_MSG 0xc0010055 +/* C1E active bits in int pending message */ +#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 +#define MSR_K8_TSEG_ADDR 0xc0010112 +#define MSR_K8_TSEG_MASK 0xc0010113 +#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */ +#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */ +#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */ + +/* K7 MSRs */ +#define MSR_K7_EVNTSEL0 0xc0010000 +#define MSR_K7_PERFCTR0 0xc0010004 +#define MSR_K7_EVNTSEL1 0xc0010001 +#define MSR_K7_PERFCTR1 0xc0010005 +#define MSR_K7_EVNTSEL2 0xc0010002 +#define MSR_K7_PERFCTR2 0xc0010006 +#define MSR_K7_EVNTSEL3 0xc0010003 +#define MSR_K7_PERFCTR3 0xc0010007 +#define MSR_K7_CLK_CTL 0xc001001b +#define MSR_K7_HWCR 0xc0010015 +#define MSR_K7_FID_VID_CTL 0xc0010041 +#define MSR_K7_FID_VID_STATUS 0xc0010042 + +/* K6 MSRs */ +#define MSR_K6_WHCR 0xc0000082 +#define MSR_K6_UWCCR 0xc0000085 +#define MSR_K6_EPMR 0xc0000086 +#define MSR_K6_PSOR 0xc0000087 +#define MSR_K6_PFIR 0xc0000088 + +/* Centaur-Hauls/IDT defined MSRs. */ +#define MSR_IDT_FCR1 0x00000107 +#define MSR_IDT_FCR2 0x00000108 +#define MSR_IDT_FCR3 0x00000109 +#define MSR_IDT_FCR4 0x0000010a + +#define MSR_IDT_MCR0 0x00000110 +#define MSR_IDT_MCR1 0x00000111 +#define MSR_IDT_MCR2 0x00000112 +#define MSR_IDT_MCR3 0x00000113 +#define MSR_IDT_MCR4 0x00000114 +#define MSR_IDT_MCR5 0x00000115 +#define MSR_IDT_MCR6 0x00000116 +#define MSR_IDT_MCR7 0x00000117 +#define MSR_IDT_MCR_CTRL 0x00000120 + +/* VIA Cyrix defined MSRs*/ +#define MSR_VIA_FCR 0x00001107 +#define MSR_VIA_LONGHAUL 0x0000110a +#define MSR_VIA_RNG 0x0000110b +#define MSR_VIA_BCR2 0x00001147 + +/* Transmeta defined MSRs */ +#define MSR_TMTA_LONGRUN_CTRL 0x80868010 +#define MSR_TMTA_LONGRUN_FLAGS 0x80868011 +#define MSR_TMTA_LRTI_READOUT 0x80868018 +#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a + +/* Intel defined MSRs. */ +#define MSR_IA32_P5_MC_ADDR 0x00000000 +#define MSR_IA32_P5_MC_TYPE 0x00000001 +#define MSR_IA32_TSC 0x00000010 +#define MSR_IA32_PLATFORM_ID 0x00000017 +#define MSR_IA32_EBL_CR_POWERON 0x0000002a +#define MSR_EBC_FREQUENCY_ID 0x0000002c +#define MSR_SMI_COUNT 0x00000034 +#define MSR_IA32_FEATURE_CONTROL 0x0000003a +#define MSR_IA32_TSC_ADJUST 0x0000003b +#define MSR_IA32_BNDCFGS 0x00000d90 + +#define MSR_IA32_XSS 0x00000da0 + +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) +#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) +#define FEATURE_CONTROL_LMCE (1<<20) + +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) + +#define MSR_IA32_TSCDEADLINE 0x000006e0 + +#define MSR_IA32_UCODE_WRITE 0x00000079 +#define MSR_IA32_UCODE_REV 0x0000008b + +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b +#define MSR_IA32_SMBASE 0x0000009e + +#define MSR_IA32_PERF_STATUS 0x00000198 +#define MSR_IA32_PERF_CTL 0x00000199 +#define INTEL_PERF_CTL_MASK 0xffff +#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 +#define MSR_AMD_PERF_STATUS 0xc0010063 +#define MSR_AMD_PERF_CTL 0xc0010062 + +#define MSR_IA32_MPERF 0x000000e7 +#define MSR_IA32_APERF 0x000000e8 + +#define MSR_IA32_THERM_CONTROL 0x0000019a +#define MSR_IA32_THERM_INTERRUPT 0x0000019b + +#define THERM_INT_HIGH_ENABLE (1 << 0) +#define THERM_INT_LOW_ENABLE (1 << 1) +#define THERM_INT_PLN_ENABLE (1 << 24) + +#define MSR_IA32_THERM_STATUS 0x0000019c + +#define THERM_STATUS_PROCHOT (1 << 0) +#define THERM_STATUS_POWER_LIMIT (1 << 10) + +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + +#define MSR_IA32_MISC_ENABLE 0x000001a0 + +#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 + +#define MSR_MISC_PWR_MGMT 0x000001aa + +#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 +#define ENERGY_PERF_BIAS_PERFORMANCE 0 +#define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_POWERSAVE 15 + +#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 + +#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) +#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) + +#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 + +#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0) +#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) +#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) + +/* Thermal Thresholds Support */ +#define THERM_INT_THRESHOLD0_ENABLE (1 << 15) +#define THERM_SHIFT_THRESHOLD0 8 +#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0) +#define THERM_INT_THRESHOLD1_ENABLE (1 << 23) +#define THERM_SHIFT_THRESHOLD1 16 +#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1) +#define THERM_STATUS_THRESHOLD0 (1 << 6) +#define THERM_LOG_THRESHOLD0 (1 << 7) +#define THERM_STATUS_THRESHOLD1 (1 << 8) +#define THERM_LOG_THRESHOLD1 (1 << 9) + +/* MISC_ENABLE bits: architectural */ +#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0 +#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) +#define MSR_IA32_MISC_ENABLE_TCC_BIT 1 +#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT) +#define MSR_IA32_MISC_ENABLE_EMON_BIT 7 +#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT) +#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11 +#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT) +#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12 +#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT) +#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16 +#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT) +#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18 +#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT) +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22 +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) +#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23 +#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34 +#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT) + +/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */ +#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2 +#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT) +#define MSR_IA32_MISC_ENABLE_TM1_BIT 3 +#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT) +#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4 +#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6 +#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8 +#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT) +#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9 +#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_FERR_BIT 10 +#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT) +#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10 +#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT) +#define MSR_IA32_MISC_ENABLE_TM2_BIT 13 +#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT) +#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19 +#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20 +#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT) +#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24 +#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT) +#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37 +#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38 +#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39 +#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT) + +#define MSR_IA32_TSC_DEADLINE 0x000006E0 + +/* P4/Xeon+ specific */ +#define MSR_IA32_MCG_EAX 0x00000180 +#define MSR_IA32_MCG_EBX 0x00000181 +#define MSR_IA32_MCG_ECX 0x00000182 +#define MSR_IA32_MCG_EDX 0x00000183 +#define MSR_IA32_MCG_ESI 0x00000184 +#define MSR_IA32_MCG_EDI 0x00000185 +#define MSR_IA32_MCG_EBP 0x00000186 +#define MSR_IA32_MCG_ESP 0x00000187 +#define MSR_IA32_MCG_EFLAGS 0x00000188 +#define MSR_IA32_MCG_EIP 0x00000189 +#define MSR_IA32_MCG_RESERVED 0x0000018a + +/* Pentium IV performance counter MSRs */ +#define MSR_P4_BPU_PERFCTR0 0x00000300 +#define MSR_P4_BPU_PERFCTR1 0x00000301 +#define MSR_P4_BPU_PERFCTR2 0x00000302 +#define MSR_P4_BPU_PERFCTR3 0x00000303 +#define MSR_P4_MS_PERFCTR0 0x00000304 +#define MSR_P4_MS_PERFCTR1 0x00000305 +#define MSR_P4_MS_PERFCTR2 0x00000306 +#define MSR_P4_MS_PERFCTR3 0x00000307 +#define MSR_P4_FLAME_PERFCTR0 0x00000308 +#define MSR_P4_FLAME_PERFCTR1 0x00000309 +#define MSR_P4_FLAME_PERFCTR2 0x0000030a +#define MSR_P4_FLAME_PERFCTR3 0x0000030b +#define MSR_P4_IQ_PERFCTR0 0x0000030c +#define MSR_P4_IQ_PERFCTR1 0x0000030d +#define MSR_P4_IQ_PERFCTR2 0x0000030e +#define MSR_P4_IQ_PERFCTR3 0x0000030f +#define MSR_P4_IQ_PERFCTR4 0x00000310 +#define MSR_P4_IQ_PERFCTR5 0x00000311 +#define MSR_P4_BPU_CCCR0 0x00000360 +#define MSR_P4_BPU_CCCR1 0x00000361 +#define MSR_P4_BPU_CCCR2 0x00000362 +#define MSR_P4_BPU_CCCR3 0x00000363 +#define MSR_P4_MS_CCCR0 0x00000364 +#define MSR_P4_MS_CCCR1 0x00000365 +#define MSR_P4_MS_CCCR2 0x00000366 +#define MSR_P4_MS_CCCR3 0x00000367 +#define MSR_P4_FLAME_CCCR0 0x00000368 +#define MSR_P4_FLAME_CCCR1 0x00000369 +#define MSR_P4_FLAME_CCCR2 0x0000036a +#define MSR_P4_FLAME_CCCR3 0x0000036b +#define MSR_P4_IQ_CCCR0 0x0000036c +#define MSR_P4_IQ_CCCR1 0x0000036d +#define MSR_P4_IQ_CCCR2 0x0000036e +#define MSR_P4_IQ_CCCR3 0x0000036f +#define MSR_P4_IQ_CCCR4 0x00000370 +#define MSR_P4_IQ_CCCR5 0x00000371 +#define MSR_P4_ALF_ESCR0 0x000003ca +#define MSR_P4_ALF_ESCR1 0x000003cb +#define MSR_P4_BPU_ESCR0 0x000003b2 +#define MSR_P4_BPU_ESCR1 0x000003b3 +#define MSR_P4_BSU_ESCR0 0x000003a0 +#define MSR_P4_BSU_ESCR1 0x000003a1 +#define MSR_P4_CRU_ESCR0 0x000003b8 +#define MSR_P4_CRU_ESCR1 0x000003b9 +#define MSR_P4_CRU_ESCR2 0x000003cc +#define MSR_P4_CRU_ESCR3 0x000003cd +#define MSR_P4_CRU_ESCR4 0x000003e0 +#define MSR_P4_CRU_ESCR5 0x000003e1 +#define MSR_P4_DAC_ESCR0 0x000003a8 +#define MSR_P4_DAC_ESCR1 0x000003a9 +#define MSR_P4_FIRM_ESCR0 0x000003a4 +#define MSR_P4_FIRM_ESCR1 0x000003a5 +#define MSR_P4_FLAME_ESCR0 0x000003a6 +#define MSR_P4_FLAME_ESCR1 0x000003a7 +#define MSR_P4_FSB_ESCR0 0x000003a2 +#define MSR_P4_FSB_ESCR1 0x000003a3 +#define MSR_P4_IQ_ESCR0 0x000003ba +#define MSR_P4_IQ_ESCR1 0x000003bb +#define MSR_P4_IS_ESCR0 0x000003b4 +#define MSR_P4_IS_ESCR1 0x000003b5 +#define MSR_P4_ITLB_ESCR0 0x000003b6 +#define MSR_P4_ITLB_ESCR1 0x000003b7 +#define MSR_P4_IX_ESCR0 0x000003c8 +#define MSR_P4_IX_ESCR1 0x000003c9 +#define MSR_P4_MOB_ESCR0 0x000003aa +#define MSR_P4_MOB_ESCR1 0x000003ab +#define MSR_P4_MS_ESCR0 0x000003c0 +#define MSR_P4_MS_ESCR1 0x000003c1 +#define MSR_P4_PMH_ESCR0 0x000003ac +#define MSR_P4_PMH_ESCR1 0x000003ad +#define MSR_P4_RAT_ESCR0 0x000003bc +#define MSR_P4_RAT_ESCR1 0x000003bd +#define MSR_P4_SAAT_ESCR0 0x000003ae +#define MSR_P4_SAAT_ESCR1 0x000003af +#define MSR_P4_SSU_ESCR0 0x000003be +#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */ + +#define MSR_P4_TBPU_ESCR0 0x000003c2 +#define MSR_P4_TBPU_ESCR1 0x000003c3 +#define MSR_P4_TC_ESCR0 0x000003c4 +#define MSR_P4_TC_ESCR1 0x000003c5 +#define MSR_P4_U2L_ESCR0 0x000003b0 +#define MSR_P4_U2L_ESCR1 0x000003b1 + +#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2 + +/* Intel Core-based CPU performance counters */ +#define MSR_CORE_PERF_FIXED_CTR0 0x00000309 +#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a +#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b +#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d +#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e +#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 + +/* Geode defined MSRs */ +#define MSR_GEODE_BUSCONT_CONF0 0x00001900 + +/* Intel VT MSRs */ +#define MSR_IA32_VMX_BASIC 0x00000480 +#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481 +#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482 +#define MSR_IA32_VMX_EXIT_CTLS 0x00000483 +#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484 +#define MSR_IA32_VMX_MISC 0x00000485 +#define MSR_IA32_VMX_CR0_FIXED0 0x00000486 +#define MSR_IA32_VMX_CR0_FIXED1 0x00000487 +#define MSR_IA32_VMX_CR4_FIXED0 0x00000488 +#define MSR_IA32_VMX_CR4_FIXED1 0x00000489 +#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a +#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b +#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c +#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d +#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e +#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f +#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 +#define MSR_IA32_VMX_VMFUNC 0x00000491 + +/* VMX_BASIC bits and bitmasks */ +#define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_TRUE_CTLS (1ULL << 55) +#define VMX_BASIC_64 0x0001000000000000LLU +#define VMX_BASIC_MEM_TYPE_SHIFT 50 +#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU +#define VMX_BASIC_MEM_TYPE_WB 6LLU +#define VMX_BASIC_INOUT 0x0040000000000000LLU + +/* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F +/* AMD-V MSRs */ + +#define MSR_VM_CR 0xc0010114 +#define MSR_VM_IGNNE 0xc0010115 +#define MSR_VM_HSAVE_PA 0xc0010117 + +#endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h new file mode 100755 index 0000000..29241ee --- /dev/null +++ b/arch/x86/include/asm/svm.h @@ -0,0 +1,306 @@ +/* + * Copyright 2019 Google LLC + */ + +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SVM_H +#define __SVM_H + +#include <uapi/asm/svm.h> + + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, + INTERCEPT_XSETBV, +}; + +#pragma pack(push, 1) +struct vmcb_control_area { + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[40]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u64 avic_vapic_bar; + u8 reserved_4[8]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 lbr_ctl; + u32 clean; + u32 reserved_5; + u64 next_rip; + u8 insn_len; + u8 insn_bytes[15]; + u64 avic_backing_page; /* Offset 0xe0 */ + u8 reserved_6[8]; /* Offset 0xe8 */ + u64 avic_logical_id; /* Offset 0xf0 */ + u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[768]; +}; +#pragma pack(pop) + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 +#define TLB_CONTROL_FLUSH_ASID 3 +#define TLB_CONTROL_FLUSH_ASID_LOCAL 7 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_GIF_SHIFT 9 +#define V_GIF_MASK (1 << V_GIF_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define V_GIF_ENABLE_SHIFT 25 +#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) + +#define AVIC_ENABLE_SHIFT 31 +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) + +#define LBR_CTL_ENABLE_MASK BIT_ULL(0) +#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + +#define SVM_NESTED_CTL_NP_ENABLE BIT(0) +#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) + +#pragma pack(push, 1) +struct vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; +#pragma pack(pop) + +#define SVM_CPUID_FUNC 0x8000000a + +#define SVM_VM_CR_SVM_DISABLE 4 + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_READ 0 +#define INTERCEPT_CR3_READ 3 +#define INTERCEPT_CR4_READ 4 +#define INTERCEPT_CR8_READ 8 +#define INTERCEPT_CR0_WRITE (16 + 0) +#define INTERCEPT_CR3_WRITE (16 + 3) +#define INTERCEPT_CR4_WRITE (16 + 4) +#define INTERCEPT_CR8_WRITE (16 + 8) + +#define INTERCEPT_DR0_READ 0 +#define INTERCEPT_DR1_READ 1 +#define INTERCEPT_DR2_READ 2 +#define INTERCEPT_DR3_READ 3 +#define INTERCEPT_DR4_READ 4 +#define INTERCEPT_DR5_READ 5 +#define INTERCEPT_DR6_READ 6 +#define INTERCEPT_DR7_READ 7 +#define INTERCEPT_DR0_WRITE (16 + 0) +#define INTERCEPT_DR1_WRITE (16 + 1) +#define INTERCEPT_DR2_WRITE (16 + 2) +#define INTERCEPT_DR3_WRITE (16 + 3) +#define INTERCEPT_DR4_WRITE (16 + 4) +#define INTERCEPT_DR5_WRITE (16 + 5) +#define INTERCEPT_DR6_WRITE (16 + 6) +#define INTERCEPT_DR7_WRITE (16 + 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK +#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 +#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 +#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 + +#define SVM_EXITINFO_REG_MASK 0x0F + +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) + +#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" +#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" +#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" +#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" +#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" +#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" + +#endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h new file mode 100755 index 0000000..dec5bf4 --- /dev/null +++ b/arch/x86/include/asm/vmx.h @@ -0,0 +1,495 @@ +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * Copyright 2019 Google LLC + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + */ +#ifndef VMX_H +#define VMX_H + + +#include <gvm_types.h> +#include <uapi/asm/vmx.h> + +/* + * Definitions of Primary Processor-Based VM-Execution Controls. + */ +#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVLPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR3_LOAD_EXITING 0x00008000 +#define CPU_BASED_CR3_STORE_EXITING 0x00010000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000 +#define CPU_BASED_USE_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 + +#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172 + +/* + * Definitions of Secondary Processor-Based VM-Execution Controls. + */ +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 +#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 +#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 +#define SECONDARY_EXEC_XSAVES 0x00100000 + +#define PIN_BASED_EXT_INTR_MASK 0x00000001 +#define PIN_BASED_NMI_EXITING 0x00000008 +#define PIN_BASED_VIRTUAL_NMIS 0x00000020 + +#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 + +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004 +#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 +#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_IA32_PAT 0x00040000 +#define VM_EXIT_LOAD_IA32_PAT 0x00080000 +#define VM_EXIT_SAVE_IA32_EFER 0x00100000 +#define VM_EXIT_LOAD_IA32_EFER 0x00200000 +#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_EXIT_CLEAR_BNDCFGS 0x00800000 + +#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff + +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 +#define VM_ENTRY_IA32E_MODE 0x00000200 +#define VM_ENTRY_SMM 0x00000400 +#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 +#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 +#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 +#define VM_ENTRY_LOAD_BNDCFGS 0x00010000 + +#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff + +#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f +#define VMX_MISC_SAVE_EFER_LMA 0x00000020 +#define VMX_MISC_ACTIVITY_HLT 0x00000040 + +/* VMCS Encodings */ +enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, + POSTED_INTR_NV = 0x00000002, + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_A_HIGH = 0x00002001, + IO_BITMAP_B = 0x00002002, + IO_BITMAP_B_HIGH = 0x00002003, + MSR_BITMAP = 0x00002004, + MSR_BITMAP_HIGH = 0x00002005, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, + TSC_OFFSET = 0x00002010, + TSC_OFFSET_HIGH = 0x00002011, + VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + APIC_ACCESS_ADDR = 0x00002014, + APIC_ACCESS_ADDR_HIGH = 0x00002015, + POSTED_INTR_DESC_ADDR = 0x00002016, + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, + EPT_POINTER = 0x0000201a, + EPT_POINTER_HIGH = 0x0000201b, + EOI_EXIT_BITMAP0 = 0x0000201c, + EOI_EXIT_BITMAP0_HIGH = 0x0000201d, + EOI_EXIT_BITMAP1 = 0x0000201e, + EOI_EXIT_BITMAP1_HIGH = 0x0000201f, + EOI_EXIT_BITMAP2 = 0x00002020, + EOI_EXIT_BITMAP2_HIGH = 0x00002021, + EOI_EXIT_BITMAP3 = 0x00002022, + EOI_EXIT_BITMAP3_HIGH = 0x00002023, + VMREAD_BITMAP = 0x00002026, + VMWRITE_BITMAP = 0x00002028, + XSS_EXIT_BITMAP = 0x0000202C, + XSS_EXIT_BITMAP_HIGH = 0x0000202D, + GUEST_PHYSICAL_ADDRESS = 0x00002400, + GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, + VMCS_LINK_POINTER = 0x00002800, + VMCS_LINK_POINTER_HIGH = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_IA32_PAT = 0x00002804, + GUEST_IA32_PAT_HIGH = 0x00002805, + GUEST_IA32_EFER = 0x00002806, + GUEST_IA32_EFER_HIGH = 0x00002807, + GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, + GUEST_PDPTR0 = 0x0000280a, + GUEST_PDPTR0_HIGH = 0x0000280b, + GUEST_PDPTR1 = 0x0000280c, + GUEST_PDPTR1_HIGH = 0x0000280d, + GUEST_PDPTR2 = 0x0000280e, + GUEST_PDPTR2_HIGH = 0x0000280f, + GUEST_PDPTR3 = 0x00002810, + GUEST_PDPTR3_HIGH = 0x00002811, + GUEST_BNDCFGS = 0x00002812, + GUEST_BNDCFGS_HIGH = 0x00002813, + HOST_IA32_PAT = 0x00002c00, + HOST_IA32_PAT_HIGH = 0x00002c01, + HOST_IA32_EFER = 0x00002c02, + HOST_IA32_EFER_HIGH = 0x00002c03, + HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + TPR_THRESHOLD = 0x0000401c, + SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, + VM_INSTRUCTION_ERROR = 0x00004400, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_INSTRUCTION_INFO = 0x0000440e, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_SYSENTER_CS = 0x0000482A, + VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + HOST_IA32_SYSENTER_CS = 0x00004c00, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUE0 = 0x00006008, + CR3_TARGET_VALUE1 = 0x0000600a, + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_RSP = 0x0000681c, + GUEST_RIP = 0x0000681e, + GUEST_RFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + GUEST_SYSENTER_ESP = 0x00006824, + GUEST_SYSENTER_EIP = 0x00006826, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_IA32_SYSENTER_ESP = 0x00006c10, + HOST_IA32_SYSENTER_EIP = 0x00006c12, + HOST_RSP = 0x00006c14, + HOST_RIP = 0x00006c16, +}; + +/* + * Interruption-information format + */ +#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ +#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ +#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */ +#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ +#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000 + +#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK +#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK +#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK +#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK + +#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ + +/* GUEST_INTERRUPTIBILITY_INFO flags. */ +#define GUEST_INTR_STATE_STI 0x00000001 +#define GUEST_INTR_STATE_MOV_SS 0x00000002 +#define GUEST_INTR_STATE_SMI 0x00000004 +#define GUEST_INTR_STATE_NMI 0x00000008 + +/* GUEST_ACTIVITY_STATE flags */ +#define GUEST_ACTIVITY_ACTIVE 0 +#define GUEST_ACTIVITY_HLT 1 +#define GUEST_ACTIVITY_SHUTDOWN 2 +#define GUEST_ACTIVITY_WAIT_SIPI 3 + +/* + * Exit Qualifications for MOV for Control Register Access + */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ +#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ +#define LMSW_SOURCE_DATA_SHIFT 16 +#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ +#define REG_EAX (0 << 8) +#define REG_ECX (1 << 8) +#define REG_EDX (2 << 8) +#define REG_EBX (3 << 8) +#define REG_ESP (4 << 8) +#define REG_EBP (5 << 8) +#define REG_ESI (6 << 8) +#define REG_EDI (7 << 8) +#define REG_R8 (8 << 8) +#define REG_R9 (9 << 8) +#define REG_R10 (10 << 8) +#define REG_R11 (11 << 8) +#define REG_R12 (12 << 8) +#define REG_R13 (13 << 8) +#define REG_R14 (14 << 8) +#define REG_R15 (15 << 8) + +/* + * Exit Qualifications for MOV for Debug Register Access + */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ +#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ +#define TYPE_MOV_TO_DR (0 << 4) +#define TYPE_MOV_FROM_DR (1 << 4) +#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ + + +/* + * Exit Qualifications for APIC-Access + */ +#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */ +#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */ +#define TYPE_LINEAR_APIC_INST_READ (0 << 12) +#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12) +#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12) +#define TYPE_LINEAR_APIC_EVENT (3 << 12) +#define TYPE_PHYSICAL_APIC_EVENT (10 << 12) +#define TYPE_PHYSICAL_APIC_INST (15 << 12) + +/* segment AR in VMCS -- these are different from what LAR reports */ +#define VMX_SEGMENT_AR_L_MASK (1 << 13) + +#define VMX_AR_TYPE_ACCESSES_MASK 1 +#define VMX_AR_TYPE_READABLE_MASK (1 << 1) +#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2) +#define VMX_AR_TYPE_CODE_MASK (1 << 3) +#define VMX_AR_TYPE_MASK 0x0f +#define VMX_AR_TYPE_BUSY_64_TSS 11 +#define VMX_AR_TYPE_BUSY_32_TSS 11 +#define VMX_AR_TYPE_BUSY_16_TSS 3 +#define VMX_AR_TYPE_LDT 2 + +#define VMX_AR_UNUSABLE_MASK (1 << 16) +#define VMX_AR_S_MASK (1 << 4) +#define VMX_AR_P_MASK (1 << 7) +#define VMX_AR_L_MASK (1 << 13) +#define VMX_AR_DB_MASK (1 << 14) +#define VMX_AR_G_MASK (1 << 15) +#define VMX_AR_DPL_SHIFT 5 +#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3) + +#define VMX_AR_RESERVD_MASK 0xfffe0f00 + +#define TSS_PRIVATE_MEMSLOT (GVM_USER_MEM_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (GVM_USER_MEM_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (GVM_USER_MEM_SLOTS + 2) + +#define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 +#define VMX_VPID_EXTENT_ALL_CONTEXT 2 + +#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 +#define VMX_EPT_EXTENT_CONTEXT 1 +#define VMX_EPT_EXTENT_GLOBAL 2 +#define VMX_EPT_EXTENT_SHIFT 24 + +#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) +#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPTP_UC_BIT (1ull << 8) +#define VMX_EPTP_WB_BIT (1ull << 14) +#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_INVEPT_BIT (1ull << 20) +#define VMX_EPT_AD_BIT (1ull << 21) +#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) +#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ +#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ + +#define VMX_EPT_DEFAULT_GAW 3 +#define VMX_EPT_MAX_GAW 0x4 +#define VMX_EPT_MT_EPTE_SHIFT 3 +#define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) +#define VMX_EPT_DEFAULT_MT 0x6ull +#define VMX_EPT_READABLE_MASK 0x1ull +#define VMX_EPT_WRITABLE_MASK 0x2ull +#define VMX_EPT_EXECUTABLE_MASK 0x4ull +#define VMX_EPT_IPAT_BIT (1ull << 6) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) + +#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul + + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" + +__align(16) +struct vmx_msr_entry { + u32 index; + u32 reserved; + u64 value; +}; + +/* + * Exit Qualifications for entry failure during or after loading guest state + */ +#define ENTRY_FAIL_DEFAULT 0 +#define ENTRY_FAIL_PDPTE 2 +#define ENTRY_FAIL_NMI 3 +#define ENTRY_FAIL_VMCS_LINK_PTR 4 + +/* + * VM-instruction error numbers + */ +enum vm_instruction_error_number { + VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1, + VMXERR_VMCLEAR_INVALID_ADDRESS = 2, + VMXERR_VMCLEAR_VMXON_POINTER = 3, + VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4, + VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5, + VMXERR_VMRESUME_AFTER_VMXOFF = 6, + VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7, + VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8, + VMXERR_VMPTRLD_INVALID_ADDRESS = 9, + VMXERR_VMPTRLD_VMXON_POINTER = 10, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11, + VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12, + VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15, + VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16, + VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17, + VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18, + VMXERR_VMCALL_NONCLEAR_VMCS = 19, + VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20, + VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22, + VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23, + VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24, + VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, +}; + +#endif diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h new file mode 100755 index 0000000..3c0874d --- /dev/null +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -0,0 +1,80 @@ +#ifndef _UAPI_ASM_X86_DEBUGREG_H +#define _UAPI_ASM_X86_DEBUGREG_H + + +/* Indicate the register numbers for a number of the specific + debug registers. Registers 0-3 contain the addresses we wish to trap on */ +#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */ +#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */ + +#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */ +#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */ + +/* Define a few things for the status register. We can use this to determine + which debugging register was responsible for the trap. The other bits + are either reserved or not of interest to us. */ + +/* Define reserved bits in DR6 which are always set to 1 */ +#define DR6_RESERVED (0xFFFF0FF0) + +#define DR_TRAP0 (0x1) /* db0 */ +#define DR_TRAP1 (0x2) /* db1 */ +#define DR_TRAP2 (0x4) /* db2 */ +#define DR_TRAP3 (0x8) /* db3 */ +#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) + +#define DR_STEP (0x4000) /* single-step */ +#define DR_SWITCH (0x8000) /* task switch */ + +/* Now define a bunch of things for manipulating the control register. + The top two bytes of the control register consist of 4 fields of 4 + bits - each field corresponds to one of the four debug registers, + and indicates what types of access we trap on, and how large the data + field is that we are looking at */ + +#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */ +#define DR_CONTROL_SIZE 4 /* 4 control bits per register */ + +#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */ +#define DR_RW_WRITE (0x1) +#define DR_RW_READ (0x3) + +#define DR_LEN_1 (0x0) /* Settings for data length to trap on */ +#define DR_LEN_2 (0x4) +#define DR_LEN_4 (0xC) +#define DR_LEN_8 (0x8) + +/* The low byte to the control register determine which registers are + enabled. There are 4 fields of two bits. One bit is "local", meaning + that the processor will reset the bit after a task switch and the other + is global meaning that we have to explicitly reset the bit. With linux, + you can use either one, since we explicitly zero the register when we enter + kernel mode. */ + +#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ +#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ +#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ +#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ + +#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ +#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */ + +/* The second byte to the control register has a few special things. + We can slow the instruction pipeline for instructions coming via the + gdt or the ldt if we want to. I am not sure why this is an advantage */ + +#ifdef __i386__ +#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */ +#else +#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */ +#endif + +#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ +#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ + +/* + * HW breakpoint additions + */ + +#endif /* _UAPI_ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h new file mode 100755 index 0000000..1d1dbb2 --- /dev/null +++ b/arch/x86/include/uapi/asm/kvm.h @@ -0,0 +1,330 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef _ASM_X86_KVM_H +#define _ASM_X86_KVM_H + +/* + * kvm x86 specific structures and definitions + * + */ + +#include <gvm_types.h> + +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define AC_VECTOR 17 +#define MC_VECTOR 18 +#define XM_VECTOR 19 +#define VE_VECTOR 20 + +/* Select x86 specific features in <linux/kvm.h> */ +#define __GVM_HAVE_MSI +#define __GVM_HAVE_USER_NMI +#define __GVM_HAVE_GUEST_DEBUG +#define __GVM_HAVE_MSIX +#define __GVM_HAVE_VCPU_EVENTS +#define __GVM_HAVE_DEBUGREGS +#define __GVM_HAVE_XSAVE +#define __GVM_HAVE_XCRS +#define __GVM_HAVE_READONLY_MEM + +/* Architectural interrupt line count. */ +#define GVM_NR_INTERRUPTS 256 + +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; + +/* for GVM_GET_IRQCHIP and GVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define GVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[GVM_IOAPIC_NUM_PINS]; +}; + +#define GVM_IRQCHIP_PIC_MASTER 0 +#define GVM_IRQCHIP_PIC_SLAVE 1 +#define GVM_IRQCHIP_IOAPIC 2 +#define GVM_NR_IRQCHIPS 3 + +#define GVM_RUN_X86_SMM (1 << 0) + +/* for GVM_GET_REGS and GVM_SET_REGS */ +struct kvm_regs { + /* out (GVM_GET_REGS) / in (GVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* for GVM_GET_LAPIC and GVM_SET_LAPIC */ +#define GVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[GVM_APIC_REG_SIZE]; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + + +/* for GVM_GET_SREGS and GVM_SET_SREGS */ +struct kvm_sregs { + /* out (GVM_GET_SREGS) / in (GVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(GVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* for GVM_GET_FPU and GVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +#pragma warning(disable : 4200) +/* for GVM_GET_MSRS and GVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for GVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +struct kvm_cpuid_entry { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +#define GVM_CPUID_FLAG_SIGNIFCANT_INDEX (1 << 0) +#define GVM_CPUID_FLAG_STATEFUL_FUNC (1 << 1) +#define GVM_CPUID_FLAG_STATE_READ_NEXT (1 << 2) + +/* for GVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +/* for GVM_GET_PIT and GVM_SET_PIT */ +struct kvm_pit_channel_state { + __u32 count; /* can be 65536 */ + __u16 latched_count; + __u8 count_latched; + __u8 status_latched; + __u8 status; + __u8 read_state; + __u8 write_state; + __u8 write_latch; + __u8 rw_mode; + __u8 mode; + __u8 bcd; + __u8 gate; + __s64 count_load_time; +}; + +struct kvm_debug_exit_arch { + __u32 exception; + __u32 pad; + __u64 pc; + __u64 dr6; + __u64 dr7; +}; + +#define GVM_GUESTDBG_USE_SW_BP 0x00010000 +#define GVM_GUESTDBG_USE_HW_BP 0x00020000 +#define GVM_GUESTDBG_INJECT_DB 0x00040000 +#define GVM_GUESTDBG_INJECT_BP 0x00080000 + +/* for GVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { + __u64 debugreg[8]; +}; + +struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; +}; + +/* When set in flags, include corresponding fields on GVM_SET_VCPU_EVENTS */ +#define GVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define GVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 +#define GVM_VCPUEVENT_VALID_SHADOW 0x00000004 +#define GVM_VCPUEVENT_VALID_SMM 0x00000008 + +/* Interrupt shadow states */ +#define GVM_X86_SHADOW_INT_MOV_SS 0x01 +#define GVM_X86_SHADOW_INT_STI 0x02 + +/* for GVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u32 reserved[9]; +}; + +/* for GVM_GET/SET_DEBUGREGS */ +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + +/* for GVM_CAP_XSAVE */ +struct kvm_xsave { + __u32 region[1024]; +}; + +#define GVM_MAX_XCRS 16 + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[GVM_MAX_XCRS]; + __u64 padding[16]; +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { + u64 reg; +}; + +#define GVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define GVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) + +#endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h new file mode 100755 index 0000000..4a65cf0 --- /dev/null +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -0,0 +1,162 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef _UAPI_ASM_X86_PROCESSOR_FLAGS_H +#define _UAPI_ASM_X86_PROCESSOR_FLAGS_H +/* Various flags defined: can be included from assembler. */ + +#ifdef CONFIG_X86_64 +#define _BITUL(a) ((1ULL) << a) +#else +#define _BITUL(a) ((1UL) << a) +#endif +#define _AC(X, Y) X##Y + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF_BIT 0 /* Carry Flag */ +#define X86_EFLAGS_CF _BITUL(X86_EFLAGS_CF_BIT) +#define X86_EFLAGS_FIXED_BIT 1 /* Bit 1 - always on */ +#define X86_EFLAGS_FIXED _BITUL(X86_EFLAGS_FIXED_BIT) +#define X86_EFLAGS_PF_BIT 2 /* Parity Flag */ +#define X86_EFLAGS_PF _BITUL(X86_EFLAGS_PF_BIT) +#define X86_EFLAGS_AF_BIT 4 /* Auxiliary carry Flag */ +#define X86_EFLAGS_AF _BITUL(X86_EFLAGS_AF_BIT) +#define X86_EFLAGS_ZF_BIT 6 /* Zero Flag */ +#define X86_EFLAGS_ZF _BITUL(X86_EFLAGS_ZF_BIT) +#define X86_EFLAGS_SF_BIT 7 /* Sign Flag */ +#define X86_EFLAGS_SF _BITUL(X86_EFLAGS_SF_BIT) +#define X86_EFLAGS_TF_BIT 8 /* Trap Flag */ +#define X86_EFLAGS_TF _BITUL(X86_EFLAGS_TF_BIT) +#define X86_EFLAGS_IF_BIT 9 /* Interrupt Flag */ +#define X86_EFLAGS_IF _BITUL(X86_EFLAGS_IF_BIT) +#define X86_EFLAGS_DF_BIT 10 /* Direction Flag */ +#define X86_EFLAGS_DF _BITUL(X86_EFLAGS_DF_BIT) +#define X86_EFLAGS_OF_BIT 11 /* Overflow Flag */ +#define X86_EFLAGS_OF _BITUL(X86_EFLAGS_OF_BIT) +#define X86_EFLAGS_IOPL_BIT 12 /* I/O Privilege Level (2 bits) */ +#define X86_EFLAGS_IOPL (_AC(3,UL) << X86_EFLAGS_IOPL_BIT) +#define X86_EFLAGS_NT_BIT 14 /* Nested Task */ +#define X86_EFLAGS_NT _BITUL(X86_EFLAGS_NT_BIT) +#define X86_EFLAGS_RF_BIT 16 /* Resume Flag */ +#define X86_EFLAGS_RF _BITUL(X86_EFLAGS_RF_BIT) +#define X86_EFLAGS_VM_BIT 17 /* Virtual Mode */ +#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT) +#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ +#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) +#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT) +#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_VIP _BITUL(X86_EFLAGS_VIP_BIT) +#define X86_EFLAGS_ID_BIT 21 /* CPUID detection */ +#define X86_EFLAGS_ID _BITUL(X86_EFLAGS_ID_BIT) + +/* + * Basic CPU control in CR0 + */ +#define X86_CR0_PE_BIT 0 /* Protection Enable */ +#define X86_CR0_PE _BITUL(X86_CR0_PE_BIT) +#define X86_CR0_MP_BIT 1 /* Monitor Coprocessor */ +#define X86_CR0_MP _BITUL(X86_CR0_MP_BIT) +#define X86_CR0_EM_BIT 2 /* Emulation */ +#define X86_CR0_EM _BITUL(X86_CR0_EM_BIT) +#define X86_CR0_TS_BIT 3 /* Task Switched */ +#define X86_CR0_TS _BITUL(X86_CR0_TS_BIT) +#define X86_CR0_ET_BIT 4 /* Extension Type */ +#define X86_CR0_ET _BITUL(X86_CR0_ET_BIT) +#define X86_CR0_NE_BIT 5 /* Numeric Error */ +#define X86_CR0_NE _BITUL(X86_CR0_NE_BIT) +#define X86_CR0_WP_BIT 16 /* Write Protect */ +#define X86_CR0_WP _BITUL(X86_CR0_WP_BIT) +#define X86_CR0_AM_BIT 18 /* Alignment Mask */ +#define X86_CR0_AM _BITUL(X86_CR0_AM_BIT) +#define X86_CR0_NW_BIT 29 /* Not Write-through */ +#define X86_CR0_NW _BITUL(X86_CR0_NW_BIT) +#define X86_CR0_CD_BIT 30 /* Cache Disable */ +#define X86_CR0_CD _BITUL(X86_CR0_CD_BIT) +#define X86_CR0_PG_BIT 31 /* Paging */ +#define X86_CR0_PG _BITUL(X86_CR0_PG_BIT) + +/* + * Paging options in CR3 + */ +#define X86_CR3_PWT_BIT 3 /* Page Write Through */ +#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) +#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ +#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) +#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME_BIT 0 /* enable vm86 extensions */ +#define X86_CR4_VME _BITUL(X86_CR4_VME_BIT) +#define X86_CR4_PVI_BIT 1 /* virtual interrupts flag enable */ +#define X86_CR4_PVI _BITUL(X86_CR4_PVI_BIT) +#define X86_CR4_TSD_BIT 2 /* disable time stamp at ipl 3 */ +#define X86_CR4_TSD _BITUL(X86_CR4_TSD_BIT) +#define X86_CR4_DE_BIT 3 /* enable debugging extensions */ +#define X86_CR4_DE _BITUL(X86_CR4_DE_BIT) +#define X86_CR4_PSE_BIT 4 /* enable page size extensions */ +#define X86_CR4_PSE _BITUL(X86_CR4_PSE_BIT) +#define X86_CR4_PAE_BIT 5 /* enable physical address extensions */ +#define X86_CR4_PAE _BITUL(X86_CR4_PAE_BIT) +#define X86_CR4_MCE_BIT 6 /* Machine check enable */ +#define X86_CR4_MCE _BITUL(X86_CR4_MCE_BIT) +#define X86_CR4_PGE_BIT 7 /* enable global pages */ +#define X86_CR4_PGE _BITUL(X86_CR4_PGE_BIT) +#define X86_CR4_PCE_BIT 8 /* enable performance counters at ipl 3 */ +#define X86_CR4_PCE _BITUL(X86_CR4_PCE_BIT) +#define X86_CR4_OSFXSR_BIT 9 /* enable fast FPU save and restore */ +#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) +#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ +#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ +#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) +#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ +#define X86_CR4_SMXE _BITUL(X86_CR4_SMXE_BIT) +#define X86_CR4_FSGSBASE_BIT 16 /* enable RDWRFSGS support */ +#define X86_CR4_FSGSBASE _BITUL(X86_CR4_FSGSBASE_BIT) +#define X86_CR4_PCIDE_BIT 17 /* enable PCID support */ +#define X86_CR4_PCIDE _BITUL(X86_CR4_PCIDE_BIT) +#define X86_CR4_OSXSAVE_BIT 18 /* enable xsave and xrestore */ +#define X86_CR4_OSXSAVE _BITUL(X86_CR4_OSXSAVE_BIT) +#define X86_CR4_SMEP_BIT 20 /* enable SMEP support */ +#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT) +#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */ +#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT) +#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */ +#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT) + +/* + * x86-64 Task Priority Register, CR8 + */ +#define X86_CR8_TPR _AC(0x0000000f,UL) /* task priority register */ + +/* + * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> + */ + +/* + * NSC/Cyrix CPU configuration register indexes + */ +#define CX86_PCR0 0x20 +#define CX86_GCR 0xb8 +#define CX86_CCR0 0xc0 +#define CX86_CCR1 0xc1 +#define CX86_CCR2 0xc2 +#define CX86_CCR3 0xc3 +#define CX86_CCR4 0xe8 +#define CX86_CCR5 0xe9 +#define CX86_CCR6 0xea +#define CX86_CCR7 0xeb +#define CX86_PCR1 0xf0 +#define CX86_DIR0 0xfe +#define CX86_DIR1 0xff +#define CX86_ARR_BASE 0xc4 +#define CX86_RCR_BASE 0xdc + + +#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h new file mode 100755 index 0000000..a9731f8 --- /dev/null +++ b/arch/x86/include/uapi/asm/svm.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__SVM_H +#define _UAPI__SVM_H + +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR2 0x002 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR2 0x012 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_NPF 0x400 +#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 +#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 + +#define SVM_EXIT_ERR -1 + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR2, "read_cr2" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR2, "write_cr2" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_READ_DR4, "read_dr4" }, \ + { SVM_EXIT_READ_DR5, "read_dr5" }, \ + { SVM_EXIT_READ_DR6, "read_dr6" }, \ + { SVM_EXIT_READ_DR7, "read_dr7" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR4, "write_dr4" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR6, "write_dr6" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DE_VECTOR, "DE excp" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + OF_VECTOR, "OF excp" }, \ + { SVM_EXIT_EXCP_BASE + BR_VECTOR, "BR excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + DF_VECTOR, "DF excp" }, \ + { SVM_EXIT_EXCP_BASE + TS_VECTOR, "TS excp" }, \ + { SVM_EXIT_EXCP_BASE + NP_VECTOR, "NP excp" }, \ + { SVM_EXIT_EXCP_BASE + SS_VECTOR, "SS excp" }, \ + { SVM_EXIT_EXCP_BASE + GP_VECTOR, "GP excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + MF_VECTOR, "MF excp" }, \ + { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_EXCP_BASE + XM_VECTOR, "XF excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \ + { SVM_EXIT_IDTR_READ, "read_idtr" }, \ + { SVM_EXIT_GDTR_READ, "read_gdtr" }, \ + { SVM_EXIT_LDTR_READ, "read_ldtr" }, \ + { SVM_EXIT_TR_READ, "read_rt" }, \ + { SVM_EXIT_IDTR_WRITE, "write_idtr" }, \ + { SVM_EXIT_GDTR_WRITE, "write_gdtr" }, \ + { SVM_EXIT_LDTR_WRITE, "write_ldtr" }, \ + { SVM_EXIT_TR_WRITE, "write_rt" }, \ + { SVM_EXIT_RDTSC, "rdtsc" }, \ + { SVM_EXIT_RDPMC, "rdpmc" }, \ + { SVM_EXIT_PUSHF, "pushf" }, \ + { SVM_EXIT_POPF, "popf" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_RSM, "rsm" }, \ + { SVM_EXIT_IRET, "iret" }, \ + { SVM_EXIT_SWINT, "swint" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_PAUSE, "pause" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_RDTSCP, "rdtscp" }, \ + { SVM_EXIT_ICEBP, "icebp" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" }, \ + { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ + { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ + { SVM_EXIT_ERR, "invalid_guest_state" } + + +#endif /* _UAPI__SVM_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h new file mode 100755 index 0000000..09c0f96 --- /dev/null +++ b/arch/x86/include/uapi/asm/vmx.h @@ -0,0 +1,133 @@ +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + */ +#ifndef _UAPIVMX_H +#define _UAPIVMX_H + + +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MSR_LOAD_FAIL 34 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_TRAP_FLAG 37 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_PML_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 + +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ + { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ + { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ + { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_XSAVES, "XSAVES" }, \ + { EXIT_REASON_XRSTORS, "XRSTORS" } + +#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 + +#endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c deleted file mode 100644 index edbbfc8..0000000 --- a/arch/x86/kernel/kvm.c +++ /dev/null @@ -1,627 +0,0 @@ -/* - * KVM paravirt_ops implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright IBM Corporation, 2007 - * Authors: Anthony Liguori <aliguori@us.ibm.com> - */ - -#include <linux/context_tracking.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/kvm_para.h> -#include <linux/cpu.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/hardirq.h> -#include <linux/notifier.h> -#include <linux/reboot.h> -#include <linux/hash.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/kprobes.h> -#include <linux/debugfs.h> -#include <linux/nmi.h> -#include <linux/swait.h> -#include <asm/timer.h> -#include <asm/cpu.h> -#include <asm/traps.h> -#include <asm/desc.h> -#include <asm/tlbflush.h> -#include <asm/idle.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/hypervisor.h> -#include <asm/kvm_guest.h> - -static int kvmapf = 1; - -static int parse_no_kvmapf(char *arg) -{ - kvmapf = 0; - return 0; -} - -early_param("no-kvmapf", parse_no_kvmapf); - -static int steal_acc = 1; -static int parse_no_stealacc(char *arg) -{ - steal_acc = 0; - return 0; -} - -early_param("no-steal-acc", parse_no_stealacc); - -static int kvmclock_vsyscall = 1; -static int parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); -static int has_steal_clock = 0; - -/* - * No need for any "IO delay" on KVM - */ -static void kvm_io_delay(void) -{ -} - -#define KVM_TASK_SLEEP_HASHBITS 8 -#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) - -struct kvm_task_sleep_node { - struct hlist_node link; - struct swait_queue_head wq; - u32 token; - int cpu; - bool halted; -}; - -static struct kvm_task_sleep_head { - raw_spinlock_t lock; - struct hlist_head list; -} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; - -static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, - u32 token) -{ - struct hlist_node *p; - - hlist_for_each(p, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); - if (n->token == token) - return n; - } - - return NULL; -} - -void kvm_async_pf_task_wait(u32 token) -{ - u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); - struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); - - raw_spin_lock(&b->lock); - e = _find_apf_task(b, token); - if (e) { - /* dummy entry exist -> wake up was delivered ahead of PF */ - hlist_del(&e->link); - kfree(e); - raw_spin_unlock(&b->lock); - - rcu_irq_exit(); - return; - } - - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); - raw_spin_unlock(&b->lock); - - for (;;) { - if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); - if (hlist_unhashed(&n.link)) - break; - - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - rcu_irq_exit(); - native_safe_halt(); - rcu_irq_enter(); - local_irq_disable(); - } - } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; -} -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); - -static void apf_task_wake_one(struct kvm_task_sleep_node *n) -{ - hlist_del_init(&n->link); - if (n->halted) - smp_send_reschedule(n->cpu); - else if (swait_active(&n->wq)) - swake_up(&n->wq); -} - -static void apf_task_wake_all(void) -{ - int i; - - for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; - struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; - raw_spin_lock(&b->lock); - hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); - if (n->cpu == smp_processor_id()) - apf_task_wake_one(n); - } - raw_spin_unlock(&b->lock); - } -} - -void kvm_async_pf_task_wake(u32 token) -{ - u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); - struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node *n; - - if (token == ~0) { - apf_task_wake_all(); - return; - } - -again: - raw_spin_lock(&b->lock); - n = _find_apf_task(b, token); - if (!n) { - /* - * async PF was not yet handled. - * Add dummy entry for the token. - */ - n = kzalloc(sizeof(*n), GFP_ATOMIC); - if (!n) { - /* - * Allocation failed! Busy wait while other cpu - * handles async PF. - */ - raw_spin_unlock(&b->lock); - cpu_relax(); - goto again; - } - n->token = token; - n->cpu = smp_processor_id(); - init_swait_queue_head(&n->wq); - hlist_add_head(&n->link, &b->list); - } else - apf_task_wake_one(n); - raw_spin_unlock(&b->lock); - return; -} -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); - -u32 kvm_read_and_reset_pf_reason(void) -{ - u32 reason = 0; - - if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); - } - - return reason; -} -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); - -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - enum ctx_state prev_state; - - switch (kvm_read_and_reset_pf_reason()) { - default: - trace_do_page_fault(regs, error_code); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - /* page is swapped out by the host. */ - prev_state = exception_enter(); - exit_idle(); - kvm_async_pf_task_wait((u32)read_cr2()); - exception_exit(prev_state); - break; - case KVM_PV_REASON_PAGE_READY: - rcu_irq_enter(); - exit_idle(); - kvm_async_pf_task_wake((u32)read_cr2()); - rcu_irq_exit(); - break; - } -} -NOKPROBE_SYMBOL(do_async_page_fault); - -static void __init paravirt_ops_setup(void) -{ - pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) - pv_cpu_ops.io_delay = kvm_io_delay; - -#ifdef CONFIG_X86_IO_APIC - no_timer_check = 1; -#endif -} - -static void kvm_register_steal_time(void) -{ - int cpu = smp_processor_id(); - struct kvm_steal_time *st = &per_cpu(steal_time, cpu); - - if (!has_steal_clock) - return; - - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); -} - -static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; - -static void kvm_guest_apic_eoi_write(u32 reg, u32 val) -{ - /** - * This relies on __test_and_clear_bit to modify the memory - * in a way that is atomic with respect to the local CPU. - * The hypervisor only accesses this memory from the local CPU so - * there's no need for lock or memory barriers. - * An optimization barrier is implied in apic write. - */ - if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) - return; - apic_write(APIC_EOI, APIC_EOI_ACK); -} - -static void kvm_guest_cpu_init(void) -{ - if (!kvm_para_available()) - return; - - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); - -#ifdef CONFIG_PREEMPT - pa |= KVM_ASYNC_PF_SEND_ALWAYS; -#endif - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); - __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); - } - - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { - unsigned long pa; - /* Size alignment is implied but just to make it explicit. */ - BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); - __this_cpu_write(kvm_apic_eoi, 0); - pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) - | KVM_MSR_ENABLED; - wrmsrl(MSR_KVM_PV_EOI_EN, pa); - } - - if (has_steal_clock) - kvm_register_steal_time(); -} - -static void kvm_pv_disable_apf(void) -{ - if (!__this_cpu_read(apf_reason.enabled)) - return; - - wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); - - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); -} - -static void kvm_pv_guest_cpu_reboot(void *unused) -{ - /* - * We disable PV EOI before we load a new kernel by kexec, - * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. - * New kernel can re-enable when it boots. - */ - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - kvm_disable_steal_time(); -} - -static int kvm_pv_reboot_notify(struct notifier_block *nb, - unsigned long code, void *unused) -{ - if (code == SYS_RESTART) - on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); - return NOTIFY_DONE; -} - -static struct notifier_block kvm_pv_reboot_nb = { - .notifier_call = kvm_pv_reboot_notify, -}; - -static u64 kvm_steal_clock(int cpu) -{ - u64 steal; - struct kvm_steal_time *src; - int version; - - src = &per_cpu(steal_time, cpu); - do { - version = src->version; - rmb(); - steal = src->steal; - rmb(); - } while ((version & 1) || (version != src->version)); - - return steal; -} - -void kvm_disable_steal_time(void) -{ - if (!has_steal_clock) - return; - - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); -} - -#ifdef CONFIG_SMP -static void __init kvm_smp_prepare_boot_cpu(void) -{ - kvm_guest_cpu_init(); - native_smp_prepare_boot_cpu(); - kvm_spinlock_init(); -} - -static void kvm_guest_cpu_offline(void) -{ - kvm_disable_steal_time(); - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - apf_task_wake_all(); -} - -static int kvm_cpu_online(unsigned int cpu) -{ - local_irq_disable(); - kvm_guest_cpu_init(); - local_irq_enable(); - return 0; -} - -static int kvm_cpu_down_prepare(unsigned int cpu) -{ - local_irq_disable(); - kvm_guest_cpu_offline(); - local_irq_enable(); - return 0; -} -#endif - -static void __init kvm_apf_trap_init(void) -{ - set_intr_gate(14, async_page_fault); -} - -void __init kvm_guest_init(void) -{ - int i; - - if (!kvm_para_available()) - return; - - paravirt_ops_setup(); - register_reboot_notifier(&kvm_pv_reboot_nb); - for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) - raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; - - if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - has_steal_clock = 1; - pv_time_ops.steal_clock = kvm_steal_clock; - } - - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - apic_set_eoi_write(kvm_guest_apic_eoi_write); - - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - -#ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; - if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", - kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); -#else - kvm_guest_cpu_init(); -#endif - - /* - * Hard lockup detection is enabled by default. Disable it, as guests - * can get false positives too easily, for example if the host is - * overcommitted. - */ - hardlockup_detector_disable(); -} - -static noinline uint32_t __kvm_cpuid_base(void) -{ - if (boot_cpu_data.cpuid_level < 0) - return 0; /* So we don't blow up on old processors */ - - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); - - return 0; -} - -static inline uint32_t kvm_cpuid_base(void) -{ - static int kvm_cpuid_base = -1; - - if (kvm_cpuid_base == -1) - kvm_cpuid_base = __kvm_cpuid_base(); - - return kvm_cpuid_base; -} - -bool kvm_para_available(void) -{ - return kvm_cpuid_base() != 0; -} -EXPORT_SYMBOL_GPL(kvm_para_available); - -unsigned int kvm_arch_para_features(void) -{ - return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); -} - -static uint32_t __init kvm_detect(void) -{ - return kvm_cpuid_base(); -} - -const struct hypervisor_x86 x86_hyper_kvm __refconst = { - .name = "KVM", - .detect = kvm_detect, - .x2apic_available = kvm_para_available, -}; -EXPORT_SYMBOL_GPL(x86_hyper_kvm); - -static __init int activate_jump_labels(void) -{ - if (has_steal_clock) { - static_key_slow_inc(¶virt_steal_enabled); - if (steal_acc) - static_key_slow_inc(¶virt_steal_rq_enabled); - } - - return 0; -} -arch_initcall(activate_jump_labels); - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - -/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ -static void kvm_kick_cpu(int cpu) -{ - int apicid; - unsigned long flags = 0; - - apicid = per_cpu(x86_cpu_to_apicid, cpu); - kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); -} - -#include <asm/qspinlock.h> - -static void kvm_wait(u8 *ptr, u8 val) -{ - unsigned long flags; - - if (in_nmi()) - return; - - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - - /* - * halt until it's our turn and kicked. Note that we do safe halt - * for irq enabled case to avoid hang when lock info is overwritten - * in irq spinlock slowpath and no spurious interrupt occur to save us. - */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); - -out: - local_irq_restore(flags); -} - -/* - * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. - */ -void __init kvm_spinlock_init(void) -{ - if (!kvm_para_available()) - return; - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return; - - __pv_init_lock_hash(); - pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_lock_ops.wait = kvm_wait; - pv_lock_ops.kick = kvm_kick_cpu; -} - -static __init int kvm_spinlock_init_jump(void) -{ - if (!kvm_para_available()) - return 0; - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - printk(KERN_INFO "KVM setup paravirtual spinlock\n"); - - return 0; -} -early_initcall(kvm_spinlock_init_jump); - -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c deleted file mode 100644 index 60b9949..0000000 --- a/arch/x86/kernel/kvmclock.c +++ /dev/null @@ -1,338 +0,0 @@ -/* KVM paravirtual clock driver. A clocksource implementation - Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -*/ - -#include <linux/clocksource.h> -#include <linux/kvm_para.h> -#include <asm/pvclock.h> -#include <asm/msr.h> -#include <asm/apic.h> -#include <linux/percpu.h> -#include <linux/hardirq.h> -#include <linux/memblock.h> -#include <linux/sched.h> - -#include <asm/x86_init.h> -#include <asm/reboot.h> - -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static cycle_t kvm_sched_clock_offset; - -static int parse_no_kvmclock(char *arg) -{ - kvmclock = 0; - return 0; -} -early_param("no-kvmclock", parse_no_kvmclock); - -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock wall_clock; - -struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return hv_clock; -} - -/* - * The wallclock is the time of day when we booted. Since then, some time may - * have elapsed since the hypervisor wrote the data. So we try to account for - * that with system time - */ -static void kvm_get_wallclock(struct timespec *now) -{ - struct pvclock_vcpu_time_info *vcpu_time; - int low, high; - int cpu; - - low = (int)__pa_symbol(&wall_clock); - high = ((u64)__pa_symbol(&wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, now); - - put_cpu(); -} - -static int kvm_set_wallclock(const struct timespec *now) -{ - return -1; -} - -static cycle_t kvm_clock_read(void) -{ - struct pvclock_vcpu_time_info *src; - cycle_t ret; - int cpu; - - preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); - preempt_enable_notrace(); - return ret; -} - -static cycle_t kvm_clock_get_cycles(struct clocksource *cs) -{ - return kvm_clock_read(); -} - -static cycle_t kvm_sched_clock_read(void) -{ - return kvm_clock_read() - kvm_sched_clock_offset; -} - -static inline void kvm_sched_clock_init(bool stable) -{ - if (!stable) { - pv_time_ops.sched_clock = kvm_clock_read; - return; - } - - kvm_sched_clock_offset = kvm_clock_read(); - pv_time_ops.sched_clock = kvm_sched_clock_read; - set_sched_clock_stable(); - - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); - - BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); -} - -/* - * If we don't do that, there is the possibility that the guest - * will calibrate under heavy load - thus, getting a lower lpj - - * and execute the delays themselves without load. This is wrong, - * because no delay loop can finish beforehand. - * Any heuristics is subject to fail, because ultimately, a large - * poll of guests can be running and trouble each other. So we preset - * lpj here - */ -static unsigned long kvm_get_tsc_khz(void) -{ - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); - return tsc_khz; -} - -static void kvm_get_preset_lpj(void) -{ - unsigned long khz; - u64 lpj; - - khz = kvm_get_tsc_khz(); - - lpj = ((u64)khz * 1000); - do_div(lpj, HZ); - preset_lpj = lpj; -} - -bool kvm_check_and_clear_guest_paused(void) -{ - bool ret = false; - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); - - if (!hv_clock) - return ret; - - src = &hv_clock[cpu].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; - pvclock_touch_watchdogs(); - ret = true; - } - - return ret; -} - -static struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -int kvm_register_clock(char *txt) -{ - int cpu = smp_processor_id(); - int low, high, ret; - struct pvclock_vcpu_time_info *src; - - if (!hv_clock) - return 0; - - src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); - - return ret; -} - -static void kvm_save_sched_clock_state(void) -{ -} - -static void kvm_restore_sched_clock_state(void) -{ - kvm_register_clock("primary cpu clock, resume"); -} - -#ifdef CONFIG_X86_LOCAL_APIC -static void kvm_setup_secondary_clock(void) -{ - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); -} -#endif - -/* - * After the clock is registered, the host will keep writing to the - * registered memory location. If the guest happens to shutdown, this memory - * won't be valid. In cases like kexec, in which you install a new kernel, this - * means a random memory location will be kept being written. So before any - * kind of shutdown from our side, we unregister the clock by writing anything - * that does not have the 'enable' bit set in the msr - */ -#ifdef CONFIG_KEXEC_CORE -static void kvm_crash_shutdown(struct pt_regs *regs) -{ - native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_crash_shutdown(regs); -} -#endif - -static void kvm_shutdown(void) -{ - native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_shutdown(); -} - -void __init kvmclock_init(void) -{ - struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem; - int size, cpu; - u8 flags; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - if (!kvm_para_available()) - return; - - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) - return; - - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", - msr_kvm_system_time, msr_kvm_wall_clock); - - mem = memblock_alloc(size, PAGE_SIZE); - if (!mem) - return; - hv_clock = __va(mem); - memset(hv_clock, 0, size); - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - memblock_free(mem, size); - return; - } - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); - - x86_platform.calibrate_tsc = kvm_get_tsc_khz; - x86_platform.calibrate_cpu = kvm_get_tsc_khz; - x86_platform.get_wallclock = kvm_get_wallclock; - x86_platform.set_wallclock = kvm_set_wallclock; -#ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; -#endif - x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; - x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; - machine_ops.shutdown = kvm_shutdown; -#ifdef CONFIG_KEXEC_CORE - machine_ops.crash_shutdown = kvm_crash_shutdown; -#endif - kvm_get_preset_lpj(); - clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.name = "KVM"; -} - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - int cpu; - u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; - - if (!hv_clock) - return 0; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - put_cpu(); - return 1; - } - - put_cpu(); - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig deleted file mode 100644 index ab8e32f..0000000 --- a/arch/x86/kvm/Kconfig +++ /dev/null @@ -1,106 +0,0 @@ -# -# KVM configuration -# - -source "virt/kvm/Kconfig" - -menuconfig VIRTUALIZATION - bool "Virtualization" - depends on HAVE_KVM || X86 - default y - ---help--- - Say Y here to get to see options for using your Linux host to run other - operating systems inside virtual machines (guests). - This option alone does not add any kernel code. - - If you say N, all options in this submenu will be skipped and disabled. - -if VIRTUALIZATION - -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM - depends on HIGH_RES_TIMERS - # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET - select PREEMPT_NOTIFIERS - select MMU_NOTIFIER - select ANON_INODES - select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD - select IRQ_BYPASS_MANAGER - select HAVE_KVM_IRQ_BYPASS - select HAVE_KVM_IRQ_ROUTING - select HAVE_KVM_EVENTFD - select KVM_ASYNC_PF - select USER_RETURN_NOTIFIER - select KVM_MMIO - select TASKSTATS - select TASK_DELAY_ACCT - select PERF_EVENTS - select HAVE_KVM_MSI - select HAVE_KVM_CPU_RELAX_INTERCEPT - select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_VFIO - select SRCU - ---help--- - Support hosting fully virtualized guest machines using hardware - virtualization extensions. You will need a fairly recent - processor equipped with virtualization extensions. You will also - need to select one or more of the processor modules below. - - This module provides access to the hardware capabilities through - a character device node named /dev/kvm. - - To compile this as a module, choose M here: the module - will be called kvm. - - If unsure, say N. - -config KVM_INTEL - tristate "KVM for Intel processors support" - depends on KVM - # for perf_guest_get_msrs(): - depends on CPU_SUP_INTEL - ---help--- - Provides support for KVM on Intel processors equipped with the VT - extensions. - - To compile this as a module, choose M here: the module - will be called kvm-intel. - -config KVM_AMD - tristate "KVM for AMD processors support" - depends on KVM - ---help--- - Provides support for KVM on AMD processors equipped with the AMD-V - (SVM) extensions. - - To compile this as a module, choose M here: the module - will be called kvm-amd. - -config KVM_MMU_AUDIT - bool "Audit KVM MMU" - depends on KVM && TRACEPOINTS - ---help--- - This option adds a R/W kVM module parameter 'mmu_audit', which allows - auditing of KVM MMU events at runtime. - -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - -# OK, it's a little counter-intuitive to do this, but it puts it neatly under -# the virtualization menu. -source drivers/vhost/Kconfig -source drivers/lguest/Kconfig - -endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile deleted file mode 100644 index 3bff207..0000000 --- a/arch/x86/kvm/Makefile +++ /dev/null @@ -1,25 +0,0 @@ - -ccflags-y += -Iarch/x86/kvm - -CFLAGS_x86.o := -I. -CFLAGS_svm.o := -I. -CFLAGS_vmx.o := -I. - -KVM := ../../../virt/kvm - -kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o -kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o - -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o debugfs.o - -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - -kvm-intel-y += vmx.o pmu_intel.o -kvm-amd-y += svm.o pmu_amd.o - -obj-$(CONFIG_KVM) += kvm.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b859..0000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/namei.h> -#include <linux/fs.h> -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a..0000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include <linux/kvm_host.h> - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afa7bbb..ad85822 100644..100755 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -6,6 +6,7 @@ * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 + * Copyright 2019 Google LLC * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -13,16 +14,9 @@ */ #include <linux/kvm_host.h> -#include <linux/export.h> -#include <linux/vmalloc.h> -#include <linux/uaccess.h> -#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ -#include <asm/user.h> -#include <asm/fpu/xstate.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" -#include "trace.h" #include "pmu.h" static u32 xstate_required_size(u64 xstate_bv, bool compacted) @@ -33,7 +27,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx, offset; + u32 eax = 0, ebx = 0, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); offset = compacted ? ret : ebx; ret = max(ret, offset + eax); @@ -51,11 +45,10 @@ bool kvm_mpx_supported(void) return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) && kvm_x86_ops->mpx_supported()); } -EXPORT_SYMBOL_GPL(kvm_mpx_supported); u64 kvm_supported_xcr0(void) { - u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + u64 xcr0 = GVM_SUPPORTED_XCR0 & host_xcr0; if (!kvm_mpx_supported()) xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); @@ -67,7 +60,7 @@ u64 kvm_supported_xcr0(void) int kvm_update_cpuid(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; struct kvm_lapic *apic = vcpu->arch.apic; best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -114,9 +107,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - if (use_eager_fpu()) - kvm_x86_ops->fpu_activate(vcpu); - /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. @@ -144,7 +134,7 @@ static int is_efer_nx(void) static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) { int i; - struct kvm_cpuid_entry2 *e, *entry; + struct kvm_cpuid_entry *e, *entry; entry = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { @@ -162,7 +152,7 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); if (!best || best->eax < 0x80000008) @@ -173,65 +163,19 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } -EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); -/* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r, i; - struct kvm_cpuid_entry *cpuid_entries = NULL; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - if (cpuid->nent) { - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * - cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out; - } - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); - -out: - vfree(cpuid_entries); - return r; -} - -int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) { int r; r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + if (cpuid->nent > GVM_MAX_CPUID_ENTRIES) goto out; r = -EFAULT; if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); @@ -241,19 +185,16 @@ out: return r; } -int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +int kvm_vcpu_ioctl_get_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) { int r; r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; + return 0; out: @@ -263,10 +204,12 @@ out: static void cpuid_mask(u32 *word, int wordnum) { +#if 0 *word &= boot_cpu_data.x86_capability[wordnum]; +#endif } -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static void do_cpuid_1_ent(struct kvm_cpuid_entry *entry, u32 function, u32 index) { entry->function = function; @@ -276,7 +219,7 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry *entry, u32 func, u32 index, int *nent, int maxnent) { switch (func) { @@ -298,7 +241,7 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, return 0; } -static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static inline int __do_cpuid_ent(struct kvm_cpuid_entry *entry, u32 function, u32 index, int *nent, int maxnent) { int r; @@ -346,7 +289,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + 0 /* Reserved*/ | F(AES) | 0 /*F(XSAVE)*/ | 0 /* OSXSAVE */ | 0 /*F(AVX)*/ | F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ const u32 kvm_cpuid_8000_0001_ecx_x86_features = @@ -403,18 +346,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + * in kvm_emulate_cpuid() using GVM_CPUID_FLAG_STATE_READ_NEXT */ case 2: { int t, times = entry->eax & 0xff; - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + entry->flags |= GVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= GVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times; ++t) { if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry[t].flags |= GVM_CPUID_FLAG_STATEFUL_FUNC; ++*nent; } break; @@ -423,7 +366,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 4: { int i, cache_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until cache_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -434,7 +377,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + GVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -446,7 +389,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = 0; break; case 7: { - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ if (index == 0) { entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; @@ -469,6 +412,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 9: break; case 0xa: { /* Architectural Performance Monitoring */ +#if 0 struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -495,13 +439,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx = cap.events_mask; entry->ecx = 0; entry->edx = edx.full; +#endif break; } /* function 0xb has additional index. */ case 0xb: { int i, level_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -512,7 +457,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + GVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -525,7 +470,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx = xstate_required_size(supported, false); entry->ecx = entry->ebx; entry->edx &= supported >> 32; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + entry->flags |= GVM_CPUID_FLAG_SIGNIFCANT_INDEX; if (!supported) break; @@ -552,37 +497,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry[i].ecx = 0; entry[i].edx = 0; entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + GVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; ++i; } break; } - case KVM_CPUID_SIGNATURE: { - static const char signature[12] = "KVMKVMKVM\0\0"; - const u32 *sigptr = (const u32 *)signature; - entry->eax = KVM_CPUID_FEATURES; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | - (1 << KVM_FEATURE_PV_UNHALT); - - if (sched_info_on()) - entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); - - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; @@ -593,11 +513,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ +#if 0 /* invariant TSC is CPUID.80000007H:EDX[8] */ entry->edx &= (1 << 8); /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; +#endif break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; @@ -646,10 +568,10 @@ out: return r; } -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, +static int do_cpuid_ent(struct kvm_cpuid_entry *entry, u32 func, u32 idx, int *nent, int maxnent, unsigned int type) { - if (type == KVM_GET_EMULATED_CPUID) + if (type == GVM_GET_EMULATED_CPUID) return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); return __do_cpuid_ent(entry, func, idx, nent, maxnent); @@ -666,23 +588,23 @@ struct kvm_cpuid_param { static bool is_centaur_cpu(const struct kvm_cpuid_param *param) { - return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; + return 0; } -static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, +static bool sanity_check_entries(struct kvm_cpuid_entry __user *entries, __u32 num_entries, unsigned int ioctl_type) { int i; __u32 pad[3]; - if (ioctl_type != KVM_GET_EMULATED_CPUID) + if (ioctl_type != GVM_GET_EMULATED_CPUID) return false; /* * We want to make sure that ->padding is being passed clean from * userspace in case we want to use it for something in the future. * - * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * Sadly, this wasn't enforced for GVM_GET_SUPPORTED_CPUID and so we * have to give ourselves satisfied only with the emulated side. /me * sheds a tear. */ @@ -696,31 +618,29 @@ static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, return false; } -int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries, +int kvm_dev_ioctl_get_cpuid(PIRP pIrp, struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries, unsigned int type) { - struct kvm_cpuid_entry2 *cpuid_entries; + struct kvm_cpuid_entry *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; u32 func; static const struct kvm_cpuid_param param[] = { { .func = 0, .has_leaf_count = true }, { .func = 0x80000000, .has_leaf_count = true }, { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, - { .func = KVM_CPUID_SIGNATURE }, - { .func = KVM_CPUID_FEATURES }, }; if (cpuid->nent < 1) goto out; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - cpuid->nent = KVM_MAX_CPUID_ENTRIES; + if (cpuid->nent > GVM_MAX_CPUID_ENTRIES) + cpuid->nent = GVM_MAX_CPUID_ENTRIES; if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; r = -ENOMEM; - cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); if (!cpuid_entries) goto out; @@ -749,11 +669,19 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, goto out_free; } - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; cpuid->nent = nent; + + r = gvmUpdateReturnBuffer(pIrp, 0, cpuid, sizeof(cpuid)); + if (!NT_SUCCESS(r)) { + r = -EFAULT; + goto out_free; + } + r = gvmUpdateReturnBuffer(pIrp, sizeof(cpuid), cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry)); + if (!NT_SUCCESS(r)) { + r = -EFAULT; + goto out_free; + } r = 0; out_free: @@ -764,48 +692,47 @@ out: static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + struct kvm_cpuid_entry *e = &vcpu->arch.cpuid_entries[i]; int j, nent = vcpu->arch.cpuid_nent; - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + e->flags &= ~GVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + struct kvm_cpuid_entry *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + ej->flags |= GVM_CPUID_FLAG_STATE_READ_NEXT; return j; } } - return 0; /* silence gcc, even though control never reaches here */ } /* find an entry with matching function, matching index (if needed), and that * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, +static int is_matching_cpuid_entry(struct kvm_cpuid_entry *e, u32 function, u32 index) { if (e->function != function) return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + if ((e->flags & GVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + if ((e->flags & GVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & GVM_CPUID_FLAG_STATE_READ_NEXT)) return 0; return 1; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, +struct kvm_cpuid_entry *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { int i; - struct kvm_cpuid_entry2 *best = NULL; + struct kvm_cpuid_entry *best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; + struct kvm_cpuid_entry *e; e = &vcpu->arch.cpuid_entries[i]; if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + if (e->flags & GVM_CPUID_FLAG_STATEFUL_FUNC) move_to_next_stateful_cpuid_entry(vcpu, i); best = e; break; @@ -813,17 +740,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* * If no match is found, check whether we exceed the vCPU's limit * and return the content of the highest valid _standard_ leaf instead. * This is to satisfy the CPUID specification. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, +static struct kvm_cpuid_entry* check_cpuid_limit(struct kvm_vcpu *vcpu, u32 function, u32 index) { - struct kvm_cpuid_entry2 *maxlevel; + struct kvm_cpuid_entry *maxlevel; maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); if (!maxlevel || maxlevel->eax >= function) @@ -839,7 +765,7 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, function, index); @@ -859,9 +785,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) *edx = best->edx; } else *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); } -EXPORT_SYMBOL_GPL(kvm_cpuid); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { @@ -876,4 +800,3 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2..4bfa008 100644..100755 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -1,25 +1,31 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H #include "x86.h" -#include <asm/cpu.h> +#include <gvm_types.h> +#include <asm/cpufeatures.h> +#include <ntkrutils.h> int kvm_update_cpuid(struct kvm_vcpu *vcpu); bool kvm_mpx_supported(void); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, +struct kvm_cpuid_entry *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries, +int kvm_dev_ioctl_get_cpuid(PIRP pIrp, struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries, unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); -int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); -int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries); +int kvm_vcpu_ioctl_get_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries); void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); @@ -31,7 +37,7 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; if (!static_cpu_has(X86_FEATURE_XSAVE)) return false; @@ -42,7 +48,7 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->edx & bit(X86_FEATURE_MTRR)); @@ -50,7 +56,7 @@ static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); @@ -58,7 +64,7 @@ static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_SMEP)); @@ -66,7 +72,7 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_SMAP)); @@ -74,7 +80,7 @@ static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); @@ -82,7 +88,7 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ecx & bit(X86_FEATURE_PKU)); @@ -90,7 +96,7 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_LM)); @@ -98,7 +104,7 @@ static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->ecx & bit(X86_FEATURE_OSVW)); @@ -106,7 +112,7 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_PCID)); @@ -114,7 +120,7 @@ static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_X2APIC)); @@ -122,7 +128,7 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0, 0); return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; @@ -130,7 +136,7 @@ static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_GBPAGES)); @@ -138,7 +144,7 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_RTM)); @@ -146,7 +152,7 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_RDTSCP)); @@ -159,7 +165,7 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); @@ -174,7 +180,7 @@ static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (!best) @@ -185,7 +191,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (!best) @@ -196,7 +202,7 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (!best) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c deleted file mode 100644 index c19c7ed..0000000 --- a/arch/x86/kvm/debugfs.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * Copyright 2016 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ -#include <linux/kvm_host.h> -#include <linux/debugfs.h> - -bool kvm_arch_has_vcpu_debugfs(void) -{ - return true; -} - -static int vcpu_get_tsc_offset(void *data, u64 *val) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; - *val = vcpu->arch.tsc_offset; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); - -static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; - *val = vcpu->arch.tsc_scaling_ratio; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); - -static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) -{ - *val = kvm_tsc_scaling_ratio_frac_bits; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - struct dentry *ret; - - ret = debugfs_create_file("tsc-offset", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_offset_fops); - if (!ret) - return -ENOMEM; - - if (kvm_has_tsc_control) { - ret = debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_fops); - if (!ret) - return -ENOMEM; - ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_frac_fops); - if (!ret) - return -ENOMEM; - - } - - return 0; -} diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3ce9d2..6ae4ce5 100644..100755 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -10,6 +10,7 @@ * * Copyright (C) 2006 Qumranet * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> @@ -23,12 +24,15 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" #include <asm/kvm_emulate.h> -#include <linux/stringify.h> -#include <asm/debugreg.h> +#include <uapi/asm/debugreg.h> #include "x86.h" #include "tss.h" +#include <gvm_types.h> +#include <uapi/asm/processor-flags.h> +#include <asm/cpufeatures.h> + /* * Operand types */ @@ -174,14 +178,14 @@ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) -#define X2(x...) x, x -#define X3(x...) X2(x), x -#define X4(x...) X2(x), X2(x) -#define X5(x...) X4(x), x -#define X6(x...) X4(x), X2(x) -#define X7(x...) X4(x), X3(x) -#define X8(x...) X4(x), X4(x) -#define X16(x...) X8(x), X8(x) +#define X2(x,...) x, x +#define X3(x,...) X2(x), x +#define X4(x,...) X2(x), X2(x) +#define X5(x,...) X4(x), x +#define X6(x,...) X4(x), X2(x) +#define X7(x,...) X4(x), X3(x) +#define X8(x,...) X4(x), X4(x) +#define X16(x,...) X8(x), X8(x) #define NR_FASTOP (ilog2(sizeof(ulong)) + 1) #define FASTOP_SIZE 8 @@ -281,7 +285,7 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { - unsigned reg; + unsigned reg = 0; for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); @@ -308,32 +312,23 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); -#define FOP_FUNC(name) \ - ".align " __stringify(FASTOP_SIZE) " \n\t" \ - ".type " name ", @function \n\t" \ - name ":\n\t" +#define FOP_FUNC(name) -#define FOP_RET "ret \n\t" +#define FOP_RET #define FOP_START(op) \ - extern void em_##op(struct fastop *fake); \ - asm(".pushsection .text, \"ax\" \n\t" \ - ".global em_" #op " \n\t" \ - FOP_FUNC("em_" #op) + extern void em_##op(struct fastop *fake); -#define FOP_END \ - ".popsection") +#define FOP_END #define FOPNOP() \ - FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \ - FOP_RET + FOP_FUNC(__stringify(__UNIQUE_ID(nop))) #define FOP1E(op, dst) \ - FOP_FUNC(#op "_" #dst) \ - "10: " #op " %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst) #define FOP1EEX(op, dst) \ - FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) + FOP1E(op, dst) #define FASTOP1(op) \ FOP_START(op) \ @@ -362,8 +357,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP2E(op, dst, src) \ - FOP_FUNC(#op "_" #dst "_" #src) \ - #op " %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src) #define FASTOP2(op) \ FOP_START(op) \ @@ -401,8 +395,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP3E(op, dst, src, src2) \ - FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \ - #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src "_" #src2) /* 3-operand, word-only, src2=cl */ #define FASTOP3WCL(op) \ @@ -414,15 +407,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END /* Special case for SETcc - 1 instruction per cc */ -#define FOP_SETCC(op) \ - ".align 4 \n\t" \ - ".type " #op ", @function \n\t" \ - #op ": \n\t" \ - #op " %al \n\t" \ - FOP_RET - -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +#define FOP_SETCC(op) FOP_START(setcc) FOP_SETCC(seto) @@ -443,7 +428,7 @@ FOP_SETCC(setle) FOP_SETCC(setnle) FOP_END; -FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET +FOP_START(salc) FOP_END; static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, @@ -472,7 +457,7 @@ static void assign_masked(ulong *dest, ulong src, ulong mask) *dest = (*dest & ~mask) | (src & mask); } -static void assign_register(unsigned long *reg, u64 val, int bytes) +static void assign_register(size_t *reg, u64 val, int bytes) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (bytes) { @@ -491,9 +476,9 @@ static void assign_register(unsigned long *reg, u64 val, int bytes) } } -static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) +static inline size_t ad_mask(struct x86_emulate_ctxt *ctxt) { - return (1UL << (ctxt->ad_bytes << 3)) - 1; + return (1ULL << (ctxt->ad_bytes << 3)) - 1; } static ulong stack_mask(struct x86_emulate_ctxt *ctxt) @@ -513,16 +498,16 @@ static int stack_size(struct x86_emulate_ctxt *ctxt) } /* Access/update address held in a register, based on addressing mode. */ -static inline unsigned long -address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) +static inline size_t +address_mask(struct x86_emulate_ctxt *ctxt, size_t reg) { - if (ctxt->ad_bytes == sizeof(unsigned long)) + if (ctxt->ad_bytes == sizeof(size_t)) return reg; else return reg & ad_mask(ctxt); } -static inline unsigned long +static inline size_t register_address(struct x86_emulate_ctxt *ctxt, int reg) { return address_mask(ctxt, reg_read(ctxt, reg)); @@ -553,7 +538,7 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +static size_t seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; @@ -733,8 +718,8 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = dst }; - if (ctxt->op_bytes != sizeof(unsigned long)) - addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); + if (ctxt->op_bytes != sizeof(size_t)) + addr.ea = dst & ((1ULL << (ctxt->op_bytes << 3)) - 1); rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; @@ -799,7 +784,7 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; unsigned size, max_size; - unsigned long linear; + size_t linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; @@ -851,27 +836,53 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _ctxt) \ -({ _type _x; \ - \ - rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ - ctxt->fetch.ptr += sizeof(_type); \ - _x; \ -}) - -#define insn_fetch_arr(_arr, _size, _ctxt) \ -({ \ - rc = do_insn_fetch_bytes(_ctxt, _size); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - ctxt->_eip += (_size); \ - memcpy(_arr, ctxt->fetch.ptr, _size); \ - ctxt->fetch.ptr += (_size); \ -}) +#define __insn_fetch_type(_type) \ +static __always_inline int \ + __insn_fetch_##_type(struct x86_emulate_ctxt *ctxt, _type *_x) \ +{ \ + int rc; \ + rc = do_insn_fetch_bytes(ctxt, sizeof(_type)); \ + if (rc == X86EMUL_CONTINUE) { \ + ctxt->_eip += sizeof(_type); \ + *_x = *(_type *) ctxt->fetch.ptr; \ + ctxt->fetch.ptr += sizeof(_type); \ + } \ + return rc; \ +} + +__insn_fetch_type(u8) +__insn_fetch_type(s8) +__insn_fetch_type(u16) +__insn_fetch_type(s16) +__insn_fetch_type(u32) +__insn_fetch_type(s32) +__insn_fetch_type(u64) +__insn_fetch_type(s64) + +#define insn_fetch(_type, _ctxt, _data) __insn_fetch_##_type(_ctxt, &(_type)_data) + +#define insn_fetch_modrmea(_type, _ctxt) \ + do { \ + _type __temp; \ + rc = insn_fetch(_type, _ctxt, __temp); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + modrm_ea += __temp; \ + } while (0) + + +static __always_inline int insn_fetch_arr(char *_arr, + unsigned int _size, struct x86_emulate_ctxt *_ctxt) +{ + int rc; + rc = do_insn_fetch_bytes(_ctxt, _size); + if (rc == X86EMUL_CONTINUE) { + _ctxt->_eip += (_size); + memcpy(_arr, _ctxt->fetch.ptr, _size); + _ctxt->fetch.ptr += (_size); + } + return rc; +} /* * Given the 'reg' portion of a ModRM byte, and a register block, return a @@ -893,7 +904,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, - u16 *size, unsigned long *address, int op_bytes) + u16 *size, size_t *address, int op_bytes) { int rc; @@ -968,14 +979,14 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt) return fastop(ctxt, em_bsr); } -static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) +extern u8 __asm_test_cc(void *fop, size_t flags); +static __always_inline u8 test_cc(unsigned int condition, size_t flags) { - u8 rc; - void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + u8 rc = 0; + void(*fop)(void) = (void(*)(void))((char *)em_setcc + 4 * (condition & 0xf)); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; - asm("push %[flags]; popf; call *%[fastop]" - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + rc = __asm_test_cc(fop, flags); return rc; } @@ -997,55 +1008,80 @@ static void fetch_register_operand(struct operand *op) } } +#define DECLARE_XMM(n) \ +extern __asm_save_xmm##n(sse128_t *data); \ +extern __asm_store_xmm##n(sse128_t *data); + +DECLARE_XMM(0) +DECLARE_XMM(1) +DECLARE_XMM(2) +DECLARE_XMM(3) +DECLARE_XMM(4) +DECLARE_XMM(5) +DECLARE_XMM(6) +DECLARE_XMM(7) +DECLARE_XMM(8) +DECLARE_XMM(9) +DECLARE_XMM(10) +DECLARE_XMM(11) +DECLARE_XMM(12) +DECLARE_XMM(13) +DECLARE_XMM(14) +DECLARE_XMM(15) + +#define SAVE_XMM(n) \ +case n: __asm_save_xmm##n(data); break; static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; + SAVE_XMM(0) + SAVE_XMM(1) + SAVE_XMM(2) + SAVE_XMM(3) + SAVE_XMM(4) + SAVE_XMM(5) + SAVE_XMM(6) + SAVE_XMM(7) #ifdef CONFIG_X86_64 - case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; + SAVE_XMM(8) + SAVE_XMM(9) + SAVE_XMM(10) + SAVE_XMM(11) + SAVE_XMM(12) + SAVE_XMM(13) + SAVE_XMM(14) + SAVE_XMM(15) #endif default: BUG(); } ctxt->ops->put_fpu(ctxt); } +#define STORE_XMM(n) \ +case n: __asm_store_xmm##n(data); break; static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; + STORE_XMM(0) + STORE_XMM(1) + STORE_XMM(2) + STORE_XMM(3) + STORE_XMM(4) + STORE_XMM(5) + STORE_XMM(6) + STORE_XMM(7) #ifdef CONFIG_X86_64 - case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; + STORE_XMM(8) + STORE_XMM(9) + STORE_XMM(10) + STORE_XMM(11) + STORE_XMM(12) + STORE_XMM(13) + STORE_XMM(14) + STORE_XMM(15) #endif default: BUG(); } @@ -1056,14 +1092,14 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; - case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; - case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; - case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; - case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; - case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; - case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; - case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + case 0: __asm_save_mm0(data); break; + case 1: __asm_save_mm1(data); break; + case 2: __asm_save_mm2(data); break; + case 3: __asm_save_mm3(data); break; + case 4: __asm_save_mm4(data); break; + case 5: __asm_save_mm5(data); break; + case 6: __asm_save_mm6(data); break; + case 7: __asm_save_mm7(data); break; default: BUG(); } ctxt->ops->put_fpu(ctxt); @@ -1073,14 +1109,14 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; - case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; - case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; - case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; - case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; - case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; - case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; - case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + case 0: __asm_store_mm0(data); break; + case 1: __asm_store_mm1(data); break; + case 2: __asm_store_mm2(data); break; + case 3: __asm_store_mm3(data); break; + case 4: __asm_store_mm4(data); break; + case 5: __asm_store_mm5(data); break; + case 6: __asm_store_mm6(data); break; + case 7: __asm_store_mm7(data); break; default: BUG(); } ctxt->ops->put_fpu(ctxt); @@ -1092,20 +1128,20 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) return emulate_nm(ctxt); ctxt->ops->get_fpu(ctxt); - asm volatile("fninit"); + __fninit(); ctxt->ops->put_fpu(ctxt); return X86EMUL_CONTINUE; } static int em_fnstcw(struct x86_emulate_ctxt *ctxt) { - u16 fcw; + u16 fcw = 0; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); ctxt->ops->get_fpu(ctxt); - asm volatile("fnstcw %0": "+m"(fcw)); + __fnstcw(&fcw); ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fcw; @@ -1115,13 +1151,13 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) static int em_fnstsw(struct x86_emulate_ctxt *ctxt) { - u16 fsw; + u16 fsw = 0; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); ctxt->ops->get_fpu(ctxt); - asm volatile("fnstsw %0": "+m"(fsw)); + __fnstsw(&fsw); ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fsw; @@ -1217,13 +1253,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, ctxt); + insn_fetch_modrmea(u16, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, ctxt); + insn_fetch_modrmea(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, ctxt); + insn_fetch_modrmea(u16, ctxt); break; } switch (ctxt->modrm_rm) { @@ -1260,13 +1296,15 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, ctxt); + rc = insn_fetch(u8, ctxt, sib); + if (rc != X86EMUL_CONTINUE) + goto done; index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, ctxt); + insn_fetch_modrmea(s32, ctxt); else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); @@ -1278,7 +1316,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { - modrm_ea += insn_fetch(s32, ctxt); + insn_fetch_modrmea(s32, ctxt); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { @@ -1288,10 +1326,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } switch (ctxt->modrm_mod) { case 1: - modrm_ea += insn_fetch(s8, ctxt); + insn_fetch_modrmea(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, ctxt); + insn_fetch_modrmea(s32, ctxt); break; } } @@ -1311,13 +1349,19 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, ctxt); + rc = insn_fetch(u16, ctxt, op->addr.mem.ea); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 4: - op->addr.mem.ea = insn_fetch(u32, ctxt); + rc = insn_fetch(u32, ctxt, op->addr.mem.ea); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 8: - op->addr.mem.ea = insn_fetch(u64, ctxt); + rc = insn_fetch(u64, ctxt, op->addr.mem.ea); + if (rc != X86EMUL_CONTINUE) + goto done; break; } done: @@ -1347,7 +1391,7 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) } static int read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *dest, unsigned size) + size_t addr, void *dest, unsigned size) { int rc; struct read_cache *mc = &ctxt->mem_read; @@ -1716,7 +1760,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, static void write_register_operand(struct operand *op) { - return assign_register(op->addr.reg, op->val, op->bytes); + assign_register(op->addr.reg, op->val, op->bytes); } static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) @@ -1802,7 +1846,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; - unsigned long val, change_mask; + size_t val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); @@ -1834,7 +1878,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, break; } - *(unsigned long *)dest = + *(size_t *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); return rc; @@ -1893,7 +1937,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt) static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; - unsigned long selector; + size_t selector; int rc; rc = emulate_pop(ctxt, &selector, 2); @@ -1901,7 +1945,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) return rc; if (ctxt->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + ctxt->interruptibility = GVM_X86_SHADOW_INT_MOV_SS; if (ctxt->op_bytes > 2) rsp_increment(ctxt, ctxt->op_bytes - 2); @@ -1911,7 +1955,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) static int em_pusha(struct x86_emulate_ctxt *ctxt) { - unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP); + size_t old_esp = reg_read(ctxt, VCPU_REGS_RSP); int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; @@ -1931,7 +1975,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM; + ctxt->src.val = (size_t)ctxt->eflags & ~X86_EFLAGS_VM; return em_push(ctxt); } @@ -2034,16 +2078,16 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; - unsigned long temp_eip = 0; - unsigned long temp_eflags = 0; - unsigned long cs = 0; - unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + size_t temp_eip = 0; + size_t temp_eflags = 0; + size_t cs = 0; + size_t mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF | X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF | X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF | X86_EFLAGS_AC | X86_EFLAGS_ID | X86_EFLAGS_FIXED; - unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | + size_t vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | X86_EFLAGS_VIP; /* TODO: Add stack limit check */ @@ -2168,7 +2212,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + size_t eip; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2180,7 +2224,7 @@ static int em_ret(struct x86_emulate_ctxt *ctxt) static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip, cs; + size_t eip, cs; int cpl = ctxt->ops->cpl(ctxt); struct desc_struct new_desc; @@ -2267,15 +2311,26 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) return edx & bit(X86_FEATURE_LM); } -#define GET_SMSTATE(type, smbase, offset) \ - ({ \ - type __val; \ - int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ - sizeof(__val)); \ - if (r != X86EMUL_CONTINUE) \ - return X86EMUL_UNHANDLEABLE; \ - __val; \ - }) +#define GET_SMSTATE(type, smbase, offset, val) \ +do { \ + type __val; \ + int __r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val,\ + sizeof(__val)); \ + if (__r != X86EMUL_CONTINUE) \ + return X86EMUL_UNHANDLEABLE; \ + val = __val; \ +} while(0) + +#define __GET_SMSTATE_TYPE(type, smbase, offset) \ +static __always_inline int __get_smstate_##type(size_t smbase, size_t offset, type *val) \ +{ \ + type __val; \ + int __r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ + if (__r == X86EMUL_CONTINUE) \ + *val = __val; \ + return r; \ +} static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) { @@ -2294,17 +2349,21 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) struct desc_struct desc; int offset; u16 selector; + u32 temp; - selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + GET_SMSTATE(int, smbase, 0x7fa8 + n * 4, selector); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + GET_SMSTATE(u32, smbase, offset + 8, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, offset + 4, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, offset, temp); + rsm_set_desc_flags(&desc, temp); ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); return X86EMUL_CONTINUE; } @@ -2313,16 +2372,19 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) { struct desc_struct desc; int offset; - u16 selector; - u32 base3; + u16 selector, temp16; + u32 base3, temp; offset = 0x7e00 + n * 16; - selector = GET_SMSTATE(u16, smbase, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - base3 = GET_SMSTATE(u32, smbase, offset + 12); + GET_SMSTATE(u16, smbase, offset, selector); + GET_SMSTATE(u16, smbase, offset + 2, temp16); + rsm_set_desc_flags(&desc, temp16 << 8); + GET_SMSTATE(u32, smbase, offset + 4, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, offset + 8, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, offset + 12, base3); ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); return X86EMUL_CONTINUE; @@ -2362,38 +2424,47 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) u16 selector; u32 val, cr0, cr4; int i; + u32 temp; - cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + GET_SMSTATE(u32, smbase, 0x7ffc, cr0); + GET_SMSTATE(u32, smbase, 0x7ff8, temp); + ctxt->ops->set_cr(ctxt, 3, temp); + GET_SMSTATE(u32, smbase, 0x7ff4, ctxt->eflags); + ctxt->eflags |= X86_EFLAGS_FIXED; + GET_SMSTATE(u32, smbase, 0x7ff0, ctxt->_eip); for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4, *reg_write(ctxt, i)); - val = GET_SMSTATE(u32, smbase, 0x7fcc); + GET_SMSTATE(u32, smbase, 0x7fcc, val); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7fc8); + GET_SMSTATE(u32, smbase, 0x7fc8, val); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - selector = GET_SMSTATE(u32, smbase, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + GET_SMSTATE(u32, smbase, 0x7fc4, selector); + GET_SMSTATE(u32, smbase, 0x7f64, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f60, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f5c, temp); + rsm_set_desc_flags(&desc, temp); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - selector = GET_SMSTATE(u32, smbase, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + GET_SMSTATE(u32, smbase, 0x7fc0, selector); + GET_SMSTATE(u32, smbase, 0x7f80, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f7c, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7f78, temp); + rsm_set_desc_flags(&desc, temp); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - dt.address = GET_SMSTATE(u32, smbase, 0x7f74); - dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + GET_SMSTATE(u32, smbase, 0x7f74, dt.address); + GET_SMSTATE(u32, smbase, 0x7f70, dt.size); ctxt->ops->set_gdt(ctxt, &dt); - dt.address = GET_SMSTATE(u32, smbase, 0x7f58); - dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + GET_SMSTATE(u32, smbase, 0x7f58, dt.address); + GET_SMSTATE(u32, smbase, 0x7f54, dt.size); ctxt->ops->set_idt(ctxt, &dt); for (i = 0; i < 6; i++) { @@ -2402,9 +2473,10 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) return r; } - cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + GET_SMSTATE(u32, smbase, 0x7f14, cr4); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + GET_SMSTATE(u32, smbase, 0x7ef8, temp); + ctxt->ops->set_smbase(ctxt, temp); return rsm_enter_protected_mode(ctxt, cr0, cr4); } @@ -2417,45 +2489,56 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) u32 base3; u16 selector; int i, r; + u64 temp64; + u32 temp = 0; for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8, *reg_write(ctxt, i)); - ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + GET_SMSTATE(u64, smbase, 0x7f78, ctxt->_eip); + GET_SMSTATE(u32, smbase, 0x7f70, ctxt->eflags); + ctxt->eflags |= X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smbase, 0x7f68); + GET_SMSTATE(u32, smbase, 0x7f68, val); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7f60); + GET_SMSTATE(u32, smbase, 0x7f60, val); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); - cr4 = GET_SMSTATE(u64, smbase, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); - val = GET_SMSTATE(u64, smbase, 0x7ed0); + GET_SMSTATE(u64, smbase, 0x7f58, cr0); + GET_SMSTATE(u64, smbase, 0x7f50, temp64); + ctxt->ops->set_cr(ctxt, 3, temp); + GET_SMSTATE(u64, smbase, 0x7f48, cr4); + GET_SMSTATE(u32, smbase, 0x7f00, temp); + ctxt->ops->set_smbase(ctxt, temp); + GET_SMSTATE(u64, smbase, 0x7ed0, val); ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); - selector = GET_SMSTATE(u32, smbase, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); - base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + GET_SMSTATE(u32, smbase, 0x7e90, selector); + GET_SMSTATE(u32, smbase, 0x7e92, temp); + rsm_set_desc_flags(&desc, temp << 8); + GET_SMSTATE(u32, smbase, 0x7e94, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e98, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e9c, base3); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e84); - dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + GET_SMSTATE(u32, smbase, 0x7e84, dt.size); + GET_SMSTATE(u64, smbase, 0x7e88, dt.address); ctxt->ops->set_idt(ctxt, &dt); - selector = GET_SMSTATE(u32, smbase, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); - base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + GET_SMSTATE(u32, smbase, 0x7e70, selector); + GET_SMSTATE(u32, smbase, 0x7e72, temp); + rsm_set_desc_flags(&desc, temp << 8); + GET_SMSTATE(u32, smbase, 0x7e74, temp); + set_desc_limit(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e78, temp); + set_desc_base(&desc, temp); + GET_SMSTATE(u32, smbase, 0x7e7c, base3); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e64); - dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + GET_SMSTATE(u32, smbase, 0x7e64, dt.size); + GET_SMSTATE(u64, smbase, 0x7e68, dt.address); ctxt->ops->set_gdt(ctxt, &dt); r = rsm_enter_protected_mode(ctxt, cr0, cr4); @@ -2473,7 +2556,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) static int em_rsm(struct x86_emulate_ctxt *ctxt) { - unsigned long cr0, cr4, efer; + size_t cr0, cr4, efer; u64 smbase; int ret; @@ -2806,7 +2889,7 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, int r; u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; - unsigned long base; + size_t base; ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) @@ -3226,7 +3309,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (has_error_code) { ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; ctxt->lock_prefix = 0; - ctxt->src.val = (unsigned long) error_code; + ctxt->src.val = (size_t) error_code; ret = em_push(ctxt); } @@ -3260,7 +3343,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, struct operand *op) { - int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count; + int df = (ctxt->eflags & X86_EFLAGS_DF) ? -(int)op->count : op->count; register_address_increment(ctxt, reg, df * op->bytes); op->addr.mem.ea = register_address(ctxt, reg); @@ -3349,7 +3432,7 @@ static int em_call(struct x86_emulate_ctxt *ctxt) int rc; long rel = ctxt->src.val; - ctxt->src.val = (unsigned long)ctxt->_eip; + ctxt->src.val = (size_t)ctxt->_eip; rc = jmp_rel(ctxt, rel); if (rc != X86EMUL_CONTINUE) return rc; @@ -3389,7 +3472,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) /* If we failed, we tainted the memory, but the very least we should restore cs */ if (rc != X86EMUL_CONTINUE) { - pr_warn_once("faulting far call emulation tainted memory\n"); + //pr_warn_once("faulting far call emulation tainted memory\n"); goto fail; } return rc; @@ -3403,7 +3486,7 @@ fail: static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + size_t eip; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3496,7 +3579,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) */ tmp = (u16)ctxt->src.val; ctxt->dst.val &= ~0xffffUL; - ctxt->dst.val |= (unsigned long)swab16(tmp); + ctxt->dst.val |= (size_t)swab16(tmp); break; case 4: ctxt->dst.val = swab32((u32)ctxt->src.val); @@ -3522,7 +3605,7 @@ static int em_cr_write(struct x86_emulate_ctxt *ctxt) static int em_dr_write(struct x86_emulate_ctxt *ctxt) { - unsigned long val; + size_t val; if (ctxt->mode == X86EMUL_MODE_PROT64) val = ctxt->src.val & ~0ULL; @@ -3581,7 +3664,7 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); if (ctxt->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + ctxt->interruptibility = GVM_X86_SHADOW_INT_MOV_SS; /* Disable writeback. */ ctxt->dst.type = OP_NONE; @@ -3672,6 +3755,8 @@ static int em_sidt(struct x86_emulate_ctxt *ctxt) return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); } +// Disable VC warning for unaligned access in desc_ptr +#pragma warning(disable : 4366) static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) { struct desc_ptr desc_ptr; @@ -3695,6 +3780,7 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } +#pragma warning(default : 4366) static int em_lgdt(struct x86_emulate_ctxt *ctxt) { @@ -3776,7 +3862,7 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) if (emulator_bad_iopl(ctxt)) return emulate_gp(ctxt, 0); - ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; + ctxt->interruptibility = GVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; return X86EMUL_CONTINUE; } @@ -3820,11 +3906,11 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt) switch (ctxt->op_bytes) { #ifdef CONFIG_X86_64 case 8: - asm("bswap %0" : "+r"(ctxt->dst.val)); + __bswap64(&ctxt->dst.val); break; #endif default: - asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); + __bswap32((u32 *)&ctxt->dst.val); break; } return X86EMUL_CONTINUE; @@ -3846,7 +3932,9 @@ static bool valid_cr(int nr) { switch (nr) { case 0: - case 2 ... 4: + case 2: + case 3: + case 4: case 8: return true; default: @@ -3925,7 +4013,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) { - unsigned long dr7; + size_t dr7; ctxt->ops->get_dr(ctxt, 7, &dr7); @@ -4575,16 +4663,24 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, ctxt); + rc = insn_fetch(s8, ctxt, op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 2: - op->val = insn_fetch(s16, ctxt); + rc = insn_fetch(s16, ctxt, op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 4: - op->val = insn_fetch(s32, ctxt); + rc = insn_fetch(s32, ctxt, op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 8: - op->val = insn_fetch(s64, ctxt); + rc = insn_fetch(s64, ctxt, (s64)op->val); + if (rc != X86EMUL_CONTINUE) + goto done; break; } if (!sign_extension) { @@ -4766,7 +4862,6 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, break; } -done: return rc; } @@ -4817,7 +4912,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, ctxt)) { + rc = insn_fetch(u8, ctxt, ctxt->b); + if (rc != X86EMUL_CONTINUE) + goto done; + switch (ctxt->b) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -4843,7 +4941,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) has_seg_override = true; ctxt->seg_override = ctxt->b & 7; break; - case 0x40 ... 0x4f: /* REX */ + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4a: + case 0x4b: + case 0x4c: + case 0x4d: + case 0x4e: + case 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; ctxt->rex_prefix = ctxt->b; @@ -4875,20 +4988,27 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->opcode_len = 2; - ctxt->b = insn_fetch(u8, ctxt); + rc = insn_fetch(u8, ctxt, ctxt->b); + if (rc != X86EMUL_CONTINUE) + goto done; opcode = twobyte_table[ctxt->b]; /* 0F_38 opcode map */ if (ctxt->b == 0x38) { ctxt->opcode_len = 3; - ctxt->b = insn_fetch(u8, ctxt); + rc = insn_fetch(u8, ctxt, ctxt->b); + if (rc != X86EMUL_CONTINUE) + goto done; opcode = opcode_map_0f_38[ctxt->b]; } } ctxt->d = opcode.flags; - if (ctxt->d & ModRM) - ctxt->modrm = insn_fetch(u8, ctxt); + if (ctxt->d & ModRM) { + rc = insn_fetch(u8, ctxt, ctxt->modrm); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* vex-prefix instructions are not implemented */ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && @@ -5069,15 +5189,11 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) bool fault = false; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + __try { + __fwait(); + } __except(EXCEPTION_EXECUTE_HANDLER) { + fault = true; + } ctxt->ops->put_fpu(ctxt); if (unlikely(fault)) @@ -5093,18 +5209,17 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); } +extern void __asm_fastop(size_t *flags,void *fop, + struct x86_emulate_ctxt *ctxt); static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); - ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + size_t flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + char *__fop = (char *)fop; if (!(ctxt->d & ByteOp)) - fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + __fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" - : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) - : "c"(ctxt->src2.val)); + __asm_fastop(&flags, __fop, ctxt); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); if (!fop) /* exception is returned in fop variable */ @@ -5115,7 +5230,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) void init_decode_cache(struct x86_emulate_ctxt *ctxt) { memset(&ctxt->rip_relative, 0, - (void *)&ctxt->modrm - (void *)&ctxt->rip_relative); + (char *)&ctxt->modrm - (char *)&ctxt->rip_relative); ctxt->io_read.pos = 0; ctxt->io_read.end = 0; @@ -5289,14 +5404,36 @@ special_insn: goto threebyte_insn; switch (ctxt->b) { - case 0x70 ... 0x7f: /* jcc (short) */ + case 0x70: /* jcc (short) */ + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7a: + case 0x7b: + case 0x7c: + case 0x7d: + case 0x7e: + case 0x7f: if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; break; - case 0x90 ... 0x97: /* nop / xchg reg, rax */ + case 0x90: /* nop / xchg reg, rax */ + case 0x91: + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: + case 0x97: if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) ctxt->dst.type = OP_NONE; else @@ -5382,7 +5519,7 @@ writeback: count = ctxt->src.count; else count = ctxt->dst.count; - register_address_increment(ctxt, VCPU_REGS_RCX, -count); + register_address_increment(ctxt, VCPU_REGS_RCX, -(int)count); if (!string_insn_completed(ctxt)) { /* @@ -5436,25 +5573,72 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; - case 0x40 ... 0x4f: /* cmov */ + case 0x40: /* cmov */ + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4a: + case 0x4b: + case 0x4c: + case 0x4d: + case 0x4e: + case 0x4f: if (test_cc(ctxt->b, ctxt->eflags)) ctxt->dst.val = ctxt->src.val; else if (ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ + case 0x80: /* jnz rel, etc*/ + case 0x81: + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8a: + case 0x8b: + case 0x8c: + case 0x8d: + case 0x8e: + case 0x8f: if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); break; - case 0x90 ... 0x9f: /* setcc r/m8 */ + case 0x90: /* setcc r/m8 */ + case 0x91: + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: + case 0x97: + case 0x98: + case 0x99: + case 0x9a: + case 0x9b: + case 0x9c: + case 0x9d: + case 0x9e: + case 0x9f: ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xb6 ... 0xb7: /* movzx */ + case 0xb6: /* movzx */ + case 0xb7: ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; - case 0xbe ... 0xbf: /* movsx */ + case 0xbe: /* movsx */ + case 0xbf: ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c deleted file mode 100644 index 42b1c83..0000000 --- a/arch/x86/kvm/hyperv.c +++ /dev/null @@ -1,1266 +0,0 @@ -/* - * KVM Microsoft Hyper-V emulation - * - * derived from arch/x86/kvm/x86.c - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2008 Qumranet, Inc. - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * Amit Shah <amit.shah@qumranet.com> - * Ben-Ami Yassour <benami@il.ibm.com> - * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "x86.h" -#include "lapic.h" -#include "ioapic.h" -#include "hyperv.h" - -#include <linux/kvm_host.h> -#include <linux/highmem.h> -#include <asm/apicdef.h> -#include <trace/events/kvm.h> - -#include "trace.h" - -static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) -{ - return atomic64_read(&synic->sint[sint]); -} - -static inline int synic_get_sint_vector(u64 sint_value) -{ - if (sint_value & HV_SYNIC_SINT_MASKED) - return -1; - return sint_value & HV_SYNIC_SINT_VECTOR_MASK; -} - -static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, - int vector) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { - if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) - return true; - } - return false; -} - -static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, - int vector) -{ - int i; - u64 sint_value; - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { - sint_value = synic_read_sint(synic, i); - if (synic_get_sint_vector(sint_value) == vector && - sint_value & HV_SYNIC_SINT_AUTO_EOI) - return true; - } - return false; -} - -static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, - u64 data, bool host) -{ - int vector; - - vector = data & HV_SYNIC_SINT_VECTOR_MASK; - if (vector < 16 && !host) - return 1; - /* - * Guest may configure multiple SINTs to use the same vector, so - * we maintain a bitmap of vectors handled by synic, and a - * bitmap of vectors with auto-eoi behavior. The bitmaps are - * updated here, and atomically queried on fast paths. - */ - - atomic64_set(&synic->sint[sint], data); - - if (synic_has_vector_connected(synic, vector)) - __set_bit(vector, synic->vec_bitmap); - else - __clear_bit(vector, synic->vec_bitmap); - - if (synic_has_vector_auto_eoi(synic, vector)) - __set_bit(vector, synic->auto_eoi_bitmap); - else - __clear_bit(vector, synic->auto_eoi_bitmap); - - /* Load SynIC vectors into EOI exit bitmap */ - kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); - return 0; -} - -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) -{ - struct kvm_vcpu *vcpu; - struct kvm_vcpu_hv_synic *synic; - - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); - if (!vcpu) - return NULL; - synic = vcpu_to_synic(vcpu); - return (synic->active) ? synic : NULL; -} - -static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, - u32 sint) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *msg; - struct hv_message_page *msg_page; - - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", - gpa); - return; - } - msg_page = kmap_atomic(page); - - msg = &msg_page->sint_message[sint]; - msg->header.message_flags.msg_pending = 0; - - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); -} - -static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - struct kvm_vcpu_hv_stimer *stimer; - int gsi, idx, stimers_pending; - - trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); - - if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) - synic_clear_sint_msg_pending(synic, sint); - - /* Try to deliver pending Hyper-V SynIC timers messages */ - stimers_pending = 0; - for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { - stimer = &hv_vcpu->stimer[idx]; - if (stimer->msg_pending && - (stimer->config & HV_STIMER_ENABLE) && - HV_STIMER_SINT(stimer->config) == sint) { - set_bit(stimer->index, - hv_vcpu->stimer_pending_bitmap); - stimers_pending++; - } - } - if (stimers_pending) - kvm_make_request(KVM_REQ_HV_STIMER, vcpu); - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = atomic_read(&synic->sint_to_gsi[sint]); - if (gsi != -1) - kvm_notify_acked_gsi(kvm, gsi); - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; - - hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; - hv_vcpu->exit.u.synic.msr = msr; - hv_vcpu->exit.u.synic.control = synic->control; - hv_vcpu->exit.u.synic.evt_page = synic->evt_page; - hv_vcpu->exit.u.synic.msg_page = synic->msg_page; - - kvm_make_request(KVM_REQ_HV_EXIT, vcpu); -} - -static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, - u32 msr, u64 data, bool host) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - int ret; - - if (!synic->active) - return 1; - - trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); - - ret = 0; - switch (msr) { - case HV_X64_MSR_SCONTROL: - synic->control = data; - if (!host) - synic_exit(synic, msr); - break; - case HV_X64_MSR_SVERSION: - if (!host) { - ret = 1; - break; - } - synic->version = data; - break; - case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) - if (kvm_clear_guest(vcpu->kvm, - data & PAGE_MASK, PAGE_SIZE)) { - ret = 1; - break; - } - synic->evt_page = data; - if (!host) - synic_exit(synic, msr); - break; - case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) - if (kvm_clear_guest(vcpu->kvm, - data & PAGE_MASK, PAGE_SIZE)) { - ret = 1; - break; - } - synic->msg_page = data; - if (!host) - synic_exit(synic, msr); - break; - case HV_X64_MSR_EOM: { - int i; - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) - kvm_hv_notify_acked_sint(vcpu, i); - break; - } - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); - break; - default: - ret = 1; - break; - } - return ret; -} - -static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) -{ - int ret; - - if (!synic->active) - return 1; - - ret = 0; - switch (msr) { - case HV_X64_MSR_SCONTROL: - *pdata = synic->control; - break; - case HV_X64_MSR_SVERSION: - *pdata = synic->version; - break; - case HV_X64_MSR_SIEFP: - *pdata = synic->evt_page; - break; - case HV_X64_MSR_SIMP: - *pdata = synic->msg_page; - break; - case HV_X64_MSR_EOM: - *pdata = 0; - break; - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); - break; - default: - ret = 1; - break; - } - return ret; -} - -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_lapic_irq irq; - int ret, vector; - - if (sint >= ARRAY_SIZE(synic->sint)) - return -EINVAL; - - vector = synic_get_sint_vector(synic_read_sint(synic, sint)); - if (vector < 0) - return -ENOENT; - - memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); - irq.dest_mode = APIC_DEST_PHYSICAL; - irq.delivery_mode = APIC_DM_FIXED; - irq.vector = vector; - irq.level = 1; - - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); - trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); - return ret; -} - -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) -{ - struct kvm_vcpu_hv_synic *synic; - - synic = synic_get(kvm, vcpu_id); - if (!synic) - return -EINVAL; - - return synic_set_irq(synic, sint); -} - -void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) -{ - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - int i; - - trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); - - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) - if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) - kvm_hv_notify_acked_sint(vcpu, i); -} - -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) -{ - struct kvm_vcpu_hv_synic *synic; - - synic = synic_get(kvm, vcpu_id); - if (!synic) - return -EINVAL; - - if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) - return -EINVAL; - - atomic_set(&synic->sint_to_gsi[sint], gsi); - return 0; -} - -void kvm_hv_irq_routing_update(struct kvm *kvm) -{ - struct kvm_irq_routing_table *irq_rt; - struct kvm_kernel_irq_routing_entry *e; - u32 gsi; - - irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, - lockdep_is_held(&kvm->irq_lock)); - - for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { - hlist_for_each_entry(e, &irq_rt->map[gsi], link) { - if (e->type == KVM_IRQ_ROUTING_HV_SINT) - kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, - e->hv_sint.sint, gsi); - } - } -} - -static void synic_init(struct kvm_vcpu_hv_synic *synic) -{ - int i; - - memset(synic, 0, sizeof(*synic)); - synic->version = HV_SYNIC_VERSION_1; - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { - atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); - atomic_set(&synic->sint_to_gsi[i], -1); - } -} - -static u64 get_time_ref_counter(struct kvm *kvm) -{ - struct kvm_hv *hv = &kvm->arch.hyperv; - struct kvm_vcpu *vcpu; - u64 tsc; - - /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. - */ - if (!hv->tsc_ref.tsc_sequence) - return div_u64(get_kvmclock_ns(kvm), 100); - - vcpu = kvm_get_vcpu(kvm, 0); - tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) - + hv->tsc_ref.tsc_offset; -} - -static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, - bool vcpu_kick) -{ - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - - set_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); - kvm_make_request(KVM_REQ_HV_STIMER, vcpu); - if (vcpu_kick) - kvm_vcpu_kick(vcpu); -} - -static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) -{ - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - - trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index); - - hrtimer_cancel(&stimer->timer); - clear_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); - stimer->msg_pending = false; - stimer->exp_time = 0; -} - -static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) -{ - struct kvm_vcpu_hv_stimer *stimer; - - stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); - trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index); - stimer_mark_pending(stimer, true); - - return HRTIMER_NORESTART; -} - -/* - * stimer_start() assumptions: - * a) stimer->count is not equal to 0 - * b) stimer->config has HV_STIMER_ENABLE flag - */ -static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) -{ - u64 time_now; - ktime_t ktime_now; - - time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); - ktime_now = ktime_get(); - - if (stimer->config & HV_STIMER_PERIODIC) { - if (stimer->exp_time) { - if (time_now >= stimer->exp_time) { - u64 remainder; - - div64_u64_rem(time_now - stimer->exp_time, - stimer->count, &remainder); - stimer->exp_time = - time_now + (stimer->count - remainder); - } - } else - stimer->exp_time = time_now + stimer->count; - - trace_kvm_hv_stimer_start_periodic( - stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, - time_now, stimer->exp_time); - - hrtimer_start(&stimer->timer, - ktime_add_ns(ktime_now, - 100 * (stimer->exp_time - time_now)), - HRTIMER_MODE_ABS); - return 0; - } - stimer->exp_time = stimer->count; - if (time_now >= stimer->count) { - /* - * Expire timer according to Hypervisor Top-Level Functional - * specification v4(15.3.1): - * "If a one shot is enabled and the specified count is in - * the past, it will expire immediately." - */ - stimer_mark_pending(stimer, false); - return 0; - } - - trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, - time_now, stimer->count); - - hrtimer_start(&stimer->timer, - ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), - HRTIMER_MODE_ABS); - return 0; -} - -static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, - bool host) -{ - trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, config, host); - - stimer_cleanup(stimer); - if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) - config &= ~HV_STIMER_ENABLE; - stimer->config = config; - stimer_mark_pending(stimer, false); - return 0; -} - -static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, - bool host) -{ - trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, count, host); - - stimer_cleanup(stimer); - stimer->count = count; - if (stimer->count == 0) - stimer->config &= ~HV_STIMER_ENABLE; - else if (stimer->config & HV_STIMER_AUTOENABLE) - stimer->config |= HV_STIMER_ENABLE; - stimer_mark_pending(stimer, false); - return 0; -} - -static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) -{ - *pconfig = stimer->config; - return 0; -} - -static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) -{ - *pcount = stimer->count; - return 0; -} - -static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, - struct hv_message *src_msg) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *dst_msg; - int r; - struct hv_message_page *msg_page; - - if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) - return -ENOENT; - - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - return -EFAULT; - - msg_page = kmap_atomic(page); - dst_msg = &msg_page->sint_message[sint]; - if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, - src_msg->header.message_type) != HVMSG_NONE) { - dst_msg->header.message_flags.msg_pending = 1; - r = -EAGAIN; - } else { - memcpy(&dst_msg->u.payload, &src_msg->u.payload, - src_msg->header.payload_size); - dst_msg->header.message_type = src_msg->header.message_type; - dst_msg->header.payload_size = src_msg->header.payload_size; - r = synic_set_irq(synic, sint); - if (r >= 1) - r = 0; - else if (r == 0) - r = -EFAULT; - } - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - return r; -} - -static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) -{ - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct hv_message *msg = &stimer->msg; - struct hv_timer_message_payload *payload = - (struct hv_timer_message_payload *)&msg->u.payload; - - payload->expiration_time = stimer->exp_time; - payload->delivery_time = get_time_ref_counter(vcpu->kvm); - return synic_deliver_msg(vcpu_to_synic(vcpu), - HV_STIMER_SINT(stimer->config), msg); -} - -static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) -{ - int r; - - stimer->msg_pending = true; - r = stimer_send_msg(stimer); - trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, r); - if (!r) { - stimer->msg_pending = false; - if (!(stimer->config & HV_STIMER_PERIODIC)) - stimer->config &= ~HV_STIMER_ENABLE; - } -} - -void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - struct kvm_vcpu_hv_stimer *stimer; - u64 time_now, exp_time; - int i; - - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { - stimer = &hv_vcpu->stimer[i]; - if (stimer->config & HV_STIMER_ENABLE) { - exp_time = stimer->exp_time; - - if (exp_time) { - time_now = - get_time_ref_counter(vcpu->kvm); - if (time_now >= exp_time) - stimer_expiration(stimer); - } - - if ((stimer->config & HV_STIMER_ENABLE) && - stimer->count) - stimer_start(stimer); - else - stimer_cleanup(stimer); - } - } -} - -void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - int i; - - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - stimer_cleanup(&hv_vcpu->stimer[i]); -} - -static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) -{ - struct hv_message *msg = &stimer->msg; - struct hv_timer_message_payload *payload = - (struct hv_timer_message_payload *)&msg->u.payload; - - memset(&msg->header, 0, sizeof(msg->header)); - msg->header.message_type = HVMSG_TIMER_EXPIRED; - msg->header.payload_size = sizeof(*payload); - - payload->timer_index = stimer->index; - payload->expiration_time = 0; - payload->delivery_time = 0; -} - -static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) -{ - memset(stimer, 0, sizeof(*stimer)); - stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; - stimer_prepare_msg(stimer); -} - -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - int i; - - synic_init(&hv_vcpu->synic); - - bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - stimer_init(&hv_vcpu->stimer[i], i); -} - -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) -{ - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so deactivate APICV - */ - kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; - return 0; -} - -static bool kvm_hv_msr_partition_wide(u32 msr) -{ - bool r = false; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - case HV_X64_MSR_REFERENCE_TSC: - case HV_X64_MSR_TIME_REF_COUNT: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_RESET: - r = true; - break; - } - - return r; -} - -static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 *pdata) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) - return -EINVAL; - - *pdata = hv->hv_crash_param[index]; - return 0; -} - -static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - *pdata = hv->hv_crash_ctl; - return 0; -} - -static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (host) - hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; - - if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { - - vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", - hv->hv_crash_param[0], - hv->hv_crash_param[1], - hv->hv_crash_param[2], - hv->hv_crash_param[3], - hv->hv_crash_param[4]); - - /* Send notification about crash to user space */ - kvm_make_request(KVM_REQ_HV_CRASH, vcpu); - } - - return 0; -} - -static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 data) -{ - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) - return -EINVAL; - - hv->hv_crash_param[index] = data; - return 0; -} - -/* - * The kvmclock and Hyper-V TSC page use similar formulas, and converting - * between them is possible: - * - * kvmclock formula: - * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) - * + system_time - * - * Hyper-V formula: - * nsec/100 = ticks * scale / 2^64 + offset - * - * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. - * By dividing the kvmclock formula by 100 and equating what's left we get: - * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 - * - * Now expand the kvmclock formula and divide by 100: - * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) - * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) - * + system_time - * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 - * + system_time / 100 - * - * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: - * nsec/100 = ticks * scale / 2^64 - * - tsc_timestamp * scale / 2^64 - * + system_time / 100 - * - * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: - * offset = system_time / 100 - tsc_timestamp * scale / 2^64 - * - * These two equivalencies are implemented in this function. - */ -static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, - HV_REFERENCE_TSC_PAGE *tsc_ref) -{ - u64 max_mul; - - if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) - return false; - - /* - * check if scale would overflow, if so we use the time ref counter - * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 - * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) - * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) - */ - max_mul = 100ull << (32 - hv_clock->tsc_shift); - if (hv_clock->tsc_to_system_mul >= max_mul) - return false; - - /* - * Otherwise compute the scale and offset according to the formulas - * derived above. - */ - tsc_ref->tsc_scale = - mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), - hv_clock->tsc_to_system_mul, - 100); - - tsc_ref->tsc_offset = hv_clock->system_time; - do_div(tsc_ref->tsc_offset, 100); - tsc_ref->tsc_offset -= - mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); - return true; -} - -void kvm_hv_setup_tsc_page(struct kvm *kvm, - struct pvclock_vcpu_time_info *hv_clock) -{ - struct kvm_hv *hv = &kvm->arch.hyperv; - u32 tsc_seq; - u64 gfn; - - BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); - - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - return; - - gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - /* - * Because the TSC parameters only vary when there is a - * change in the master clock, do not bother with caching. - */ - if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), - &tsc_seq, sizeof(tsc_seq)))) - return; - - /* - * While we're computing and writing the parameters, force the - * guest to use the time reference count MSR. - */ - hv->tsc_ref.tsc_sequence = 0; - if (kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - return; - - if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - return; - - /* Ensure sequence is zero before writing the rest of the struct. */ - smp_wmb(); - if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - return; - - /* - * Now switch to the TSC page mechanism by writing the sequence. - */ - tsc_seq++; - if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) - tsc_seq = 1; - - /* Write the struct entirely before the non-zero sequence. */ - smp_wmb(); - - hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); -} - -static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, - bool host) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - hv->hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!hv->hv_guest_os_id) - hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; - - /* if guest os id is not set hypercall should remain disabled */ - if (!hv->hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - hv->hv_hypercall = data; - break; - } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) - return 1; - hv->hv_hypercall = data; - mark_page_dirty(kvm, gfn); - break; - } - case HV_X64_MSR_REFERENCE_TSC: - hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - break; - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_set_crash_data(vcpu, - msr - HV_X64_MSR_CRASH_P0, - data); - case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_set_crash_ctl(vcpu, data, host); - case HV_X64_MSR_RESET: - if (data == 1) { - vcpu_debug(vcpu, "hyper-v reset requested\n"); - kvm_make_request(KVM_REQ_HV_RESET, vcpu); - } - break; - default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } - return 0; -} - -/* Calculate cpu time spent by current task in 100ns units */ -static u64 current_task_runtime_100ns(void) -{ - cputime_t utime, stime; - - task_cputime_adjusted(current, &utime, &stime); - return div_u64(cputime_to_nsecs(utime + stime), 100); -} - -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) -{ - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - u64 gfn; - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - hv->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) - return 1; - break; - } - gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (kvm_is_error_hva(addr)) - return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) - return 1; - hv->hv_vapic = data; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, - gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) - return 1; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); - case HV_X64_MSR_VP_RUNTIME: - if (!host) - return 1; - hv->runtime_offset = data - current_task_runtime_100ns(); - break; - case HV_X64_MSR_SCONTROL: - case HV_X64_MSR_SVERSION: - case HV_X64_MSR_SIEFP: - case HV_X64_MSR_SIMP: - case HV_X64_MSR_EOM: - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); - case HV_X64_MSR_STIMER0_CONFIG: - case HV_X64_MSR_STIMER1_CONFIG: - case HV_X64_MSR_STIMER2_CONFIG: - case HV_X64_MSR_STIMER3_CONFIG: { - int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - - return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), - data, host); - } - case HV_X64_MSR_STIMER0_COUNT: - case HV_X64_MSR_STIMER1_COUNT: - case HV_X64_MSR_STIMER2_COUNT: - case HV_X64_MSR_STIMER3_COUNT: { - int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - - return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), - data, host); - } - default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } - - return 0; -} - -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = hv->hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = hv->hv_hypercall; - break; - case HV_X64_MSR_TIME_REF_COUNT: - data = get_time_ref_counter(kvm); - break; - case HV_X64_MSR_REFERENCE_TSC: - data = hv->hv_tsc_page; - break; - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_get_crash_data(vcpu, - msr - HV_X64_MSR_CRASH_P0, - pdata); - case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_get_crash_ctl(vcpu, pdata); - case HV_X64_MSR_RESET: - data = 0; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - - *pdata = data; - return 0; -} - -static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: - data = hv->hv_vapic; - break; - case HV_X64_MSR_VP_RUNTIME: - data = current_task_runtime_100ns() + hv->runtime_offset; - break; - case HV_X64_MSR_SCONTROL: - case HV_X64_MSR_SVERSION: - case HV_X64_MSR_SIEFP: - case HV_X64_MSR_SIMP: - case HV_X64_MSR_EOM: - case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); - case HV_X64_MSR_STIMER0_CONFIG: - case HV_X64_MSR_STIMER1_CONFIG: - case HV_X64_MSR_STIMER2_CONFIG: - case HV_X64_MSR_STIMER3_CONFIG: { - int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - - return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), - pdata); - } - case HV_X64_MSR_STIMER0_COUNT: - case HV_X64_MSR_STIMER1_COUNT: - case HV_X64_MSR_STIMER2_COUNT: - case HV_X64_MSR_STIMER3_COUNT: { - int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - - return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), - pdata); - } - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} - -int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) -{ - if (kvm_hv_msr_partition_wide(msr)) { - int r; - - mutex_lock(&vcpu->kvm->lock); - r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return kvm_hv_set_msr(vcpu, msr, data, host); -} - -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - if (kvm_hv_msr_partition_wide(msr)) { - int r; - - mutex_lock(&vcpu->kvm->lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return kvm_hv_get_msr(vcpu, msr, pdata); -} - -bool kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; -} - -static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) -{ - bool longmode; - - longmode = is_64_bit_mode(vcpu); - if (longmode) - kvm_register_write(vcpu, VCPU_REGS_RAX, result); - else { - kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); - } -} - -static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - - kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); - return 1; -} - -int kvm_hv_hypercall(struct kvm_vcpu *vcpu) -{ - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - bool fast, longmode; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif - - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; - - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - - /* Hypercall continuation is not supported yet */ - if (rep_cnt || rep_idx) { - res = HV_STATUS_INVALID_HYPERCALL_CODE; - goto set_result; - } - - switch (code) { - case HVCALL_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); - break; - case HVCALL_POST_MESSAGE: - case HVCALL_SIGNAL_EVENT: - /* don't bother userspace if it has no way to handle it */ - if (!vcpu_to_synic(vcpu)->active) { - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - -set_result: - ret = res | (((u64)rep_done & 0xfff) << 32); - kvm_hv_hypercall_set_result(vcpu, ret); - return 1; -} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h deleted file mode 100644 index cd11195..0000000 --- a/arch/x86/kvm/hyperv.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * KVM Microsoft Hyper-V emulation - * - * derived from arch/x86/kvm/x86.c - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2008 Qumranet, Inc. - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * Amit Shah <amit.shah@qumranet.com> - * Ben-Ami Yassour <benami@il.ibm.com> - * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#ifndef __ARCH_X86_KVM_HYPERV_H__ -#define __ARCH_X86_KVM_HYPERV_H__ - -static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) -{ - return &vcpu->arch.hyperv; -} - -static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu) -{ - struct kvm_vcpu_arch *arch; - - arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv); - return container_of(arch, struct kvm_vcpu, arch); -} - -static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu) -{ - return &vcpu->arch.hyperv.synic; -} - -static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) -{ - return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); -} - -int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); - -bool kvm_hv_hypercall_enabled(struct kvm *kvm); -int kvm_hv_hypercall(struct kvm_vcpu *vcpu); - -void kvm_hv_irq_routing_update(struct kvm *kvm); -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); -void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); - -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); - -static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, - int timer_index) -{ - return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index]; -} - -static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) -{ - struct kvm_vcpu_hv *hv_vcpu; - - hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv, - stimer[0]); - return hv_vcpu_to_vcpu(hv_vcpu); -} - -static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) -{ - return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap, - HV_SYNIC_STIMER_COUNT); -} - -void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); - -void kvm_hv_setup_tsc_page(struct kvm *kvm, - struct pvclock_vcpu_time_info *hv_clock); - -#endif diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c deleted file mode 100644 index 16a7134..0000000 --- a/arch/x86/kvm/i8254.c +++ /dev/null @@ -1,738 +0,0 @@ -/* - * 8253/8254 interval timer emulation - * - * Copyright (c) 2003-2004 Fabrice Bellard - * Copyright (c) 2006 Intel Corporation - * Copyright (c) 2007 Keir Fraser, XenSource Inc - * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * Authors: - * Sheng Yang <sheng.yang@intel.com> - * Based on QEMU and Xen. - */ - -#define pr_fmt(fmt) "pit: " fmt - -#include <linux/kvm_host.h> -#include <linux/slab.h> - -#include "ioapic.h" -#include "irq.h" -#include "i8254.h" -#include "x86.h" - -#ifndef CONFIG_X86_64 -#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) -#else -#define mod_64(x, y) ((x) % (y)) -#endif - -#define RW_STATE_LSB 1 -#define RW_STATE_MSB 2 -#define RW_STATE_WORD0 3 -#define RW_STATE_WORD1 4 - -static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - - switch (c->mode) { - default: - case 0: - case 4: - /* XXX: just disable/enable counting */ - break; - case 1: - case 2: - case 3: - case 5: - /* Restart counting on rising edge. */ - if (c->gate < val) - c->count_load_time = ktime_get(); - break; - } - - c->gate = val; -} - -static int pit_get_gate(struct kvm_pit *pit, int channel) -{ - return pit->pit_state.channels[channel].gate; -} - -static s64 __kpit_elapsed(struct kvm_pit *pit) -{ - s64 elapsed; - ktime_t remaining; - struct kvm_kpit_state *ps = &pit->pit_state; - - if (!ps->period) - return 0; - - /* - * The Counter does not stop when it reaches zero. In - * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to - * the highest count, either FFFF hex for binary counting - * or 9999 for BCD counting, and continues counting. - * Modes 2 and 3 are periodic; the Counter reloads - * itself with the initial count and continues counting - * from there. - */ - remaining = hrtimer_get_remaining(&ps->timer); - elapsed = ps->period - ktime_to_ns(remaining); - - return elapsed; -} - -static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c, - int channel) -{ - if (channel == 0) - return __kpit_elapsed(pit); - - return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); -} - -static int pit_get_count(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - s64 d, t; - int counter; - - t = kpit_elapsed(pit, c, channel); - d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); - - switch (c->mode) { - case 0: - case 1: - case 4: - case 5: - counter = (c->count - d) & 0xffff; - break; - case 3: - /* XXX: may be incorrect for odd counts */ - counter = c->count - (mod_64((2 * d), c->count)); - break; - default: - counter = c->count - mod_64(d, c->count); - break; - } - return counter; -} - -static int pit_get_out(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - s64 d, t; - int out; - - t = kpit_elapsed(pit, c, channel); - d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); - - switch (c->mode) { - default: - case 0: - out = (d >= c->count); - break; - case 1: - out = (d < c->count); - break; - case 2: - out = ((mod_64(d, c->count) == 0) && (d != 0)); - break; - case 3: - out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); - break; - case 4: - case 5: - out = (d == c->count); - break; - } - - return out; -} - -static void pit_latch_count(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - - if (!c->count_latched) { - c->latched_count = pit_get_count(pit, channel); - c->count_latched = c->rw_mode; - } -} - -static void pit_latch_status(struct kvm_pit *pit, int channel) -{ - struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; - - if (!c->status_latched) { - /* TODO: Return NULL COUNT (bit 6). */ - c->status = ((pit_get_out(pit, channel) << 7) | - (c->rw_mode << 4) | - (c->mode << 1) | - c->bcd); - c->status_latched = 1; - } -} - -static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps) -{ - return container_of(ps, struct kvm_pit, pit_state); -} - -static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, - irq_ack_notifier); - struct kvm_pit *pit = pit_state_to_pit(ps); - - atomic_set(&ps->irq_ack, 1); - /* irq_ack should be set before pending is read. Order accesses with - * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work. - */ - smp_mb(); - if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); -} - -void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct hrtimer *timer; - - if (!kvm_vcpu_is_bsp(vcpu) || !pit) - return; - - timer = &pit->pit_state.timer; - mutex_lock(&pit->pit_state.lock); - if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); - mutex_unlock(&pit->pit_state.lock); -} - -static void destroy_pit_timer(struct kvm_pit *pit) -{ - hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); -} - -static void pit_do_work(struct kthread_work *work) -{ - struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); - struct kvm *kvm = pit->kvm; - struct kvm_vcpu *vcpu; - int i; - struct kvm_kpit_state *ps = &pit->pit_state; - - if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) - return; - - kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation does not follow - * the MP specification. We propagate a PIT interrupt to all - * VCPUs and only when LVT0 is in NMI mode. The interrupt can - * also be simultaneously delivered through PIC and IOAPIC. - */ - if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); -} - -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); - struct kvm_pit *pt = pit_state_to_pit(ps); - - if (atomic_read(&ps->reinject)) - atomic_inc(&ps->pending); - - kthread_queue_work(&pt->worker, &pt->expired); - - if (ps->is_periodic) { - hrtimer_add_expires_ns(&ps->timer, ps->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} - -static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) -{ - atomic_set(&pit->pit_state.pending, 0); - atomic_set(&pit->pit_state.irq_ack, 1); -} - -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) -{ - struct kvm_kpit_state *ps = &pit->pit_state; - struct kvm *kvm = pit->kvm; - - if (atomic_read(&ps->reinject) == reinject) - return; - - if (reinject) { - /* The initial state is preserved while ps->reinject == 0. */ - kvm_pit_reset_reinject(pit); - kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); - kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - } else { - kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); - kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - } - - atomic_set(&ps->reinject, reinject); -} - -static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) -{ - struct kvm_kpit_state *ps = &pit->pit_state; - struct kvm *kvm = pit->kvm; - s64 interval; - - if (!ioapic_in_kernel(kvm) || - ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) - return; - - interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ); - - pr_debug("create pit timer, interval is %llu nsec\n", interval); - - /* TODO The new value only affected after the retriggered */ - hrtimer_cancel(&ps->timer); - kthread_flush_work(&pit->expired); - ps->period = interval; - ps->is_periodic = is_period; - - kvm_pit_reset_reinject(pit); - - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (ps->is_periodic) { - s64 min_period = min_timer_period_us * 1000LL; - - if (ps->period < min_period) { - pr_info_ratelimited( - "kvm: requested %lld ns " - "i8254 timer period limited to %lld ns\n", - ps->period, min_period); - ps->period = min_period; - } - } - - hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), - HRTIMER_MODE_ABS); -} - -static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) -{ - struct kvm_kpit_state *ps = &pit->pit_state; - - pr_debug("load_count val is %d, channel is %d\n", val, channel); - - /* - * The largest possible initial count is 0; this is equivalent - * to 216 for binary counting and 104 for BCD counting. - */ - if (val == 0) - val = 0x10000; - - ps->channels[channel].count = val; - - if (channel != 0) { - ps->channels[channel].count_load_time = ktime_get(); - return; - } - - /* Two types of timer - * mode 1 is one shot, mode 2 is period, otherwise del timer */ - switch (ps->channels[0].mode) { - case 0: - case 1: - /* FIXME: enhance mode 4 precision */ - case 4: - create_pit_timer(pit, val, 0); - break; - case 2: - case 3: - create_pit_timer(pit, val, 1); - break; - default: - destroy_pit_timer(pit); - } -} - -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start) -{ - u8 saved_mode; - - WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock)); - - if (hpet_legacy_start) { - /* save existing mode for later reenablement */ - WARN_ON(channel != 0); - saved_mode = pit->pit_state.channels[0].mode; - pit->pit_state.channels[0].mode = 0xff; /* disable timer */ - pit_load_count(pit, channel, val); - pit->pit_state.channels[0].mode = saved_mode; - } else { - pit_load_count(pit, channel, val); - } -} - -static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pit, dev); -} - -static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pit, speaker_dev); -} - -static inline int pit_in_range(gpa_t addr) -{ - return ((addr >= KVM_PIT_BASE_ADDRESS) && - (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); -} - -static int pit_ioport_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, const void *data) -{ - struct kvm_pit *pit = dev_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - int channel, access; - struct kvm_kpit_channel_state *s; - u32 val = *(u32 *) data; - if (!pit_in_range(addr)) - return -EOPNOTSUPP; - - val &= 0xff; - addr &= KVM_PIT_CHANNEL_MASK; - - mutex_lock(&pit_state->lock); - - if (val != 0) - pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", - (unsigned int)addr, len, val); - - if (addr == 3) { - channel = val >> 6; - if (channel == 3) { - /* Read-Back Command. */ - for (channel = 0; channel < 3; channel++) { - s = &pit_state->channels[channel]; - if (val & (2 << channel)) { - if (!(val & 0x20)) - pit_latch_count(pit, channel); - if (!(val & 0x10)) - pit_latch_status(pit, channel); - } - } - } else { - /* Select Counter <channel>. */ - s = &pit_state->channels[channel]; - access = (val >> 4) & KVM_PIT_CHANNEL_MASK; - if (access == 0) { - pit_latch_count(pit, channel); - } else { - s->rw_mode = access; - s->read_state = access; - s->write_state = access; - s->mode = (val >> 1) & 7; - if (s->mode > 5) - s->mode -= 4; - s->bcd = val & 1; - } - } - } else { - /* Write Count. */ - s = &pit_state->channels[addr]; - switch (s->write_state) { - default: - case RW_STATE_LSB: - pit_load_count(pit, addr, val); - break; - case RW_STATE_MSB: - pit_load_count(pit, addr, val << 8); - break; - case RW_STATE_WORD0: - s->write_latch = val; - s->write_state = RW_STATE_WORD1; - break; - case RW_STATE_WORD1: - pit_load_count(pit, addr, s->write_latch | (val << 8)); - s->write_state = RW_STATE_WORD0; - break; - } - } - - mutex_unlock(&pit_state->lock); - return 0; -} - -static int pit_ioport_read(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, void *data) -{ - struct kvm_pit *pit = dev_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - int ret, count; - struct kvm_kpit_channel_state *s; - if (!pit_in_range(addr)) - return -EOPNOTSUPP; - - addr &= KVM_PIT_CHANNEL_MASK; - if (addr == 3) - return 0; - - s = &pit_state->channels[addr]; - - mutex_lock(&pit_state->lock); - - if (s->status_latched) { - s->status_latched = 0; - ret = s->status; - } else if (s->count_latched) { - switch (s->count_latched) { - default: - case RW_STATE_LSB: - ret = s->latched_count & 0xff; - s->count_latched = 0; - break; - case RW_STATE_MSB: - ret = s->latched_count >> 8; - s->count_latched = 0; - break; - case RW_STATE_WORD0: - ret = s->latched_count & 0xff; - s->count_latched = RW_STATE_MSB; - break; - } - } else { - switch (s->read_state) { - default: - case RW_STATE_LSB: - count = pit_get_count(pit, addr); - ret = count & 0xff; - break; - case RW_STATE_MSB: - count = pit_get_count(pit, addr); - ret = (count >> 8) & 0xff; - break; - case RW_STATE_WORD0: - count = pit_get_count(pit, addr); - ret = count & 0xff; - s->read_state = RW_STATE_WORD1; - break; - case RW_STATE_WORD1: - count = pit_get_count(pit, addr); - ret = (count >> 8) & 0xff; - s->read_state = RW_STATE_WORD0; - break; - } - } - - if (len > sizeof(ret)) - len = sizeof(ret); - memcpy(data, (char *)&ret, len); - - mutex_unlock(&pit_state->lock); - return 0; -} - -static int speaker_ioport_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, const void *data) -{ - struct kvm_pit *pit = speaker_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - u32 val = *(u32 *) data; - if (addr != KVM_SPEAKER_BASE_ADDRESS) - return -EOPNOTSUPP; - - mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; - pit_set_gate(pit, 2, val & 1); - mutex_unlock(&pit_state->lock); - return 0; -} - -static int speaker_ioport_read(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, void *data) -{ - struct kvm_pit *pit = speaker_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - unsigned int refresh_clock; - int ret; - if (addr != KVM_SPEAKER_BASE_ADDRESS) - return -EOPNOTSUPP; - - /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ - refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; - - mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); - if (len > sizeof(ret)) - len = sizeof(ret); - memcpy(data, (char *)&ret, len); - mutex_unlock(&pit_state->lock); - return 0; -} - -static void kvm_pit_reset(struct kvm_pit *pit) -{ - int i; - struct kvm_kpit_channel_state *c; - - pit->pit_state.flags = 0; - for (i = 0; i < 3; i++) { - c = &pit->pit_state.channels[i]; - c->mode = 0xff; - c->gate = (i != 2); - pit_load_count(pit, i, 0); - } - - kvm_pit_reset_reinject(pit); -} - -static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) -{ - struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); - - if (!mask) - kvm_pit_reset_reinject(pit); -} - -static const struct kvm_io_device_ops pit_dev_ops = { - .read = pit_ioport_read, - .write = pit_ioport_write, -}; - -static const struct kvm_io_device_ops speaker_dev_ops = { - .read = speaker_ioport_read, - .write = speaker_ioport_write, -}; - -struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) -{ - struct kvm_pit *pit; - struct kvm_kpit_state *pit_state; - struct pid *pid; - pid_t pid_nr; - int ret; - - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); - if (!pit) - return NULL; - - pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) - goto fail_request; - - mutex_init(&pit->pit_state.lock); - - pid = get_pid(task_tgid(current)); - pid_nr = pid_vnr(pid); - put_pid(pid); - - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) - goto fail_kthread; - - kthread_init_work(&pit->expired, pit_do_work); - - pit->kvm = kvm; - - pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; - - pit_state->irq_ack_notifier.gsi = 0; - pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; - pit->mask_notifier.func = pit_mask_notifer; - - kvm_pit_reset(pit); - - kvm_pit_set_reinject(pit, true); - - mutex_lock(&kvm->slots_lock); - kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, - KVM_PIT_MEM_LENGTH, &pit->dev); - if (ret < 0) - goto fail_register_pit; - - if (flags & KVM_PIT_SPEAKER_DUMMY) { - kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - KVM_SPEAKER_BASE_ADDRESS, 4, - &pit->speaker_dev); - if (ret < 0) - goto fail_register_speaker; - } - mutex_unlock(&kvm->slots_lock); - - return pit; - -fail_register_speaker: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); -fail_register_pit: - mutex_unlock(&kvm->slots_lock); - kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); -fail_kthread: - kvm_free_irq_source_id(kvm, pit->irq_source_id); -fail_request: - kfree(pit); - return NULL; -} - -void kvm_free_pit(struct kvm *kvm) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - if (pit) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); - kvm_pit_set_reinject(pit, false); - hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - kfree(pit); - } -} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h deleted file mode 100644 index 2f5af07..0000000 --- a/arch/x86/kvm/i8254.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef __I8254_H -#define __I8254_H - -#include <linux/kthread.h> - -#include <kvm/iodev.h> - -struct kvm_kpit_channel_state { - u32 count; /* can be 65536 */ - u16 latched_count; - u8 count_latched; - u8 status_latched; - u8 status; - u8 read_state; - u8 write_state; - u8 write_latch; - u8 rw_mode; - u8 mode; - u8 bcd; /* not supported */ - u8 gate; /* timer start */ - ktime_t count_load_time; -}; - -struct kvm_kpit_state { - /* All members before "struct mutex lock" are protected by the lock. */ - struct kvm_kpit_channel_state channels[3]; - u32 flags; - bool is_periodic; - s64 period; /* unit: ns */ - struct hrtimer timer; - u32 speaker_data_on; - - struct mutex lock; - atomic_t reinject; - atomic_t pending; /* accumulated triggered timers */ - atomic_t irq_ack; - struct kvm_irq_ack_notifier irq_ack_notifier; -}; - -struct kvm_pit { - struct kvm_io_device dev; - struct kvm_io_device speaker_dev; - struct kvm *kvm; - struct kvm_kpit_state pit_state; - int irq_source_id; - struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; - struct kthread_work expired; -}; - -#define KVM_PIT_BASE_ADDRESS 0x40 -#define KVM_SPEAKER_BASE_ADDRESS 0x61 -#define KVM_PIT_MEM_LENGTH 4 -#define KVM_PIT_FREQ 1193181 -#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 -#define KVM_PIT_CHANNEL_MASK 0x3 - -struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); -void kvm_free_pit(struct kvm *kvm); - -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start); -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); - -#endif diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360..c178239 100644..100755 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -4,6 +4,7 @@ * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,13 +27,11 @@ * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Port from Qemu. */ -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/bitops.h> #include "irq.h" - #include <linux/kvm_host.h> -#include "trace.h" + +#include <ntddk.h> +#include <gvm_types.h> #define pr_pic_unimpl(fmt, ...) \ pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) @@ -40,13 +39,11 @@ static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) { spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) { bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu, *found = NULL; @@ -67,7 +64,7 @@ static void pic_unlock(struct kvm_pic *s) if (!found) return; - kvm_make_request(KVM_REQ_EVENT, found); + kvm_make_request(GVM_REQ_EVENT, found); kvm_vcpu_kick(found); } } @@ -84,7 +81,7 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * it should be safe since PIC state is already updated at this stage. */ pic_unlock(s->pics_state); - kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); + //kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); pic_lock(s->pics_state); } @@ -199,8 +196,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) irq_source_id, level); ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); - trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, - s->pics[irq >> 3].imr, ret == 0); pic_unlock(s); return ret; @@ -620,16 +615,16 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, + ret = kvm_io_bus_register_dev(kvm, GVM_PIO_BUS, 0x20, 2, &s->dev_master); if (ret < 0) goto fail_unlock; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); + ret = kvm_io_bus_register_dev(kvm, GVM_PIO_BUS, 0xa0, 2, &s->dev_slave); if (ret < 0) goto fail_unreg_2; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + ret = kvm_io_bus_register_dev(kvm, GVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); if (ret < 0) goto fail_unreg_1; @@ -638,10 +633,10 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; fail_unreg_1: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); + kvm_io_bus_unregister_dev(kvm, GVM_PIO_BUS, &s->dev_slave); fail_unreg_2: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); + kvm_io_bus_unregister_dev(kvm, GVM_PIO_BUS, &s->dev_master); fail_unlock: mutex_unlock(&kvm->slots_lock); @@ -653,8 +648,8 @@ fail_unlock: void kvm_destroy_pic(struct kvm_pic *vpic) { - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kvm_io_bus_unregister_dev(vpic->kvm, GVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(vpic->kvm, GVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(vpic->kvm, GVM_PIO_BUS, &vpic->dev_eclr); kfree(vpic); } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5..4e2c62b 100644..100755 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -1,6 +1,7 @@ /* * Copyright (C) 2001 MandrakeSoft S.A. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * MandrakeSoft S.A. * 43, rue d'Aboukir @@ -28,36 +29,26 @@ */ #include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/current.h> -#include <trace/events/kvm.h> +#include <uapi/linux/kvm.h> #include "ioapic.h" #include "lapic.h" #include "irq.h" -#if 0 -#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#ifdef KVM_IOAPIC_DEBUG +#define ioapic_debug DbgPrint #else -#define ioapic_debug(fmt, arg...) +#define ioapic_debug(fmt,...) #endif + static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); -static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, - unsigned long addr, - unsigned long length) +static size_t ioapic_read_indirect(struct kvm_ioapic *ioapic, + size_t addr, + size_t length) { - unsigned long result = 0; + size_t result = 0; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -94,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); + bitmap_zero(ioapic->rtc_status.dest_map.map, GVM_MAX_VCPU_ID); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -148,9 +139,6 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) struct kvm_vcpu *vcpu; int i; - if (RTC_GSI >= IOAPIC_NUM_PINS) - return; - rtc_irq_eoi_tracking_reset(ioapic); kvm_for_each_vcpu(i, vcpu, ioapic->kvm) __rtc_irq_eoi_tracking_restore_one(vcpu); @@ -220,11 +208,10 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, ret = ioapic_service(ioapic, irq, line_status); out: - trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); return ret; } -static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) +static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, size_t irr) { u32 idx; @@ -253,7 +240,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) for (index = 0; index < IOAPIC_NUM_PINS; index++) { e = &ioapic->redirtbl[index]; if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || - kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || + //kvm_irq_has_notifier(ioapic->kvm, GVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, e->fields.dest_mode) || @@ -311,7 +298,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } mask_after = e->fields.mask; if (mask_before != mask_after) - kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + kvm_fire_mask_notifiers(ioapic->kvm, GVM_IRQCHIP_IOAPIC, index, mask_after); if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); @@ -389,31 +376,11 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) int i; spin_lock(&ioapic->lock); - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + for (i = 0; i < GVM_IOAPIC_NUM_PINS; i++) __clear_bit(irq_source_id, &ioapic->irq_states[i]); spin_unlock(&ioapic->lock); } -static void kvm_ioapic_eoi_inject_work(struct work_struct *work) -{ - int i; - struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, - eoi_inject.work); - spin_lock(&ioapic->lock); - for (i = 0; i < IOAPIC_NUM_PINS; i++) { - union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; - - if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) - continue; - - if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) - ioapic_service(ioapic, i, false); - } - spin_unlock(&ioapic->lock); -} - -#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 - static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { @@ -441,7 +408,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, * after ack notifier returns. */ spin_unlock(&ioapic->lock); - kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + //kvm_notify_acked_irq(ioapic->kvm, GVM_IRQCHIP_IOAPIC, i); spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || @@ -452,21 +419,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ent->fields.remote_irr = 0; if (!ent->fields.mask && (ioapic->irr & (1 << i))) { ++ioapic->irq_eoi[i]; - if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { - /* - * Real hardware does not deliver the interrupt - * immediately during eoi broadcast, and this - * lets a buggy guest make slow progress - * even if it does not correctly handle a - * level-triggered interrupt. Emulate this - * behavior if we detect an interrupt storm. - */ - schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); - ioapic->irq_eoi[i] = 0; - trace_kvm_ioapic_delayed_eoi_inj(ent->bits); - } else { - ioapic_service(ioapic, i, false); - } + ioapic_service(ioapic, i, false); } else { ioapic->irq_eoi[i] = 0; } @@ -501,7 +454,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ioapic_debug("addr %lx\n", (unsigned long)addr); + ioapic_debug("addr %lx\n", (size_t)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; @@ -586,7 +539,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) { int i; - cancel_delayed_work_sync(&ioapic->eoi_inject); for (i = 0; i < IOAPIC_NUM_PINS; i++) ioapic->redirtbl[i].fields.mask = 1; ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; @@ -612,13 +564,12 @@ int kvm_ioapic_init(struct kvm *kvm) if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); - INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, + ret = kvm_io_bus_register_dev(kvm, GVM_MMIO_BUS, ioapic->base_address, IOAPIC_MEM_LENGTH, &ioapic->dev); mutex_unlock(&kvm->slots_lock); if (ret < 0) { @@ -635,8 +586,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - cancel_delayed_work_sync(&ioapic->eoi_inject); - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm_io_bus_unregister_dev(kvm, GVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; kfree(ioapic); } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 1cc6e54..854f770 100644..100755 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_IO_APIC_H #define __KVM_IO_APIC_H @@ -5,11 +9,13 @@ #include <kvm/iodev.h> +#include <gvm_types.h> + struct kvm; struct kvm_vcpu; -#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS -#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES +#define IOAPIC_NUM_PINS GVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS GVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -34,21 +40,17 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); + DECLARE_BITMAP(map, GVM_MAX_VCPU_ID); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID]; + u8 vectors[GVM_MAX_VCPU_ID]; }; @@ -81,30 +83,16 @@ struct kvm_ioapic { u32 irr; u32 pad; union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; - unsigned long irq_states[IOAPIC_NUM_PINS]; + size_t irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; struct rtc_status rtc_status; - struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; u32 irr_delivered; }; -#ifdef DEBUG -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ - __FILE__, __LINE__, #x); \ - BUG(); \ - } \ -} while (0) -#else -#define ASSERT(x) do { } while (0) -#endif - static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) { return kvm->arch.vioapic; diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426..0000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay <allen.m.kay@intel.com> - * Author: Weidong Han <weidong.han@intel.com> - * Author: Ben-Ami Yassour <benami@il.ibm.com> - */ - -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/stat.h> -#include <linux/iommu.h> -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 60d91c9..ba0db8f 100644..100755 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -2,6 +2,7 @@ * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -20,11 +21,9 @@ * */ -#include <linux/export.h> #include <linux/kvm_host.h> #include "irq.h" -#include "i8254.h" #include "x86.h" /* @@ -38,7 +37,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* * check if there is a pending userspace external interrupt @@ -57,10 +55,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) u8 accept = kvm_apic_accept_pic_intr(v); if (accept) { - if (irqchip_split(v->kvm)) - return pending_userspace_extint(v); - else - return pic_irqchip(v->kvm)->output; + return pic_irqchip(v->kvm)->output; } else return 0; } @@ -99,7 +94,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); /* * Read pending interrupt(from non-APIC source) @@ -108,13 +102,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); static int kvm_cpu_get_extint(struct kvm_vcpu *v) { if (kvm_cpu_has_extint(v)) { - if (irqchip_split(v->kvm)) { - int vector = v->arch.pending_external_vector; - - v->arch.pending_external_vector = -1; - return vector; - } else - return kvm_pic_read_irq(v->kvm); /* PIC */ + return kvm_pic_read_irq(v->kvm); /* PIC */ } else return -1; } @@ -136,17 +124,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) return kvm_get_apic_interrupt(v); /* APIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) kvm_inject_apic_timer_irqs(vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); - -void __kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - __kvm_migrate_apic_timer(vcpu); - __kvm_migrate_pit_timer(vcpu); -} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731e..b51da4d 100644..100755 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -1,6 +1,7 @@ /* * irq.h: in kernel interrupt controller related definitions * Copyright (c) 2007, Intel Corporation. + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -22,10 +23,7 @@ #ifndef __IRQ_H #define __IRQ_H -#include <linux/mm_types.h> -#include <linux/hrtimer.h> #include <linux/kvm_host.h> -#include <linux/spinlock.h> #include <kvm/iodev.h> #include "ioapic.h" @@ -33,7 +31,7 @@ #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ - ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) + ((irq) < 8 ? GVM_IRQCHIP_PIC_MASTER : GVM_IRQCHIP_PIC_SLAVE) struct kvm; struct kvm_vcpu; @@ -70,7 +68,7 @@ struct kvm_pic { struct kvm_io_device dev_slave; struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); - unsigned long irq_states[PIC_NUM_PINS]; + size_t irq_states[PIC_NUM_PINS]; }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); @@ -91,18 +89,12 @@ static inline int pic_in_kernel(struct kvm *kvm) return ret; } -static inline int irqchip_split(struct kvm *kvm) -{ - return kvm->arch.irqchip_split; -} - static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); bool ret; ret = (vpic != NULL); - ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); @@ -114,9 +106,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); -void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); -void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c01916..1fd7c73 100644..100755 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -1,6 +1,7 @@ /* * irq_comm.c: Common API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -21,21 +22,15 @@ */ #include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <trace/events/kvm.h> - #include <asm/msidef.h> - #include "irq.h" #include "ioapic.h" - #include "lapic.h" - -#include "hyperv.h" #include "x86.h" +#include <gvm_types.h> + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -45,7 +40,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, /* * XXX: rejecting pic routes when pic isn't in use would be better, * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. + * NULL and GVM_CREATE_IRQCHIP can race with GVM_IRQ_LINE. */ if (!pic) return -1; @@ -71,7 +66,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, { int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + size_t dest_vcpu_bitmap[BITS_TO_LONGS(GVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; if (irq->dest_mode == 0 && irq->dest_id == 0xff && @@ -112,7 +107,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (dest_vcpus != 0) { int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); + dest_vcpu_bitmap, GVM_MAX_VCPUS); lowest = kvm_get_vcpu(kvm, idx); } @@ -126,10 +121,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? - (u64)e->msi.address_hi << 32 : 0), - e->msi.data); - irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; if (kvm->arch.x2apic_format) @@ -144,7 +135,6 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } -EXPORT_SYMBOL_GPL(kvm_set_msi_irq); static inline bool kvm_msi_route_invalid(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e) @@ -169,16 +159,6 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } -static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - if (!level) - return -1; - - return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); -} - int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -187,11 +167,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, int r; switch (e->type) { - case KVM_IRQ_ROUTING_HV_SINT: - return kvm_hv_set_sint(e, kvm, irq_source_id, level, - line_status); - - case KVM_IRQ_ROUTING_MSI: + case GVM_IRQ_ROUTING_MSI: if (kvm_msi_route_invalid(kvm, e)) return -EINVAL; @@ -210,7 +186,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, int kvm_request_irq_source_id(struct kvm *kvm) { - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + size_t *bitmap = &kvm->arch.irq_sources_bitmap; int irq_source_id; mutex_lock(&kvm->irq_lock); @@ -222,8 +198,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) goto unlock; } - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != GVM_USERSPACE_IRQ_SOURCE_ID); set_bit(irq_source_id, bitmap); unlock: mutex_unlock(&kvm->irq_lock); @@ -233,8 +208,7 @@ unlock: void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) { - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != GVM_USERSPACE_IRQ_SOURCE_ID); mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || @@ -257,7 +231,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, { mutex_lock(&kvm->irq_lock); kimn->irq = irq; - hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); + hlist_add_head(&kimn->link, &kvm->arch.mask_notifier_list); mutex_unlock(&kvm->irq_lock); } @@ -265,24 +239,25 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { mutex_lock(&kvm->irq_lock); - hlist_del_rcu(&kimn->link); + hlist_del(&kimn->link); mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); } void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask) { struct kvm_irq_mask_notifier *kimn; - int idx, gsi; + int gsi; - idx = srcu_read_lock(&kvm->irq_srcu); + mutex_lock(&kvm->irq_lock); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) +#define LIST_ENTRY_TYPE_INFO struct kvm_irq_mask_notifier + hlist_for_each_entry(kimn, &kvm->arch.mask_notifier_list, link) if (kimn->irq == gsi) kimn->func(kimn, mask); - srcu_read_unlock(&kvm->irq_srcu, idx); +#undef LIST_ENTRY_TYPE_INFO + mutex_unlock(&kvm->irq_lock); } int kvm_set_routing_entry(struct kvm *kvm, @@ -294,20 +269,20 @@ int kvm_set_routing_entry(struct kvm *kvm, unsigned max_pin; switch (ue->type) { - case KVM_IRQ_ROUTING_IRQCHIP: + case GVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: + case GVM_IRQCHIP_PIC_MASTER: e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; break; - case KVM_IRQCHIP_PIC_SLAVE: + case GVM_IRQCHIP_PIC_SLAVE: e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; delta = 8; break; - case KVM_IRQCHIP_IOAPIC: - max_pin = KVM_IOAPIC_NUM_PINS; + case GVM_IRQCHIP_IOAPIC: + max_pin = GVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; default: @@ -318,7 +293,7 @@ int kvm_set_routing_entry(struct kvm *kvm, if (e->irqchip.pin >= max_pin) goto out; break; - case KVM_IRQ_ROUTING_MSI: + case GVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; @@ -327,11 +302,6 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_msi_route_invalid(kvm, e)) goto out; break; - case KVM_IRQ_ROUTING_HV_SINT: - e->set = kvm_hv_set_sint; - e->hv_sint.vcpu = ue->u.hv_sint.vcpu; - e->hv_sint.sint = ue->u.hv_sint.sint; - break; default: goto out; } @@ -366,15 +336,14 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, return r == 1; } -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); #define IOAPIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ - .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } + { .gsi = irq, .type = GVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = GVM_IRQCHIP_IOAPIC, .pin = (irq) } } #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) #define PIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + { .gsi = irq, .type = GVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } #define ROUTING_ENTRY2(irq) \ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) @@ -400,13 +369,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) ARRAY_SIZE(default_routing), 0); } -static const struct kvm_irq_routing_entry empty_routing[] = {}; - -int kvm_setup_empty_irq_routing(struct kvm *kvm) -{ - return kvm_set_irq_routing(kvm, empty_routing, 0, 0); -} - void kvm_arch_post_irq_routing_update(struct kvm *kvm) { if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) @@ -414,37 +376,3 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } -void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, - ulong *ioapic_handled_vectors) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_kernel_irq_routing_entry *entry; - struct kvm_irq_routing_table *table; - u32 i, nr_ioapic_pins; - int idx; - - idx = srcu_read_lock(&kvm->irq_srcu); - table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - nr_ioapic_pins = min_t(u32, table->nr_rt_entries, - kvm->arch.nr_reserved_ioapic_pins); - for (i = 0; i < nr_ioapic_pins; ++i) { - hlist_for_each_entry(entry, &table->map[i], link) { - struct kvm_lapic_irq irq; - - if (entry->type != KVM_IRQ_ROUTING_MSI) - continue; - - kvm_set_msi_irq(vcpu->kvm, entry, &irq); - - if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, - irq.dest_id, irq.dest_mode)) - __set_bit(irq.vector, ioapic_handled_vectors); - } - } - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -void kvm_arch_irq_routing_update(struct kvm *kvm) -{ - kvm_hv_irq_routing_update(kvm); -} diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 762cdf2..2ca26a9 100644..100755 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,15 +1,21 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#include <uapi/asm/processor-flags.h> + #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, +static inline size_t kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + if (!test_bit(reg, (size_t *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, reg); return vcpu->arch.regs[reg]; @@ -17,19 +23,19 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, static inline void kvm_register_write(struct kvm_vcpu *vcpu, enum kvm_reg reg, - unsigned long val) + size_t val) { vcpu->arch.regs[reg] = val; - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (size_t *)&vcpu->arch.regs_dirty); + __set_bit(reg, (size_t *)&vcpu->arch.regs_avail); } -static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +static inline size_t kvm_rip_read(struct kvm_vcpu *vcpu) { return kvm_register_read(vcpu, VCPU_REGS_RIP); } -static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, size_t val) { kvm_register_write(vcpu, VCPU_REGS_RIP, val); } @@ -39,54 +45,49 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + (size_t *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } -static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +static inline size_t kvm_read_cr0_bits(struct kvm_vcpu *vcpu, size_t mask) { - ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + size_t tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if (tmask & vcpu->arch.cr0_guest_owned_bits) kvm_x86_ops->decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } -static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +static inline size_t kvm_read_cr0(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, ~0UL); + return kvm_read_cr0_bits(vcpu, ~(size_t)0); } -static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +static inline size_t kvm_read_cr4_bits(struct kvm_vcpu *vcpu, size_t mask) { - ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + size_t tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if (tmask & vcpu->arch.cr4_guest_owned_bits) kvm_x86_ops->decache_cr4_guest_bits(vcpu); return vcpu->arch.cr4 & mask; } -static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) +static inline size_t kvm_read_cr3(struct kvm_vcpu *vcpu) { - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + if (!test_bit(VCPU_EXREG_CR3, (size_t *)&vcpu->arch.regs_avail)) kvm_x86_ops->decache_cr3(vcpu); return vcpu->arch.cr3; } -static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +static inline size_t kvm_read_cr4(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, ~0UL); + return kvm_read_cr4_bits(vcpu, ~(size_t)0); } static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) { - return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) - | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); -} - -static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_pkru(vcpu); + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & (unsigned)-1) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & (unsigned)-1) << 32); } static inline void enter_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f69340..7a156d4 100644..100755 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -6,6 +6,7 @@ * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Dor Laor <dor.laor@qumranet.com> @@ -19,29 +20,15 @@ */ #include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <linux/export.h> -#include <linux/math64.h> -#include <linux/slab.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/page.h> -#include <asm/current.h> +#include <uapi/linux/kvm.h> #include <asm/apicdef.h> -#include <asm/delay.h> -#include <linux/atomic.h> -#include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" -#include "trace.h" #include "x86.h" #include "cpuid.h" -#include "hyperv.h" + +#include <gvm_types.h> + #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -57,10 +44,10 @@ #define APIC_BUS_CYCLE_NS 1 /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) +#define apic_debug(fmt, arg,...) /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION (0x14UL | ((GVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define APIC_SHORT_MASK 0xc0000 @@ -72,9 +59,33 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul + +/** + * hweightN - returns the hamming weight of a N-bit word + * @x: the word to weigh + * + * The Hamming Weight of a number is the total number of bits set in it. + */ + +static unsigned int hweight32(unsigned int w) +{ + w -= (w >> 1) & 0x55555555; + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); + w = (w + (w >> 4)) & 0x0f0f0f0f; + return (w * 0x01010101) >> 24; +} + +static unsigned int hweight16(unsigned int w) +{ + unsigned int res = w - ((w >> 1) & 0x5555); + res = (res & 0x3333) + ((res >> 2) & 0x3333); + res = (res + (res >> 4)) & 0x0F0F; + return (res + (res >> 8)) & 0x00FF; +} + static inline int apic_test_vector(int vec, void *bitmap) { - return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return test_bit(VEC_POS(vec), (size_t *)((char *)(bitmap)+REG_POS(vec))); } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) @@ -87,22 +98,19 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) static inline void apic_clear_vector(int vec, void *bitmap) { - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + clear_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } static inline int __apic_test_and_set_vector(int vec, void *bitmap) { - return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return __test_and_set_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } static inline int __apic_test_and_clear_vector(int vec, void *bitmap) { - return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + return __test_and_clear_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } -struct static_key_deferred apic_hw_disabled __read_mostly; -struct static_key_deferred apic_sw_disabled __read_mostly; - static inline int apic_enabled(struct kvm_lapic *apic) { return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); @@ -118,7 +126,7 @@ static inline int apic_enabled(struct kvm_lapic *apic) static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { - case KVM_APIC_MODE_X2APIC: { + case GVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; u32 max_apic_id = map->max_apic_id; @@ -133,11 +141,11 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, return true; } - case KVM_APIC_MODE_XAPIC_FLAT: + case GVM_APIC_MODE_XAPIC_FLAT: *cluster = map->xapic_flat_map; *mask = dest_id & 0xff; return true; - case KVM_APIC_MODE_XAPIC_CLUSTER: + case GVM_APIC_MODE_XAPIC_CLUSTER: *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; @@ -147,13 +155,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -191,13 +192,13 @@ static void recalculate_apic_map(struct kvm *kvm) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { - new->mode |= KVM_APIC_MODE_X2APIC; + new->mode |= GVM_APIC_MODE_X2APIC; } else if (ldr) { ldr = GET_APIC_LOGICAL_ID(ldr); if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) - new->mode |= KVM_APIC_MODE_XAPIC_FLAT; + new->mode |= GVM_APIC_MODE_XAPIC_FLAT; else - new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; + new->mode |= GVM_APIC_MODE_XAPIC_CLUSTER; } if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) @@ -207,13 +208,12 @@ static void recalculate_apic_map(struct kvm *kvm) cluster[ffs(mask) - 1] = apic; } out: - old = rcu_dereference_protected(kvm->arch.apic_map, - lockdep_is_held(&kvm->arch.apic_map_lock)); - rcu_assign_pointer(kvm->arch.apic_map, new); + old = kvm->arch.apic_map; + kvm->arch.apic_map = new; mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree(old); kvm_make_scan_ioapic_request(kvm); } @@ -227,10 +227,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; if (enabled) { - static_key_slow_dec_deferred(&apic_sw_disabled); recalculate_apic_map(apic->vcpu->kvm); - } else - static_key_slow_inc(&apic_sw_disabled.key); + } //else } } @@ -275,11 +273,6 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic) return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; } -static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) -{ - return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; -} - static inline int apic_lvt_nmi_mode(u32 lvt_val) { return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; @@ -288,7 +281,7 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; + struct kvm_cpuid_entry *feat; u32 v = APIC_VERSION; if (!lapic_in_kernel(vcpu)) @@ -300,7 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { +static const unsigned int apic_lvt_mask[GVM_APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ @@ -315,7 +308,7 @@ static int find_highest_vector(void *bitmap) for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) { - reg = bitmap + REG_POS(vec); + reg = (u32 *)((u8 *)bitmap + REG_POS(vec)); if (*reg) return fls(*reg) - 1 + vec; } @@ -330,7 +323,7 @@ static u8 count_vectors(void *bitmap) u8 count = 0; for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { - reg = bitmap + REG_POS(vec); + reg = (u32 *)((u8 *)bitmap + REG_POS(vec)); count += hweight32(*reg); } @@ -344,10 +337,9 @@ void __kvm_apic_update_irr(u32 *pir, void *regs) for (i = 0; i <= 7; i++) { pir_val = xchg(&pir[i], 0); if (pir_val) - *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + *((u32 *)((u8 *)regs + APIC_IRR + i * 0x10)) |= pir_val; } } -EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) { @@ -355,9 +347,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) __kvm_apic_update_irr(pir, apic->regs); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { @@ -375,8 +366,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - if (apic->vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -392,7 +381,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } else { apic->irr_pending = false; apic_clear_vector(vec, apic->regs + APIC_IRR); @@ -496,54 +485,6 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } -static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); -} - -static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) -{ - - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); -} - -static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; -} - -static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) -{ - u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) - apic_debug("Can't read EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return val & 0x1; -} - -static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) -{ - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { - apic_debug("Can't set EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return; - } - __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); -} - -static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) -{ - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { - apic_debug("Can't clear EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return; - } - __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -565,7 +506,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) if (old_ppr != ppr) { kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + kvm_make_request(GVM_REQ_EVENT, apic->vcpu); } } @@ -623,7 +564,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* The KVM local APIC implementation has two quirks: +/* The kvm local APIC implementation has two quirks: * * - the xAPIC MDA stores the destination at bits 24-31, while this * is not true of struct kvm_lapic_irq's dest_id field. This is @@ -635,7 +576,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) * rewrites the destination of non-IPI messages from APIC_BROADCAST * to X2APIC_BROADCAST. * - * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * The broadcast quirk can be disabled with GVM_CAP_X2APIC_API. This is * important when userspace wants to use x2APIC-format MSIs, because * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ @@ -681,10 +622,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return false; } } -EXPORT_SYMBOL_GPL(kvm_apic_match_dest); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size) + const size_t *bitmap, u32 bitmap_size) { u32 mod; int i, idx = -1; @@ -713,7 +653,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, { if (kvm->arch.x2apic_broadcast_quirk_disabled) { if ((irq->dest_id == APIC_BROADCAST && - map->mode != KVM_APIC_MODE_X2APIC)) + map->mode != GVM_APIC_MODE_X2APIC)) return true; if (irq->dest_id == X2APIC_BROADCAST) return true; @@ -737,7 +677,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map, struct kvm_lapic ***dst, - unsigned long *bitmap) + size_t *bitmap) { int i, lowest; @@ -803,7 +743,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; - unsigned long bitmap; + size_t bitmap; struct kvm_lapic **dst = NULL; int i; bool ret; @@ -850,7 +790,7 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; - unsigned long bitmap; + size_t bitmap; struct kvm_lapic **dst = NULL; bool ret = false; @@ -862,7 +802,7 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && hweight16(bitmap) == 1) { - unsigned long i = find_first_bit(&bitmap, 16); + size_t i = find_first_bit(&bitmap, 16); if (dst[i]) { *dest_vcpu = dst[i]->vcpu; @@ -885,8 +825,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; @@ -912,26 +850,20 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_clear_vector(vector, apic->regs + APIC_TMR); } - if (vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active && + kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { kvm_lapic_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; - case APIC_DM_REMRD: - result = 1; - vcpu->arch.pv.pv_unhalted = 1; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - break; - case APIC_DM_SMI: result = 1; - kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_make_request(GVM_REQ_SMI, vcpu); kvm_vcpu_kick(vcpu); break; @@ -944,12 +876,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (!trig_mode || level) { result = 1; - /* assumes that there are only KVM_APIC_INIT/SIPI */ - apic->pending_events = (1UL << KVM_APIC_INIT); + /* assumes that there are only GVM_APIC_INIT/SIPI */ + apic->pending_events = (1ULL << GVM_APIC_INIT); /* make sure pending_events is visible before sending * the request */ smp_wmb(); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -964,8 +896,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); - set_bit(KVM_APIC_SIPI, &apic->pending_events); - kvm_make_request(KVM_REQ_EVENT, vcpu); + set_bit(GVM_APIC_SIPI, &apic->pending_events); + kvm_make_request(GVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -1003,13 +935,6 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) if (!kvm_ioapic_handles_vector(apic, vector)) return; - /* Request a KVM exit to inform the userspace IOAPIC. */ - if (irqchip_split(apic->vcpu->kvm)) { - apic->vcpu->arch.pending_ioapic_eoi = vector; - kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); - return; - } - if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else @@ -1022,8 +947,6 @@ static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - trace_kvm_eoi(apic, vector); - /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -1034,11 +957,8 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap)) - kvm_hv_synic_send_eoi(apic->vcpu, vector); - kvm_ioapic_send_eoi(apic, vector); - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + kvm_make_request(GVM_REQ_EVENT, apic->vcpu); return vector; } @@ -1050,12 +970,9 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; - trace_kvm_eoi(apic, vector); - kvm_ioapic_send_eoi(apic, vector); - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + kvm_make_request(GVM_REQ_EVENT, apic->vcpu); } -EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); static void apic_send_ipi(struct kvm_lapic *apic) { @@ -1075,8 +992,6 @@ static void apic_send_ipi(struct kvm_lapic *apic) else irq.dest_id = GET_APIC_DEST_FIELD(icr_high); - trace_kvm_apic_ipi(icr_low, irq.dest_id); - apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " @@ -1117,7 +1032,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; - kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); + kvm_make_request(GVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } @@ -1137,13 +1052,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) switch (offset) { case APIC_ARBPRI: - apic_debug("Access APIC ARBPRI register which is for P6\n"); + //apic_debug("Access APIC ARBPRI register which is for P6\n"); break; - case APIC_TMCCT: /* Timer CCR */ - if (apic_lvtt_tscdeadline(apic)) - return 0; - val = apic_get_tmcct(apic); break; case APIC_PROCPRI: @@ -1175,21 +1086,19 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { - apic_debug("KVM_APIC_READ: alignment error %x %d\n", + apic_debug("GVM_APIC_READ: alignment error %x %d\n", offset, len); return 1; } if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { - apic_debug("KVM_APIC_READ: read reserved register %x\n", + apic_debug("GVM_APIC_READ: read reserved register %x\n", offset); return 1; } result = __apic_read(apic, offset & ~0xf); - trace_kvm_apic_read(offset, result); - switch (len) { case 1: case 2: @@ -1203,7 +1112,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, } return 0; } -EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { @@ -1253,8 +1161,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; - struct swait_queue_head *q = &vcpu->wq; - struct kvm_timer *ktimer = &apic->lapic_timer; + //struct swait_queue_head *q = &vcpu->wq; if (atomic_read(&apic->lapic_timer.pending)) return; @@ -1262,11 +1169,12 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); + kvm_vcpu_kick(vcpu); + +#if 0 if (swait_active(q)) swake_up(q); - - if (apic_lvtt_tscdeadline(apic)) - ktimer->expired_tscdeadline = ktimer->tscdeadline; +#endif } /* @@ -1292,136 +1200,6 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } -void wait_lapic_expire(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u64 guest_tsc, tsc_deadline; - - if (!lapic_in_kernel(vcpu)) - return; - - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - - if (!lapic_timer_int_injected(vcpu)) - return; - - tsc_deadline = apic->lapic_timer.expired_tscdeadline; - apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ - if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); -} - -static void start_sw_tscdeadline(struct kvm_lapic *apic) -{ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - ktime_t expire; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - ktime_t now; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); - } else - apic_timer_expired(apic); - - local_irq_restore(flags); -} - -bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) -{ - if (!lapic_in_kernel(vcpu)) - return false; - - return vcpu->arch.apic->lapic_timer.hv_timer_in_use; -} -EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); - -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) -{ - kvm_x86_ops->cancel_hv_timer(apic->vcpu); - apic->lapic_timer.hv_timer_in_use = false; -} - -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) -{ - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (atomic_read(&apic->lapic_timer.pending) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); - - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); - } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; -} - -void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); - -void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_tscdeadline(apic); - - if (atomic_read(&apic->lapic_timer.pending)) - return; - - start_sw_tscdeadline(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1467,9 +1245,6 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) - start_sw_tscdeadline(apic); } } @@ -1492,8 +1267,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; - trace_kvm_apic_write(reg, val); - switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) @@ -1535,7 +1308,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) int i; u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { + for (i = 0; i < GVM_APIC_LVT_NUM; i++) { lvt_val = kvm_lapic_get_reg(apic, APIC_LVTT + 0x10 * i); kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, @@ -1583,9 +1356,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_TMICT: - if (apic_lvtt_tscdeadline(apic)) - break; - hrtimer_cancel(&apic->lapic_timer.timer); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); @@ -1593,14 +1363,14 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: if (val & 4) - apic_debug("KVM_WRITE:TDCR %x\n", val); + apic_debug("GVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { - apic_debug("KVM_WRITE:ESR not zero %x\n", val); + apic_debug("GVM_WRITE:ESR not zero %x\n", val); ret = 1; } break; @@ -1619,7 +1389,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_debug("Local APIC Write to read-only register %x\n", reg); return ret; } -EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) @@ -1658,7 +1427,6 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) @@ -1673,7 +1441,6 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) /* TODO: optimize to just emulate side effect w/o one more write */ kvm_lapic_reg_write(vcpu->arch.apic, offset, val); } -EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); void kvm_free_lapic(struct kvm_vcpu *vcpu) { @@ -1684,14 +1451,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); - if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) - static_key_slow_dec_deferred(&apic_hw_disabled); - - if (!apic->sw_enabled) - static_key_slow_dec_deferred(&apic_sw_disabled); - if (apic->regs) - free_page((unsigned long)apic->regs); + free_page((size_t)apic->regs); kfree(apic); } @@ -1702,31 +1463,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - -void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) - return; - - hrtimer_cancel(&apic->lapic_timer.timer); - apic->lapic_timer.tscdeadline = data; - start_apic_timer(apic); -} - -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, size_t cr8) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1760,9 +1497,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - static_key_slow_dec_deferred(&apic_hw_disabled); } else { - static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } } @@ -1780,7 +1515,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((value & MSR_IA32_APICBASE_ENABLE) && apic->base_address != APIC_DEFAULT_PHYS_BASE) - pr_warn_once("APIC base relocation is unsupported by KVM"); + pr_warn_once("APIC base relocation is unsupported by kvm"); /* with FSB delivery interrupt, we can restart APIC functionality */ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " @@ -1809,10 +1544,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) } kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) + for (i = 0; i < GVM_APIC_LVT_NUM; i++) kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) + if (kvm_check_has_quirk(vcpu->kvm, GVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); @@ -1840,7 +1575,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_bsp(vcpu)) kvm_lapic_set_base(vcpu, vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); - vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1945,7 +1679,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1991,8 +1724,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) - apic->lapic_timer.tscdeadline = 0; atomic_set(&apic->lapic_timer.pending, 0); } } @@ -2016,11 +1747,6 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); - apic_update_ppr(apic); - } - return vector; } @@ -2086,7 +1812,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); } - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); @@ -2095,63 +1821,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) return 0; } -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -{ - struct hrtimer *timer; - - if (!lapic_in_kernel(vcpu)) - return; - - timer = &vcpu->arch.apic->lapic_timer.timer; - if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); -} - -/* - * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt - * - * Detect whether guest triggered PV EOI since the - * last entry. If yes, set EOI on guests's behalf. - * Clear PV EOI in guest memory in any case. - */ -static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, - struct kvm_lapic *apic) -{ - bool pending; - int vector; - /* - * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host - * and KVM_PV_EOI_ENABLED in guest memory as follows: - * - * KVM_APIC_PV_EOI_PENDING is unset: - * -> host disabled PV EOI. - * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: - * -> host enabled PV EOI, guest did not execute EOI yet. - * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: - * -> host enabled PV EOI, guest executed EOI. - */ - BUG_ON(!pv_eoi_enabled(vcpu)); - pending = pv_eoi_get_pending(vcpu); - /* - * Clear pending bit in any case: it will be set again on vmentry. - * While this might not be ideal from performance point of view, - * this makes sure pv eoi is only enabled when we know it's safe. - */ - pv_eoi_clr_pending(vcpu); - if (pending) - return; - vector = apic_set_eoi(apic); - trace_kvm_pv_eoi(apic, vector); -} - void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; - if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) - apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); - - if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) + if (!test_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, @@ -2161,41 +1835,13 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) apic_set_tpr(vcpu->arch.apic, data & 0xff); } -/* - * apic_sync_pv_eoi_to_guest - called before vmentry - * - * Detect whether it's safe to enable PV EOI and - * if yes do so. - */ -static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, - struct kvm_lapic *apic) -{ - if (!pv_eoi_enabled(vcpu) || - /* IRR set or many bits in ISR: could be nested. */ - apic->irr_pending || - /* Cache not set: could be safe but we don't bother. */ - apic->highest_isr_cache == -1 || - /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { - /* - * PV EOI was disabled by apic_sync_pv_eoi_from_guest - * so we need not do anything here. - */ - return; - } - - pv_eoi_set_pending(apic->vcpu); -} - void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; struct kvm_lapic *apic = vcpu->arch.apic; - apic_sync_pv_eoi_to_guest(vcpu, apic); - - if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) + if (!test_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff; @@ -2218,9 +1864,9 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; - __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + __set_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } else { - __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + __clear_bit(GVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } vcpu->arch.apic->vapic_addr = vapic_addr; @@ -2253,7 +1899,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { - apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n", + apic_debug("GVM_APIC_READ: read x2apic reserved register %x\n", reg); return 1; } @@ -2268,95 +1914,48 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } -int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 1; - - /* if this is ICR write vector before command */ - if (reg == APIC_ICR) - kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(apic, reg, (u32)data); -} - -int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u32 low, high = 0; - - if (!lapic_in_kernel(vcpu)) - return 1; - - if (kvm_lapic_reg_read(apic, reg, 4, &low)) - return 1; - if (reg == APIC_ICR) - kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high); - - *data = (((u64)high) << 32) | low; - - return 0; -} - -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) -{ - u64 addr = data & ~KVM_MSR_ENABLED; - if (!IS_ALIGNED(addr, 4)) - return 1; - - vcpu->arch.pv_eoi.msr_val = data; - if (!pv_eoi_enabled(vcpu)) - return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, - addr, sizeof(u8)); -} - void kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; - unsigned long pe; + size_t pe; if (!lapic_in_kernel(vcpu) || !apic->pending_events) return; /* * INITs are latched while in SMM. Because an SMM CPU cannot - * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs + * be in GVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs * and delay processing of INIT until the next RSM. */ if (is_smm(vcpu)) { - WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); - if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) - clear_bit(KVM_APIC_SIPI, &apic->pending_events); + WARN_ON_ONCE(vcpu->arch.mp_state == GVM_MP_STATE_INIT_RECEIVED); + if (test_bit(GVM_APIC_SIPI, &apic->pending_events)) + clear_bit(GVM_APIC_SIPI, &apic->pending_events); return; } pe = xchg(&apic->pending_events, 0); - if (test_bit(KVM_APIC_INIT, &pe)) { + if (test_bit(GVM_APIC_INIT, &pe)) { kvm_lapic_reset(vcpu, true); kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + vcpu->arch.mp_state = GVM_MP_STATE_INIT_RECEIVED; } - if (test_bit(KVM_APIC_SIPI, &pe) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + if (test_bit(GVM_APIC_SIPI, &pe) && + vcpu->arch.mp_state == GVM_MP_STATE_INIT_RECEIVED) { /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; apic_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; } } void kvm_lapic_init(void) { - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&apic_hw_disabled, HZ); - jump_label_rate_limit(&apic_sw_disabled, HZ); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c..ffbed39 100644..100755 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_X86_LAPIC_H #define __KVM_X86_LAPIC_H @@ -5,26 +9,31 @@ #include <linux/kvm_host.h> -#define KVM_APIC_INIT 0 -#define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 +#include <ntkrutils.h> +#include <asm/apicdef.h> +#include <asm/msr-index.h> +#include <gvm_types.h> +#include <ntkrutils.h> + +#define GVM_APIC_INIT 0 +#define GVM_APIC_SIPI 1 +#define GVM_APIC_LVT_NUM 6 + +#define GVM_APIC_SHORT_MASK 0xc0000 +#define GVM_APIC_DEST_MASK 0x800 -#define KVM_APIC_SHORT_MASK 0xc0000 -#define KVM_APIC_DEST_MASK 0x800 +#define u32 unsigned int struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ u32 timer_mode; u32 timer_mode_mask; - u64 tscdeadline; - u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ - bool hv_timer_in_use; }; struct kvm_lapic { - unsigned long base_address; + size_t base_address; struct kvm_io_device dev; struct kvm_timer lapic_timer; u32 divide_count; @@ -41,10 +50,10 @@ struct kvm_lapic { * the guest 1:1, because it is accessed by the vmx microcode. * Note: Only one register, the TPR, is used by the microcode. */ - void *regs; + u8 *regs; gpa_t vapic_addr; struct gfn_to_hva_cache vapic_cache; - unsigned long pending_events; + size_t pending_events; unsigned int sipi_vector; }; @@ -59,7 +68,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, size_t cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); @@ -85,9 +94,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); -void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); - void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); @@ -98,15 +104,6 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); -int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); - -static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; -} - -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); #define VEC_POS(v) ((v) & (32 - 1)) @@ -114,12 +111,12 @@ void kvm_lapic_init(void); static inline void kvm_lapic_set_vector(int vec, void *bitmap) { - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + set_bit(VEC_POS(vec), (size_t *)((u8 *)(bitmap) + REG_POS(vec))); } static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) { - kvm_lapic_set_vector(vec, apic->regs + APIC_IRR); + kvm_lapic_set_vector(vec, (unsigned char *)apic->regs + APIC_IRR); /* * irr_pending must be true if any interrupt is pending; set it after * APIC_IRR to avoid race with apic_clear_irr @@ -129,39 +126,27 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) { - return *((u32 *) (apic->regs + reg_off)); + return *((u32 *) ((unsigned char *)apic->regs + reg_off)); } static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - *((u32 *) (apic->regs + reg_off)) = val; + *((u32 *) ((unsigned char *)apic->regs + reg_off)) = val; } -extern struct static_key kvm_no_apic_vcpu; - static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) { - if (static_key_false(&kvm_no_apic_vcpu)) - return vcpu->arch.apic; - return true; + return vcpu->arch.apic; } -extern struct static_key_deferred apic_hw_disabled; - static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_hw_disabled.key)) - return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; - return MSR_IA32_APICBASE_ENABLE; + return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; } -extern struct static_key_deferred apic_sw_disabled; - static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_sw_disabled.key)) - return apic->sw_enabled; - return true; + return apic->sw_enabled; } static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) @@ -197,7 +182,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { - return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + return lapic_in_kernel(vcpu) && test_bit(GVM_APIC_INIT, &vcpu->arch.apic->pending_events); } static inline u32 kvm_apic_id(struct kvm_lapic *apic) @@ -213,14 +198,8 @@ static inline u32 kvm_apic_id(struct kvm_lapic *apic) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); -void wait_lapic_expire(struct kvm_vcpu *vcpu); - bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size); -void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); -void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); + const size_t *bitmap, u32 bitmap_size); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e98..e183d24 100644..100755 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -8,6 +8,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -23,27 +24,12 @@ #include "x86.h" #include "kvm_cache_regs.h" #include "cpuid.h" +#include <linux/list.h> #include <linux/kvm_host.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/moduleparam.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/hugetlb.h> -#include <linux/compiler.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/uaccess.h> - -#include <asm/page.h> -#include <asm/cmpxchg.h> -#include <asm/io.h> -#include <asm/vmx.h> #include <asm/kvm_page_track.h> +#pragma warning(disable : 4221) /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -51,7 +37,7 @@ * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging. */ -bool tdp_enabled = false; +bool tdp_enabled = true; enum { AUDIT_PRE_PAGE_FAULT, @@ -72,8 +58,8 @@ module_param(dbg, bool, 0644); #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) #define MMU_WARN_ON(x) WARN_ON(x) #else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) +#define pgprintk(x,...) do { } while (0) +#define rmap_printk(x,...) do { } while (0) #define MMU_WARN_ON(x) do { } while (0) #endif @@ -129,11 +115,6 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -#include <trace/events/kvm.h> - -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -162,13 +143,13 @@ struct kvm_shadow_walk_iterator { #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ - shadow_walk_okay(&(_walker)) && \ - ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ + shadow_walk_okay(&(_walker)); \ __shadow_walk_next(&(_walker), spte)) -static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; +// todo-001 +//static struct kmem_cache *pte_list_desc_cache; +//static struct kmem_cache *mmu_page_header_cache; +//static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_nx_mask; static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ @@ -178,6 +159,60 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +#ifdef CONFIG_X86_64 +typedef u64 phys_addr_t; +#define __PHYSICAL_MASK_SHIFT 46 +#endif +/* PAGE_SHIFT determines the page size */ +#ifndef PAGE_SIZE +#define PAGE_SHIFT 12 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) +#endif + +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) + +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + +#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __VIRTUAL_MASK ((1ULL << __VIRTUAL_MASK_SHIFT) - 1) + +/* Cast *PAGE_MASK to a signed type so that it is sign-extended if +virtual addresses are 32-bits but physical addresses are larger +(ie, 32-bit PAE). */ +#define PHYSICAL_PAGE_MASK (((ssize_t)PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PMD_PAGE_MASK (((ssize_t)PMD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((ssize_t)PUD_PAGE_MASK) & __PHYSICAL_MASK) + +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* +* Extracts the flags from a (pte|pmd|pud|pgd)val_t +* This includes the protection key value. +*/ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +#define pte_val(pte) (pte.pte) + +static pteval_t pte_flags(pte_t pte) +{ + return pte_val(pte) & PTE_FLAGS_MASK; +} + +static size_t pte_pfn(pte_t pte) +{ + return (pte_val(pte)& PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static int pte_write(pte_t pte) +{ + return pte_flags(pte) & _PAGE_RW; +} + + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -185,7 +220,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { shadow_mmio_mask = mmio_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); /* * the low bit of the generation number is always presumed to be zero. @@ -240,7 +274,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, access &= ACC_WRITE_MASK | ACC_USER_MASK; mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; - trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -279,7 +312,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); - trace_check_mmio_spte(spte, kvm_gen, spte_gen); return likely(kvm_gen == spte_gen); } @@ -293,7 +325,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_cpuid_PSE36(void) { @@ -354,7 +385,9 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) static u64 __get_spte_lockless(u64 *sptep) { - return ACCESS_ONCE(*sptep); + u64 temp; + ACCESS_ONCE(*sptep, temp); + return temp; } #else union split_spte { @@ -561,12 +594,6 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) ret = true; if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); return ret; } @@ -578,11 +605,6 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) shadow_accessed_mask | shadow_dirty_mask)) ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; } @@ -607,17 +629,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by + * kvm does not hold the refcount of the page used by * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) - kvm_set_pfn_dirty(pfn); return 1; } @@ -663,14 +680,14 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) + size_t cache_size, int min) { void *obj; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + obj = kzalloc_fast(cache_size, GFP_KERNEL); if (!obj) return -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -683,11 +700,10 @@ static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) return cache->nobjs; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); + kfree_fast(mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -709,7 +725,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); + free_page((size_t)mc->objects[--mc->nobjs]); } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) @@ -717,25 +733,23 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) int r; r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + sizeof(struct pte_list_desc), 8 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); + sizeof(struct kvm_mmu_page), 4); out: return r; } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) @@ -754,7 +768,7 @@ static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { - kmem_cache_free(pte_list_desc_cache, pte_list_desc); + kfree_fast(pte_list_desc); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) @@ -773,43 +787,6 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) sp->gfns[index] = gfn; } -/* - * Return the pointer to the large page information for a given gfn, - * handling slots that are not large page aligned. - */ -static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) -{ - unsigned long idx; - - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.lpage_info[level - 2][idx]; -} - -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, - gfn_t gfn, int count) -{ - struct kvm_lpage_info *linfo; - int i; - - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->disallow_lpage += count; - WARN_ON(linfo->disallow_lpage < 0); - } -} - -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) -{ - update_gfn_disallow_lpage_count(slot, gfn, 1); -} - -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) -{ - update_gfn_disallow_lpage_count(slot, gfn, -1); -} - static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -823,10 +800,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return kvm_slot_page_track_add_page(kvm, slot, gfn, + kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); - - kvm_mmu_gfn_disallow_lpage(slot, gfn); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -840,55 +815,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return kvm_slot_page_track_remove_page(kvm, slot, gfn, + kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); - - kvm_mmu_gfn_allow_lpage(slot, gfn); -} - -static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, - struct kvm_memory_slot *slot) -{ - struct kvm_lpage_info *linfo; - - if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return !!linfo->disallow_lpage; - } - - return true; } static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { - struct kvm_memory_slot *slot; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); -} - -static int host_mapping_level(struct kvm *kvm, gfn_t gfn) -{ - unsigned long page_size; - int i, ret = 0; - - page_size = kvm_host_page_size(kvm, gfn); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - if (page_size >= KVM_HPAGE_SIZE(i)) - ret = i; - else - break; - } - - return ret; + return true; } static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, bool no_dirty_log) { - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + if (!slot || slot->flags & GVM_MEMSLOT_INVALID) return false; if (no_dirty_log && slot->dirty_bitmap) return false; @@ -912,29 +852,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { - int host_level, level, max_level; - struct kvm_memory_slot *slot; - - if (unlikely(*force_pt_level)) - return PT_PAGE_TABLE_LEVEL; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - *force_pt_level = !memslot_valid_for_gpte(slot, true); - if (unlikely(*force_pt_level)) - return PT_PAGE_TABLE_LEVEL; - - host_level = host_mapping_level(vcpu->kvm, large_gfn); - - if (host_level == PT_PAGE_TABLE_LEVEL) - return host_level; - - max_level = min(kvm_x86_ops->get_lpage_level(), host_level); - - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) - break; - - return level - 1; + return PT_PAGE_TABLE_LEVEL; } /* @@ -956,17 +874,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, if (!rmap_head->val) { rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); - rmap_head->val = (unsigned long)spte; + rmap_head->val = (size_t)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (size_t)desc | 1; ++count; } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ull); while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { desc = desc->more; count += PTE_LIST_EXT; @@ -996,12 +914,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, if (j != 0) return; if (!prev_desc && !desc->more) - rmap_head->val = (unsigned long)desc->sptes[0]; + rmap_head->val = (size_t)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; else - rmap_head->val = (unsigned long)desc->more | 1; + rmap_head->val = (size_t)desc->more | 1; mmu_free_pte_list_desc(desc); } @@ -1023,7 +941,7 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) rmap_head->val = 0; } else { rmap_printk("pte_list_remove: %p many->many\n", spte); - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ull); prev_desc = NULL; while (desc) { for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { @@ -1041,13 +959,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, +static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, struct kvm_memory_slot *slot) { - unsigned long idx; + size_t idx; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; + idx = gfn - slot->base_gfn; + return &slot->arch.rmap[idx]; } static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, @@ -1058,7 +976,7 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); + return __gfn_to_rmap(gfn, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1123,7 +1041,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, goto out; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ull); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: @@ -1296,13 +1214,13 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) + gfn_t gfn_offset, size_t mask) { struct kvm_rmap_head *rmap_head; while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1321,20 +1239,19 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, */ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) + gfn_t gfn_offset, size_t mask) { struct kvm_rmap_head *rmap_head; while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + slot); __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ mask &= mask - 1; } } -EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); /** * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected @@ -1348,7 +1265,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) + gfn_t gfn_offset, size_t mask) { if (kvm_x86_ops->enable_log_dirty_pt_masked) kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, @@ -1361,13 +1278,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { struct kvm_rmap_head *rmap_head; - int i; bool write_protected = false; - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); - } + rmap_head = __gfn_to_rmap(gfn, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); return write_protected; } @@ -1386,11 +1300,13 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) struct rmap_iterator iter; bool flush = false; - while ((sptep = rmap_get_first(rmap_head, &iter))) { + sptep = rmap_get_first(rmap_head, &iter); + while (sptep) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); flush = true; + sptep = rmap_get_first(rmap_head, &iter); } return flush; @@ -1398,14 +1314,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) + size_t data) { return kvm_zap_rmapp(kvm, rmap_head); } static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) + size_t data) { u64 *sptep; struct rmap_iterator iter; @@ -1468,8 +1384,8 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, + iterator->rmap = __gfn_to_rmap(iterator->gfn, iterator->slot); + iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, iterator->slot); } @@ -1495,7 +1411,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { if (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += 1ULL; return; } @@ -1515,15 +1431,15 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_next(_iter_)) static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, + size_t start, + size_t end, + size_t data, int (*handler)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data)) + size_t data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1531,10 +1447,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, int ret = 0; int i; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; + size_t hva_start, hva_end; gfn_t gfn_start, gfn_end; hva_start = max(start, memslot->userspace_addr); @@ -1550,7 +1466,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, + PT_PAGE_TABLE_LEVEL, gfn_start, gfn_end - 1, &iterator) ret |= handler(kvm, iterator.rmap, memslot, @@ -1561,38 +1477,38 @@ static int kvm_handle_hva_range(struct kvm *kvm, return ret; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, +static int kvm_handle_hva(struct kvm *kvm, size_t hva, + size_t data, int (*handler)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data)) + size_t data)) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +int kvm_unmap_hva(struct kvm *kvm, size_t hva) { return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range(struct kvm *kvm, size_t start, size_t end) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +void kvm_set_spte_hva(struct kvm *kvm, size_t hva, pte_t pte) { - kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + kvm_handle_hva(kvm, hva, (size_t)&pte, kvm_set_pte_rmapp); } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) + size_t data) { u64 *sptep; - struct rmap_iterator uninitialized_var(iter); + struct rmap_iterator iter; int young = 0; BUG_ON(!shadow_accessed_mask); @@ -1601,17 +1517,16 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); + (size_t *)sptep); } } - trace_kvm_age_page(gfn, level, slot, young); return young; } static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, - int level, unsigned long data) + int level, size_t data) { u64 *sptep; struct rmap_iterator iter; @@ -1649,8 +1564,9 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +//todo-003 +#if 0 +int kvm_age_hva(struct kvm *kvm, size_t start, size_t end) { /* * In case of absence of EPT Access and Dirty Bits supports, @@ -1674,8 +1590,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } +#endif -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_test_age_hva(struct kvm *kvm, size_t hva) { return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); } @@ -1705,7 +1622,7 @@ static int is_empty_shadow_page(u64 *spt) static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) { kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); + //percpu_counter_add(&kvm_total_used_mmu_pages, nr); } static void kvm_mmu_free_page(struct kvm_mmu_page *sp) @@ -1713,15 +1630,15 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - free_page((unsigned long)sp->spt); + free_page((size_t)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); - kmem_cache_free(mmu_page_header_cache, sp); + free_page((size_t)sp->gfns); + kfree_fast(sp); } static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return gfn & ((1 << GVM_MMU_HASH_SHIFT) - 1); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, @@ -1754,7 +1671,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + set_page_private(virt_to_page(sp->spt), (size_t)sp); /* * The active_mmu_pages list is the FIFO list, do not move the @@ -1808,13 +1725,13 @@ static void nonpaging_update_pte(struct kvm_vcpu *vcpu, WARN_ON(1); } -#define KVM_PAGE_ARRAY_NR 16 +#define GVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { struct mmu_page_and_offset { struct kvm_mmu_page *sp; unsigned int idx; - } page[KVM_PAGE_ARRAY_NR]; + } page[GVM_PAGE_ARRAY_NR]; unsigned int nr; }; @@ -1831,7 +1748,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, pvec->page[pvec->nr].sp = sp; pvec->page[pvec->nr].idx = idx; pvec->nr++; - return (pvec->nr == KVM_PAGE_ARRAY_NR); + return (pvec->nr == GVM_PAGE_ARRAY_NR); } static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) @@ -1896,7 +1813,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); - trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } @@ -1953,10 +1869,10 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); else if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); } -#ifdef CONFIG_KVM_MMU_AUDIT +#ifdef CONFIG_GVM_MMU_AUDIT #include "mmu_audit.c" #else static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } @@ -1982,6 +1898,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_mmu_page *s; bool ret = false; +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; @@ -1989,6 +1906,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); ret |= kvm_sync_page(vcpu, s, invalid_list); } +#undef LIST_ENTRY_TYPE_INFO return ret; } @@ -1998,9 +1916,16 @@ struct mmu_page_path { unsigned int idx[PT64_ROOT_LEVEL]; }; +static int __for_each_sp_end(struct kvm_mmu_page **sp, struct kvm_mmu_pages *pvec, int nr) +{ + *sp = pvec->page[nr].sp; + + return 1; +} + #define for_each_sp(pvec, sp, parents, i) \ for (i = mmu_pages_first(&pvec, &parents); \ - i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i < pvec.nr && __for_each_sp_end(&sp, &pvec, i); \ i = mmu_pages_next(&pvec, &parents, i)) static int mmu_pages_next(struct kvm_mmu_pages *pvec, @@ -2090,9 +2015,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + //if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) + { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); - cond_resched_lock(&vcpu->kvm->mmu_lock); + //cond_resched_lock(&vcpu->kvm->mmu_lock); flush = false; } } @@ -2138,6 +2064,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } + +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; @@ -2153,16 +2081,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); } if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(GVM_REQ_MMU_SYNC, vcpu); __clear_sp_write_flooding_count(sp); - trace_kvm_mmu_get_page(sp, false); return sp; } +#undef LIST_ENTRY_TYPE_INFO ++vcpu->kvm->stat.mmu_cache_miss; @@ -2188,7 +2116,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); - trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); return sp; @@ -2240,7 +2167,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { - return __shadow_walk_next(iterator, *iterator->sptep); + __shadow_walk_next(iterator, *iterator->sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2248,8 +2175,6 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; @@ -2322,8 +2247,11 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) u64 *sptep; struct rmap_iterator iter; - while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) + sptep = rmap_get_first(&sp->parent_ptes, &iter); + while (sptep) { drop_parent_pte(sp, sptep); + sptep = rmap_get_first(&sp->parent_ptes, &iter); + } } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2355,7 +2283,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, { int ret; - trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); @@ -2405,10 +2332,12 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, */ kvm_flush_remote_tlbs(kvm); +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); kvm_mmu_free_page(sp); } +#undef LIST_ENTRY_TYPE_INFO } static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, @@ -2460,22 +2389,22 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; spin_lock(&kvm->mmu_lock); +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } +#undef LIST_ENTRY_TYPE_INFO kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; @@ -2487,9 +2416,12 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_mmu_page *sp; - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) +#if 0 + if (kvm_page_track_is_active(vcpu, gfn, GVM_PAGE_TRACK_WRITE)) return true; +#endif +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) return true; @@ -2500,16 +2432,15 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); kvm_unsync_page(vcpu, sp); } +#undef LIST_ENTRY_TYPE_INFO return false; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); - - return true; + /* Without IOMMU, we won't assign real MMIO resource */ + return false; } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2635,7 +2566,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, true, host_writable)) { if (write_fault) emulate = true; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep))) @@ -2657,8 +2588,6 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return emulate; } @@ -2669,7 +2598,7 @@ static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) - return KVM_PFN_ERR_FAULT; + return GVM_PFN_ERR_FAULT; return gfn_to_pfn_memslot_atomic(slot, gfn); } @@ -2678,7 +2607,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) { - struct page *pages[PTE_PREFETCH_NUM]; + pfn_t pfn[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; unsigned access = sp->role.access; int i, ret; @@ -2689,13 +2618,13 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (!slot) return -1; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); + ret = gfn_to_pfn_many_atomic(slot, gfn, pfn, end - start); if (ret <= 0) return -1; for (i = 0; i < ret; i++, gfn++, start++) mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + pfn[i], true, true); return 0; } @@ -2744,7 +2673,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) + int level, gfn_t gfn, kvm_pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2757,7 +2686,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, + write, level, gfn, pfn, false, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; @@ -2779,19 +2708,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, return emulate; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_MCEERR_AR; - info.si_addr = (void __user *)address; - info.si_addr_lsb = PAGE_SHIFT; - - send_sig_info(SIGBUS, &info, tsk); -} - static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { /* @@ -2800,59 +2716,12 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * caused mmio page fault and treat it as mmio access. * Return 1 to tell kvm to emulate it. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (pfn == GVM_PFN_ERR_RO_FAULT) return 1; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); - return 0; - } - return -EFAULT; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, - int *levelp) -{ - kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn)) && - !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - } -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { @@ -2941,9 +2810,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) { + spte = mmu_spte_get_lockless(iterator.sptep); if (!is_shadow_present_pte(spte) || iterator.level < level) break; + } /* * If the mapping has been changed, let the vcpu fault on the @@ -2996,67 +2867,42 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); exit: - trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static void get_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); static void make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, - gfn_t gfn, bool prefault) + gfn_t gfn) { int r; int level; bool force_pt_level = false; kvm_pfn_t pfn; - unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } if (fast_page_fault(vcpu, v, level, error_code)) return 0; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + get_pfn(vcpu, gfn, v, &pfn, write, &map_writable); if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; make_mmu_pages_available(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; } @@ -3110,7 +2956,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -3291,7 +3137,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } -EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3405,7 +3250,6 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (direct) addr = 0; - trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); return RET_MMIO_PF_EMULATE; } @@ -3416,7 +3260,6 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, u32 error_code, gfn_t gfn) @@ -3428,12 +3271,14 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, !(error_code & PFERR_WRITE_MASK)) return false; +#if 0 /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_page_track_is_active(vcpu, gfn, GVM_PAGE_TRACK_WRITE)) return true; +#endif return false; } @@ -3448,6 +3293,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + spte = mmu_spte_get_lockless(iterator.sptep); clear_sp_write_flooding_count(iterator.sptep); if (!is_shadow_present_pte(spte)) break; @@ -3456,7 +3302,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code, bool prefault) + u32 error_code) { gfn_t gfn = gva >> PAGE_SHIFT; int r; @@ -3473,76 +3319,24 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code, gfn, prefault); + return nonpaging_map(vcpu, gva & PAGE_MASK, error_code, gfn); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) -{ - struct kvm_arch_async_pf arch; - - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; - arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu.direct_map; - arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - - return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); -} - -static bool can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu))) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static void get_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { struct kvm_memory_slot *slot; - bool async; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); - if (!async) - return false; /* *pfn has correct page already */ - - if (!prefault && can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) - return true; - } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); - return false; -} - -static bool -check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) -{ - int page_num = KVM_PAGES_PER_HPAGE(level); - - gfn &= ~(page_num - 1); - - return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); } -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, - bool prefault) +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code) { kvm_pfn_t pfn; int r; int level; - bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; - unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; @@ -3555,43 +3349,24 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - if (level > PT_DIRECTORY_LEVEL && - !check_hugepage_cache_consistency(vcpu, gfn, level)) - level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } + level = mapping_level(vcpu, gfn, NULL); if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; + get_pfn(vcpu, gfn, gpa, &pfn, write, &map_writable); if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; make_mmu_pages_available(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -3614,7 +3389,7 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static unsigned long get_cr3(struct kvm_vcpu *vcpu) +static size_t get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } @@ -3662,10 +3437,12 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, return gpte & PT_PAGE_SIZE_MASK; } +#if 0 #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" #undef PTTYPE +#endif #define PTTYPE 64 #include "paging_tmpl.h" @@ -3820,7 +3597,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) /* * Passing "true" to the last argument is okay; it adds a check - * on bit 8 of the SPTEs which KVM doesn't use anyway. + * on bit 8 of the SPTEs which kvm doesn't use anyway. */ __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, @@ -3828,7 +3605,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); } -EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); static inline bool boot_cpu_is_amd(void) { @@ -3932,81 +3708,6 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } -/* -* PKU is an additional mechanism by which the paging controls access to -* user-mode addresses based on the value in the PKRU register. Protection -* key violations are reported through a bit in the page fault error code. -* Unlike other bits of the error code, the PK bit is not known at the -* call site of e.g. gva_to_gpa; it must be computed directly in -* permission_fault based on two bits of PKRU, on some machine state (CR4, -* CR0, EFER, CPL), and on other bits of the error code and the page tables. -* -* In particular the following conditions come from the error code, the -* page tables and the machine state: -* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 -* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) -* - PK is always zero if U=0 in the page tables -* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. -* -* The PKRU bitmask caches the result of these four conditions. The error -* code (minus the P bit) and the page table's U bit form an index into the -* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed -* with the two bits of the PKRU register corresponding to the protection key. -* For the first three conditions above the bits will be 00, thus masking -* away both AD and WD. For all reads or if the last condition holds, WD -* only will be masked away. -*/ -static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept) -{ - unsigned bit; - bool wp; - - if (ept) { - mmu->pkru_mask = 0; - return; - } - - /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { - mmu->pkru_mask = 0; - return; - } - - wp = is_write_protection(vcpu); - - for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { - unsigned pfec, pkey_bits; - bool check_pkey, check_write, ff, uf, wf, pte_user; - - pfec = bit << 1; - ff = pfec & PFERR_FETCH_MASK; - uf = pfec & PFERR_USER_MASK; - wf = pfec & PFERR_WRITE_MASK; - - /* PFEC.RSVD is replaced by ACC_USER_MASK. */ - pte_user = pfec & PFERR_RSVD_MASK; - - /* - * Only need to check the access which is not an - * instruction fetch and is to a user page. - */ - check_pkey = (!ff && pte_user); - /* - * write access is controlled by PKRU if it is a - * user access or CR0.WP = 1. - */ - check_write = check_pkey && wf && (uf || wp); - - /* PKRU.AD stops both read and write access. */ - pkey_bits = !!check_pkey; - /* PKRU.WD stops write access. */ - pkey_bits |= (!!check_write) << 1; - - mmu->pkru_mask |= (pkey_bits & 3) << pfec; - } -} - static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { unsigned root_level = mmu->root_level; @@ -4025,7 +3726,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); @@ -4053,7 +3753,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; @@ -4112,7 +3811,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4144,10 +3842,10 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.smm = is_smm(vcpu); reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { +#if 0 struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -4165,11 +3863,10 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); - update_pkru_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); +#endif } -EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { @@ -4220,7 +3917,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); - update_pkru_bitmask(vcpu, g_context, false); update_last_nonleaf_level(vcpu, g_context); } @@ -4239,7 +3935,6 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { @@ -4257,14 +3952,12 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } -EXPORT_SYMBOL_GPL(kvm_mmu_unload); static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, @@ -4413,7 +4106,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush; - union kvm_mmu_page_role mask = { }; + union kvm_mmu_page_role mask = { 0 }; mask.cr0_wp = 1; mask.cr4_pae = 1; @@ -4426,7 +4119,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * If we don't have indirect shadow pages, it means no page is * write-protected, so we can exit simply. */ - if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + unsigned int temp; + ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages, temp); + if (!temp) return; remote_flush = local_flush = false; @@ -4446,6 +4141,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { @@ -4471,6 +4167,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++spte; } } +#undef LIST_ENTRY_TYPE_INFO kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); @@ -4490,16 +4187,15 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); static void make_mmu_pages_available(struct kvm_vcpu *vcpu) { LIST_HEAD(invalid_list); - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= GVM_MIN_FREE_MMU_PAGES)) return; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + while (kvm_mmu_available_pages(vcpu->kvm) < GVM_REFILL_PAGES) { if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) break; @@ -4527,7 +4223,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); if (r < 0) return r; if (!r) @@ -4550,38 +4246,33 @@ emulate: BUG(); } } -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } -EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_enable_tdp(void) { tdp_enabled = true; } -EXPORT_SYMBOL_GPL(kvm_enable_tdp); void kvm_disable_tdp(void) { tdp_enabled = false; } -EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - free_page((unsigned long)vcpu->arch.mmu.pae_root); + MmFreeContiguousMemory(vcpu->arch.mmu.pae_root); if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((size_t)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { - struct page *page; int i; /* @@ -4589,11 +4280,14 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) + PHYSICAL_ADDRESS addr_4g; + addr_4g.QuadPart = 0xFFFFFFFF; + + vcpu->arch.mmu.pae_root = + MmAllocateContiguousMemory(PAGE_SIZE, addr_4g); + if (!vcpu->arch.mmu.pae_root) return -ENOMEM; - vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; @@ -4649,6 +4343,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, if (iterator.rmap) flush |= fn(kvm, iterator.rmap); +#if 0 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { kvm_flush_remote_tlbs(kvm); @@ -4656,6 +4351,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } cond_resched_lock(&kvm->mmu_lock); } +#endif } if (flush && lock_flush_tlb) { @@ -4682,15 +4378,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } static bool @@ -4708,7 +4396,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; spin_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { gfn_t start, end; @@ -4719,7 +4407,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + PT_PAGE_TABLE_LEVEL, PT_PAGE_TABLE_LEVEL, start, end - 1, true); } } @@ -4748,7 +4436,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * which do tlb flush out of mmu-lock should be serialized by * kvm->slots_lock otherwise tlb flush would be missed. */ - lockdep_assert_held(&kvm->slots_lock); + //lockdep_assert_held(&kvm->slots_lock); /* * We can flush all the TLBs out of the mmu lock without TLB @@ -4786,9 +4474,8 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct //&& + /*PageTransCompoundMap(pfn_to_page(pfn))*/) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; @@ -4817,7 +4504,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); - lockdep_assert_held(&kvm->slots_lock); + //lockdep_assert_held(&kvm->slots_lock); /* * It's also safe to flush TLBs out of mmu lock here as currently this @@ -4828,25 +4515,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); - -void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - bool flush; - - spin_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); - spin_unlock(&kvm->mmu_lock); - - /* see kvm_mmu_slot_remove_write_access */ - lockdep_assert_held(&kvm->slots_lock); - - if (flush) - kvm_flush_remote_tlbs(kvm); -} -EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -4857,13 +4525,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); - lockdep_assert_held(&kvm->slots_lock); + //lockdep_assert_held(&kvm->slots_lock); /* see kvm_mmu_slot_leaf_clear_dirty */ if (flush) kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) @@ -4872,6 +4539,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) int batch = 0; restart: +#define LIST_ENTRY_TYPE_INFO struct kvm_mmu_page list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { int ret; @@ -4895,8 +4563,8 @@ restart: * Need not flush tlb since we only zap the sp with invalid * generation number. */ - if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + if (batch >= BATCH_ZAP_PAGES) {// && + //cond_resched_lock(&kvm->mmu_lock)) { batch = 0; goto restart; } @@ -4908,6 +4576,7 @@ restart: if (ret) goto restart; } +#undef LIST_ENTRY_TYPE_INFO /* * Should flush tlb before free page tables since lockless-walking @@ -4921,14 +4590,13 @@ restart: * to zap obsolete pages. * * It's required when memslot is being deleted or VM is being - * destroyed, in these cases, we should ensure that KVM MMU does + * destroyed, in these cases, we should ensure that kvm MMU does * not use any resource of the being-deleted slot or all slots * after calling the function. */ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); - trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; /* @@ -4963,12 +4631,14 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) } } -static unsigned long +// todo-002 +#if 0 +static size_t mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; + size_t freed = 0; spin_lock(&kvm_lock); @@ -5024,7 +4694,7 @@ unlock: return freed; } -static unsigned long +static size_t mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return percpu_counter_read_positive(&kvm_total_used_mmu_pages); @@ -5035,39 +4705,16 @@ static struct shrinker mmu_shrinker = { .scan_objects = mmu_shrink_scan, .seeks = DEFAULT_SEEKS * 10, }; +#endif static void mmu_destroy_caches(void) { +#if 0 if (pte_list_desc_cache) kmem_cache_destroy(pte_list_desc_cache); if (mmu_page_header_cache) kmem_cache_destroy(mmu_page_header_cache); -} - -int kvm_mmu_module_init(void) -{ - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, 0, NULL); - if (!pte_list_desc_cache) - goto nomem; - - mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); - if (!mmu_page_header_cache) - goto nomem; - - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto nomem; - - register_shrinker(&mmu_shrinker); - - return 0; - -nomem: - mmu_destroy_caches(); - return -ENOMEM; +#endif } /* @@ -5081,16 +4728,16 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) struct kvm_memory_slot *memslot; int i; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) nr_pages += memslot->npages; } - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; + nr_mmu_pages = nr_pages * GVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + (unsigned int) GVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } @@ -5104,8 +4751,11 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_module_exit(void) { + // todo-001 +#if 0 mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); +#endif } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e9..cf39e5a 100644..100755 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_X86_MMU_H #define __KVM_X86_MMU_H @@ -44,7 +48,7 @@ #define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) +#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + GVM_NR_PAGE_SIZES - 1) static inline u64 rsvd_bits(int s, int e) { @@ -96,7 +100,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is - * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences + * used to sync dirty bitmap when we do GVM_GET_DIRTY_LOG. The differences * between these two sorts are: * 1) the first case clears SPTE_MMU_WRITEABLE bit. * 2) the first case requires flushing tlb immediately avoiding corrupting @@ -126,7 +130,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) * * TODO: introduce APIs to split these two cases. */ -static inline int is_writable_pte(unsigned long pte) +static inline int is_writable_pte(size_t pte) { return pte & PT_WRITABLE_MASK; } @@ -149,7 +153,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + size_t rflags = kvm_x86_ops->get_rflags(vcpu); /* * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. @@ -164,41 +168,20 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * but it will be one in index if SMAP checks are being overridden. * It is important to keep this branchless. */ - unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + size_t smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); - if (unlikely(mmu->pkru_mask)) { - u32 pkru_bits, offset; - - /* - * PKRU defines 32 bits, there are 16 domains and 2 - * attribute bits per domain in pkru. pte_pkey is the - * index of the protection domain, so pte_pkey * 2 is - * is the index of the first bit for the domain. - */ - pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; - - /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); - - pkru_bits &= mmu->pkru_mask >> offset; - errcode |= -pkru_bits & PFERR_PK_MASK; - fault |= (pkru_bits != 0); - } - - return -(u32)fault & errcode; + + return -(s32)fault & errcode; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); #endif diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index dcce533..76050b1 100644..100755 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -5,6 +5,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -17,6 +18,7 @@ * */ +#if 0 #include <linux/ratelimit.h> char const *audit_point_name[] = { @@ -278,7 +280,7 @@ static void mmu_audit_disable(void) static int mmu_audit_set(const char *val, const struct kernel_param *kp) { int ret; - unsigned long enable; + size_t enable; ret = kstrtoul(val, 10, &enable); if (ret < 0) @@ -304,3 +306,4 @@ static const struct kernel_param_ops audit_param_ops = { }; arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); +#endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h deleted file mode 100644 index 5a24b84..0000000 --- a/arch/x86/kvm/mmutrace.h +++ /dev/null @@ -1,333 +0,0 @@ -#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVMMMU_H - -#include <linux/tracepoint.h> -#include <linux/trace_events.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvmmmu - -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ - __field(bool, unsync) - -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ - __entry->unsync = sp->unsync; - -#define KVM_MMU_PAGE_PRINTK() ({ \ - const char *saved_ptr = trace_seq_buffer_ptr(p); \ - static const char *access_str[] = { \ - "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ - }; \ - union kvm_mmu_page_role role; \ - \ - role.word = __entry->role; \ - \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ - __entry->gfn, role.level, \ - role.cr4_pae ? " pae" : "", \ - role.quadrant, \ - role.direct ? " direct" : "", \ - access_str[role.access], \ - role.invalid ? " invalid" : "", \ - role.nxe ? "" : "!", \ - __entry->root_count, \ - __entry->unsync ? "unsync" : "sync", 0); \ - saved_ptr; \ - }) - -#define kvm_mmu_trace_pferr_flags \ - { PFERR_PRESENT_MASK, "P" }, \ - { PFERR_WRITE_MASK, "W" }, \ - { PFERR_USER_MASK, "U" }, \ - { PFERR_RSVD_MASK, "RSVD" }, \ - { PFERR_FETCH_MASK, "F" } - -/* - * A pagetable walk has started - */ -TRACE_EVENT( - kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, u32 pferr), - TP_ARGS(addr, pferr), - - TP_STRUCT__entry( - __field(__u64, addr) - __field(__u32, pferr) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pferr = pferr; - ), - - TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, - __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) -); - - -/* We just walked a paging element */ -TRACE_EVENT( - kvm_mmu_paging_element, - TP_PROTO(u64 pte, int level), - TP_ARGS(pte, level), - - TP_STRUCT__entry( - __field(__u64, pte) - __field(__u32, level) - ), - - TP_fast_assign( - __entry->pte = pte; - __entry->level = level; - ), - - TP_printk("pte %llx level %u", __entry->pte, __entry->level) -); - -DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size), - - TP_STRUCT__entry( - __field(__u64, gpa) - ), - - TP_fast_assign( - __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) - + index * size; - ), - - TP_printk("gpa %llx", __entry->gpa) -); - -/* We set a pte accessed bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size) -); - -/* We set a pte dirty bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size) -); - -TRACE_EVENT( - kvm_mmu_walker_error, - TP_PROTO(u32 pferr), - TP_ARGS(pferr), - - TP_STRUCT__entry( - __field(__u32, pferr) - ), - - TP_fast_assign( - __entry->pferr = pferr; - ), - - TP_printk("pferr %x %s", __entry->pferr, - __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) -); - -TRACE_EVENT( - kvm_mmu_get_page, - TP_PROTO(struct kvm_mmu_page *sp, bool created), - TP_ARGS(sp, created), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - __field(bool, created) - ), - - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - __entry->created = created; - ), - - TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), - __entry->created ? "new" : "existing") -); - -DECLARE_EVENT_CLASS(kvm_mmu_page_class, - - TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), - - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), - - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -TRACE_EVENT( - mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), - - TP_STRUCT__entry( - __field(void *, sptep) - __field(gfn_t, gfn) - __field(unsigned, access) - __field(unsigned int, gen) - ), - - TP_fast_assign( - __entry->sptep = sptep; - __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; - ), - - TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, - __entry->gfn, __entry->access, __entry->gen) -); - -TRACE_EVENT( - handle_mmio_page_fault, - TP_PROTO(u64 addr, gfn_t gfn, unsigned access), - TP_ARGS(addr, gfn, access), - - TP_STRUCT__entry( - __field(u64, addr) - __field(gfn_t, gfn) - __field(unsigned, access) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->gfn = gfn; - __entry->access = access; - ), - - TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, - __entry->access) -); - -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - -TRACE_EVENT( - fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(gva_t, gva) - __field(u32, error_code) - __field(u64 *, sptep) - __field(u64, old_spte) - __field(u64, new_spte) - __field(bool, retry) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu->vcpu_id; - __entry->gva = gva; - __entry->error_code = error_code; - __entry->sptep = sptep; - __entry->old_spte = old_spte; - __entry->new_spte = *sptep; - __entry->retry = retry; - ), - - TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" - " new %llx spurious %d fixed %d", __entry->vcpu_id, - __entry->gva, __print_flags(__entry->error_code, "|", - kvm_mmu_trace_pferr_flags), __entry->sptep, - __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) - ) -); - -TRACE_EVENT( - kvm_mmu_invalidate_zap_all_pages, - TP_PROTO(struct kvm *kvm), - TP_ARGS(kvm), - - TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) - __field(unsigned int, mmu_used_pages) - ), - - TP_fast_assign( - __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; - ), - - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", - __entry->mmu_valid_gen, __entry->mmu_used_pages - ) -); - - -TRACE_EVENT( - check_mmio_spte, - TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), - TP_ARGS(spte, kvm_gen, spte_gen), - - TP_STRUCT__entry( - __field(unsigned int, kvm_gen) - __field(unsigned int, spte_gen) - __field(u64, spte) - ), - - TP_fast_assign( - __entry->kvm_gen = kvm_gen; - __entry->spte_gen = spte_gen; - __entry->spte = spte; - ), - - TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, - __entry->kvm_gen, __entry->spte_gen, - __entry->kvm_gen == __entry->spte_gen - ) -); -#endif /* _TRACE_KVMMMU_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE mmutrace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 0149ac5..6f3c042 100644..100755 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -4,6 +4,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright(C) 2015 Intel Corporation. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -17,7 +18,6 @@ */ #include <linux/kvm_host.h> -#include <asm/mtrr.h> #include "cpuid.h" #include "mmu.h" @@ -26,10 +26,19 @@ #define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) #define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) +/* MTRR memory types, which are defined in SDM */ +#define MTRR_TYPE_UNCACHABLE 0 +#define MTRR_TYPE_WRCOMB 1 +/*#define MTRR_TYPE_ 2*/ +/*#define MTRR_TYPE_ 3*/ +#define MTRR_TYPE_WRTHROUGH 4 +#define MTRR_TYPE_WRPROT 5 +#define MTRR_TYPE_WRBACK 6 +#define MTRR_NUM_TYPES 7 + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: case MSR_MTRRfix64K_00000: case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: @@ -44,6 +53,9 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRdefType: case MSR_IA32_CR_PAT: return true; + default: + if (msr >= 0x200 && msr < 0x210) + return true; } return false; } @@ -83,7 +95,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * kvm_NR_VAR_MTRR)); mask = (~0ULL) << cpuid_maxphyaddr(vcpu); if ((msr & 1) == 0) { @@ -101,7 +113,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return true; } -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) { @@ -200,11 +211,19 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) *seg = 0; *unit = 0; break; - case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: *seg = 1; *unit = msr - MSR_MTRRfix16K_80000; break; - case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: *seg = 2; *unit = msr - MSR_MTRRfix4K_C0000; break; @@ -319,8 +338,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) gfn_t start, end; int index; - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (msr == MSR_IA32_CR_PAT || !tdp_enabled) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) @@ -372,10 +390,12 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { +#define LIST_ENTRY_TYPE_INFO struct kvm_mtrr_range list_for_each_entry(tmp, &mtrr_state->head, node) if (cur->base >= tmp->base) break; list_add_tail(&cur->node, &tmp->node); +#undef LIST_ENTRY_TYPE_INFO } } @@ -410,9 +430,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * SMRR = 0 * WC = 1 * FIX = 1 - * VCNT = KVM_NR_VAR_MTRR + * VCNT = kvm_NR_VAR_MTRR */ - *pdata = 0x500 | KVM_NR_VAR_MTRR; + *pdata = 0x500 | 8; return 0; } @@ -525,9 +545,11 @@ static void __mtrr_lookup_var_next(struct mtrr_iter *iter) { struct kvm_mtrr *mtrr_state = iter->mtrr_state; +#define LIST_ENTRY_TYPE_INFO struct kvm_mtrr_range list_for_each_entry_continue(iter->range, &mtrr_state->head, node) if (match_var_range(iter, iter->range)) return; +#undef LIST_ENTRY_TYPE_INFO iter->range = NULL; iter->partial_map |= iter->start_max < iter->end; @@ -540,7 +562,9 @@ static void mtrr_lookup_var_start(struct mtrr_iter *iter) iter->fixed = false; iter->start_max = iter->start; iter->range = NULL; +#define LIST_ENTRY_TYPE_INFO struct kvm_mtrr_range iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); +#undef LIST_ENTRY_TYPE_INFO __mtrr_lookup_var_next(iter); } @@ -557,9 +581,10 @@ static void mtrr_lookup_fixed_next(struct mtrr_iter *iter) iter->index++; /* have looked up for all fixed MTRRs. */ - if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) - return mtrr_lookup_var_start(iter); - + if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) { + mtrr_lookup_var_start(iter); + return; + } /* switch to next segment. */ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg)) iter->seg++; @@ -696,7 +721,6 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) return type; } -EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num) diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539..2d6d87d 100644..100755 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -5,6 +5,7 @@ * write access is tracked. * * Copyright(C) 2015 Intel Corporation. + * Copyright 2019 Google LLC * * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> @@ -33,7 +34,7 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *free, } int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) + size_t npages) { int i; @@ -64,7 +65,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, { int index, val; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn - slot->base_gfn; val = slot->arch.gfn_track[mode][index]; @@ -96,12 +97,6 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, update_gfn_track(slot, gfn, mode, 1); - /* - * new track stops large page mapping for the - * tracked page. - */ - kvm_mmu_gfn_disallow_lpage(slot, gfn); - if (mode == KVM_PAGE_TRACK_WRITE) if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); @@ -128,12 +123,6 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, return; update_gfn_track(slot, gfn, mode, -1); - - /* - * allow large page mapping for the tracked page - * after the tracker is gone. - */ - kvm_mmu_gfn_allow_lpage(slot, gfn); } /* @@ -144,6 +133,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot; int index; + unsigned short temp; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; @@ -152,8 +142,9 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return false; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); - return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); + index = gfn - slot->base_gfn; + ACCESS_ONCE(slot->arch.gfn_track[mode][index], temp); + return !!temp; } void kvm_page_track_init(struct kvm *kvm) @@ -165,6 +156,14 @@ void kvm_page_track_init(struct kvm *kvm) INIT_HLIST_HEAD(&head->track_notifier_list); } +void kvm_page_track_destroy(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + /* * register the notifier so that event interception for the tracked guest * pages can be received. @@ -220,8 +219,10 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, return; idx = srcu_read_lock(&head->track_srcu); +#define LIST_ENTRY_TYPE_INFO struct kvm_page_track_notifier_node hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) n->track_write(vcpu, gpa, new, bytes); +#undef LIST_ENTRY_TYPE_INFO srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a011054..0d5fd47 100644..100755 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -8,6 +8,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -27,9 +28,13 @@ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro * uses for EPT without A/D paging type. */ +#if 0 extern u64 __pure __using_nonexistent_pte_bit(void) __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); +#endif +#pragma warning(disable : 4127) +#pragma warning(disable : 4310) #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -65,6 +70,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#if 0 #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -80,6 +86,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 4 +#endif #else #error Invalid PTTYPE value #endif @@ -119,8 +126,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) if (!PT_GUEST_DIRTY_MASK) return; - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - mask = (unsigned)~ACC_WRITE_MASK; /* Allow write access to dirty gptes */ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & @@ -128,7 +133,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) *access &= mask; } -static inline int FNAME(is_present_gpte)(unsigned long pte) +static inline int FNAME(is_present_gpte)(size_t pte) { #if PTTYPE != PTTYPE_EPT return pte & PT_PRESENT_MASK; @@ -144,18 +149,20 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int npages; pt_element_t ret; pt_element_t *table; - struct page *page; + PMDL kmap_mdl; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + npages = get_user_pages_fast((size_t)ptep_user, 1, 1, &kmap_mdl); /* Check if the user is doing something meaningless. */ if (unlikely(npages != 1)) return -EFAULT; - table = kmap_atomic(page); + table = kmap_atomic(kmap_mdl); + if (!table) + return -EFAULT; ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); + kunmap_atomic(kmap_mdl); - kvm_release_page_dirty(page); + kvm_release_page(kmap_mdl); return (ret != orig_pte); } @@ -195,8 +202,6 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else - BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); - BUILD_BUG_ON(ACC_EXEC_MASK != 1); access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ access ^= (gpte >> PT64_NX_SHIFT); @@ -226,12 +231,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); if (!(pte & PT_GUEST_ACCESSED_MASK)) { - trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); pte |= PT_GUEST_ACCESSED_MASK; } if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) @@ -266,11 +269,13 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned pkeys = 0; +#if 0 #if PTTYPE == 64 pte_t pte = {.pte = gpte}; pkeys = pte_flags_pkey(pte_flags(pte)); #endif +#endif return pkeys; } @@ -283,7 +288,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, { int ret; pt_element_t pte; - pt_element_t __user *uninitialized_var(ptep_user); + pt_element_t __user *ptep_user; gfn_t table_gfn; unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; gpa_t pte_gpa; @@ -295,7 +300,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gpa_t real_gpa; gfn_t gfn; - trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -303,7 +307,6 @@ retry_walk: #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); - trace_kvm_mmu_paging_element(pte, walker->level); if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; @@ -318,7 +321,7 @@ retry_walk: do { gfn_t real_gfn; - unsigned long host_addr; + size_t host_addr; pt_access &= pte_access; --walker->level; @@ -355,13 +358,11 @@ retry_walk: if (unlikely(kvm_is_error_hva(host_addr))) goto error; - ptep_user = (pt_element_t __user *)((void *)host_addr + offset); + ptep_user = (pt_element_t __user *)((char *)host_addr + offset); if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; - trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; @@ -449,7 +450,6 @@ error: walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; - trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; } @@ -546,7 +546,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, return; if (sp->role.direct) - return __direct_pte_prefetch(vcpu, sp, sptep); + __direct_pte_prefetch(vcpu, sp, sptep); i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; @@ -571,7 +571,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; @@ -634,7 +634,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (is_shadow_present_pte(*it.sptep)) continue; - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + direct_gfn = gw->gfn; sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access); @@ -643,13 +643,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, clear_sp_write_flooding_count(it.sptep); emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + it.level, gw->gfn, pfn, false, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); return emulate; out_gpte_changed: - kvm_release_pfn_clean(pfn); return 0; } @@ -676,7 +675,6 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, bool *write_fault_to_shadow_pgtable) { int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); bool self_changed = false; if (!(walker->pte_access & ACC_WRITE_MASK || @@ -686,7 +684,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, for (level = walker->level; level <= walker->max_level; level++) { gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - self_changed |= !(gfn & mask); + self_changed |= !gfn; *write_fault_to_shadow_pgtable |= !gfn; } @@ -707,8 +705,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code) { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; @@ -716,8 +713,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; - unsigned long mmu_seq; bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -742,8 +737,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) - inject_page_fault(vcpu, &walker.fault); + inject_page_fault(vcpu, &walker.fault); return 0; } @@ -759,20 +753,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &force_pt_level); - if (likely(!force_pt_level)) { - level = min(walker.level, level); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - } else - force_pt_level = true; + level = mapping_level(vcpu, walker.gfn, NULL); + } - mmu_seq = vcpu->kvm->mmu_notifier_seq; + //mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) - return 0; + get_pfn(vcpu, walker.gfn, addr, &pfn, write_fault, &map_writable); if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -799,25 +786,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, } spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); make_mmu_pages_available(vcpu); - if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 06ce377..d3937d2 100644..100755 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -2,6 +2,7 @@ * Kernel-based Virtual Machine -- Performance Monitoring Unit support * * Copyright 2015 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@redhat.com> @@ -13,6 +14,7 @@ * */ +#if 0 #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -63,9 +65,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + (size_t *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (size_t *)&pmu->global_status); + kvm_make_request(GVM_REQ_PMU, pmc->vcpu); } } @@ -77,9 +79,9 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + (size_t *)&pmu->reprogram_pmi)) { + __set_bit(pmc->idx, (size_t *)&pmu->global_status); + kvm_make_request(GVM_REQ_PMU, pmc->vcpu); /* * Inject PMI. If vcpu was in a guest mode during NMI PMI @@ -92,7 +94,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, if (!kvm_is_in_guest()) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); + kvm_make_request(GVM_REQ_PMI, pmc->vcpu); } } @@ -130,7 +132,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); + clear_bit(pmc->idx, (size_t*)&pmc_to_pmu(pmc)->reprogram_pmi); } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) @@ -173,7 +175,6 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) (eventsel & HSW_IN_TX), (eventsel & HSW_IN_TX_CHECKPOINTED)); } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { @@ -191,7 +192,6 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) !(en_field & 0x1), /* exclude kernel */ pmi, false, false); } -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { @@ -209,7 +209,6 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) reprogram_fixed_counter(pmc, ctrl, idx); } } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -219,11 +218,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) bitmask = pmu->reprogram_pmi; - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (size_t *)&bitmask, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + clear_bit(bit, (size_t *)&pmu->reprogram_pmi); continue; } @@ -307,3 +306,4 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); } +#endif diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f96e1f9..1025403 100644..100755 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -1,6 +1,8 @@ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H +#if 0 + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -115,4 +117,6 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; +#endif + #endif /* __KVM_X86_PMU_H */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index cd94443..5db57c6 100644..100755 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -11,6 +11,7 @@ * * Implementation is based on pmu_intel.c file */ +#if 0 #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -203,3 +204,4 @@ struct kvm_pmu_ops amd_pmu_ops = { .init = amd_pmu_init, .reset = amd_pmu_reset, }; +#endif diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a850..4fb5c5f 100644..100755 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -2,6 +2,7 @@ * KVM PMU support for Intel CPUs * * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@redhat.com> @@ -11,6 +12,7 @@ * the COPYING file in the top-level directory. * */ +#if 0 #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -63,7 +65,7 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + for_each_set_bit(bit, (size_t *)&diff, X86_PMC_IDX_MAX) reprogram_counter(pmu, bit); } @@ -98,7 +100,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); + return test_bit(pmc->idx, (size_t *)&pmu->global_ctrl); } static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) @@ -356,3 +358,4 @@ struct kvm_pmu_ops intel_pmu_ops = { .init = intel_pmu_init, .reset = intel_pmu_reset, }; +#endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca..31fc896 100644..100755 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5,6 +5,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -26,199 +27,12 @@ #include "cpuid.h" #include "pmu.h" -#include <linux/module.h> -#include <linux/mod_devicetable.h> -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/trace_events.h> -#include <linux/slab.h> -#include <linux/amd-iommu.h> -#include <linux/hashtable.h> - -#include <asm/apic.h> -#include <asm/perf_event.h> -#include <asm/tlbflush.h> -#include <asm/desc.h> -#include <asm/debugreg.h> -#include <asm/kvm_para.h> -#include <asm/irq_remapping.h> - -#include <asm/virtext.h> -#include "trace.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static const struct x86_cpu_id svm_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_SVM), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); - -#define IOPM_ALLOC_ORDER 2 -#define MSRPM_ALLOC_ORDER 1 - -#define SEG_TYPE_LDT 2 -#define SEG_TYPE_BUSY_TSS16 3 - -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) - -#define SVM_AVIC_DOORBELL 0xc001011b - -#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ -#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ -#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ - -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - -#define TSC_RATIO_RSVD 0xffffff0000000000ULL -#define TSC_RATIO_MIN 0x0000000000000001ULL -#define TSC_RATIO_MAX 0x000000ffffffffffULL - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - -/* AVIC GATAG is encoded using VM and VCPU IDs */ -#define AVIC_VCPU_ID_BITS 8 -#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) - -#define AVIC_VM_ID_BITS 24 -#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) -#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) - -#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ - (y & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) - -static bool erratum_383_found __read_mostly; - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_TSC_AUX, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_vcpu; - -struct nested_state { - struct vmcb *hsave; - u64 hsave_msr; - u64 vm_cr_msr; - u64 vmcb; - - /* These are the merged vectors */ - u32 *msrpm; - - /* gpa pointers to the real vectors */ - u64 vmcb_msrpm; - u64 vmcb_iopm; - - /* A VMEXIT is required but not yet emulated */ - bool exit_required; - - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - - /* Nested Paging related state */ - u64 nested_cr3; -}; - -#define MSRPM_OFFSETS 16 -static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - -/* - * Set osvw_len to higher value when updated Revision Guides - * are published and we know what the new status bits are - */ -static uint64_t osvw_len = 4, osvw_status; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - uint64_t sysenter_esp; - uint64_t sysenter_eip; - uint64_t tsc_aux; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; +#include <asm/svm.h> +#include <asm/vmx.h> - u32 *msrpm; - - ulong nmi_iret_rip; - - struct nested_state nested; - - bool nmi_singlestep; - - unsigned int3_injected; - unsigned long int3_rip; - u32 apf_reason; - - /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; - - u32 ldr_reg; - struct page *avic_backing_page; - u64 *avic_physical_id_cache; - bool avic_is_running; - - /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. - */ - struct list_head ir_list; - spinlock_t ir_list_lock; -}; - -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; +#include <__asm.h> +//seperate definitions to svm_def.h for asmgen +#include "svm_def.h" #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) @@ -228,9 +42,6 @@ struct amd_svm_iommu_ir { #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) -static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define TSC_RATIO_DEFAULT 0x0100000000ULL - #define MSR_INVALID 0xffffffffU static const struct svm_direct_access_msrs { @@ -263,23 +74,18 @@ static bool npt_enabled; /* allow nested paging (virtualized MMU) for all guests */ static int npt = true; -module_param(npt, int, S_IRUGO); -/* allow nested virtualization in KVM/SVM */ -static int nested = true; -module_param(nested, int, S_IRUGO); +/* allow nested virtualization in kvm/SVM */ +static int nested = false; /* enable / disable AVIC */ static int avic; -#ifdef CONFIG_X86_LOCAL_APIC -module_param(avic, int, S_IRUGO); -#endif /* AVIC VM ID bit masks and lock */ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); static DEFINE_SPINLOCK(avic_vm_id_lock); -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +static void svm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -345,11 +151,13 @@ static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u64 *entry = svm->avic_physical_id_cache; + u64 temp; if (!entry) return false; - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); + READ_ONCE(*entry, temp); + return temp & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; } static void recalc_intercepts(struct vcpu_svm *svm) @@ -489,8 +297,10 @@ static inline bool gif_set(struct vcpu_svm *svm) return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } -static unsigned long iopm_base; +static size_t iopm_base; +static void *iopm_va; +#pragma pack(push, 1) struct kvm_ldttss_desc { u16 limit0; u16 base0; @@ -498,7 +308,8 @@ struct kvm_ldttss_desc { unsigned limit1:4, zero0:3, g:1, base2:8; u32 base3; u32 zero1; -} __attribute__((packed)); +}; +#pragma pack(pop) struct svm_cpu_data { int cpu; @@ -549,17 +360,17 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + __svm_clgi(); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + __svm_stgi(); } -static inline void invlpga(unsigned long addr, u32 asid) +static inline void invlpga(size_t addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); + __svm_invlpga((void *)addr, asid); } static int get_npt_level(void) @@ -593,7 +404,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + ret = GVM_X86_SHADOW_INT_STI | GVM_X86_SHADOW_INT_MOV_SS; return ret; } @@ -646,7 +457,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { - unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + size_t rip, old_rip = kvm_rip_read(&svm->vcpu); /* * For guest debugging where we have to reinject #BP if some @@ -668,15 +479,28 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, svm->vmcb->control.event_inj_err = error_code; } +/** +* upper_32_bits - return bits 32-63 of a number +* @n: the number we're accessing +* +* A basic shift-right of a 64- or 32-bit quantity. Use this to suppress +* the "right shift count >= width of type" warning when that quantity is +* 32-bits. +*/ +#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) + +/** +* lower_32_bits - return bits 0-31 of a number
+* @n: the number we're accessing
+*/ +#define lower_32_bits(n) ((u32)(n)) + static void svm_init_erratum_383(void) { u32 low, high; int err; u64 val; - if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) - return; - /* Use _safe variants to not break nested virtualization */ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); if (err) @@ -715,25 +539,23 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) static int has_svm(void) { - const char *msg; + return static_cpu_has(X86_FEATURE_SVM); +} - if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; - } +static inline void cpu_svm_disable(void) +{ + uint64_t efer; - return 1; + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~EFER_SVME); } static void svm_hardware_disable(void) { - /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - cpu_svm_disable(); - amd_pmu_disable_virt(); + //amd_pmu_disable_virt(); } static int svm_hardware_enable(void) @@ -771,11 +593,6 @@ static int svm_hardware_enable(void) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); - } - /* * Get OSVW bits. @@ -808,7 +625,7 @@ static int svm_hardware_enable(void) svm_init_erratum_383(); - amd_pmu_enable_virt(); + //amd_pmu_enable_virt(); return 0; } @@ -864,7 +681,7 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { u8 bit_read, bit_write; - unsigned long tmp; + size_t tmp; u32 offset; /* @@ -965,70 +782,17 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } -/* Note: - * This hash table is used to map VM_ID to a struct kvm_arch, - * when handling AMD IOMMU GALOG notification to schedule in - * a particular vCPU. - */ -#define SVM_VM_DATA_HASH_BITS 8 -DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); -static spinlock_t svm_vm_data_hash_lock; - -/* Note: - * This function is called from IOMMU driver to notify - * SVM to schedule in a particular vCPU of a particular VM. - */ -static int avic_ga_log_notifier(u32 ga_tag) -{ - unsigned long flags; - struct kvm_arch *ka = NULL; - struct kvm_vcpu *vcpu = NULL; - u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); - - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { - struct kvm *kvm = container_of(ka, struct kvm, arch); - struct kvm_arch *vm_data = &kvm->arch; - - if (vm_data->avic_vm_id != vm_id) - continue; - vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - break; - } - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - - if (!vcpu) - return 0; - - /* Note: - * At this point, the IOMMU should have already set the pending - * bit in the vAPIC backing page. So, we just need to schedule - * in the vcpu. - */ - if (vcpu->mode == OUTSIDE_GUEST_MODE) - kvm_vcpu_wake_up(vcpu); - - return 0; -} - -static __init int svm_hardware_setup(void) +static int svm_hardware_setup(void) { int cpu; - struct page *iopm_pages; - void *iopm_va; int r; + PHYSICAL_ADDRESS max_phys = { .QuadPart = MAXULONG64 }; - iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); + iopm_va = MmAllocateContiguousMemory(PAGE_SIZE * (1 << IOPM_ALLOC_ORDER), max_phys); + if (!iopm_va) + return ENOMEM; memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + iopm_base = MmGetPhysicalAddress(iopm_va).QuadPart; init_msrpm_offsets(); @@ -1038,12 +802,6 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -1071,34 +829,31 @@ static __init int svm_hardware_setup(void) if (avic) { if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + !boot_cpu_has(X86_FEATURE_AVIC)) { avic = false; } else { pr_info("AVIC enabled\n"); - - hash_init(svm_vm_data_hash); - spin_lock_init(&svm_vm_data_hash_lock); - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } } return 0; err: - __free_pages(iopm_pages, IOPM_ALLOC_ORDER); + MmFreeContiguousMemory(iopm_va); + iopm_va = NULL; iopm_base = 0; return r; } -static __exit void svm_hardware_unsetup(void) +static void svm_hardware_unsetup(void) { int cpu; for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + MmFreeContiguousMemory(iopm_va); + iopm_va = NULL; iopm_base = 0; } @@ -1128,10 +883,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset, - offset); + } svm->vmcb->control.tsc_offset = offset + g_tsc_offset; @@ -1159,7 +911,6 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - svm->vcpu.fpu_active = 1; svm->vcpu.arch.hflags = 0; set_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1260,11 +1011,6 @@ static void init_vmcb(struct vcpu_svm *svm) svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; - if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - control->pause_filter_count = 3000; - set_intercept(svm, INTERCEPT_PAUSE); - } - if (avic) avic_init_vmcb(svm); @@ -1338,7 +1084,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); + READ_ONCE(*entry, new_entry); new_entry = (page_to_phys(svm->avic_backing_page) & AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; @@ -1379,7 +1125,6 @@ static inline int avic_free_vm_id(int id) static void avic_vm_destroy(struct kvm *kvm) { - unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; avic_free_vm_id(vm_data->avic_vm_id); @@ -1388,15 +1133,10 @@ static void avic_vm_destroy(struct kvm *kvm) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) __free_page(vm_data->avic_physical_id_table_page); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_del(&vm_data->hnode); - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } static int avic_vm_init(struct kvm *kvm) { - unsigned long flags; int vm_id, err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; @@ -1426,10 +1166,6 @@ static int avic_vm_init(struct kvm *kvm) vm_data->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - return 0; free_avic: @@ -1437,36 +1173,6 @@ free_avic: return err; } -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; - - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; -} - static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; @@ -1480,7 +1186,7 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + READ_ONCE(*(svm->avic_physical_id_cache), entry); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; @@ -1491,8 +1197,6 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); } static void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1503,10 +1207,7 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); - + READ_ONCE(*(svm->avic_physical_id_cache), entry); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } @@ -1550,12 +1251,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; struct page *page; - struct page *msrpm_pages; struct page *hsave_page; - struct page *nested_msrpm_pages; + void *msrpm_va; + void *nested_msrpm_va; int err; + PHYSICAL_ADDRESS max_phys = { .QuadPart = MAXULONG64 }; - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + svm = kzalloc_fast(sizeof(struct vcpu_svm), GFP_KERNEL); if (!svm) { err = -ENOMEM; goto out; @@ -1570,12 +1272,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!page) goto uninit; - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) + msrpm_va = MmAllocateContiguousMemory(PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER), max_phys); + if (!msrpm_va) goto free_page1; - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) + nested_msrpm_va = MmAllocateContiguousMemory(PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER), max_phys); + if (!nested_msrpm_va) goto free_page2; hsave_page = alloc_page(GFP_KERNEL); @@ -1586,9 +1288,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = avic_init_backing_page(&svm->vcpu); if (err) goto free_page4; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); } /* We initialize this flag to true to make sure that the is_running @@ -1598,10 +1297,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->nested.hsave = page_address(hsave_page); - svm->msrpm = page_address(msrpm_pages); + svm->msrpm = msrpm_va; svm_vcpu_init_msrpm(svm->msrpm); - svm->nested.msrpm = page_address(nested_msrpm_pages); + svm->nested.msrpm = nested_msrpm_va; svm_vcpu_init_msrpm(svm->nested.msrpm); svm->vmcb = page_address(page); @@ -1617,15 +1316,15 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) free_page4: __free_page(hsave_page); free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(nested_msrpm_va); free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(msrpm_va); free_page1: __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: - kmem_cache_free(kvm_vcpu_cache, svm); + kfree(svm); out: return ERR_PTR(err); } @@ -1635,16 +1334,27 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(svm->msrpm); __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + MmFreeContiguousMemory(svm->nested.msrpm); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, svm); + kfree(svm); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + avic_vcpu_load(vcpu, cpu); +} + +static void svm_vcpu_put(struct kvm_vcpu *vcpu) +{ + avic_vcpu_put(vcpu); +} + +static void svm_save_host_state(struct kvm_vcpu *vcpu) +{ struct vcpu_svm *svm = to_svm(vcpu); + int cpu = smp_processor_id(); int i; if (unlikely(cpu != vcpu->cpu)) { @@ -1661,38 +1371,17 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { - u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; - if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); - } - } - /* This assumes that the kernel never uses MSR_TSC_AUX */ - if (static_cpu_has(X86_FEATURE_RDTSCP)) - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); - - avic_vcpu_load(vcpu, cpu); } -static void svm_vcpu_put(struct kvm_vcpu *vcpu) +static void svm_load_host_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); int i; - avic_vcpu_put(vcpu); - - ++vcpu->stat.host_state_reload; kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); load_gs_index(svm->host.gs); -#else -#ifdef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif #endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); @@ -1708,12 +1397,12 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_set_running(vcpu, true); } -static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) +static size_t svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; } -static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void svm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { /* * Any change of EFLAGS.VM is accompanied by a reload of SS @@ -1723,11 +1412,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } -static u32 svm_get_pkru(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -1765,7 +1449,6 @@ static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) case VCPU_SREG_LDTR: return &save->ldtr; } BUG(); - return NULL; } static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -1796,7 +1479,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, * However, the SVM spec states that the G bit is not observed by the * CPU, and some VMware virtual CPUs drop the G bit for all segments. * So let's synthesize a legal G bit for all segments, this helps - * running KVM nested. It also helps cross-vendor migration, because + * running kvm nested. It also helps cross-vendor migration, because * Intel's vmentry has a check on the 'G' bit. */ var->g = s->limit > 0xfffff; @@ -1901,15 +1584,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm) ulong gcr0 = svm->vcpu.arch.cr0; u64 *hcr0 = &svm->vmcb->save.cr0; - if (!svm->vcpu.fpu_active) - *hcr0 |= SVM_CR0_SELECTIVE_MASK; - else - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); mark_dirty(svm->vmcb, VMCB_CR); - if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); } else { @@ -1918,7 +1598,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm) } } -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +static void svm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1940,24 +1620,22 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!npt_enabled) cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) - cr0 |= X86_CR0_TS; /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at * reboot */ - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, GVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); } -static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int svm_set_cr4(struct kvm_vcpu *vcpu, size_t cr4) { - unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; - unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + size_t host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; + size_t old_cr4 = to_svm(vcpu)->vmcb->save.cr4; if (cr4 & X86_CR4_VMXE) return 1; @@ -2014,8 +1692,8 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu) clr_exception_intercept(svm, BP_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + if (vcpu->guest_debug & GVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & GVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); } else vcpu->guest_debug = 0; @@ -2040,7 +1718,7 @@ static u64 svm_get_dr6(struct kvm_vcpu *vcpu) return to_svm(vcpu)->vmcb->save.dr6; } -static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, size_t value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2059,11 +1737,11 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = svm_get_dr6(vcpu); vcpu->arch.dr7 = svm->vmcb->save.dr7; - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_WONT_EXIT; set_dr_intercepts(svm); } -static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) +static void svm_set_dr7(struct kvm_vcpu *vcpu, size_t value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2077,30 +1755,14 @@ static int pf_interception(struct vcpu_svm *svm) u32 error_code; int r = 1; - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; - - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, - svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } + error_code = svm->vmcb->control.exit_info_1; + + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); + return r; } @@ -2109,7 +1771,7 @@ static int db_interception(struct vcpu_svm *svm) struct kvm_run *kvm_run = svm->vcpu.run; if (!(svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; @@ -2117,14 +1779,14 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) + if (!(svm->vcpu.guest_debug & GVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); } if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { - kvm_run->exit_reason = KVM_EXIT_DEBUG; + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) { + kvm_run->exit_reason = GVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = DB_VECTOR; @@ -2138,7 +1800,7 @@ static int bp_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->exit_reason = GVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = BP_VECTOR; return 0; @@ -2160,22 +1822,6 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } -static void svm_fpu_activate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, NM_VECTOR); - - svm->vcpu.fpu_active = 1; - update_cr0_intercept(svm); -} - -static int nm_interception(struct vcpu_svm *svm) -{ - svm_fpu_activate(&svm->vcpu); - return 1; -} - static bool is_erratum_383(void) { int err, i; @@ -2210,7 +1856,7 @@ static bool is_erratum_383(void) } /* Flush tlb to evict multi-match entries */ - __flush_tlb_all(); + //__flush_tlb_all(); return true; } @@ -2222,9 +1868,9 @@ static void svm_handle_mce(struct vcpu_svm *svm) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("kvm: Guest triggered AMD Erratum 383\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, &svm->vcpu); return; } @@ -2233,8 +1879,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. */ - asm volatile ( - "int $0x12\n"); + __int12(); /* not sure if we ever come back to this point */ return; @@ -2256,7 +1901,7 @@ static int shutdown_interception(struct vcpu_svm *svm) clear_page(svm->vmcb); init_vmcb(svm); - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + kvm_run->exit_reason = GVM_EXIT_SHUTDOWN; return 0; } @@ -2303,13 +1948,7 @@ static int halt_interception(struct vcpu_svm *svm) return kvm_emulate_halt(&svm->vcpu); } -static int vmmcall_interception(struct vcpu_svm *svm) -{ - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - return kvm_emulate_hypercall(&svm->vcpu); -} - -static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +static size_t nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2331,7 +1970,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) } static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, - unsigned long root) + size_t root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2455,7 +2094,6 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) * the #vmexit here. */ svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); return false; } @@ -2477,30 +2115,44 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm) return false; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, PMDL *_mdl) { - struct page *page; + size_t hva; + PMDL mdl; + void *ret; might_sleep(); - page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + hva = gfn_to_hva(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + if (kvm_is_error_hva(hva)) goto error; - *_page = page; + mdl = IoAllocateMdl((void *)hva, PAGE_SIZE, FALSE, FALSE, NULL); + if (!mdl) + goto error; + + MmProbeAndLockPages(mdl, KernelMode, IoWriteAccess); - return kmap(page); + ret = kmap(mdl); + if (!ret) + goto error1; + *_mdl = mdl; + return ret; + +error1: + MmUnlockPages(mdl); + IoFreeMdl(mdl); error: kvm_inject_gp(&svm->vcpu, 0); return NULL; } -static void nested_svm_unmap(struct page *page) +static void nested_svm_unmap(PMDL mdl) { - kunmap(page); - kvm_release_page_dirty(page); + kunmap(mdl); + kvm_release_page(mdl); } static int nested_svm_intercept_ioio(struct vcpu_svm *svm) @@ -2569,12 +2221,9 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled) return NESTED_EXIT_HOST; break; - case SVM_EXIT_EXCP_BASE + NM_VECTOR: - nm_interception(svm); - break; default: break; } @@ -2597,26 +2246,77 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { + case SVM_EXIT_READ_CR0: + case SVM_EXIT_READ_CR2: + case SVM_EXIT_READ_CR3: + case SVM_EXIT_READ_CR4: + case SVM_EXIT_READ_CR8: + case SVM_EXIT_WRITE_CR0: + case SVM_EXIT_WRITE_CR2: + case SVM_EXIT_WRITE_CR3: + case SVM_EXIT_WRITE_CR4: + case SVM_EXIT_WRITE_CR8: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr & bit) vmexit = NESTED_EXIT_DONE; break; } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { + case SVM_EXIT_READ_DR0: + case SVM_EXIT_READ_DR1: + case SVM_EXIT_READ_DR2: + case SVM_EXIT_READ_DR3: + case SVM_EXIT_READ_DR4: + case SVM_EXIT_READ_DR5: + case SVM_EXIT_READ_DR6: + case SVM_EXIT_READ_DR7: + case SVM_EXIT_WRITE_DR0: + case SVM_EXIT_WRITE_DR1: + case SVM_EXIT_WRITE_DR2: + case SVM_EXIT_WRITE_DR3: + case SVM_EXIT_WRITE_DR4: + case SVM_EXIT_WRITE_DR5: + case SVM_EXIT_WRITE_DR6: + case SVM_EXIT_WRITE_DR7: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); if (svm->nested.intercept_dr & bit) vmexit = NESTED_EXIT_DONE; break; } - case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + case SVM_EXIT_EXCP_BASE: + case SVM_EXIT_EXCP_BASE + 0x1: + case SVM_EXIT_EXCP_BASE + 0x2: + case SVM_EXIT_EXCP_BASE + 0x3: + case SVM_EXIT_EXCP_BASE + 0x4: + case SVM_EXIT_EXCP_BASE + 0x5: + case SVM_EXIT_EXCP_BASE + 0x6: + case SVM_EXIT_EXCP_BASE + 0x7: + case SVM_EXIT_EXCP_BASE + 0x8: + case SVM_EXIT_EXCP_BASE + 0x9: + case SVM_EXIT_EXCP_BASE + 0xa: + case SVM_EXIT_EXCP_BASE + 0xb: + case SVM_EXIT_EXCP_BASE + 0xc: + case SVM_EXIT_EXCP_BASE + 0xd: + case SVM_EXIT_EXCP_BASE + 0xe: + case SVM_EXIT_EXCP_BASE + 0xf: + case SVM_EXIT_EXCP_BASE + 0x10: + case SVM_EXIT_EXCP_BASE + 0x11: + case SVM_EXIT_EXCP_BASE + 0x12: + case SVM_EXIT_EXCP_BASE + 0x13: + case SVM_EXIT_EXCP_BASE + 0x14: + case SVM_EXIT_EXCP_BASE + 0x15: + case SVM_EXIT_EXCP_BASE + 0x16: + case SVM_EXIT_EXCP_BASE + 0x17: + case SVM_EXIT_EXCP_BASE + 0x18: + case SVM_EXIT_EXCP_BASE + 0x19: + case SVM_EXIT_EXCP_BASE + 0x1a: + case SVM_EXIT_EXCP_BASE + 0x1b: + case SVM_EXIT_EXCP_BASE + 0x1c: + case SVM_EXIT_EXCP_BASE + 0x1d: + case SVM_EXIT_EXCP_BASE + 0x1e: + case SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); if (svm->nested.intercept_exceptions & excp_bits) vmexit = NESTED_EXIT_DONE; - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) - vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_ERR: { @@ -2680,16 +2380,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + PMDL kmap_mdl; - trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, - vmcb->control.exit_info_1, - vmcb->control.exit_info_2, - vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &kmap_mdl); if (!nested_vmcb) return 1; @@ -2789,7 +2482,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) mark_all_dirty(svm->vmcb); - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); @@ -2850,12 +2543,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + PMDL kmap_mdl; u64 vmcb_gpa; vmcb_gpa = svm->vmcb->save.rax; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &kmap_mdl); if (!nested_vmcb) return false; @@ -2865,22 +2558,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); return false; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -2985,7 +2667,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); /* Enter Guest-Mode */ enter_guest_mode(&svm->vcpu); @@ -3024,12 +2706,12 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + PMDL kmap_mdl; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &kmap_mdl); if (!nested_vmcb) return 1; @@ -3037,7 +2719,7 @@ static int vmload_interception(struct vcpu_svm *svm) skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); return 1; } @@ -3045,12 +2727,12 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + PMDL kmap_mdl; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &kmap_mdl); if (!nested_vmcb) return 1; @@ -3058,7 +2740,7 @@ static int vmsave_interception(struct vcpu_svm *svm) skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(page); + nested_svm_unmap(kmap_mdl); return 1; } @@ -3098,7 +2780,7 @@ static int stgi_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); @@ -3129,9 +2811,6 @@ static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), - kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); - /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); @@ -3142,8 +2821,6 @@ static int invlpga_interception(struct vcpu_svm *svm) static int skinit_interception(struct vcpu_svm *svm) { - trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -3227,8 +2904,8 @@ static int task_switch_interception(struct vcpu_svm *svm) if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; svm->vcpu.run->internal.ndata = 0; return 0; } @@ -3248,7 +2925,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -3269,6 +2946,7 @@ static int emulate_on_interception(struct vcpu_svm *svm) static int rdpmc_interception(struct vcpu_svm *svm) { +#if 0 int err; if (!static_cpu_has(X86_FEATURE_NRIPS)) @@ -3276,14 +2954,15 @@ static int rdpmc_interception(struct vcpu_svm *svm) err = kvm_rdpmc(&svm->vcpu); kvm_complete_insn_gp(&svm->vcpu, err); +#endif return 1; } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, - unsigned long val) + size_t val) { - unsigned long cr0 = svm->vcpu.arch.cr0; + size_t cr0 = svm->vcpu.arch.cr0; bool ret = false; u64 intercept; @@ -3309,7 +2988,7 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, static int cr_interception(struct vcpu_svm *svm) { int reg, cr; - unsigned long val; + size_t val; int err; if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) @@ -3346,7 +3025,7 @@ static int cr_interception(struct vcpu_svm *svm) err = kvm_set_cr8(&svm->vcpu, val); break; default: - WARN(1, "unhandled write to CR%d", cr); + //WARN(1, "unhandled write to CR%d", cr); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -3368,7 +3047,7 @@ static int cr_interception(struct vcpu_svm *svm) val = kvm_get_cr8(&svm->vcpu); break; default: - WARN(1, "unhandled read from CR%d", cr); + //WARN(1, "unhandled read from CR%d", cr); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -3382,7 +3061,7 @@ static int cr_interception(struct vcpu_svm *svm) static int dr_interception(struct vcpu_svm *svm) { int reg, dr; - unsigned long val; + size_t val; if (svm->vcpu.guest_debug == 0) { /* @@ -3391,7 +3070,7 @@ static int dr_interception(struct vcpu_svm *svm) * retrieve the full state of the debug registers. */ clr_dr_intercepts(svm); - svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + svm->vcpu.arch.switch_db_regs |= GVM_DEBUGREG_WONT_EXIT; return 1; } @@ -3430,7 +3109,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + kvm_run->exit_reason = GVM_EXIT_SET_TPR; return 0; } @@ -3440,9 +3119,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { - msr_info->data = svm->vmcb->control.tsc_offset + - kvm_scale_tsc(vcpu, rdtsc()); - + msr_info->data = svm->vmcb->control.tsc_offset + rdtsc(); break; } case MSR_STAR: @@ -3536,11 +3213,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) msr_info.index = ecx; msr_info.host_initiated = false; if (svm_get_msr(&svm->vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, msr_info.data & 0xffffffff); kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, @@ -3624,7 +3298,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * svm_vcpu_put. */ svm->tsc_aux = data; - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { @@ -3672,10 +3345,8 @@ static int wrmsr_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (kvm_set_msr(&svm->vcpu, &msr)) { - trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); } return 1; @@ -3691,7 +3362,7 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); @@ -3699,12 +3370,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) return 1; } -static int pause_interception(struct vcpu_svm *svm) -{ - kvm_vcpu_on_spin(&(svm->vcpu)); - return 1; -} - static int nop_interception(struct vcpu_svm *svm) { skip_emulated_instruction(&(svm->vcpu)); @@ -3735,11 +3400,8 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0xFF; struct kvm_lapic *apic = svm->vcpu.arch.apic; - trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); - switch (id) { case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* @@ -3760,7 +3422,6 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) int i; struct kvm_vcpu *vcpu; struct kvm *kvm = svm->vcpu.kvm; - struct kvm_lapic *apic = svm->vcpu.arch.apic; /* * At this point, we expect that the AVIC HW has already @@ -3769,9 +3430,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) */ kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, apic, - icrl & KVM_APIC_SHORT_MASK, + icrl & GVM_APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & KVM_APIC_DEST_MASK); + icrl & GVM_APIC_DEST_MASK); if (m && !avic_vcpu_is_running(vcpu)) kvm_vcpu_wake_up(vcpu); @@ -3781,7 +3442,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) case AVIC_IPI_FAILURE_INVALID_TARGET: break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: - WARN_ONCE(1, "Invalid backing page\n"); + //WARN_ONCE(1, "Invalid backing page\n"); break; default: pr_err("Unknown IPI interception\n"); @@ -3830,7 +3491,7 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); + READ_ONCE(*entry, new_entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); if (valid) @@ -3975,17 +3636,10 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) int ret = 0; u32 offset = svm->vmcb->control.exit_info_1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK; - u32 vector = svm->vmcb->control.exit_info_2 & - AVIC_UNACCEL_ACCESS_VECTOR_MASK; - bool write = (svm->vmcb->control.exit_info_1 >> 32) & - AVIC_UNACCEL_ACCESS_WRITE_MASK; bool trap = is_avic_unaccelerated_access_trap(offset); - trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, - trap, write, vector); if (trap) { /* Handling Trap */ - WARN_ONCE(!write, "svm: Handling trap read.\n"); ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ @@ -4025,7 +3679,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, @@ -4037,7 +3690,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, - [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, @@ -4046,7 +3698,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = vmrun_interception, - [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = vmload_interception, [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, @@ -4185,8 +3836,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -4202,13 +3851,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - vmexit = nested_svm_exit_special(svm); if (vmexit == NESTED_EXIT_CONTINUE) @@ -4221,10 +3863,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->exit_reason = GVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; - pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); + pr_err("kvm: FAILED VMRUN WITH VMCB:\n"); dump_vmcb(vcpu); return 0; } @@ -4240,7 +3882,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); + //WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -4297,7 +3939,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) BUG_ON(!(gif_set(svm))); - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | @@ -4362,11 +4003,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - return; -} - static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) { kvm_lapic_set_irr(vec, vcpu->arch.apic); @@ -4379,209 +4015,6 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - unsigned long flags; - struct amd_svm_iommu_ir *cur; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) - continue; - list_del(&cur->node); - kfree(cur); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - - /** - * In some cases, the existing irte is updaed and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); - } - - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_add(&ir->node, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; -} - -/** - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) -{ - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } - - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); - vcpu_info->vector = irq.vector; - - return 0; -} - -/* - * svm_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - int idx, ret = -EINVAL; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) - return 0; - - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - WARN_ON(guest_irq >= irq_rt->nr_rt_entries); - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; - - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtialization is disabled for the vcpu. - */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; - - /* Try to enable guest_mode in IRTE */ - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; - pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } - } - - if (!ret && svm) { - trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, - host_irq, e->gsi, - vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } - - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4682,10 +4115,6 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) svm->asid_generation--; } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) -{ -} - static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4729,7 +4158,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if ((svm->vcpu.arch.hflags & HF_IRET_MASK) && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); } svm->vcpu.arch.nmi_injected = false; @@ -4739,7 +4168,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(GVM_REQ_EVENT, &svm->vcpu); vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; @@ -4813,87 +4242,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); - asm volatile ( - "push %%" _ASM_BP "; \n\t" - "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" - "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" - "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" - "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" - "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" - "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%[svm]), %%r8 \n\t" - "mov %c[r9](%[svm]), %%r9 \n\t" - "mov %c[r10](%[svm]), %%r10 \n\t" - "mov %c[r11](%[svm]), %%r11 \n\t" - "mov %c[r12](%[svm]), %%r12 \n\t" - "mov %c[r13](%[svm]), %%r13 \n\t" - "mov %c[r14](%[svm]), %%r14 \n\t" - "mov %c[r15](%[svm]), %%r15 \n\t" -#endif - - /* Enter guest mode */ - "push %%" _ASM_AX " \n\t" - "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" - "pop %%" _ASM_AX " \n\t" - - /* Save guest registers, load host registers */ - "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" - "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%[svm]) \n\t" - "mov %%r9, %c[r9](%[svm]) \n\t" - "mov %%r10, %c[r10](%[svm]) \n\t" - "mov %%r11, %c[r11](%[svm]) \n\t" - "mov %%r12, %c[r12](%[svm]) \n\t" - "mov %%r13, %c[r13](%[svm]) \n\t" - "mov %%r14, %c[r14](%[svm]) \n\t" - "mov %%r15, %c[r15](%[svm]) \n\t" -#endif - "pop %%" _ASM_BP - : - : [svm]"a"(svm), - [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) -#ifdef CONFIG_X86_64 - , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" - , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx", "esi", "edi" -#endif - ); + __asm_svm_vcpu_run(svm); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS loadsegment(gs, svm->host.gs); #endif -#endif reload_tss(vcpu); @@ -4920,10 +4276,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - /* if exit due to PF check for async PF */ - if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); - if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); @@ -4940,7 +4292,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_set_cr3(struct kvm_vcpu *vcpu, size_t root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4949,7 +4301,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm_flush_tlb(vcpu); } -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) +static void set_tdp_cr3(struct kvm_vcpu *vcpu, size_t root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4974,17 +4326,6 @@ static int is_disabled(void) return 0; } -static void -svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xd9; -} - static void svm_check_processor_compat(void *rtn) { *(int *)rtn = 0; @@ -5008,7 +4349,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry *entry; /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); @@ -5021,7 +4362,7 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) entry->ecx &= ~bit(X86_FEATURE_X2APIC); } -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry *entry) { switch (func) { case 0x1: @@ -5059,7 +4400,7 @@ static int svm_get_lpage_level(void) static bool svm_rdtscp_supported(void) { - return boot_cpu_has(X86_FEATURE_RDTSCP); + return false; } static bool svm_invpcid_supported(void) @@ -5082,14 +4423,6 @@ static bool svm_has_wbinvd_exit(void) return true; } -static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - set_exception_intercept(svm, NM_VECTOR); - update_cr0_intercept(svm); -} - #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -5176,7 +4509,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, icpt_info.exit_code += info->modrm_reg; break; case SVM_EXIT_WRITE_CR0: { - unsigned long cr0, val; + size_t cr0, val; u64 intercept; if (info->intercept == x86_intercept_cr_write) @@ -5280,14 +4613,10 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu) * We must have an instruction with interrupts enabled, so * the timer interrupt isn't delayed by the interrupt shadow. */ - asm("nop"); + __nop(); local_irq_disable(); } -static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ -} - static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) @@ -5297,7 +4626,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static struct kvm_x86_ops svm_x86_ops __ro_after_init = { +static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, @@ -5315,7 +4644,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vm_init = avic_vm_init, .vm_destroy = avic_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .save_host_state = svm_save_host_state, + .load_host_state = svm_load_host_state, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = svm_vcpu_blocking, @@ -5348,11 +4678,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .get_pkru = svm_get_pkru, - - .fpu_activate = svm_fpu_activate, - .fpu_deactivate = svm_fpu_deactivate, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, @@ -5360,7 +4685,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .skip_emulated_instruction = skip_emulated_instruction, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, - .patch_hypercall = svm_patch_hypercall, .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, @@ -5376,7 +4700,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, - .sync_pir_to_irr = svm_sync_pir_to_irr, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, .apicv_post_state_restore = avic_post_state_restore, @@ -5407,23 +4730,15 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, - .sched_in = svm_sched_in, - - .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, - .update_pi_irte = svm_update_pi_irte, }; -static int __init svm_init(void) +int svm_init(void) { - return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), 0); } -static void __exit svm_exit(void) +void svm_exit(void) { kvm_exit(); } - -module_init(svm_init) -module_exit(svm_exit) diff --git a/arch/x86/kvm/svm_def.h b/arch/x86/kvm/svm_def.h new file mode 100755 index 0000000..2b5ce8e --- /dev/null +++ b/arch/x86/kvm/svm_def.h @@ -0,0 +1,176 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#define pr_fmt(fmt) "SVM: " fmt + +#include <linux/kvm_host.h> + +#include "irq.h" +#include "mmu.h" +#include "kvm_cache_regs.h" +#include "x86.h" +#include "cpuid.h" +#include "pmu.h" + +#include <asm/svm.h> +#include <asm/vmx.h> + +#include <__asm.h> + +#define IOPM_ALLOC_ORDER 2 +#define MSRPM_ALLOC_ORDER 1 + +#define SEG_TYPE_LDT 2 +#define SEG_TYPE_BUSY_TSS16 3 + +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_TSC_RATE (1 << 4) +#define SVM_FEATURE_VMCB_CLEAN (1 << 5) +#define SVM_FEATURE_FLUSH_ASID (1 << 6) +#define SVM_FEATURE_DECODE_ASSIST (1 << 7) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) + +#define SVM_AVIC_DOORBELL 0xc001011b + +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + +#define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL + +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 255 + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + +static bool erratum_383_found __read_mostly; + +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_TSC_AUX, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +struct kvm_vcpu; + +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vm_cr_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; + u64 vmcb_iopm; + + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + + /* cache for intercepts of the guest */ + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + + /* Nested Paging related state */ + u64 nested_cr3; +}; + +#define MSRPM_OFFSETS 16 +static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; + +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + size_t vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t tsc_aux; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + struct { + u16 fs; + u16 gs; + u16 ldt; + u64 gs_base; + } host; + + u32 *msrpm; + + ulong nmi_iret_rip; + + struct nested_state nested; + + bool nmi_singlestep; + + unsigned int3_injected; + size_t int3_rip; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; + + u32 ldr_reg; + struct page *avic_backing_page; + u64 *avic_physical_id_cache; + bool avic_is_running; +}; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h deleted file mode 100644 index 0a6cc67..0000000 --- a/arch/x86/kvm/trace.h +++ /dev/null @@ -1,1374 +0,0 @@ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H - -#include <linux/tracepoint.h> -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/clocksource.h> -#include <asm/pvclock-abi.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* - * Tracepoint for guest mode entry. - */ -TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - ), - - TP_printk("vcpu %u", __entry->vcpu_id) -); - -/* - * Tracepoint for hypercall. - */ -TRACE_EVENT(kvm_hypercall, - TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3), - TP_ARGS(nr, a0, a1, a2, a3), - - TP_STRUCT__entry( - __field( unsigned long, nr ) - __field( unsigned long, a0 ) - __field( unsigned long, a1 ) - __field( unsigned long, a2 ) - __field( unsigned long, a3 ) - ), - - TP_fast_assign( - __entry->nr = nr; - __entry->a0 = a0; - __entry->a1 = a1; - __entry->a2 = a2; - __entry->a3 = a3; - ), - - TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", - __entry->nr, __entry->a0, __entry->a1, __entry->a2, - __entry->a3) -); - -/* - * Tracepoint for hypercall. - */ -TRACE_EVENT(kvm_hv_hypercall, - TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, - __u64 ingpa, __u64 outgpa), - TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), - - TP_STRUCT__entry( - __field( __u16, rep_cnt ) - __field( __u16, rep_idx ) - __field( __u64, ingpa ) - __field( __u64, outgpa ) - __field( __u16, code ) - __field( bool, fast ) - ), - - TP_fast_assign( - __entry->rep_cnt = rep_cnt; - __entry->rep_idx = rep_idx; - __entry->ingpa = ingpa; - __entry->outgpa = outgpa; - __entry->code = code; - __entry->fast = fast; - ), - - TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", - __entry->code, __entry->fast ? "fast" : "slow", - __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, - __entry->outgpa) -); - -/* - * Tracepoint for PIO. - */ - -#define KVM_PIO_IN 0 -#define KVM_PIO_OUT 1 - -TRACE_EVENT(kvm_pio, - TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), - TP_ARGS(rw, port, size, count, data), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, port ) - __field( unsigned int, size ) - __field( unsigned int, count ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->port = port; - __entry->size = size; - __entry->count = count; - if (size == 1) - __entry->val = *(unsigned char *)data; - else if (size == 2) - __entry->val = *(unsigned short *)data; - else - __entry->val = *(unsigned int *)data; - ), - - TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", - __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count, __entry->val, - __entry->count > 1 ? "(...)" : "") -); - -/* - * Tracepoint for fast mmio. - */ -TRACE_EVENT(kvm_fast_mmio, - TP_PROTO(u64 gpa), - TP_ARGS(gpa), - - TP_STRUCT__entry( - __field(u64, gpa) - ), - - TP_fast_assign( - __entry->gpa = gpa; - ), - - TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) -); - -/* - * Tracepoint for cpuid. - */ -TRACE_EVENT(kvm_cpuid, - TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, - unsigned long rcx, unsigned long rdx), - TP_ARGS(function, rax, rbx, rcx, rdx), - - TP_STRUCT__entry( - __field( unsigned int, function ) - __field( unsigned long, rax ) - __field( unsigned long, rbx ) - __field( unsigned long, rcx ) - __field( unsigned long, rdx ) - ), - - TP_fast_assign( - __entry->function = function; - __entry->rax = rax; - __entry->rbx = rbx; - __entry->rcx = rcx; - __entry->rdx = rdx; - ), - - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", - __entry->function, __entry->rax, - __entry->rbx, __entry->rcx, __entry->rdx) -); - -#define AREG(x) { APIC_##x, "APIC_" #x } - -#define kvm_trace_symbol_apic \ - AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ - AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ - AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ - AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ - AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ - AREG(ECTRL) -/* - * Tracepoint for apic access. - */ -TRACE_EVENT(kvm_apic, - TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), - TP_ARGS(rw, reg, val), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, reg ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->reg = reg; - __entry->val = val; - ), - - TP_printk("apic_%s %s = 0x%x", - __entry->rw ? "write" : "read", - __print_symbolic(__entry->reg, kvm_trace_symbol_apic), - __entry->val) -); - -#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) -#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) - -#define KVM_ISA_VMX 1 -#define KVM_ISA_SVM 2 - -/* - * Tracepoint for kvm guest exit: - */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, - &__entry->info2); - ), - - TP_printk("reason %s rip 0x%lx info %llx %llx", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), - __entry->guest_rip, __entry->info1, __entry->info2) -); - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), - - TP_STRUCT__entry( - __field( unsigned int, irq ) - ), - - TP_fast_assign( - __entry->irq = irq; - ), - - TP_printk("irq %u", __entry->irq) -); - -#define EXS(x) { x##_VECTOR, "#" #x } - -#define kvm_trace_sym_exc \ - EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(AC), EXS(MC) - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), - - TP_STRUCT__entry( - __field( u8, exception ) - __field( u8, has_error ) - __field( u32, error_code ) - ), - - TP_fast_assign( - __entry->exception = exception; - __entry->has_error = has_error; - __entry->error_code = error_code; - ), - - TP_printk("%s (0x%x)", - __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) -); - -/* - * Tracepoint for page fault. - */ -TRACE_EVENT(kvm_page_fault, - TP_PROTO(unsigned long fault_address, unsigned int error_code), - TP_ARGS(fault_address, error_code), - - TP_STRUCT__entry( - __field( unsigned long, fault_address ) - __field( unsigned int, error_code ) - ), - - TP_fast_assign( - __entry->fault_address = fault_address; - __entry->error_code = error_code; - ), - - TP_printk("address %lx error_code %x", - __entry->fault_address, __entry->error_code) -); - -/* - * Tracepoint for guest MSR access. - */ -TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), - TP_ARGS(write, ecx, data, exception), - - TP_STRUCT__entry( - __field( unsigned, write ) - __field( u32, ecx ) - __field( u64, data ) - __field( u8, exception ) - ), - - TP_fast_assign( - __entry->write = write; - __entry->ecx = ecx; - __entry->data = data; - __entry->exception = exception; - ), - - TP_printk("msr_%s %x = 0x%llx%s", - __entry->write ? "write" : "read", - __entry->ecx, __entry->data, - __entry->exception ? " (#GP)" : "") -); - -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) -#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) -#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) - -/* - * Tracepoint for guest CR access. - */ -TRACE_EVENT(kvm_cr, - TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), - TP_ARGS(rw, cr, val), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, cr ) - __field( unsigned long, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->cr = cr; - __entry->val = val; - ), - - TP_printk("cr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->cr, __entry->val) -); - -#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) -#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) - -TRACE_EVENT(kvm_pic_set_irq, - TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), - TP_ARGS(chip, pin, elcr, imr, coalesced), - - TP_STRUCT__entry( - __field( __u8, chip ) - __field( __u8, pin ) - __field( __u8, elcr ) - __field( __u8, imr ) - __field( bool, coalesced ) - ), - - TP_fast_assign( - __entry->chip = chip; - __entry->pin = pin; - __entry->elcr = elcr; - __entry->imr = imr; - __entry->coalesced = coalesced; - ), - - TP_printk("chip %u pin %u (%s%s)%s", - __entry->chip, __entry->pin, - (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", - (__entry->imr & (1 << __entry->pin)) ? "|masked":"", - __entry->coalesced ? " (coalesced)" : "") -); - -#define kvm_apic_dst_shorthand \ - {0x0, "dst"}, \ - {0x1, "self"}, \ - {0x2, "all"}, \ - {0x3, "all-but-self"} - -TRACE_EVENT(kvm_apic_ipi, - TP_PROTO(__u32 icr_low, __u32 dest_id), - TP_ARGS(icr_low, dest_id), - - TP_STRUCT__entry( - __field( __u32, icr_low ) - __field( __u32, dest_id ) - ), - - TP_fast_assign( - __entry->icr_low = icr_low; - __entry->dest_id = dest_id; - ), - - TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", - __entry->dest_id, (u8)__entry->icr_low, - __print_symbolic((__entry->icr_low >> 8 & 0x7), - kvm_deliver_mode), - (__entry->icr_low & (1<<11)) ? "logical" : "physical", - (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", - (__entry->icr_low & (1<<15)) ? "level" : "edge", - __print_symbolic((__entry->icr_low >> 18 & 0x3), - kvm_apic_dst_shorthand)) -); - -TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), - TP_ARGS(apicid, dm, tm, vec), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( __u16, dm ) - __field( __u8, tm ) - __field( __u8, vec ) - ), - - TP_fast_assign( - __entry->apicid = apicid; - __entry->dm = dm; - __entry->tm = tm; - __entry->vec = vec; - ), - - TP_printk("apicid %x vec %u (%s|%s)", - __entry->apicid, __entry->vec, - __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), - __entry->tm ? "level" : "edge") -); - -TRACE_EVENT(kvm_eoi, - TP_PROTO(struct kvm_lapic *apic, int vector), - TP_ARGS(apic, vector), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( int, vector ) - ), - - TP_fast_assign( - __entry->apicid = apic->vcpu->vcpu_id; - __entry->vector = vector; - ), - - TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) -); - -TRACE_EVENT(kvm_pv_eoi, - TP_PROTO(struct kvm_lapic *apic, int vector), - TP_ARGS(apic, vector), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( int, vector ) - ), - - TP_fast_assign( - __entry->apicid = apic->vcpu->vcpu_id; - __entry->vector = vector; - ), - - TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) -); - -/* - * Tracepoint for nested VMRUN - */ -TRACE_EVENT(kvm_nested_vmrun, - TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl, - __u32 event_inj, bool npt), - TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u64, vmcb ) - __field( __u64, nested_rip ) - __field( __u32, int_ctl ) - __field( __u32, event_inj ) - __field( bool, npt ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->vmcb = vmcb; - __entry->nested_rip = nested_rip; - __entry->int_ctl = int_ctl; - __entry->event_inj = event_inj; - __entry->npt = npt; - ), - - TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " - "event_inj: 0x%08x npt: %s", - __entry->rip, __entry->vmcb, __entry->nested_rip, - __entry->int_ctl, __entry->event_inj, - __entry->npt ? "on" : "off") -); - -TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), - - TP_STRUCT__entry( - __field( __u16, cr_read ) - __field( __u16, cr_write ) - __field( __u32, exceptions ) - __field( __u64, intercept ) - ), - - TP_fast_assign( - __entry->cr_read = cr_read; - __entry->cr_write = cr_write; - __entry->exceptions = exceptions; - __entry->intercept = intercept; - ), - - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) -); -/* - * Tracepoint for #VMEXIT while nested - */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); - -/* - * Tracepoint for #VMEXIT reinjected to the guest - */ -TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - - TP_printk("reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_nested_intr_vmexit, - TP_PROTO(__u64 rip), - TP_ARGS(rip), - - TP_STRUCT__entry( - __field( __u64, rip ) - ), - - TP_fast_assign( - __entry->rip = rip - ), - - TP_printk("rip: 0x%016llx", __entry->rip) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), - TP_ARGS(rip, asid, address), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->asid = asid; - __entry->address = address; - ), - - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", - __entry->rip, __entry->asid, __entry->address) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_skinit, - TP_PROTO(__u64 rip, __u32 slb), - TP_ARGS(rip, slb), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, slb ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->slb = slb; - ), - - TP_printk("rip: 0x%016llx slb: 0x%08x", - __entry->rip, __entry->slb) -); - -#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) -#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) -#define KVM_EMUL_INSN_F_CS_D (1 << 2) -#define KVM_EMUL_INSN_F_CS_L (1 << 3) - -#define kvm_trace_symbol_emul_flags \ - { 0, "real" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ - { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L, "prot64" } - -#define kei_decode_mode(mode) ({ \ - u8 flags = 0xff; \ - switch (mode) { \ - case X86EMUL_MODE_REAL: \ - flags = 0; \ - break; \ - case X86EMUL_MODE_VM86: \ - flags = KVM_EMUL_INSN_F_EFL_VM; \ - break; \ - case X86EMUL_MODE_PROT16: \ - flags = KVM_EMUL_INSN_F_CR0_PE; \ - break; \ - case X86EMUL_MODE_PROT32: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D; \ - break; \ - case X86EMUL_MODE_PROT64: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L; \ - break; \ - } \ - flags; \ - }) - -TRACE_EVENT(kvm_emulate_insn, - TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), - TP_ARGS(vcpu, failed), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) - ), - - TP_fast_assign( - __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr - - vcpu->arch.emulate_ctxt.fetch.data; - __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; - memcpy(__entry->insn, - vcpu->arch.emulate_ctxt.fetch.data, - 15); - __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); - __entry->failed = failed; - ), - - TP_printk("%x:%llx:%s (%s)%s", - __entry->csbase, __entry->rip, - __print_hex(__entry->insn, __entry->len), - __print_symbolic(__entry->flags, - kvm_trace_symbol_emul_flags), - __entry->failed ? " failed" : "" - ) - ); - -#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) -#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) - -TRACE_EVENT( - vcpu_match_mmio, - TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), - TP_ARGS(gva, gpa, write, gpa_match), - - TP_STRUCT__entry( - __field(gva_t, gva) - __field(gpa_t, gpa) - __field(bool, write) - __field(bool, gpa_match) - ), - - TP_fast_assign( - __entry->gva = gva; - __entry->gpa = gpa; - __entry->write = write; - __entry->gpa_match = gpa_match - ), - - TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, - __entry->write ? "Write" : "Read", - __entry->gpa_match ? "GPA" : "GVA") -); - -TRACE_EVENT(kvm_write_tsc_offset, - TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, - __u64 next_tsc_offset), - TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( __u64, previous_tsc_offset ) - __field( __u64, next_tsc_offset ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->previous_tsc_offset = previous_tsc_offset; - __entry->next_tsc_offset = next_tsc_offset; - ), - - TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, - __entry->previous_tsc_offset, __entry->next_tsc_offset) -); - -#ifdef CONFIG_X86_64 - -#define host_clocks \ - {VCLOCK_NONE, "none"}, \ - {VCLOCK_TSC, "tsc"} \ - -TRACE_EVENT(kvm_update_master_clock, - TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), - TP_ARGS(use_master_clock, host_clock, offset_matched), - - TP_STRUCT__entry( - __field( bool, use_master_clock ) - __field( unsigned int, host_clock ) - __field( bool, offset_matched ) - ), - - TP_fast_assign( - __entry->use_master_clock = use_master_clock; - __entry->host_clock = host_clock; - __entry->offset_matched = offset_matched; - ), - - TP_printk("masterclock %d hostclock %s offsetmatched %u", - __entry->use_master_clock, - __print_symbolic(__entry->host_clock, host_clocks), - __entry->offset_matched) -); - -TRACE_EVENT(kvm_track_tsc, - TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, - unsigned int online_vcpus, bool use_master_clock, - unsigned int host_clock), - TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, - host_clock), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( unsigned int, nr_vcpus_matched_tsc ) - __field( unsigned int, online_vcpus ) - __field( bool, use_master_clock ) - __field( unsigned int, host_clock ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->nr_vcpus_matched_tsc = nr_matched; - __entry->online_vcpus = online_vcpus; - __entry->use_master_clock = use_master_clock; - __entry->host_clock = host_clock; - ), - - TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" - " hostclock %s", - __entry->vcpu_id, __entry->use_master_clock, - __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, - __print_symbolic(__entry->host_clock, host_clocks)) -); - -#endif /* CONFIG_X86_64 */ - -/* - * Tracepoint for PML full VMEXIT. - */ -TRACE_EVENT(kvm_pml_full, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - ), - - TP_printk("vcpu %d: PML full", __entry->vcpu_id) -); - -TRACE_EVENT(kvm_ple_window, - TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), - TP_ARGS(grow, vcpu_id, new, old), - - TP_STRUCT__entry( - __field( bool, grow ) - __field( unsigned int, vcpu_id ) - __field( int, new ) - __field( int, old ) - ), - - TP_fast_assign( - __entry->grow = grow; - __entry->vcpu_id = vcpu_id; - __entry->new = new; - __entry->old = old; - ), - - TP_printk("vcpu %u: ple_window %d (%s %d)", - __entry->vcpu_id, - __entry->new, - __entry->grow ? "grow" : "shrink", - __entry->old) -); - -#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ - trace_kvm_ple_window(true, vcpu_id, new, old) -#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ - trace_kvm_ple_window(false, vcpu_id, new, old) - -TRACE_EVENT(kvm_pvclock_update, - TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), - TP_ARGS(vcpu_id, pvclock), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( __u32, version ) - __field( __u64, tsc_timestamp ) - __field( __u64, system_time ) - __field( __u32, tsc_to_system_mul ) - __field( __s8, tsc_shift ) - __field( __u8, flags ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->version = pvclock->version; - __entry->tsc_timestamp = pvclock->tsc_timestamp; - __entry->system_time = pvclock->system_time; - __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul; - __entry->tsc_shift = pvclock->tsc_shift; - __entry->flags = pvclock->flags; - ), - - TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, " - "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, " - "flags 0x%x }", - __entry->vcpu_id, - __entry->version, - __entry->tsc_timestamp, - __entry->system_time, - __entry->tsc_to_system_mul, - __entry->tsc_shift, - __entry->flags) -); - -TRACE_EVENT(kvm_wait_lapic_expire, - TP_PROTO(unsigned int vcpu_id, s64 delta), - TP_ARGS(vcpu_id, delta), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( s64, delta ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->delta = delta; - ), - - TP_printk("vcpu %u: delta %lld (%s)", - __entry->vcpu_id, - __entry->delta, - __entry->delta < 0 ? "early" : "late") -); - -TRACE_EVENT(kvm_enter_smm, - TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), - TP_ARGS(vcpu_id, smbase, entering), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - __field( u64, smbase ) - __field( bool, entering ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->smbase = smbase; - __entry->entering = entering; - ), - - TP_printk("vcpu %u: %s SMM, smbase 0x%llx", - __entry->vcpu_id, - __entry->entering ? "entering" : "leaving", - __entry->smbase) -); - -/* - * Tracepoint for VT-d posted-interrupts. - */ -TRACE_EVENT(kvm_pi_irte_update, - TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, - unsigned int gsi, unsigned int gvec, - u64 pi_desc_addr, bool set), - TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), - - TP_STRUCT__entry( - __field( unsigned int, host_irq ) - __field( unsigned int, vcpu_id ) - __field( unsigned int, gsi ) - __field( unsigned int, gvec ) - __field( u64, pi_desc_addr ) - __field( bool, set ) - ), - - TP_fast_assign( - __entry->host_irq = host_irq; - __entry->vcpu_id = vcpu_id; - __entry->gsi = gsi; - __entry->gvec = gvec; - __entry->pi_desc_addr = pi_desc_addr; - __entry->set = set; - ), - - TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " - "gvec: 0x%x, pi_desc_addr: 0x%llx", - __entry->set ? "enabled and being updated" : "disabled", - __entry->host_irq, - __entry->vcpu_id, - __entry->gsi, - __entry->gvec, - __entry->pi_desc_addr) -); - -/* - * Tracepoint for kvm_hv_notify_acked_sint. - */ -TRACE_EVENT(kvm_hv_notify_acked_sint, - TP_PROTO(int vcpu_id, u32 sint), - TP_ARGS(vcpu_id, sint), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, sint) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->sint = sint; - ), - - TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint) -); - -/* - * Tracepoint for synic_set_irq. - */ -TRACE_EVENT(kvm_hv_synic_set_irq, - TP_PROTO(int vcpu_id, u32 sint, int vector, int ret), - TP_ARGS(vcpu_id, sint, vector, ret), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, sint) - __field(int, vector) - __field(int, ret) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->sint = sint; - __entry->vector = vector; - __entry->ret = ret; - ), - - TP_printk("vcpu_id %d sint %u vector %d ret %d", - __entry->vcpu_id, __entry->sint, __entry->vector, - __entry->ret) -); - -/* - * Tracepoint for kvm_hv_synic_send_eoi. - */ -TRACE_EVENT(kvm_hv_synic_send_eoi, - TP_PROTO(int vcpu_id, int vector), - TP_ARGS(vcpu_id, vector), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, sint) - __field(int, vector) - __field(int, ret) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->vector = vector; - ), - - TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector) -); - -/* - * Tracepoint for synic_set_msr. - */ -TRACE_EVENT(kvm_hv_synic_set_msr, - TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host), - TP_ARGS(vcpu_id, msr, data, host), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(u32, msr) - __field(u64, data) - __field(bool, host) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->msr = msr; - __entry->data = data; - __entry->host = host - ), - - TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d", - __entry->vcpu_id, __entry->msr, __entry->data, __entry->host) -); - -/* - * Tracepoint for stimer_set_config. - */ -TRACE_EVENT(kvm_hv_stimer_set_config, - TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host), - TP_ARGS(vcpu_id, timer_index, config, host), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, config) - __field(bool, host) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->config = config; - __entry->host = host; - ), - - TP_printk("vcpu_id %d timer %d config 0x%llx host %d", - __entry->vcpu_id, __entry->timer_index, __entry->config, - __entry->host) -); - -/* - * Tracepoint for stimer_set_count. - */ -TRACE_EVENT(kvm_hv_stimer_set_count, - TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host), - TP_ARGS(vcpu_id, timer_index, count, host), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, count) - __field(bool, host) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->count = count; - __entry->host = host; - ), - - TP_printk("vcpu_id %d timer %d count %llu host %d", - __entry->vcpu_id, __entry->timer_index, __entry->count, - __entry->host) -); - -/* - * Tracepoint for stimer_start(periodic timer case). - */ -TRACE_EVENT(kvm_hv_stimer_start_periodic, - TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time), - TP_ARGS(vcpu_id, timer_index, time_now, exp_time), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, time_now) - __field(u64, exp_time) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->time_now = time_now; - __entry->exp_time = exp_time; - ), - - TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu", - __entry->vcpu_id, __entry->timer_index, __entry->time_now, - __entry->exp_time) -); - -/* - * Tracepoint for stimer_start(one-shot timer case). - */ -TRACE_EVENT(kvm_hv_stimer_start_one_shot, - TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count), - TP_ARGS(vcpu_id, timer_index, time_now, count), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(u64, time_now) - __field(u64, count) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->time_now = time_now; - __entry->count = count; - ), - - TP_printk("vcpu_id %d timer %d time_now %llu count %llu", - __entry->vcpu_id, __entry->timer_index, __entry->time_now, - __entry->count) -); - -/* - * Tracepoint for stimer_timer_callback. - */ -TRACE_EVENT(kvm_hv_stimer_callback, - TP_PROTO(int vcpu_id, int timer_index), - TP_ARGS(vcpu_id, timer_index), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - ), - - TP_printk("vcpu_id %d timer %d", - __entry->vcpu_id, __entry->timer_index) -); - -/* - * Tracepoint for stimer_expiration. - */ -TRACE_EVENT(kvm_hv_stimer_expiration, - TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), - TP_ARGS(vcpu_id, timer_index, msg_send_result), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - __field(int, msg_send_result) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - __entry->msg_send_result = msg_send_result; - ), - - TP_printk("vcpu_id %d timer %d msg send result %d", - __entry->vcpu_id, __entry->timer_index, - __entry->msg_send_result) -); - -/* - * Tracepoint for stimer_cleanup. - */ -TRACE_EVENT(kvm_hv_stimer_cleanup, - TP_PROTO(int vcpu_id, int timer_index), - TP_ARGS(vcpu_id, timer_index), - - TP_STRUCT__entry( - __field(int, vcpu_id) - __field(int, timer_index) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->timer_index = timer_index; - ), - - TP_printk("vcpu_id %d timer %d", - __entry->vcpu_id, __entry->timer_index) -); - -/* - * Tracepoint for AMD AVIC - */ -TRACE_EVENT(kvm_avic_incomplete_ipi, - TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index), - TP_ARGS(vcpu, icrh, icrl, id, index), - - TP_STRUCT__entry( - __field(u32, vcpu) - __field(u32, icrh) - __field(u32, icrl) - __field(u32, id) - __field(u32, index) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->icrh = icrh; - __entry->icrl = icrl; - __entry->id = id; - __entry->index = index; - ), - - TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", - __entry->vcpu, __entry->icrh, __entry->icrl, - __entry->id, __entry->index) -); - -TRACE_EVENT(kvm_avic_unaccelerated_access, - TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec), - TP_ARGS(vcpu, offset, ft, rw, vec), - - TP_STRUCT__entry( - __field(u32, vcpu) - __field(u32, offset) - __field(bool, ft) - __field(bool, rw) - __field(u32, vec) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->offset = offset; - __entry->ft = ft; - __entry->rw = rw; - __entry->vec = vec; - ), - - TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", - __entry->vcpu, - __entry->offset, - __print_symbolic(__entry->offset, kvm_trace_symbol_apic), - __entry->ft ? "trap" : "fault", - __entry->rw ? "write" : "read", - __entry->vec) -); - -TRACE_EVENT(kvm_hv_timer_state, - TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), - TP_ARGS(vcpu_id, hv_timer_in_use), - TP_STRUCT__entry( - __field(unsigned int, vcpu_id) - __field(unsigned int, hv_timer_in_use) - ), - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->hv_timer_in_use = hv_timer_in_use; - ), - TP_printk("vcpu_id %x hv_timer %x\n", - __entry->vcpu_id, - __entry->hv_timer_in_use) -); -#endif /* _TRACE_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h index 622aa10..622aa10 100644..100755 --- a/arch/x86/kvm/tss.h +++ b/arch/x86/kvm/tss.h diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5382b82..4de8486 100644..100755 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6,6 +6,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@qumranet.com> @@ -22,639 +23,67 @@ #include "lapic.h" #include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/moduleparam.h> -#include <linux/mod_devicetable.h> -#include <linux/trace_events.h> -#include <linux/slab.h> -#include <linux/tboot.h> -#include <linux/hrtimer.h> +#include <linux/list.h> +#include <ntkrutils.h> +#include <__asm.h> #include "kvm_cache_regs.h" #include "x86.h" - -#include <asm/cpu.h> -#include <asm/io.h> -#include <asm/desc.h> #include <asm/vmx.h> -#include <asm/virtext.h> -#include <asm/mce.h> -#include <asm/fpu/internal.h> -#include <asm/perf_event.h> -#include <asm/debugreg.h> -#include <asm/kexec.h> -#include <asm/apic.h> -#include <asm/irq_remapping.h> - -#include "trace.h" -#include "pmu.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) +#include "pmu.h" +// seperate struct definitions to vmx_def.h so that asmgen can include +#include "vmx_def.h" -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); +#pragma warning(disable : 4146) +#pragma warning(disable : 4127) +#pragma warning(disable : 4334) -static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); +#define DR6_RESERVED (0xFFFF0FF0) -static bool __read_mostly enable_vpid = 1; -module_param_named(vpid, enable_vpid, bool, 0444); +static bool enable_vpid = 0; -static bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); +static bool flexpriority_enabled = 1; -static bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); +static bool enable_ept = 1; -static bool __read_mostly enable_unrestricted_guest = 1; -module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); +static bool enable_unrestricted_guest = 1; -static bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); +static bool enable_ept_ad_bits = 1; -static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static bool emulate_invalid_guest_state = true; -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); +static bool vmm_exclusive = 1; -static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); +static bool fasteoi = 1; -static bool __read_mostly enable_apicv = 1; -module_param(enable_apicv, bool, S_IRUGO); +static bool enable_apicv = 1; -static bool __read_mostly enable_shadow_vmcs = 1; -module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); +static bool enable_shadow_vmcs = 0; /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static bool __read_mostly nested = 0; -module_param(nested, bool, S_IRUGO); - -static u64 __read_mostly host_xss; - -static bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); - -#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL - -/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ -static int __read_mostly cpu_preemption_timer_multi; -static bool __read_mostly enable_preemption_timer = 1; -#ifdef CONFIG_X86_64 -module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); -#endif - -#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) - -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) - -#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 - -/* - * These 2 parameters are used to config the controls for Pause-Loop Exiting: - * ple_gap: upper bound on the amount of time between two successive - * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually smaller than 128 cycles. - * ple_window: upper bound on the amount of time a guest is allowed to execute - * in a PAUSE loop. Tests indicate that most spinlocks are held for - * less than 2^12 cycles - * Time is measured based on a counter that runs at the same rate as the TSC, - * refer SDM volume 3b section 21.6.13 & 22.1.3. - */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 -#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 -#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 -#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ - INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW - -static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; -module_param(ple_gap, int, S_IRUGO); - -static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, int, S_IRUGO); - -/* Default doubles per-vcpu window every exit. */ -static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, int, S_IRUGO); - -/* Default resets per-vcpu window every exit to ple_window. */ -static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, int, S_IRUGO); - -/* Default is to compute the maximum so we can never overflow. */ -static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, int, S_IRUGO); - -extern const ulong vmx_return; - -#define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 - -struct vmcs { - u32 revision_id; - u32 abort; - char data[0]; -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - struct vmcs *shadow_vmcs; - int cpu; - int launched; - struct list_head loaded_vmcss_on_cpu_link; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * If there are changes in this struct, VMCS12_REVISION must be changed. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - u32 revision_id; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 posted_intr_desc_addr; - u64 ept_pointer; - u64 eoi_exit_bitmap0; - u64 eoi_exit_bitmap1; - u64 eoi_exit_bitmap2; - u64 eoi_exit_bitmap3; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 padding64[8]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 vmx_preemption_timer_value; - u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; - u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 guest_intr_status; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; -}; - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - */ -#define VMCS12_REVISION 0x11e57ed0 +static bool nested = 0; -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 +static u64 host_xss; -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - gpa_t vmxon_ptr; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* The host-usable pointer to the above */ - struct page *current_vmcs12_page; - struct vmcs12 *current_vmcs12; - /* - * Cache of the guest's VMCS, existing outside of guest memory. - * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMXOFF, VMCLEAR, VMPTRLD. - */ - struct vmcs12 *cached_vmcs12; - /* - * Indicates if the shadow vmcs must be updated with the - * data hold by vmcs12 - */ - bool sync_shadow_vmcs; - - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; - bool change_vmcs01_virtual_x2apic_mode; - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; - struct pi_desc *pi_desc; - bool pi_pending; - u16 posted_intr_nv; - - unsigned long *msr_bitmap; - - struct hrtimer preemption_timer; - bool preemption_timer_expired; - - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - - u16 vpid02; - u16 last_vpid; - - u32 nested_vmx_procbased_ctls_low; - u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; - u32 nested_vmx_secondary_ctls_low; - u32 nested_vmx_secondary_ctls_high; - u32 nested_vmx_pinbased_ctls_low; - u32 nested_vmx_pinbased_ctls_high; - u32 nested_vmx_exit_ctls_low; - u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; - u32 nested_vmx_entry_ctls_low; - u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; - u32 nested_vmx_misc_low; - u32 nested_vmx_misc_high; - u32 nested_vmx_ept_caps; - u32 nested_vmx_vpid_caps; -}; - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} +static bool enable_pml = 0; -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - bool nmi_known_unmasked; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; - unsigned long host_idt_base; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; - } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - int gs_ldt_reload_needed; - int fs_reload_needed; - u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr4; /* May not match real cr4 */ - } host_state; - struct { - int vm86_active; - ulong save_rflags; - struct kvm_segment segs[8]; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - u32 exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; - - /* Dynamic PLE window. */ - int ple_window; - bool ple_window_dirty; - - /* Support for PML */ -#define PML_ENTITY_NUM 512 - struct page *pml_pg; - - /* apic deadline value in host tsc */ - u64 hv_deadline_tsc; - - u64 current_tsc_ratio; - - bool guest_pkru_valid; - u32 guest_pkru; - u32 host_pkru; - - /* - * Only bits masked by msr_ia32_feature_control_valid_bits can be set in - * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included - * in msr_ia32_feature_control_valid_bits. - */ - u64 msr_ia32_feature_control; - u64 msr_ia32_feature_control_valid_bits; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; +extern const size_t vmx_return; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } -static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ [number##_HIGH] = VMCS12_OFFSET(name)+4 -static unsigned long shadow_read_only_fields[] = { +static size_t shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -680,7 +109,7 @@ static unsigned long shadow_read_only_fields[] = { static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static unsigned long shadow_read_write_fields[] = { +static size_t shadow_read_write_fields[] = { TPR_THRESHOLD, GUEST_RIP, GUEST_RSP, @@ -853,10 +282,8 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RIP, host_rip), }; -static inline short vmcs_field_to_offset(unsigned long field) +static inline short vmcs_field_to_offset(size_t field) { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || vmcs_field_to_offset_table[field] == 0) return -ENOENT; @@ -869,27 +296,31 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } -static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) +static PMDL nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { - struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); - if (is_error_page(page)) + PMDL mdl; + size_t hva; + + hva = kvm_vcpu_gfn_to_hva(vcpu, addr >> PAGE_SHIFT); + if (kvm_is_error_hva(hva)) return NULL; - return page; -} + mdl = IoAllocateMdl((void *)hva, PAGE_SIZE, FALSE, FALSE, NULL); + if (!mdl) + return NULL; -static void nested_release_page(struct page *page) -{ - kvm_release_page_dirty(page); + MmProbeAndLockPages(mdl, KernelMode, IoWriteAccess); + + return mdl; } -static void nested_release_page_clean(struct page *page) +static void nested_release_page(PMDL mdl) { - kvm_release_page_clean(page); + kvm_release_page(mdl); } -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); +static size_t nested_ept_get_cr3(struct kvm_vcpu *vcpu); +static u64 construct_eptp(size_t root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_xsaves_supported(void); @@ -904,14 +335,10 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -/* - * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed - * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. - */ -static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +static DEFINE_PER_CPU(struct desc_ptr, host_idt); /* * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we @@ -920,16 +347,16 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +static size_t *vmx_io_bitmap_a; +static size_t *vmx_io_bitmap_b; +static size_t *vmx_msr_bitmap_legacy; +static size_t *vmx_msr_bitmap_longmode; +static size_t *vmx_msr_bitmap_legacy_x2apic; +static size_t *vmx_msr_bitmap_longmode_x2apic; +static size_t *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +static size_t *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; +static size_t *vmx_vmread_bitmap; +static size_t *vmx_vmwrite_bitmap; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -982,17 +409,6 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); -/* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. - */ -static const u32 vmx_msr_index[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, -}; - static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1015,11 +431,6 @@ static inline bool is_page_fault(u32 intr_info) return is_exception_n(intr_info, PF_VECTOR); } -static inline bool is_no_device(u32 intr_info) -{ - return is_exception_n(intr_info, NM_VECTOR); -} - static inline bool is_invalid_opcode(u32 intr_info) { return is_exception_n(intr_info, UD_VECTOR); @@ -1083,69 +494,10 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } -/* - * Comment's format: document - errata name - stepping - processor name. - * Refer from - * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp - */ -static u32 vmx_preemption_cpu_tfms[] = { -/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ -0x000206E6, -/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ -/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ -/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020652, -/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020655, -/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ -/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ -/* - * 320767.pdf - AAP86 - B1 - - * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile - */ -0x000106E5, -/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ -0x000106A0, -/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ -0x000106A1, -/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ -0x000106A4, - /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ - /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ - /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ -0x000106A5, -}; - -static inline bool cpu_has_broken_vmx_preemption_timer(void) -{ - u32 eax = cpuid_eax(0x00000001), i; - - /* Clear the reserved bits */ - eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) - if (eax == vmx_preemption_cpu_tfms[i]) - return true; - - return false; -} - -static inline bool cpu_has_vmx_preemption_timer(void) -{ - return vmcs_config.pin_based_exec_ctrl & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool cpu_has_vmx_posted_intr(void) -{ - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; -} - static inline bool cpu_has_vmx_apicv(void) { return cpu_has_vmx_apic_register_virt() && - cpu_has_vmx_virtual_intr_delivery() && - cpu_has_vmx_posted_intr(); + cpu_has_vmx_virtual_intr_delivery(); } static inline bool cpu_has_vmx_flexpriority(void) @@ -1211,12 +563,6 @@ static inline bool cpu_has_vmx_unrestricted_guest(void) SECONDARY_EXEC_UNRESTRICTED_GUEST; } -static inline bool cpu_has_vmx_ple(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_PAUSE_LOOP_EXITING; -} - static inline bool cpu_has_vmx_basic_inout(void) { return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); @@ -1273,12 +619,6 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } -static inline bool cpu_has_vmx_tsc_scaling(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_TSC_SCALING; -} - static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1301,12 +641,6 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } -static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1338,11 +672,6 @@ static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } -static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; -} - static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1351,65 +680,20 @@ static inline bool is_exception(u32 intr_info) static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification); + size_t exit_qualification); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification); - -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) - return i; - return -1; -} + u32 reason, size_t qualification); -static inline void __invvpid(int ext, u16 vpid, gva_t gva) -{ - struct { - u64 vpid : 16; - u64 rsvd : 48; - u64 gva; - } operand = { vpid, 0, gva }; - - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); -} - -static inline void __invept(int ext, u64 eptp, gpa_t gpa) -{ - struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); -} - -static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - i = __find_msr_index(vmx, msr); - if (i >= 0) - return &vmx->guest_msrs[i]; - return NULL; -} +#define __invvpid(a, b, c) +#define __invept(a, b, c) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); + error = __vmx_vmclear(&phys_addr); if (error) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); @@ -1429,85 +713,12 @@ static void vmcs_load(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + error = __vmx_vmptrld(&phys_addr); + if (error) { + DbgBreakPoint(); printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -} - -#ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static void crash_vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v; - - if (!crash_local_vmclear_enabled(cpu)) - return; - - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - vmcs_clear(v->vmcs); -} -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } -#endif /* CONFIG_KEXEC_CORE */ - -static void __loaded_vmcs_clear(void *arg) -{ - struct loaded_vmcs *loaded_vmcs = arg; - int cpu = raw_smp_processor_id(); - - if (loaded_vmcs->cpu != cpu) - return; /* vcpu migration can race with cpu offline */ - if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) - per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); - list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); - - /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. - */ - smp_wmb(); - - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); -} - -static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) -{ - int cpu = loaded_vmcs->cpu; - - if (cpu != -1) - smp_call_function_single(cpu, - __loaded_vmcs_clear, loaded_vmcs, 1); + vmcs, phys_addr); + } } static inline void vpid_sync_vcpu_single(int vpid) @@ -1549,154 +760,118 @@ static inline void ept_sync_context(u64 eptp) } } -static __always_inline void vmcs_check16(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "16-bit accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "16-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "16-bit accessor invalid for natural width field"); -} +#define VMCS_RW_DEBUG -static __always_inline void vmcs_check32(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "32-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "32-bit accessor invalid for natural width field"); -} +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +static void vmx_vcpu_put(struct kvm_vcpu *vcpu); -static __always_inline void vmcs_check64(unsigned long field) +static __forceinline size_t __vmcs_readl(struct kvm_vcpu* vcpu, size_t field) { - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "64-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "64-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "64-bit accessor invalid for 32-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "64-bit accessor invalid for natural width field"); -} + size_t value; -static __always_inline void vmcs_checkl(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "Natural width accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "Natural width accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "Natural width accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "Natural width accessor invalid for 32-bit field"); -} + preempt_disable(); + vmcs_load(to_vmx(vcpu)->loaded_vmcs->vmcs); -static __always_inline unsigned long __vmcs_readl(unsigned long field) -{ - unsigned long value; + __vmx_vmread(field, &value); + + vmcs_clear(to_vmx(vcpu)->loaded_vmcs->vmcs); + preempt_enable(); - asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") - : "=a"(value) : "d"(field) : "cc"); return value; } -static __always_inline u16 vmcs_read16(unsigned long field) +static __forceinline u16 vmcs_read16(struct kvm_vcpu* vcpu, size_t field) { - vmcs_check16(field); - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); } -static __always_inline u32 vmcs_read32(unsigned long field) +static __forceinline u32 vmcs_read32(struct kvm_vcpu* vcpu, size_t field) { - vmcs_check32(field); - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); } -static __always_inline u64 vmcs_read64(unsigned long field) +static __forceinline u64 vmcs_read64(struct kvm_vcpu* vcpu, size_t field) { - vmcs_check64(field); #ifdef CONFIG_X86_64 - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); #else return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); #endif } -static __always_inline unsigned long vmcs_readl(unsigned long field) +static __forceinline size_t vmcs_readl(struct kvm_vcpu* vcpu, size_t field) { - vmcs_checkl(field); - return __vmcs_readl(field); + return __vmcs_readl(vcpu, field); } -static noinline void vmwrite_error(unsigned long field, unsigned long value) +static __declspec(noinline) void vmwrite_error(struct kvm_vcpu* vcpu, size_t field, size_t value) { printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + field, value, vmcs_read32(vcpu, VM_INSTRUCTION_ERROR)); +#if 0 dump_stack(); +#endif } -static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) +static __always_inline void __vmcs_writel(struct kvm_vcpu* vcpu, size_t field, size_t value) { u8 error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); - if (unlikely(error)) - vmwrite_error(field, value); + preempt_disable(); + vmcs_load(to_vmx(vcpu)->loaded_vmcs->vmcs); + + error = __vmx_vmwrite(field, value); + if (unlikely(error)) { + DbgBreakPoint(); + vmwrite_error(vcpu, field, value); + } + + vmcs_clear(to_vmx(vcpu)->loaded_vmcs->vmcs); + preempt_enable(); } -static __always_inline void vmcs_write16(unsigned long field, u16 value) +static __always_inline void vmcs_write16(struct kvm_vcpu* vcpu, size_t field, u16 value) { - vmcs_check16(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); } -static __always_inline void vmcs_write32(unsigned long field, u32 value) +static __always_inline void vmcs_write32(struct kvm_vcpu* vcpu, size_t field, u32 value) { - vmcs_check32(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); } -static __always_inline void vmcs_write64(unsigned long field, u64 value) +static __always_inline void vmcs_write64(struct kvm_vcpu* vcpu, size_t field, u64 value) { - vmcs_check64(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); #ifndef CONFIG_X86_64 asm volatile (""); __vmcs_writel(field+1, value >> 32); #endif } -static __always_inline void vmcs_writel(unsigned long field, unsigned long value) +static __always_inline void vmcs_writel(struct kvm_vcpu* vcpu, size_t field, size_t value) { - vmcs_checkl(field); - __vmcs_writel(field, value); + __vmcs_writel(vcpu, field, value); } -static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_clear_bits(struct kvm_vcpu* vcpu, size_t field, u32 mask) { - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_clear_bits does not support 64-bit fields"); - __vmcs_writel(field, __vmcs_readl(field) & ~mask); + __vmcs_writel(vcpu, field, __vmcs_readl(vcpu, field) & ~mask); } -static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_set_bits(struct kvm_vcpu* vcpu, size_t field, u32 mask) { - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_set_bits does not support 64-bit fields"); - __vmcs_writel(field, __vmcs_readl(field) | mask); + __vmcs_writel(vcpu, field, __vmcs_readl(vcpu, field) | mask); } static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) { - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); + vmx->vm_entry_controls_shadow = vmcs_read32(&vmx->vcpu, VM_ENTRY_CONTROLS); } static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { - vmcs_write32(VM_ENTRY_CONTROLS, val); + vmcs_write32(&vmx->vcpu, VM_ENTRY_CONTROLS, val); vmx->vm_entry_controls_shadow = val; } @@ -1724,12 +899,12 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) { - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); + vmx->vm_exit_controls_shadow = vmcs_read32(&vmx->vcpu, VM_EXIT_CONTROLS); } static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { - vmcs_write32(VM_EXIT_CONTROLS, val); + vmcs_write32(&vmx->vcpu, VM_EXIT_CONTROLS, val); vmx->vm_exit_controls_shadow = val; } @@ -1780,7 +955,7 @@ static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) u16 *p = &vmx->segment_cache.seg[seg].selector; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) - *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + *p = vmcs_read16(&vmx->vcpu, kvm_vmx_segment_fields[seg].selector); return *p; } @@ -1789,7 +964,7 @@ static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) ulong *p = &vmx->segment_cache.seg[seg].base; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) - *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + *p = vmcs_readl(&vmx->vcpu, kvm_vmx_segment_fields[seg].base); return *p; } @@ -1798,7 +973,7 @@ static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) u32 *p = &vmx->segment_cache.seg[seg].limit; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + *p = vmcs_read32(&vmx->vcpu, kvm_vmx_segment_fields[seg].limit); return *p; } @@ -1807,7 +982,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) u32 *p = &vmx->segment_cache.seg[seg].ar; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + *p = vmcs_read32(&vmx->vcpu, kvm_vmx_segment_fields[seg].ar_bytes); return *p; } @@ -1816,17 +991,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); + (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + (GVM_GUESTDBG_ENABLE | GVM_GUESTDBG_USE_SW_BP)) == + (GVM_GUESTDBG_ENABLE | GVM_GUESTDBG_USE_SW_BP)) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - if (vcpu->fpu_active) - eb &= ~(1u << NM_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@ -1836,11 +1009,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; - vmcs_write32(EXCEPTION_BITMAP, eb); + vmcs_write32(vcpu, EXCEPTION_BITMAP, eb); } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit) + size_t entry, size_t exit) { vm_entry_controls_clearbit(vmx, entry); vm_exit_controls_clearbit(vmx, exit); @@ -1879,17 +1052,17 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) --m->nr; m->guest[i] = m->guest[m->nr]; m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + vmcs_write32(&vmx->vcpu, VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(&vmx->vcpu, VM_EXIT_MSR_LOAD_COUNT, m->nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit, - unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + size_t entry, size_t exit, + size_t guest_val_vmcs, size_t host_val_vmcs, u64 guest_val, u64 host_val) { - vmcs_write64(guest_val_vmcs, guest_val); - vmcs_write64(host_val_vmcs, host_val); + vmcs_write64(&vmx->vcpu, guest_val_vmcs, guest_val); + vmcs_write64(&vmx->vcpu, host_val_vmcs, host_val); vm_entry_controls_setbit(vmx, entry); vm_exit_controls_setbit(vmx, exit); } @@ -1942,8 +1115,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, return; } else if (i == m->nr) { ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + //vmcs_write32(&vmx->vcpu, VM_ENTRY_MSR_LOAD_COUNT, m->nr); + //vmcs_write32(&vmx->vcpu, VM_EXIT_MSR_LOAD_COUNT, m->nr); } m->guest[i].index = msr; @@ -1965,7 +1138,7 @@ static void reload_tss(void) load_TR_desc(); } -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; @@ -1995,36 +1168,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) clear_atomic_switch_msr(vmx, MSR_EFER); - /* - * On EPT, we can't emulate NX, so we must switch EFER atomically. - * On CPUs that support "load IA32_EFER", always switch EFER - * atomically, since it's faster than switching it manually. - */ - if (cpu_has_load_ia32_efer || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { - if (!(guest_efer & EFER_LMA)) - guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer); - return false; - } else { - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; - - return true; - } + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); + return false; } -static unsigned long segment_base(u16 selector) +static size_t segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; - unsigned long table_base; - unsigned long v; + size_t table_base; + size_t v; if (!(selector & ~3)) return 0; @@ -2043,47 +1200,40 @@ static unsigned long segment_base(u16 selector) v = get_desc_base(d); #ifdef CONFIG_X86_64 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; + v |= ((size_t)((struct ldttss_desc64 *)d)->base3) << 32; #endif return v; } -static inline unsigned long kvm_read_tr_base(void) +static inline size_t kvm_read_tr_base(void) { - u16 tr; - asm("str %0" : "=g"(tr)); + u16 tr = 0; + tr = gvm_read_tr(); return segment_base(tr); } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int i; - if (vmx->host_state.loaded) - return; - - vmx->host_state.loaded = 1; /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * Set host fs and gs selectors. Unfortunately, 26.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmcs_write16(vcpu, HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; } else { - vmcs_write16(HOST_FS_SELECTOR, 0); + vmcs_write16(vcpu, HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + vmcs_write16(vcpu, HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; + vmcs_write16(vcpu, HOST_GS_SELECTOR, 0); + vmx->host_state.gs_reload_needed = 1; } #ifdef CONFIG_X86_64 @@ -2092,8 +1242,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); + vmcs_writel(vcpu, HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(vcpu, HOST_GS_BASE, read_msr(MSR_GS_BASE)); #else vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); @@ -2106,25 +1256,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif if (boot_cpu_has(X86_FEATURE_MPX)) rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) - return; - ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (vmx->host_state.gs_reload_needed) { #ifdef CONFIG_X86_64 load_gs_index(vmx->host_state.gs_sel); #else @@ -2145,74 +1286,14 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - /* - * If the FPU is not active (through the host task or - * the guest vcpu), then restore the cr0.TS bit. - */ - if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) - stts(); load_gdt(this_cpu_ptr(&host_gdt)); + load_idt(this_cpu_ptr(&host_idt)); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_load_host_state(struct kvm_vcpu *vcpu) { - preempt_disable(); + struct vcpu_vmx *vmx = to_vmx(vcpu); __vmx_load_host_state(vmx); - preempt_enable(); -} - -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - do { - old.control = new.control = pi_desc->control; - - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); -} - -static void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } /* @@ -2227,104 +1308,26 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (!already_loaded) - loaded_vmcs_clear(vmx->loaded_vmcs); - - if (!already_loaded) { - local_irq_disable(); - crash_disable_local_vmclear(cpu); - - /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). - */ - smp_rmb(); - - list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, - &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); - local_irq_enable(); - } - - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } if (!already_loaded) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ - vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ - - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); - - vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); -} - -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_vcpu_pi_put(vcpu); - - __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; kvm_cpu_vmxoff(); } } -static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -{ - ulong cr0; - - if (vcpu->fpu_active) - return; - vcpu->fpu_active = 1; - cr0 = vmcs_readl(GUEST_CR0); - cr0 &= ~(X86_CR0_TS | X86_CR0_MP); - cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); - vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - if (is_guest_mode(vcpu)) - vcpu->arch.cr0_guest_owned_bits &= - ~get_vmcs12(vcpu)->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2332,51 +1335,24 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by * its hypervisor (cr0_read_shadow). */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +static inline size_t nested_read_cr0(struct vmcs12 *fields) { return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); } -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +static inline size_t nested_read_cr4(struct vmcs12 *fields) { return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); } -static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +static size_t vmx_get_rflags(struct kvm_vcpu *vcpu) { - /* Note that there is no vcpu->fpu_active = 0 here. The caller must - * set this *before* calling this function. - */ - vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = 0; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - if (is_guest_mode(vcpu)) { - /* - * L1's specified read shadow might not contain the TS bit, - * so now that we turned on shadowing of this bit, we need to - * set this bit of the shadow. Like in nested_vmx_run we need - * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet - * up-to-date here because we just decached cr0.TS (and we'll - * only update vmcs12->guest_cr0 on nested exit). - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | - (vcpu->arch.cr0 & X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - } else - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); -} - -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags, save_rflags; + size_t rflags, save_rflags; if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - rflags = vmcs_readl(GUEST_RFLAGS); + rflags = vmcs_readl(vcpu, GUEST_RFLAGS); if (to_vmx(vcpu)->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; save_rflags = to_vmx(vcpu)->rmode.save_rflags; @@ -2387,7 +1363,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->rflags; } -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void vmx_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; @@ -2395,49 +1371,44 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } - vmcs_writel(GUEST_RFLAGS, rflags); -} - -static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->guest_pkru; + vmcs_writel(vcpu, GUEST_RFLAGS, rflags); } static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { - u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO); int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) - ret |= KVM_X86_SHADOW_INT_STI; + ret |= GVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= KVM_X86_SHADOW_INT_MOV_SS; + ret |= GVM_X86_SHADOW_INT_MOV_SS; return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { - u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility_old = vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO); u32 interruptibility = interruptibility_old; interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - if (mask & KVM_X86_SHADOW_INT_MOV_SS) + if (mask & GVM_X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; - else if (mask & KVM_X86_SHADOW_INT_STI) + else if (mask & GVM_X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); + vmcs_write32(vcpu, GUEST_INTERRUPTIBILITY_INFO, interruptibility); } static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rip; + size_t rip; rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip += vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); kvm_rip_write(vcpu, rip); /* skipping an emulated instruction also counts */ @@ -2445,7 +1416,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } /* - * KVM wants to inject page-faults which it got to the guest. This function + * kvm wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) @@ -2456,8 +1427,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) return 0; nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_readl(vcpu, EXIT_QUALIFICATION)); return 1; } @@ -2473,7 +1444,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + vmcs_write32(vcpu, VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } @@ -2482,18 +1453,18 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, if (kvm_exception_is_soft(nr)) inc_eip = vcpu->arch.event_exit_inst_len; if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } if (kvm_exception_is_soft(nr)) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_write32(vcpu, VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, intr_info); } static bool vmx_rdtscp_supported(void) @@ -2506,26 +1477,15 @@ static bool vmx_invpcid_supported(void) return cpu_has_vmx_invpcid() && enable_ept; } -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) -{ - struct shared_msr_entry tmp; - - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; -} - static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { - unsigned long *msr_bitmap; + size_t *msr_bitmap; if (is_guest_mode(vcpu)) msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; - else if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + else + if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) @@ -2545,7 +1505,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) msr_bitmap = vmx_msr_bitmap_legacy; } - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + vmcs_write64(vcpu, MSR_BITMAP, __pa(msr_bitmap)); } /* @@ -2555,37 +1515,33 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs, index; + u64 value; - save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) - move_msr_up(vmx, index, save_nmsrs++); + if (!rdmsrl_safe(MSR_SYSCALL_MASK, &value) && + !wrmsrl_safe(MSR_SYSCALL_MASK, value)) + add_atomic_switch_msr(vmx, MSR_SYSCALL_MASK, 0, value); + if (!rdmsrl_safe(MSR_LSTAR, &value) && + !wrmsrl_safe(MSR_LSTAR, value)) + add_atomic_switch_msr(vmx, MSR_LSTAR, 0, value); + if (!rdmsrl_safe(MSR_CSTAR, &value) && + !wrmsrl_safe(MSR_CSTAR, value)) + add_atomic_switch_msr(vmx, MSR_CSTAR, 0, value); + if (!rdmsrl_safe(MSR_GS_BASE, &value) && + !wrmsrl_safe(MSR_GS_BASE, value)) + add_atomic_switch_msr(vmx, MSR_GS_BASE, 0, value); /* * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vmx, MSR_STAR); - if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) - move_msr_up(vmx, index, save_nmsrs++); + if (vmx->vcpu.arch.efer & EFER_SCE) + if (!rdmsrl_safe(MSR_STAR, &value) && + !wrmsrl_safe(MSR_STAR, value)) + add_atomic_switch_msr(vmx, MSR_STAR, 0, value); } #endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; + update_transition_efer(vmx); if (cpu_has_vmx_msr_bitmap()) vmx_set_msr_bitmap(&vmx->vcpu); @@ -2600,9 +1556,10 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu) { u64 host_tsc, tsc_offset; - host_tsc = rdtsc(); - tsc_offset = vmcs_read64(TSC_OFFSET); - return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + host_tsc = __rdtsc(); + tsc_offset = vmcs_read64(vcpu, TSC_OFFSET); + //return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + return host_tsc + tsc_offset; } /* @@ -2610,6 +1567,7 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu) */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + vmcs_write64(vcpu, TSC_OFFSET, offset); if (is_guest_mode(vcpu)) { /* * We're here if L1 chose not to trap WRMSR to TSC. According @@ -2620,19 +1578,17 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) struct vmcs12 *vmcs12; /* recalculate vmcs02.TSC_OFFSET: */ vmcs12 = get_vmcs12(vcpu); - vmcs_write64(TSC_OFFSET, offset + + vmcs_write64(vcpu, TSC_OFFSET, offset + (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12->tsc_offset : 0)); } else { - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vmcs_read64(TSC_OFFSET), offset); - vmcs_write64(TSC_OFFSET, offset); + vmcs_write64(vcpu, TSC_OFFSET, offset); } } static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); + struct kvm_cpuid_entry *best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); } @@ -2685,11 +1641,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; vmx->nested.nested_vmx_pinbased_ctls_high |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - PIN_BASED_VMX_PREEMPTION_TIMER; - if (kvm_vcpu_apicv_active(&vmx->vcpu)) - vmx->nested.nested_vmx_pinbased_ctls_high |= - PIN_BASED_POSTED_INTR; + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, @@ -2804,7 +1756,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = 0; /* - * Old versions of KVM use the single-context version without + * Old versions of kvm use the single-context version without * checking for support, so declare that it is supported even * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. @@ -2957,18 +1909,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, */ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - struct shared_msr_entry *msr; - switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: - msr_info->data = vmcs_readl(GUEST_FS_BASE); + msr_info->data = vmcs_readl(vcpu, GUEST_FS_BASE); break; case MSR_GS_BASE: - msr_info->data = vmcs_readl(GUEST_GS_BASE); + msr_info->data = vmcs_readl(vcpu, GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(to_vmx(vcpu)); msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; break; #endif @@ -2978,33 +1927,22 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = guest_read_tsc(vcpu); break; case MSR_IA32_SYSENTER_CS: - msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + msr_info->data = vmcs_read32(vcpu, GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); + msr_info->data = vmcs_readl(vcpu, GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); + msr_info->data = vmcs_readl(vcpu, GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) return 1; - msr_info->data = vmcs_read64(GUEST_BNDCFGS); - break; - case MSR_IA32_MCG_EXT_CTL: - if (!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) - return 1; - msr_info->data = vcpu->arch.mcg_ext_ctl; + msr_info->data = vmcs_read64(vcpu, GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested_vmx_allowed(vcpu)) - return 1; - return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3013,12 +1951,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_TSC_AUX: if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) return 1; + case MSR_SYSCALL_MASK: + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i = 0; + for (i = 0; i < vmx->msr_autoload.nr; i++) + if (vmx->msr_autoload.guest[i].index == msr_info->index) + msr_info->data = vmx->msr_autoload.guest[i].value; + break; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_info->index); - if (msr) { - msr_info->data = msr->data; - break; + if (msr_info->index >= MSR_IA32_VMX_BASIC + && msr_info->index <= MSR_IA32_VMX_VMFUNC) { + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(vcpu, msr_info->index, + &msr_info->data); } return kvm_get_msr_common(vcpu, msr_info); } @@ -3036,10 +1986,10 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu); static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; + u64 host_value = 0; switch (msr_index) { case MSR_EFER: @@ -3048,30 +1998,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_FS_BASE: vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_FS_BASE, data); + vmcs_writel(vcpu, GUEST_FS_BASE, data); break; case MSR_GS_BASE: vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_GS_BASE, data); + vmcs_writel(vcpu, GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); vmx->msr_guest_kernel_gs_base = data; break; #endif case MSR_IA32_SYSENTER_CS: - vmcs_write32(GUEST_SYSENTER_CS, data); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: - vmcs_writel(GUEST_SYSENTER_EIP, data); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: - vmcs_writel(GUEST_SYSENTER_ESP, data); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) return 1; - vmcs_write64(GUEST_BNDCFGS, data); + vmcs_write64(vcpu, GUEST_BNDCFGS, data); break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); @@ -3080,7 +2029,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; - vmcs_write64(GUEST_IA32_PAT, data); + vmcs_write64(vcpu, GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; } @@ -3089,14 +2038,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_MCG_EXT_CTL: - if ((!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) || - (data & ~MCG_EXT_CTL_LMCE_EN)) - return 1; - vcpu->arch.mcg_ext_ctl = data; - break; case MSR_IA32_FEATURE_CONTROL: if (!vmx_feature_control_msr_valid(vcpu, data) || (to_vmx(vcpu)->msr_ia32_feature_control & @@ -3106,14 +2047,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; /* * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. + * it is not supported on kvm. */ if (data != 0) return 1; @@ -3130,22 +2069,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; + case MSR_SYSCALL_MASK: + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + if (!rdmsrl_safe(msr_index, &host_value)) + add_atomic_switch_msr(vmx, msr_index, data, host_value); + break; /* Otherwise falls through */ default: - msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } + if (msr_index >= MSR_IA32_VMX_BASIC + && msr_index <= MSR_IA32_VMX_VMFUNC) + return 1; /* they are read-only */ ret = kvm_set_msr_common(vcpu, msr_info); } @@ -3154,13 +2089,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (size_t *)&vcpu->arch.regs_avail); switch (reg) { case VCPU_REGS_RSP: - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(vcpu, GUEST_RSP); break; case VCPU_REGS_RIP: - vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(vcpu, GUEST_RIP); break; case VCPU_EXREG_PDPTR: if (enable_ept) @@ -3171,32 +2106,28 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) +static int cpu_has_kvm_support(void) { return cpu_has_vmx(); } -static __init int vmx_disabled_by_bios(void) +static int vmx_disabled_by_bios(void) { u64 msr; - rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + if (rdmsrl_safe(MSR_IA32_FEATURE_CONTROL, &msr)) + return 0; + if (msr & FEATURE_CONTROL_LOCKED) { - /* launched w/ TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && tboot_enabled()) - return 1; /* launched w/o TXT and VMX only enabled w/ TXT */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && !tboot_enabled()) { + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)) { printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - "activate TXT before enabling KVM\n"); + "activate TXT before enabling kvm\n"); return 1; } /* launched w/o TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && !tboot_enabled()) + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)) return 1; } @@ -3205,11 +2136,9 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { - intel_pt_handle_vmx(1); - - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&addr), "m"(addr) - : "memory", "cc"); + u8 rc = __vmx_on(&addr); + if (rc) + printk(KERN_CRIT "rc is %d\n", rc); } static int hardware_enable(void) @@ -3221,27 +2150,13 @@ static int hardware_enable(void) if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; - INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); - rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (tboot_enabled()) - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; if ((old & test_bits) != test_bits) { /* enable and lock */ @@ -3255,44 +2170,31 @@ static int hardware_enable(void) } native_store_gdt(this_cpu_ptr(&host_gdt)); + native_store_idt(this_cpu_ptr(&host_idt)); return 0; } -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - - /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() * tricks. */ static void kvm_cpu_vmxoff(void) { - asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); - - intel_pt_handle_vmx(0); + __vmx_off(); } static void hardware_disable(void) { if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } cr4_clear_bits(X86_CR4_VMXE); } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { - u32 vmx_msr_low, vmx_msr_high; + u32 vmx_msr_low = 0, vmx_msr_high = 0; u32 ctl = ctl_min | ctl_opt; rdmsr(msr, vmx_msr_low, vmx_msr_high); @@ -3308,17 +2210,17 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init bool allow_1_setting(u32 msr, u32 ctl) +static bool allow_1_setting(u32 msr, u32 ctl) { - u32 vmx_msr_low, vmx_msr_high; + u32 vmx_msr_low = 0, vmx_msr_high = 0; rdmsr(msr, vmx_msr_low, vmx_msr_high); return vmx_msr_high & ctl; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf) { - u32 vmx_msr_low, vmx_msr_high; + u32 vmx_msr_low = 0, vmx_msr_high = 0; u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; @@ -3360,15 +2262,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING; + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3407,18 +2307,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; + opt = PIN_BASED_VIRTUAL_NMIS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; - if (cpu_has_broken_vmx_preemption_timer()) - _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) - _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = VM_ENTRY_LOAD_DEBUG_CONTROLS; opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -3442,7 +2335,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); + /* should always 0 */ + vmcs_conf->order = 0; vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; @@ -3478,8 +2372,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) * BA97 (model 46) * */ - if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { + if (cpu_has_load_perf_global_ctrl && x86_cpuid_family() == 0x6) { + switch (x86_cpuid_model()) { case 26: case 30: case 37: @@ -3500,16 +2394,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(void) { - int node = cpu_to_node(cpu); - struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); - if (!pages) + vmcs = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, GVM_POOL_TAG); + if (!vmcs) return NULL; - vmcs = page_address(pages); memset(vmcs, 0, vmcs_config.size); vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ return vmcs; @@ -3517,12 +2408,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) static struct vmcs *alloc_vmcs(void) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(); } static void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_config.order); + ExFreePoolWithTag(vmcs, GVM_POOL_TAG); } /* @@ -3532,7 +2423,6 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { if (!loaded_vmcs->vmcs) return; - loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; WARN_ON(loaded_vmcs->shadow_vmcs != NULL); @@ -3583,14 +2473,14 @@ static void init_vmcs_shadow_fields(void) vmx_vmread_bitmap); } -static __init int alloc_kvm_area(void) +static int alloc_kvm_area(void) { int cpu; for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -3627,7 +2517,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, static void enter_pmode(struct kvm_vcpu *vcpu) { - unsigned long flags; + size_t flags; struct vcpu_vmx *vmx = to_vmx(vcpu); /* @@ -3647,13 +2537,13 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); - flags = vmcs_readl(GUEST_RFLAGS); + flags = vmcs_readl(vcpu, GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(vcpu, GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | - (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + vmcs_writel(vcpu, GUEST_CR4, (vmcs_readl(vcpu, GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(vcpu, CR4_READ_SHADOW) & X86_CR4_VME)); update_exception_bitmap(vcpu); @@ -3665,7 +2555,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); } -static void fix_rmode_seg(int seg, struct kvm_segment *save) +static void fix_rmode_seg(struct kvm_vcpu* vcpu, int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment var = *save; @@ -3692,15 +2582,15 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) "protected mode (seg=%d)", seg); } - vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); - vmcs_write32(sf->limit, var.limit); - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); + vmcs_write16(vcpu, sf->selector, var.selector); + vmcs_write32(vcpu, sf->base, var.base); + vmcs_write32(vcpu, sf->limit, var.limit); + vmcs_write32(vcpu, sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) { - unsigned long flags; + size_t flags; struct vcpu_vmx *vmx = to_vmx(vcpu); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); @@ -3714,34 +2604,34 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 1; /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * Very old userspace does not call GVM_SET_TSS_ADDR before entering * vcpu. Warn the user that an update is overdue. */ if (!vcpu->kvm->arch.tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + printk_once(KERN_WARNING "kvm: GVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); - vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + vmcs_writel(vcpu, GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); + vmcs_write32(vcpu, GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, 0x008b); - flags = vmcs_readl(GUEST_RFLAGS); + flags = vmcs_readl(vcpu, GUEST_RFLAGS); vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + vmcs_writel(vcpu, GUEST_RFLAGS, flags); + vmcs_writel(vcpu, GUEST_CR4, vmcs_readl(vcpu, GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); - fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); - fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_rmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); kvm_mmu_reset_context(vcpu); } @@ -3749,41 +2639,31 @@ static void enter_rmode(struct kvm_vcpu *vcpu) static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - if (!msr) - return; - /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). - */ - vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; - if (efer & EFER_LMA) { + if (efer & EFER_LMA) vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { + else vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer & ~EFER_LME; - } setup_msrs(vmx); } #ifdef CONFIG_X86_64 +#define pr_debug_ratelimited DbgPrint + static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; vmx_segment_cache_clear(to_vmx(vcpu)); - guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + guest_tr_ar = vmcs_read32(vcpu, GUEST_TR_AR_BYTES); if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); - vmcs_write32(GUEST_TR_AR_BYTES, + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, (guest_tr_ar & ~VMX_AR_TYPE_MASK) | VMX_AR_TYPE_BUSY_64_TSS); } @@ -3818,13 +2698,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(vcpu, GUEST_CR0) & cr0_guest_owned_bits; } static void vmx_decache_cr3(struct kvm_vcpu *vcpu) { if (enable_ept && is_paging(vcpu)) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + vcpu->arch.cr3 = vmcs_readl(vcpu, GUEST_CR3); __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } @@ -3833,7 +2713,7 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(vcpu, GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -3841,14 +2721,14 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) + (size_t *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); + vmcs_write64(vcpu, GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(vcpu, GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(vcpu, GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(vcpu, GUEST_PDPTR3, mmu->pdptrs[3]); } } @@ -3857,38 +2737,38 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(vcpu, GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(vcpu, GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(vcpu, GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(vcpu, GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); + (size_t *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + (size_t *)&vcpu->arch.regs_dirty); } -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +static int vmx_set_cr4(struct kvm_vcpu *vcpu, size_t cr4); -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, +static void ept_update_paging_mode_cr0(size_t *hw_cr0, + size_t cr0, struct kvm_vcpu *vcpu) { if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) vmx_decache_cr3(vcpu); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL) | (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL) & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; @@ -3899,16 +2779,16 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, *hw_cr0 &= ~X86_CR0_WP; } -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +static void vmx_set_cr0(struct kvm_vcpu *vcpu, size_t cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + size_t hw_cr0; - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); + hw_cr0 = (cr0 & ~GVM_GUEST_CR0_MASK); if (enable_unrestricted_guest) - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + hw_cr0 |= GVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + hw_cr0 |= GVM_VM_CR0_ALWAYS_ON; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3929,18 +2809,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS | X86_CR0_MP; - - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); + vmcs_writel(vcpu, CR0_READ_SHADOW, cr0); + vmcs_writel(vcpu, GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static u64 construct_eptp(size_t root_hpa) { u64 eptp; @@ -3954,15 +2831,15 @@ static u64 construct_eptp(unsigned long root_hpa) return eptp; } -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void vmx_set_cr3(struct kvm_vcpu *vcpu, size_t cr3) { - unsigned long guest_cr3; + size_t guest_cr3; u64 eptp; guest_cr3 = cr3; if (enable_ept) { eptp = construct_eptp(cr3); - vmcs_write64(EPT_POINTER, eptp); + vmcs_write64(vcpu, EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else @@ -3971,21 +2848,21 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } vmx_flush_tlb(vcpu); - vmcs_writel(GUEST_CR3, guest_cr3); + vmcs_writel(vcpu, GUEST_CR3, guest_cr3); } -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int vmx_set_cr4(struct kvm_vcpu *vcpu, size_t cr4) { /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ - unsigned long hw_cr4 = + size_t hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE) | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + GVM_RMODE_VM_CR4_ALWAYS_ON : GVM_PMODE_VM_CR4_ALWAYS_ON); if (cr4 & X86_CR4_VMXE) { /* @@ -3994,7 +2871,7 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * So basically the check on whether to allow nested VMX * is here. */ - if (!nested_vmx_allowed(vcpu)) + //if (!nested_vmx_allowed(vcpu)) return 1; } if (to_vmx(vcpu)->nested.vmxon && @@ -4025,8 +2902,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) */ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, hw_cr4); + vmcs_writel(vcpu, CR4_READ_SHADOW, cr4); + vmcs_writel(vcpu, GUEST_CR4, hw_cr4); return 0; } @@ -4121,15 +2998,15 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; if (seg == VCPU_SREG_TR) - vmcs_write16(sf->selector, var->selector); + vmcs_write16(vcpu, sf->selector, var->selector); else if (var->s) - fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + fix_rmode_seg(vcpu, seg, &vmx->rmode.segs[seg]); goto out; } - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); + vmcs_writel(vcpu, sf->base, var->base); + vmcs_write32(vcpu, sf->limit, var->limit); + vmcs_write16(vcpu, sf->selector, var->selector); /* * Fix the "Accessed" bit in AR field of segment registers for older @@ -4145,7 +3022,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); + vmcs_write32(vcpu, sf->ar_bytes, vmx_segment_access_rights(var)); out: vmx->emulation_required = emulation_required(vcpu); @@ -4161,26 +3038,26 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->size = vmcs_read32(GUEST_IDTR_LIMIT); - dt->address = vmcs_readl(GUEST_IDTR_BASE); + dt->size = vmcs_read32(vcpu, GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(vcpu, GUEST_IDTR_BASE); } static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_IDTR_LIMIT, dt->size); - vmcs_writel(GUEST_IDTR_BASE, dt->address); + vmcs_write32(vcpu, GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(vcpu, GUEST_IDTR_BASE, dt->address); } static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->size = vmcs_read32(GUEST_GDTR_LIMIT); - dt->address = vmcs_readl(GUEST_GDTR_BASE); + dt->size = vmcs_read32(vcpu, GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(vcpu, GUEST_GDTR_BASE); } static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_GDTR_LIMIT, dt->size); - vmcs_writel(GUEST_GDTR_BASE, dt->address); + vmcs_write32(vcpu, GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(vcpu, GUEST_GDTR_BASE, dt->address); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) @@ -4454,24 +3331,24 @@ out2: return r; } -static void seg_setup(int seg) +static void seg_setup(struct kvm_vcpu *vcpu, int seg) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; unsigned int ar; - vmcs_write16(sf->selector, 0); - vmcs_writel(sf->base, 0); - vmcs_write32(sf->limit, 0xffff); + vmcs_write16(vcpu, sf->selector, 0); + vmcs_writel(vcpu, sf->base, 0); + vmcs_write32(vcpu, sf->limit, 0xffff); ar = 0x93; if (seg == VCPU_SREG_CS) ar |= 0x08; /* code segment */ - vmcs_write32(sf->ar_bytes, ar); + vmcs_write32(vcpu, sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) { - struct page *page; + pfn_t pfn; int r = 0; mutex_lock(&kvm->slots_lock); @@ -4482,17 +3359,12 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { + pfn = gfn_to_pfn(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_noslot_pfn(pfn)) { r = -EFAULT; goto out; } - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); @@ -4540,10 +3412,10 @@ static void free_vpid(int vpid) #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static void __vmx_disable_intercept_for_msr(size_t *msr_bitmap, u32 msr, int type) { - int f = sizeof(unsigned long); + int f = sizeof(size_t); if (!cpu_has_vmx_msr_bitmap()) return; @@ -4575,10 +3447,10 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static void __vmx_enable_intercept_for_msr(size_t *msr_bitmap, u32 msr, int type) { - int f = sizeof(unsigned long); + int f = sizeof(size_t); if (!cpu_has_vmx_msr_bitmap()) return; @@ -4614,11 +3486,11 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, +static void nested_vmx_disable_intercept_for_msr(size_t *msr_bitmap_l1, + size_t *msr_bitmap_nested, u32 msr, int type) { - int f = sizeof(unsigned long); + int f = sizeof(size_t); if (!cpu_has_vmx_msr_bitmap()) { WARN_ON(1); @@ -4715,125 +3587,6 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - void *vapic_page; - u16 status; - - if (vmx->nested.pi_desc && - vmx->nested.pi_pending) { - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; - - max_irr = find_last_bit( - (unsigned long *)vmx->nested.pi_desc->pir, 256); - - if (max_irr == 256) - return 0; - - vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); - kunmap(vmx->nested.virtual_apic_page); - - status = vmcs_read16(GUEST_INTR_STATUS); - if ((u8)max_irr > ((u8)status & 0xff)) { - status &= ~0xff; - status |= (u8)max_irr; - vmcs_write16(GUEST_INTR_STATUS, status); - } - } - return 0; -} - -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. - * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. - */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); - - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); - return true; - } -#endif - return false; -} - -static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, - int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu) && - vector == vmx->nested.posted_intr_nv) { - /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu); - /* - * If a posted intr is not recognized by hardware, - * we will accomplish it in the next vmentry. - */ - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; - } - return -1; -} -/* - * Send interrupt to vcpu via posted interrupt way. - * 1. If target vcpu is running(non-root mode), send posted interrupt - * notification to vcpu and hardware will sync PIR to vIRR atomically. - * 2. If target vcpu isn't running(root mode), kick it to pick up the - * interrupt from PIR in next vmentry. - */ -static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - - r = vmx_deliver_nested_posted_interrupt(vcpu, vector); - if (!r) - return; - - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return; - - r = pi_test_and_set_on(&vmx->pi_desc); - kvm_make_request(KVM_REQ_EVENT, vcpu); - if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) - kvm_vcpu_kick(vcpu); -} - -static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!pi_test_and_clear_on(&vmx->pi_desc)) - return; - - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); -} - /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -4842,71 +3595,63 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) */ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { - u32 low32, high32; - unsigned long tmpl; - struct desc_ptr dt; - unsigned long cr4; + u32 low32 = 0, high32 = 0; + size_t tmpl; + size_t cr4; + struct kvm_vcpu *vcpu = &vmx->vcpu; - vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + vmcs_writel(vcpu, HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(vcpu, HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); - vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmcs_writel(vcpu, HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->host_state.vmcs_host_cr4 = cr4; - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in * __vmx_load_host_state(), in case userspace uses the null selectors * too (the expected case). */ - vmcs_write16(HOST_DS_SELECTOR, 0); - vmcs_write16(HOST_ES_SELECTOR, 0); + vmcs_write16(vcpu, HOST_DS_SELECTOR, 0); + vmcs_write16(vcpu, HOST_ES_SELECTOR, 0); #else - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #endif - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - native_store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; + vmcs_write16(vcpu, HOST_SS_SELECTOR, __KERNEL_SS); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ + vmcs_writel(vcpu, HOST_RIP, vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); - vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + vmcs_write32(vcpu, HOST_IA32_SYSENTER_CS, low32); rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); - vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ + vmcs_writel(vcpu, HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { rdmsr(MSR_IA32_CR_PAT, low32, high32); - vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); + vmcs_write64(vcpu, HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } } static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + vmx->vcpu.arch.cr4_guest_owned_bits = GVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) vmx->vcpu.arch.cr4_guest_owned_bits &= ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + vmcs_writel(&vmx->vcpu, CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!kvm_vcpu_apicv_active(&vmx->vcpu)) - pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; - /* Enable the preemption timer dynamically */ - pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -4914,14 +3659,14 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmcs_write32(vcpu, PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_set_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); else - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_clear_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } @@ -4934,7 +3679,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; - if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + if (vmx->vcpu.arch.switch_db_regs & GVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; if (!cpu_need_tpr_shadow(&vmx->vcpu)) { @@ -4966,8 +3711,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); @@ -5003,114 +3746,90 @@ static void ept_set_mmio_spte_mask(void) static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 - unsigned long a; + size_t a; #endif - int i; + struct kvm_vcpu *vcpu = &vmx->vcpu; /* I/O */ - vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); + vmcs_write64(vcpu, IO_BITMAP_A, __pa(vmx_io_bitmap_a)); + vmcs_write64(vcpu, IO_BITMAP_B, __pa(vmx_io_bitmap_b)); if (enable_shadow_vmcs) { - vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + vmcs_write64(vcpu, VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(vcpu, VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); + vmcs_write64(vcpu, MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + vmcs_write64(vcpu, VMCS_LINK_POINTER, (u64)-1); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; + vmcs_write32(vcpu, PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { - vmcs_write64(EOI_EXIT_BITMAP0, 0); - vmcs_write64(EOI_EXIT_BITMAP1, 0); - vmcs_write64(EOI_EXIT_BITMAP2, 0); - vmcs_write64(EOI_EXIT_BITMAP3, 0); - - vmcs_write16(GUEST_INTR_STATUS, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP0, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP1, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP2, 0); + vmcs_write64(vcpu, EOI_EXIT_BITMAP3, 0); - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); - } + vmcs_write16(vcpu, GUEST_INTR_STATUS, 0); - if (ple_gap) { - vmcs_write32(PLE_GAP, ple_gap); - vmx->ple_window = ple_window; - vmx->ple_window_dirty = true; + //vmcs_write16(vcpu, POSTED_INTR_NV, POSTED_INTR_VECTOR); + //vmcs_write64(vcpu, POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MATCH, 0); + vmcs_write32(vcpu, CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(vcpu, HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_FS_BASE, a); /* 22.2.4 */ rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_GS_BASE, a); /* 22.2.4 */ #else - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_GS_BASE, 0); /* 22.2.4 */ #endif - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write32(vcpu, VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(vcpu, VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(vcpu, VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(vcpu, VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - + vmcs_write64(vcpu, GUEST_IA32_PAT, vmx->vcpu.arch.pat); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmcs_writel(vcpu, CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); if (vmx_xsaves_supported()) - vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + vmcs_write64(vcpu, XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { ASSERT(vmx->pml_pg); - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write64(vcpu, PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(vcpu, GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } return 0; } - static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5135,72 +3854,68 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_segment_cache_clear(vmx); - seg_setup(VCPU_SREG_CS); - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); + seg_setup(vcpu, VCPU_SREG_CS); + vmcs_write16(vcpu, GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(vcpu, GUEST_CS_BASE, 0xffff0000ul); - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); + seg_setup(vcpu, VCPU_SREG_DS); + seg_setup(vcpu, VCPU_SREG_ES); + seg_setup(vcpu, VCPU_SREG_FS); + seg_setup(vcpu, VCPU_SREG_GS); + seg_setup(vcpu, VCPU_SREG_SS); - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + vmcs_write16(vcpu, GUEST_TR_SELECTOR, 0); + vmcs_writel(vcpu, GUEST_TR_BASE, 0); + vmcs_write32(vcpu, GUEST_TR_LIMIT, 0xffff); + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, 0x008b); - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + vmcs_write16(vcpu, GUEST_LDTR_SELECTOR, 0); + vmcs_writel(vcpu, GUEST_LDTR_BASE, 0); + vmcs_write32(vcpu, GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(vcpu, GUEST_LDTR_AR_BYTES, 0x00082); if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, 0); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, 0); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, 0); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, 0); } - vmcs_writel(GUEST_RFLAGS, 0x02); + vmcs_writel(vcpu, GUEST_RFLAGS, 0x02); kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + vmcs_writel(vcpu, GUEST_GDTR_BASE, 0); + vmcs_write32(vcpu, GUEST_GDTR_LIMIT, 0xffff); - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmcs_writel(vcpu, GUEST_IDTR_BASE, 0); + vmcs_write32(vcpu, GUEST_IDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmcs_write32(vcpu, GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); + vmcs_write32(vcpu, GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_writel(vcpu, GUEST_PENDING_DBG_EXCEPTIONS, 0); setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + vmcs_write64(vcpu, VIRTUAL_APIC_PAGE_ADDR, 0); if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + vmcs_write64(vcpu, VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); + vmcs_write32(vcpu, TPR_THRESHOLD, 0); } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - - if (kvm_vcpu_apicv_active(vcpu)) - memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); + kvm_make_request(GVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmcs_write16(vcpu, VIRTUAL_PROCESSOR_ID, vmx->vpid); cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx->vcpu.arch.cr0 = cr0; vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); - vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); @@ -5236,9 +3951,9 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -5246,14 +3961,14 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) u32 cpu_based_vm_exec_control; if (!cpu_has_virtual_nmis() || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -5262,25 +3977,23 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); - ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } intr = irq | INTR_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { intr |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_write32(vcpu, VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); } else intr |= INTR_TYPE_EXT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, intr); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -5307,11 +4020,11 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } @@ -5321,7 +4034,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->soft_vnmi_blocked; if (to_vmx(vcpu)->nmi_known_unmasked) return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + return vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -5336,10 +4049,10 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } else { vmx->nmi_known_unmasked = !masked; if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_set_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_clear_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } } @@ -5352,7 +4065,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + return !(vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); } @@ -5360,8 +4073,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + vmcs_readl(vcpu, GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -5386,13 +4099,13 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) * from user space while in guest debugging mode. */ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); + if (vcpu->guest_debug & GVM_GUESTDBG_USE_SW_BP) return false; /* fall through */ case DB_VECTOR: if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) return false; /* fall through */ case DE_VECTOR: @@ -5445,14 +4158,14 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, */ static void kvm_machine_check(void) { -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; + /* + * On an #MC intercept the MCE handler is not called automatically in + * the host. So do it by hand here. + */ + __int12(); + /* not sure if we ever come back to this point */ - do_machine_check(®s, 0); -#endif + return; } static int handle_machine_check(struct kvm_vcpu *vcpu) @@ -5466,7 +4179,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; + size_t cr2, rip, dr6; u32 vect_info; enum emulation_result er; @@ -5479,11 +4192,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ - if (is_no_device(intr_info)) { - vmx_fpu_activate(vcpu); - return 1; - } - if (is_invalid_opcode(intr_info)) { if (is_guest_mode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5497,7 +4205,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) - error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + error_code = vmcs_read32(vcpu, VM_EXIT_INTR_ERROR_CODE); /* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing @@ -5506,8 +4214,8 @@ static int handle_exception(struct kvm_vcpu *vcpu) */ if ((vect_info & VECTORING_INFO_VALID_MASK) && !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_SIMUL_EX; vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; @@ -5518,8 +4226,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); - cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); + cr2 = vmcs_readl(vcpu, EXIT_QUALIFICATION); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); @@ -5536,9 +4243,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); + dr6 = vmcs_readl(vcpu, EXIT_QUALIFICATION); if (!(vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ @@ -5548,7 +4255,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; } kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; - kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + kvm_run->debug.arch.dr7 = vmcs_readl(vcpu, GUEST_DR7); /* fall through */ case BP_VECTOR: /* @@ -5557,14 +4264,14 @@ static int handle_exception(struct kvm_vcpu *vcpu) * #DB as well causes no harm, it is not used in that case. */ vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_run->exit_reason = KVM_EXIT_DEBUG; + vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); + kvm_run->exit_reason = GVM_EXIT_DEBUG; rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.pc = vmcs_readl(vcpu, GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; default: - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->exit_reason = GVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; kvm_run->ex.error_code = error_code; break; @@ -5580,17 +4287,17 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) static int handle_triple_fault(struct kvm_vcpu *vcpu) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = GVM_EXIT_SHUTDOWN; return 0; } static int handle_io(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + size_t exit_qualification; int size, in, string; unsigned port; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; in = (exit_qualification & 8) != 0; @@ -5606,20 +4313,9 @@ static int handle_io(struct kvm_vcpu *vcpu) return kvm_fast_pio_out(vcpu, size, port); } -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, size_t val) { - /* - * Patch in the VMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xc1; -} - -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; + size_t always_on = VMXON_CR0_ALWAYSON; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & @@ -5630,11 +4326,11 @@ static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) } /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ -static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +static int handle_set_cr0(struct kvm_vcpu *vcpu, size_t val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; + size_t orig_val = val; /* * We get here when L2 changed cr0 in a way that did not change @@ -5652,7 +4348,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) if (kvm_set_cr0(vcpu, val)) return 1; - vmcs_writel(CR0_READ_SHADOW, orig_val); + vmcs_writel(vcpu, CR0_READ_SHADOW, orig_val); return 0; } else { if (to_vmx(vcpu)->nested.vmxon && @@ -5662,53 +4358,36 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) } } -static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) +static int handle_set_cr4(struct kvm_vcpu *vcpu, size_t val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; + size_t orig_val = val; /* analogously to handle_set_cr0 */ val = (val & ~vmcs12->cr4_guest_host_mask) | (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); if (kvm_set_cr4(vcpu, val)) return 1; - vmcs_writel(CR4_READ_SHADOW, orig_val); + vmcs_writel(vcpu, CR4_READ_SHADOW, orig_val); return 0; } else return kvm_set_cr4(vcpu, val); } -/* called to set cr0 as appropriate for clts instruction exit. */ -static void handle_clts(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) { - /* - * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS - * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, - * just pretend it's off (also in arch.cr0 for fpu_activate). - */ - vmcs_writel(CR0_READ_SHADOW, - vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); - vcpu->arch.cr0 &= ~X86_CR0_TS; - } else - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); -} - static int handle_cr(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification, val; + size_t exit_qualification, val; int cr; int reg; int err; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ val = kvm_register_readl(vcpu, reg); - trace_kvm_cr_write(cr, val); switch (cr) { case 0: err = handle_set_cr0(vcpu, val); @@ -5731,36 +4410,31 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 1; if (cr8_prev <= cr8) return 1; - vcpu->run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = GVM_EXIT_SET_TPR; return 0; } } break; case 2: /* clts */ - handle_clts(vcpu); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); skip_emulated_instruction(vcpu); - vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); @@ -5776,10 +4450,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + size_t exit_qualification; int dr, dr7, reg; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ @@ -5789,19 +4463,19 @@ static int handle_dr(struct kvm_vcpu *vcpu) /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; - dr7 = vmcs_readl(GUEST_DR7); + dr7 = vmcs_readl(vcpu, GUEST_DR7); if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the * guest debugging itself. */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; - vcpu->run->exit_reason = KVM_EXIT_DEBUG; + vcpu->run->exit_reason = GVM_EXIT_DEBUG; return 0; } else { vcpu->arch.dr6 &= ~15; @@ -5812,7 +4486,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) } if (vcpu->guest_debug == 0) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + vmcs_clear_bits(vcpu, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); /* @@ -5820,13 +4494,13 @@ static int handle_dr(struct kvm_vcpu *vcpu) * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers. */ - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + vcpu->arch.switch_db_regs |= GVM_DEBUGREG_WONT_EXIT; return 1; } reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; + size_t val; if (kvm_get_dr(vcpu, dr, &val)) return 1; @@ -5844,7 +4518,7 @@ static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) return vcpu->arch.dr6; } -static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +static void vmx_set_dr6(struct kvm_vcpu *vcpu, size_t val) { } @@ -5855,15 +4529,15 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); get_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + vcpu->arch.dr7 = vmcs_readl(vcpu, GUEST_DR7); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_WONT_EXIT; + vmcs_set_bits(vcpu, CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +static void vmx_set_dr7(struct kvm_vcpu *vcpu, size_t val) { - vmcs_writel(GUEST_DR7, val); + vmcs_writel(vcpu, GUEST_DR7, val); } static int handle_cpuid(struct kvm_vcpu *vcpu) @@ -5880,16 +4554,13 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) msr_info.index = ecx; msr_info.host_initiated = false; if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_read(ecx, msr_info.data); - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; + vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & (unsigned)-1; + vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & (unsigned)-1; skip_emulated_instruction(vcpu); return 1; } @@ -5898,26 +4569,24 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) { struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & (unsigned)-1) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & (unsigned)-1) << 32); msr.data = data; msr.index = ecx; msr.host_initiated = false; if (kvm_set_msr(vcpu, &msr) != 0) { - trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 1; } @@ -5926,11 +4595,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) u32 cpu_based_vm_exec_control; /* clear pending irq */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; return 1; @@ -5941,11 +4610,6 @@ static int handle_halt(struct kvm_vcpu *vcpu) return kvm_emulate_halt(vcpu); } -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - static int handle_invd(struct kvm_vcpu *vcpu) { return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -5953,7 +4617,7 @@ static int handle_invd(struct kvm_vcpu *vcpu) static int handle_invlpg(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); skip_emulated_instruction(vcpu); @@ -5962,10 +4626,12 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) static int handle_rdpmc(struct kvm_vcpu *vcpu) { +#if 0 int err; err = kvm_rdpmc(vcpu); kvm_complete_insn_gp(vcpu, err); +#endif return 1; } @@ -5989,21 +4655,21 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_xsaves(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); + //WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); + //WARN(1, "this should never happen\n"); return 1; } static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; @@ -6025,7 +4691,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ @@ -6035,7 +4701,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ @@ -6046,7 +4712,7 @@ static int handle_apic_write(struct kvm_vcpu *vcpu) static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification; + size_t exit_qualification; bool has_error_code = false; u32 error_code = 0; u16 tss_selector; @@ -6056,7 +4722,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { @@ -6074,7 +4740,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) VECTORING_INFO_DELIVER_CODE_MASK) { has_error_code = true; error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs_read32(vcpu, IDT_VECTORING_ERROR_CODE); } /* fall through */ case INTR_TYPE_SOFT_EXCEPTION: @@ -6094,8 +4760,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) if (kvm_task_switch(vcpu, tss_selector, type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return 0; } @@ -6110,22 +4776,22 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + size_t exit_qualification; gpa_t gpa; u32 error_code; int gla_validity; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity == 0x2) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); + (long unsigned int)vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS), + vmcs_readl(vcpu, GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->exit_reason = GVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; return 0; } @@ -6139,10 +4805,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + vmcs_set_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); + gpa = vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS); /* it is a read fault? */ error_code = (exit_qualification << 2) & PFERR_USER_MASK; @@ -6163,10 +4828,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) int ret; gpa_t gpa; - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + gpa = vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS); + if (!kvm_io_bus_write(vcpu, GVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); - trace_kvm_fast_mmio(gpa); return 1; } @@ -6184,7 +4848,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) /* It is the real ept misconfig */ WARN_ON(1); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->exit_reason = GVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; return 0; @@ -6195,11 +4859,11 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) u32 cpu_based_vm_exec_control; /* clear pending NMI */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); ++vcpu->stat.nmi_window_exits; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 1; } @@ -6213,14 +4877,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) bool intr_window_requested; unsigned count = 130; - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_exec_ctrl = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (test_bit(GVM_REQ_EVENT, &vcpu->requests)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); @@ -6232,8 +4896,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return 0; } @@ -6244,155 +4908,65 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } +#if 0 if (signal_pending(current)) goto out; if (need_resched()) schedule(); +#endif } out: return ret; } -static int __grow_ple_window(int val) -{ - if (ple_window_grow < 1) - return ple_window; - - val = min(val, ple_window_actual_max); - - if (ple_window_grow < ple_window) - val *= ple_window_grow; - else - val += ple_window_grow; - - return val; -} - -static int __shrink_ple_window(int val, int modifier, int minimum) -{ - if (modifier < 1) - return ple_window; - - if (modifier < ple_window) - val /= modifier; - else - val -= modifier; - - return max(val, minimum); -} - -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) +static int hardware_setup(void) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, - ple_window_shrink, ple_window); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); -} - -/* - * ple_window_actual_max is computed to be one grow_ple_window() below - * ple_window_max. (See __grow_ple_window for the reason.) - * This prevents overflows, because ple_window_max is int. - * ple_window_max effectively rounded down to a multiple of ple_window_grow in - * this process. - * ple_window_max is also prevented from setting vmx->ple_window < ple_window. - */ -static void update_ple_window_actual_max(void) -{ - ple_window_actual_max = - __shrink_ple_window(max(ple_window_max, ple_window), - ple_window_grow, INT_MIN); -} - -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - -static __init int hardware_setup(void) -{ - int r = -ENOMEM, i, msr; + int r = -ENOMEM, msr; rdmsrl_safe(MSR_EFER, &host_efer); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_io_bitmap_a = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) return r; - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_io_bitmap_b = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_b) goto out; - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_msr_bitmap_legacy = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_legacy) goto out1; vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_legacy_x2apic) goto out2; vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) goto out3; - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_msr_bitmap_longmode = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) goto out4; vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) goto out5; vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) goto out6; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_vmread_bitmap = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out7; - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + vmx_vmwrite_bitmap = (size_t *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) goto out8; @@ -6421,10 +4995,6 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -6453,28 +5023,15 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - if (!cpu_has_vmx_apicv()) enable_apicv = 0; - if (cpu_has_vmx_tsc_scaling()) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -6522,8 +5079,6 @@ static __init int hardware_setup(void) } else kvm_disable_tdp(); - update_ple_window_actual_max(); - /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -6538,78 +5093,50 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { - u64 vmx_msr; - - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - } - - kvm_set_posted_intr_wakeup_handler(wakeup_handler); - - kvm_mce_cap_supported |= MCG_LMCE_P; + //kvm_set_posted_intr_wakeup_handler(wakeup_handler); return alloc_kvm_area(); out9: - free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((size_t)vmx_vmwrite_bitmap); out8: - free_page((unsigned long)vmx_vmread_bitmap); + free_page((size_t)vmx_vmread_bitmap); out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic); out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((size_t)vmx_msr_bitmap_longmode); out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic); out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((size_t)vmx_msr_bitmap_legacy); out1: - free_page((unsigned long)vmx_io_bitmap_b); + free_page((size_t)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + free_page((size_t)vmx_io_bitmap_a); return r; } -static __exit void hardware_unsetup(void) +static void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic); + free_page((size_t)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic); + free_page((size_t)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); + free_page((size_t)vmx_msr_bitmap_legacy); + free_page((size_t)vmx_msr_bitmap_longmode); + free_page((size_t)vmx_io_bitmap_b); + free_page((size_t)vmx_io_bitmap_a); + free_page((size_t)vmx_vmwrite_bitmap); + free_page((size_t)vmx_vmread_bitmap); free_kvm_area(); } -/* - * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE - * exiting, so only get here on cpu with PAUSE-Loop-Exiting. - */ -static int handle_pause(struct kvm_vcpu *vcpu) -{ - if (ple_gap) - grow_ple_window(vcpu); - - skip_emulated_instruction(vcpu); - kvm_vcpu_on_spin(vcpu); - - return 1; -} - static int handle_nop(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); @@ -6650,11 +5177,13 @@ static int handle_monitor(struct kvm_vcpu *vcpu) static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) { struct vmcs02_list *item; +#define LIST_ENTRY_TYPE_INFO struct vmcs02_list list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) if (item->vmptr == vmx->nested.current_vmptr) { list_move(&item->list, &vmx->nested.vmcs02_pool); return &item->vmcs02; } +#undef LIST_ENTRY_TYPE_INFO if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { /* Recycle the least recently used VMCS. */ @@ -6686,6 +5215,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) { struct vmcs02_list *item; +#define LIST_ENTRY_TYPE_INFO struct vmcs02_list list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) if (item->vmptr == vmptr) { free_loaded_vmcs(&item->vmcs02); @@ -6694,6 +5224,7 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) vmx->nested.vmcs02_num--; return; } +#undef LIST_ENTRY_TYPE_INFO } /* @@ -6706,6 +5237,7 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) struct vmcs02_list *item, *n; WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); +#define LIST_ENTRY_TYPE_INFO struct vmcs02_list list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { /* * Something will leak if the above WARN triggers. Better than @@ -6719,6 +5251,7 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) kfree(item); vmx->nested.vmcs02_num--; } +#undef LIST_ENTRY_TYPE_INFO } /* @@ -6766,22 +5299,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } -static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) -{ - struct vcpu_vmx *vmx = - container_of(timer, struct vcpu_vmx, nested.preemption_timer); - - vmx->nested.preemption_timer_expired = true; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - kvm_vcpu_kick(&vmx->vcpu); - - return HRTIMER_NORESTART; -} - /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). @@ -6789,7 +5310,7 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) * #UD or #GP. */ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, + size_t exit_qualification, u32 vmx_instruction_info, bool wr, gva_t *ret) { gva_t off; @@ -6892,12 +5413,12 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, gva_t gva; gpa_t vmptr; struct x86_exception e; - struct page *page; + PMDL kmap_mdl; struct vcpu_vmx *vmx = to_vmx(vcpu); int maxphyaddr = cpuid_maxphyaddr(vcpu); - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + if (get_vmx_mem_address(vcpu, vmcs_readl(vcpu, EXIT_QUALIFICATION), + vmcs_read32(vcpu, VMX_INSTRUCTION_INFO), false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, @@ -6924,15 +5445,15 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, return 1; } - page = nested_get_page(vcpu, vmptr); - if (page == NULL || - *(u32 *)kmap(page) != VMCS12_REVISION) { + kmap_mdl = nested_get_page(vcpu, vmptr); + if (kmap_mdl == NULL || + *(u32 *)kmap(kmap_mdl) != VMCS12_REVISION) { nested_vmx_failInvalid(vcpu); - kunmap(page); + kunmap(kmap_mdl); skip_emulated_instruction(vcpu); return 1; } - kunmap(page); + kunmap(kmap_mdl); vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: @@ -7030,7 +5551,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = - (unsigned long *)__get_free_page(GFP_KERNEL); + (size_t *)__get_free_page(GFP_KERNEL); if (!vmx->nested.msr_bitmap) goto out_msr_bitmap; } @@ -7053,10 +5574,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; - vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -7067,7 +5584,7 @@ out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: - free_page((unsigned long)vmx->nested.msr_bitmap); + free_page((size_t)vmx->nested.msr_bitmap); out_msr_bitmap: return -ENOMEM; @@ -7105,6 +5622,8 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { + struct kvm_vcpu* vcpu = &vmx->vcpu; + if (vmx->nested.current_vmptr == -1ull) return; @@ -7117,18 +5636,17 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_clear_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(vcpu, VMCS_LINK_POINTER, -1ull); } - vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, VMCS12_SIZE); - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); + kunmap(vmx->nested.current_vmcs12_mdl); + nested_release_page(vmx->nested.current_vmcs12_mdl); vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } @@ -7146,7 +5664,7 @@ static void free_nested(struct vcpu_vmx *vmx) free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (vmx->nested.msr_bitmap) { - free_page((unsigned long)vmx->nested.msr_bitmap); + free_page((size_t)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } if (enable_shadow_vmcs) { @@ -7156,19 +5674,13 @@ static void free_nested(struct vcpu_vmx *vmx) } kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ - if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; + if (vmx->nested.apic_access_mdl) { + nested_release_page(vmx->nested.apic_access_mdl); + vmx->nested.apic_access_mdl = NULL; } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; + if (vmx->nested.virtual_apic_mdl) { + nested_release_page(vmx->nested.virtual_apic_mdl); + vmx->nested.virtual_apic_mdl = NULL; } nested_free_all_saved_vmcss(vmx); @@ -7191,7 +5703,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; struct vmcs12 *vmcs12; - struct page *page; + PMDL kmap_mdl; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7202,8 +5714,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { + kmap_mdl = nested_get_page(vcpu, vmptr); + if (kmap_mdl == NULL) { /* * For accurate processor emulation, VMCLEAR beyond available * physical memory should do nothing at all. However, it is @@ -7211,13 +5723,13 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) * resulted in this case, so let's shut down before doing any * more damage: */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return 1; } - vmcs12 = kmap(page); + vmcs12 = kmap(kmap_mdl); vmcs12->launch_state = 0; - kunmap(page); - nested_release_page(page); + kunmap(kmap_mdl); + nested_release_page(kmap_mdl); nested_free_vmcs02(vmx, vmptr); @@ -7248,14 +5760,14 @@ enum vmcs_field_type { VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 }; -static inline int vmcs_field_type(unsigned long field) +static inline int vmcs_field_type(size_t field) { if (0x1 & field) /* the *_HIGH fields are all 32 bit */ return VMCS_FIELD_TYPE_U32; return (field >> 13) & 0x3 ; } -static inline int vmcs_field_readonly(unsigned long field) +static inline int vmcs_field_readonly(size_t field) { return (((field >> 10) & 0x3) == 1); } @@ -7268,7 +5780,7 @@ static inline int vmcs_field_readonly(unsigned long field) * 64-bit fields are to be returned). */ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) + size_t field, u64 *ret) { short offset = vmcs_field_to_offset(field); char *p; @@ -7299,7 +5811,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 field_value){ + size_t field, u64 field_value){ short offset = vmcs_field_to_offset(field); char *p = ((char *) get_vmcs12(vcpu)) + offset; if (offset < 0) @@ -7328,11 +5840,12 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { int i; - unsigned long field; + size_t field; u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - const unsigned long *fields = shadow_read_write_fields; + const size_t *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + struct kvm_vcpu* vcpu = &vmx->vcpu; preempt_disable(); @@ -7342,16 +5855,16 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) field = fields[i]; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: - field_value = vmcs_read16(field); + field_value = vmcs_read16(vcpu, field); break; case VMCS_FIELD_TYPE_U32: - field_value = vmcs_read32(field); + field_value = vmcs_read32(vcpu, field); break; case VMCS_FIELD_TYPE_U64: - field_value = vmcs_read64(field); + field_value = vmcs_read64(vcpu, field); break; case VMCS_FIELD_TYPE_NATURAL_WIDTH: - field_value = vmcs_readl(field); + field_value = vmcs_readl(vcpu, field); break; default: WARN_ON(1); @@ -7368,7 +5881,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const unsigned long *fields[] = { + const size_t *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -7377,9 +5890,10 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) max_shadow_read_only_fields }; int i, q; - unsigned long field; + size_t field; u64 field_value = 0; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct kvm_vcpu* vcpu = &vmx->vcpu; vmcs_load(shadow_vmcs); @@ -7390,16 +5904,16 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: - vmcs_write16(field, (u16)field_value); + vmcs_write16(vcpu, field, (u16)field_value); break; case VMCS_FIELD_TYPE_U32: - vmcs_write32(field, (u32)field_value); + vmcs_write32(vcpu, field, (u32)field_value); break; case VMCS_FIELD_TYPE_U64: - vmcs_write64(field, (u64)field_value); + vmcs_write64(vcpu, field, (u64)field_value); break; case VMCS_FIELD_TYPE_NATURAL_WIDTH: - vmcs_writel(field, (long)field_value); + vmcs_writel(vcpu, field, (long)field_value); break; default: WARN_ON(1); @@ -7429,10 +5943,10 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) static int handle_vmread(struct kvm_vcpu *vcpu) { - unsigned long field; + size_t field; u64 field_value; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); gva_t gva = 0; if (!nested_vmx_check_permission(vcpu) || @@ -7472,10 +5986,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) static int handle_vmwrite(struct kvm_vcpu *vcpu) { - unsigned long field; + size_t field; gva_t gva; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -7537,17 +6051,17 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; - struct page *page; - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { + PMDL kmap_mdl; + kmap_mdl = nested_get_page(vcpu, vmptr); + if (kmap_mdl == NULL) { nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; } - new_vmcs12 = kmap(page); + new_vmcs12 = kmap(kmap_mdl); if (new_vmcs12->revision_id != VMCS12_REVISION) { - kunmap(page); - nested_release_page_clean(page); + kunmap(kmap_mdl); + nested_release_page(kmap_mdl); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); skip_emulated_instruction(vcpu); @@ -7557,7 +6071,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; - vmx->nested.current_vmcs12_page = page; + vmx->nested.current_vmcs12_mdl = kmap_mdl; /* * Load VMCS12 from guest memory since it is not already * cached. @@ -7566,9 +6080,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12, VMCS12_SIZE); if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + vmcs_set_bits(vcpu, SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, + vmcs_write64(vcpu, VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } @@ -7582,8 +6096,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); gva_t vmcs_gva; struct x86_exception e; @@ -7610,7 +6124,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; - unsigned long type; + size_t type; gva_t gva; struct x86_exception e; struct { @@ -7632,7 +6146,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -7647,7 +6161,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmcs_readl(vcpu, EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, @@ -7664,7 +6178,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) */ case VMX_EPT_EXTENT_CONTEXT: kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: @@ -7680,7 +6194,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info; - unsigned long type, types; + size_t type, types; gva_t gva; struct x86_exception e; int vpid; @@ -7695,7 +6209,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; @@ -7710,7 +6224,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmcs_readl(vcpu, EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, @@ -7722,7 +6236,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) switch (type) { case VMX_VPID_EXTENT_SINGLE_CONTEXT: /* - * Old versions of KVM use the single-context version so we + * Old versions of kvm use the single-context version so we * have to support it; just treat it the same as all-context. */ case VMX_VPID_EXTENT_ALL_CONTEXT: @@ -7741,11 +6255,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) static int handle_pml_full(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; - - trace_kvm_pml_full(vcpu->vcpu_id); + size_t exit_qualification; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); /* * PML buffer FULL happened while executing iret from NMI, @@ -7754,7 +6266,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_set_bits(vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); /* @@ -7764,12 +6276,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static int handle_preemption_timer(struct kvm_vcpu *vcpu) -{ - kvm_lapic_expired_hv_timer(vcpu); - return 1; -} - /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7791,7 +6297,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, [EXIT_REASON_VMPTRLD] = handle_vmptrld, @@ -7811,7 +6316,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, - [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, @@ -7820,7 +6324,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, - [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -7829,7 +6332,7 @@ static const int kvm_vmx_max_exit_handlers = static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification; + size_t exit_qualification; gpa_t bitmap, last_bitmap; unsigned int port; int size; @@ -7838,7 +6341,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7915,10 +6418,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + size_t exit_qualification = vmcs_readl(vcpu, EXIT_QUALIFICATION); int cr = exit_qualification & 15; int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_readl(vcpu, reg); + size_t val = kvm_register_readl(vcpu, reg); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ @@ -7995,24 +6498,17 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, */ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) { - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + u32 intr_info = vmcs_read32(vcpu, VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 exit_reason = vmx->exit_reason; - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); - if (vmx->nested.nested_run_pending) return false; if (unlikely(vmx->fail)) { pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); + vmcs_read32(vcpu, VM_INSTRUCTION_ERROR)); return true; } @@ -8022,15 +6518,12 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return false; else if (is_page_fault(intr_info)) return enable_ept; - else if (is_no_device(intr_info) && - !(vmcs12->guest_cr0 & X86_CR0_TS)) - return false; else if (is_debug(intr_info) && vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + (GVM_GUESTDBG_SINGLESTEP | GVM_GUESTDBG_USE_HW_BP)) return false; else if (is_breakpoint(intr_info) && - vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + vcpu->guest_debug & GVM_GUESTDBG_USE_SW_BP) return false; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); @@ -8129,8 +6622,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; default: return true; } @@ -8138,8 +6629,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); + *info1 = vmcs_readl(vcpu, EXIT_QUALIFICATION); + *info2 = vmcs_read32(vcpu, VM_EXIT_INTR_INFO); } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -8156,7 +6647,7 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) u64 *pml_buf; u16 pml_idx; - pml_idx = vmcs_read16(GUEST_PML_INDEX); + pml_idx = vmcs_read16(vcpu, GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ if (pml_idx == (PML_ENTITY_NUM - 1)) @@ -8178,7 +6669,7 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) } /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(vcpu, GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } /* @@ -8199,160 +6690,156 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) kvm_vcpu_kick(vcpu); } -static void vmx_dump_sel(char *name, uint32_t sel) +static void vmx_dump_sel(struct kvm_vcpu* vcpu, char *name, uint32_t sel) { - pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), - vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), - vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), - vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", + name, vmcs_read32(vcpu, sel), + vmcs_read32(vcpu, sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(vcpu, sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(vcpu, sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); } -static void vmx_dump_dtsel(char *name, uint32_t limit) +static void vmx_dump_dtsel(struct kvm_vcpu* vcpu, char *name, uint32_t limit) { - pr_err("%s limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(limit), - vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); + pr_err("%s limit=0x%08x, base=0x%016llx\n", + name, vmcs_read32(vcpu, limit), + vmcs_readl(vcpu, limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +static void dump_vmcs(struct kvm_vcpu* vcpu) { - u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); - u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); - u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 vmentry_ctl = vmcs_read32(vcpu, VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(vcpu, VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(vcpu, CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(vcpu, PIN_BASED_VM_EXEC_CONTROL); u32 secondary_exec_control = 0; - unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_read64(GUEST_IA32_EFER); + size_t cr4 = vmcs_readl(vcpu, GUEST_CR4); + u64 efer = vmcs_read64(vcpu, GUEST_IA32_EFER); int i, n; if (cpu_has_secondary_exec_ctrls()) - secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + secondary_exec_control = vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL); pr_err("*** Guest State ***\n"); - pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), - vmcs_readl(CR0_GUEST_HOST_MASK)); - pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); - pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + pr_err("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", + vmcs_readl(vcpu, GUEST_CR0), vmcs_readl(vcpu, CR0_READ_SHADOW), + vmcs_readl(vcpu, CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", + cr4, vmcs_readl(vcpu, CR4_READ_SHADOW), vmcs_readl(vcpu, CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016llx\n", vmcs_readl(vcpu, GUEST_CR3)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) { pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + vmcs_read64(vcpu, GUEST_PDPTR0), vmcs_read64(vcpu, GUEST_PDPTR1)); pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); - } - pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", - vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); - pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", - vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(GUEST_SYSENTER_ESP), - vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); - vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); - vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); - vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); - vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); - vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); - vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); - vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); - vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); - vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); - vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + vmcs_read64(vcpu, GUEST_PDPTR2), vmcs_read64(vcpu, GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016llx RIP = 0x%016llx\n", + vmcs_readl(vcpu, GUEST_RSP), vmcs_readl(vcpu, GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016llx\n", + vmcs_readl(vcpu, GUEST_RFLAGS), vmcs_readl(vcpu, GUEST_DR7)); + pr_err("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", + vmcs_readl(vcpu, GUEST_SYSENTER_ESP), + vmcs_read32(vcpu, GUEST_SYSENTER_CS), vmcs_readl(vcpu, GUEST_SYSENTER_EIP)); + vmx_dump_sel(vcpu, "CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel(vcpu, "DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel(vcpu, "SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel(vcpu, "ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel(vcpu, "FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel(vcpu, "GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel(vcpu, "GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel(vcpu, "LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel(vcpu, "IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel(vcpu, "TR: ", GUEST_TR_SELECTOR); if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - efer, vmcs_read64(GUEST_IA32_PAT)); - pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", - vmcs_read64(GUEST_IA32_DEBUGCTL), - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + efer, vmcs_read64(vcpu, GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016llx\n", + vmcs_read64(vcpu, GUEST_IA32_DEBUGCTL), + vmcs_readl(vcpu, GUEST_PENDING_DBG_EXCEPTIONS)); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); + vmcs_read64(vcpu, GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) - pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(vcpu, GUEST_BNDCFGS)); pr_err("Interruptibility = %08x ActivityState = %08x\n", - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), - vmcs_read32(GUEST_ACTIVITY_STATE)); + vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(vcpu, GUEST_ACTIVITY_STATE)); if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) pr_err("InterruptStatus = %04x\n", - vmcs_read16(GUEST_INTR_STATUS)); + vmcs_read16(vcpu, GUEST_INTR_STATUS)); pr_err("*** Host State ***\n"); - pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", - vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("RIP = 0x%016llx RSP = 0x%016llx\n", + vmcs_readl(vcpu, HOST_RIP), vmcs_readl(vcpu, HOST_RSP)); pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", - vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), - vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), - vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), - vmcs_read16(HOST_TR_SELECTOR)); - pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", - vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), - vmcs_readl(HOST_TR_BASE)); - pr_err("GDTBase=%016lx IDTBase=%016lx\n", - vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); - pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", - vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), - vmcs_readl(HOST_CR4)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(HOST_IA32_SYSENTER_ESP), - vmcs_read32(HOST_IA32_SYSENTER_CS), - vmcs_readl(HOST_IA32_SYSENTER_EIP)); + vmcs_read16(vcpu, HOST_CS_SELECTOR), vmcs_read16(vcpu, HOST_SS_SELECTOR), + vmcs_read16(vcpu, HOST_DS_SELECTOR), vmcs_read16(vcpu, HOST_ES_SELECTOR), + vmcs_read16(vcpu, HOST_FS_SELECTOR), vmcs_read16(vcpu, HOST_GS_SELECTOR), + vmcs_read16(vcpu, HOST_TR_SELECTOR)); + pr_err("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n", + vmcs_readl(vcpu, HOST_FS_BASE), vmcs_readl(vcpu, HOST_GS_BASE), + vmcs_readl(vcpu, HOST_TR_BASE)); + pr_err("GDTBase=%016llx IDTBase=%016llx\n", + vmcs_readl(vcpu, HOST_GDTR_BASE), vmcs_readl(vcpu, HOST_IDTR_BASE)); + pr_err("CR0=%016llx CR3=%016llx CR4=%016llx\n", + vmcs_readl(vcpu, HOST_CR0), vmcs_readl(vcpu, HOST_CR3), + vmcs_readl(vcpu, HOST_CR4)); + pr_err("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", + vmcs_readl(vcpu, HOST_IA32_SYSENTER_ESP), + vmcs_read32(vcpu, HOST_IA32_SYSENTER_CS), + vmcs_readl(vcpu, HOST_IA32_SYSENTER_EIP)); if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - vmcs_read64(HOST_IA32_EFER), - vmcs_read64(HOST_IA32_PAT)); + vmcs_read64(vcpu, HOST_IA32_EFER), + vmcs_read64(vcpu, HOST_IA32_PAT)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + vmcs_read64(vcpu, HOST_IA32_PERF_GLOBAL_CTRL)); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", - vmcs_read32(EXCEPTION_BITMAP), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + vmcs_read32(vcpu, EXCEPTION_BITMAP), + vmcs_read32(vcpu, PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(vcpu, PAGE_FAULT_ERROR_CODE_MATCH)); pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), - vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + vmcs_read32(vcpu, VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(vcpu, VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(vcpu, VM_ENTRY_INSTRUCTION_LEN)); pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - pr_err(" reason=%08x qualification=%016lx\n", - vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_read32(vcpu, VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016llx\n", + vmcs_read32(vcpu, VM_EXIT_REASON), vmcs_readl(vcpu, EXIT_QUALIFICATION)); + pr_err(" gpa=%016llx\n", vmcs_read64(vcpu, GUEST_PHYSICAL_ADDRESS)); pr_err("IDTVectoring: info=%08x errcode=%08x\n", - vmcs_read32(IDT_VECTORING_INFO_FIELD), - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); - if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) - pr_err("TSC Multiplier = 0x%016llx\n", - vmcs_read64(TSC_MULTIPLIER)); + vmcs_read32(vcpu, IDT_VECTORING_INFO_FIELD), + vmcs_read32(vcpu, IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(vcpu, TSC_OFFSET)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); - if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) - pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(vcpu, TPR_THRESHOLD)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) - pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(vcpu, EPT_POINTER)); + n = vmcs_read32(vcpu, CR3_TARGET_COUNT); for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + pr_err("CR3 target%u=%016llx target%u=%016llx\n", + i, vmcs_readl(vcpu, CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(vcpu, CR3_TARGET_VALUE0 + i * 2 + 2)); if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + pr_err("CR3 target%u=%016llx\n", + i, vmcs_readl(vcpu, CR3_TARGET_VALUE0 + i * 2)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", - vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + vmcs_read32(vcpu, PLE_GAP), vmcs_read32(vcpu, PLE_WINDOW)); if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", - vmcs_read16(VIRTUAL_PROCESSOR_ID)); + vmcs_read16(vcpu, VIRTUAL_PROCESSOR_ID)); } /* @@ -8365,8 +6852,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before @@ -8383,23 +6868,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { nested_vmx_vmexit(vcpu, exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_readl(vcpu, EXIT_QUALIFICATION)); return 1; } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + dump_vmcs(vcpu); + vcpu->run->exit_reason = GVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; return 0; } if (unlikely(vmx->fail)) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->exit_reason = GVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); + = vmcs_read32(vcpu, VM_INSTRUCTION_ERROR); return 0; } @@ -8415,8 +6900,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && exit_reason != EXIT_REASON_TASK_SWITCH)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; @@ -8447,7 +6932,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + //WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8462,11 +6947,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) return; if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); + vmcs_write32(vcpu, TPR_THRESHOLD, 0); return; } - vmcs_write32(TPR_THRESHOLD, irr); + vmcs_write32(vcpu, TPR_THRESHOLD, irr); } static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) @@ -8485,7 +6970,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) if (!cpu_need_tpr_shadow(vcpu)) return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control = vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL); if (set) { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; @@ -8494,14 +6979,14 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, sec_exec_control); vmx_set_msr_bitmap(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + //struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Currently we do not handle the nested case where L2 has an @@ -8516,10 +7001,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) * prepare_vmcs02. If the latter, the vmcs01 will be updated in * the next L2->L1 exit. */ - if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(&vmx->vcpu), - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmcs_write64(APIC_ACCESS_ADDR, hpa); + //if (!is_guest_mode(vcpu) || + //!nested_cpu_has2(get_vmcs12(&vmx->vcpu), + //SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmcs_write64(vcpu, APIC_ACCESS_ADDR, hpa); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -8530,16 +7015,16 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) if (max_isr == -1) max_isr = 0; - status = vmcs_read16(GUEST_INTR_STATUS); + status = vmcs_read16(vcpu, GUEST_INTR_STATUS); old = status >> 8; if (max_isr != old) { status &= 0xff; status |= max_isr << 8; - vmcs_write16(GUEST_INTR_STATUS, status); + vmcs_write16(vcpu, GUEST_INTR_STATUS, status); } } -static void vmx_set_rvi(int vector) +static void vmx_set_rvi(struct kvm_vcpu *vcpu, int vector) { u16 status; u8 old; @@ -8547,19 +7032,19 @@ static void vmx_set_rvi(int vector) if (vector == -1) vector = 0; - status = vmcs_read16(GUEST_INTR_STATUS); + status = vmcs_read16(vcpu, GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { status &= ~0xff; status |= (u8)vector; - vmcs_write16(GUEST_INTR_STATUS, status); + vmcs_write16(vcpu, GUEST_INTR_STATUS, status); } } static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); + vmx_set_rvi(vcpu, max_irr); return; } @@ -8589,12 +7074,13 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) if (!kvm_vcpu_apicv_active(vcpu)) return; - vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); - vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); - vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); - vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(vcpu, EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } +static u64 nmi_count = 0; static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8603,7 +7089,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = vmcs_read32(&vmx->vcpu, VM_EXIT_INTR_INFO); exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ @@ -8614,15 +7100,15 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (exit_intr_info & INTR_INFO_VALID_MASK)) { kvm_before_handle_nmi(&vmx->vcpu); - asm("int $2"); + __int2(); + nmi_count++; kvm_after_handle_nmi(&vmx->vcpu); } } static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); + u32 exit_intr_info = vmcs_read32(vcpu, VM_EXIT_INTR_INFO); /* * If external interrupt exists, IF bit is set in rflags/eflags on the @@ -8632,36 +7118,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; - unsigned long entry; + size_t entry; gate_desc *desc; - struct vcpu_vmx *vmx = to_vmx(vcpu); -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; + desc = (gate_desc *)(this_cpu_ptr(&host_idt))->address + vector; entry = gate_offset(*desc); - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" - "call *%[entry]\n\t" - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - "+r"(__sp) - : - [entry]"r"(entry), - [ss]"i"(__KERNEL_DS), - [cs]"i"(__KERNEL_CS) - ); + __asm_vmx_handle_external_intr(entry); } } @@ -8698,7 +7161,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) * Can't use vmx->exit_intr_info since we're not sure what * the exit reason is. */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + exit_intr_info = vmcs_read32(&vmx->vcpu, VM_EXIT_INTR_INFO); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -8713,11 +7176,11 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) */ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + vmcs_set_bits(&vmx->vcpu, GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + !(vmcs_read32(&vmx->vcpu, GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += @@ -8742,7 +7205,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, if (!idtv_info_valid) return; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; @@ -8758,17 +7221,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, vmx_set_nmi_mask(vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(vcpu, instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); + u32 err = vmcs_read32(vcpu, error_code_field); kvm_requeue_exception_e(vcpu, vector, err); } else kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(vcpu, instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); @@ -8788,15 +7251,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(vcpu, VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, VM_ENTRY_EXCEPTION_ERROR_CODE); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, 0); } static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { +#if 0 int i, nr_msrs; struct perf_guest_switch_msr *msrs; @@ -8811,32 +7275,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, msrs[i].host); +#endif } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl; - u32 delta_tsc; - - if (vmx->hv_deadline_tsc == -1) - return; - - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; - - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); -} +u64 last_vmexit_rip = 0; +u64 last_vmexit_rsp = 0; +u64 rip = 0xffffffffffffffff; +u8 do_print = 1; +u8 do_print1 = 1; -static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __declspec(noinline) vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; + //size_t debugctlmsr, cr4; + size_t cr4; + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + struct desc_ptr *idt = this_cpu_ptr(&host_idt); + size_t sysenter_esp; + unsigned int i; + struct msr_autoload *m = &vmx->msr_autoload; + /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -8847,24 +7305,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; - if (vmx->ple_window_dirty) { - vmx->ple_window_dirty = false; - vmcs_write32(PLE_WINDOW, vmx->ple_window); - } - if (vmx->nested.sync_shadow_vmcs) { copy_vmcs12_to_shadow(vmx); vmx->nested.sync_shadow_vmcs = false; } - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + if (test_bit(VCPU_REGS_RSP, (size_t *)&vcpu->arch.regs_dirty)) + vmcs_writel(vcpu, GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (size_t *)&vcpu->arch.regs_dirty)) + vmcs_writel(vcpu, GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { - vmcs_writel(HOST_CR4, cr4); + vmcs_writel(vcpu, HOST_CR4, cr4); vmx->host_state.vmcs_host_cr4 = cr4; } @@ -8873,126 +7326,50 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * vmentry fails as it then expects bit 14 (BS) in pending debug * exceptions being set, but that's not correct for the guest debugging * case. */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - if (vmx->guest_pkru_valid) - __write_pkru(vmx->guest_pkru); + vmcs_writel(vcpu, HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ + vmcs_writel(vcpu, HOST_IDTR_BASE, idt->address); /* 22.2.4 */ + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(vcpu, HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ +#if 0 atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - - vmx_arm_hv_timer(vcpu); - - vmx->__launched = vmx->loaded_vmcs->launched; - asm( - /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ - "push %%" _ASM_CX " \n\t" - "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" - "1: \n\t" - /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 2f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "2: \n\t" - /* Check if vmlaunch of vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" - /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%" _ASM_AX " \n\t" - "mov %c[rbx](%0), %%" _ASM_BX " \n\t" - "mov %c[rdx](%0), %%" _ASM_DX " \n\t" - "mov %c[rsi](%0), %%" _ASM_SI " \n\t" - "mov %c[rdi](%0), %%" _ASM_DI " \n\t" - "mov %c[rbp](%0), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" -#endif - "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ - - /* Enter guest mode */ - "jne 1f \n\t" - __ex(ASM_VMX_VMLAUNCH) "\n\t" - "jmp 2f \n\t" - "1: " __ex(ASM_VMX_VMRESUME) "\n\t" - "2: " - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" - "pop %0 \n\t" - "mov %%" _ASM_AX ", %c[rax](%0) \n\t" - "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" - __ASM_SIZE(pop) " %c[rcx](%0) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" -#endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - "setbe %c[fail](%0) \n\t" - ".pushsection .rodata \n\t" - ".global vmx_return \n\t" - "vmx_return: " _ASM_PTR " 2b \n\t" - ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), - [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi", "rsi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "eax", "ebx", "edi", "esi" -#endif - ); + if (do_print1) { + dump_vmcs(vcpu); + do_print1 = 0; + } + vmcs_load(vmx->loaded_vmcs->vmcs); + + for (i = 0; i < m->nr; i++) + wrmsrl(m->guest[i].index, m->guest[i].value); + /* Calls to low-level assembly functions*/ + __asm_vmx_vcpu_run(vmx); + for (i = 0; i < m->nr; i++) + wrmsrl(m->host[i].index, m->host[i].value); + vmcs_clear(vmx->loaded_vmcs->vmcs); + + if (vcpu->vcpu_id == 0) { + last_vmexit_rip = vmcs_read64(vcpu, GUEST_RIP); + last_vmexit_rsp = vmcs_read64(vcpu, GUEST_RSP); + } + if (do_print && (vcpu->vcpu_id == 0)) { + DbgPrint("-------------------vcpu 0-----------------------------------------------------------\n"); + dump_vmcs(vcpu); + do_print = 0; + } + if (last_vmexit_rip == rip) + DbgBreakPoint(); +#if 0 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); +#endif #ifndef CONFIG_X86_64 /* @@ -9014,33 +7391,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx->idt_vectoring_info = vmcs_read32(vcpu, IDT_VECTORING_INFO_FIELD); - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (boot_cpu_has(X86_FEATURE_OSPKE)) { - vmx->guest_pkru = __read_pkru(); - if (vmx->guest_pkru != vmx->host_pkru) { - vmx->guest_pkru_valid = true; - __write_pkru(vmx->host_pkru); - } else - vmx->guest_pkru_valid = false; - } + vmx->exit_reason = vmcs_read32(vcpu, VM_EXIT_REASON); /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * the GVM_REQ_EVENT optimization bit is only on for one entry, and if * we did not inject a still-pending event to L1 now because of * nested_run_pending, we need to re-enable this bit. */ if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); vmx->nested.nested_run_pending = 0; @@ -9072,13 +7433,9 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - r = vcpu_load(vcpu); - BUG_ON(r); vmx_load_vmcs01(vcpu); free_nested(vmx); - vcpu_put(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -9091,16 +7448,14 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) leave_guest_mode(vcpu); vmx_free_vcpu_nested(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); - kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vmx); + kfree(vmx); } static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - int cpu; + struct vcpu_vmx *vmx = kzalloc_fast(sizeof(struct vcpu_vmx), GFP_KERNEL); if (!vmx) return ERR_PTR(-ENOMEM); @@ -9125,30 +7480,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) - > PAGE_SIZE); - - if (!vmx->guest_msrs) - goto free_pml; - vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->vmcs = alloc_vmcs(); + DbgPrint("vmcs allocated with phys %llx on cpu %d\n", __pa(vmx->loaded_vmcs->vmcs), smp_processor_id()); vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) - goto free_msrs; + goto free_pml; if (!vmm_exclusive) kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); loaded_vmcs_init(vmx->loaded_vmcs); if (!vmm_exclusive) kvm_cpu_vmxoff(); - cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); - vmx_vcpu_put(&vmx->vcpu); - put_cpu(); if (err) goto free_vmcs; if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { @@ -9171,7 +7515,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.vpid02 = allocate_vpid(); } - vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; @@ -9182,19 +7525,17 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) free_vmcs: free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); -free_msrs: - kfree(vmx->guest_msrs); free_pml: vmx_destroy_pml_buffer(vmx); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: free_vpid(vmx->vpid); - kmem_cache_free(kvm_vcpu_cache, vmx); + kfree(vmx); return ERR_PTR(err); } -static void __init vmx_check_processor_compat(void *rtn) +static void vmx_check_processor_compat(void *rtn) { struct vmcs_config vmcs_conf; @@ -9234,22 +7575,14 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) goto exit; } - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + //if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; } - if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; - else - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + //cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; @@ -9264,7 +7597,7 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } -static void vmcs_set_secondary_exec_control(u32 new_ctl) +static void vmcs_set_secondary_exec_control(struct kvm_vcpu *vcpu, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -9277,15 +7610,15 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + u32 cur_ctl = vmcs_read32(vcpu, SECONDARY_VM_EXEC_CONTROL); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, (new_ctl & ~mask) | (cur_ctl & mask)); } static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry *best; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); @@ -9316,7 +7649,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } if (cpu_has_secondary_exec_ctrls()) - vmcs_set_secondary_exec_control(secondary_exec_ctl); + vmcs_set_secondary_exec_control(vcpu, secondary_exec_ctl); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -9326,7 +7659,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry *entry) { if (func == 1 && nested) entry->ecx |= bit(X86_FEATURE_VMX); @@ -9348,7 +7681,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, /* Callbacks for nested_ept_init_mmu_context: */ -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +static size_t nested_ept_get_cr3(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ return get_vmcs12(vcpu)->ept_pointer; @@ -9393,8 +7726,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs_read32(vcpu, VM_EXIT_INTR_INFO), + vmcs_readl(vcpu, EXIT_QUALIFICATION)); else kvm_inject_page_fault(vcpu, fault); } @@ -9416,9 +7749,9 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, * physical address remains valid. We keep a reference * to it so we can release it later. */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = + if (vmx->nested.apic_access_mdl) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_mdl); + vmx->nested.apic_access_mdl = nested_get_page(vcpu, vmcs12->apic_access_addr); } @@ -9427,9 +7760,9 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, vmcs12->virtual_apic_page_addr >> maxphyaddr) return false; - if (vmx->nested.virtual_apic_page) /* shouldn't happen */ - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = + if (vmx->nested.virtual_apic_mdl) /* shouldn't happen */ + nested_release_page(vmx->nested.virtual_apic_mdl); + vmx->nested.virtual_apic_mdl = nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); /* @@ -9442,61 +7775,13 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, * the execution control. But such a configuration is useless, * so let's keep the code simple. */ - if (!vmx->nested.virtual_apic_page) + if (!vmx->nested.virtual_apic_mdl) return false; } - if (nested_cpu_has_posted_intr(vmcs12)) { - if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || - vmcs12->posted_intr_desc_addr >> maxphyaddr) - return false; - - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); - } - vmx->nested.pi_desc_page = - nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); - if (!vmx->nested.pi_desc_page) - return false; - - vmx->nested.pi_desc = - (struct pi_desc *)kmap(vmx->nested.pi_desc_page); - if (!vmx->nested.pi_desc) { - nested_release_page_clean(vmx->nested.pi_desc_page); - return false; - } - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } - return true; } -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { - vmx_preemption_timer_fn(&vmx->nested.preemption_timer); - return; - } - - preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; - preemption_timeout *= 1000000; - do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); - hrtimer_start(&vmx->nested.preemption_timer, - ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); -} - static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -9527,22 +7812,22 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { int msr; - struct page *page; - unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + PMDL kmap_mdl; + size_t *msr_bitmap_l1; + size_t *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; - page = nested_get_page(vcpu, vmcs12->msr_bitmap); - if (!page) { + kmap_mdl = nested_get_page(vcpu, vmcs12->msr_bitmap); + if (!kmap_mdl) { WARN_ON(1); return false; } - msr_bitmap_l1 = (unsigned long *)kmap(page); + msr_bitmap_l1 = (size_t *)kmap(kmap_mdl); if (!msr_bitmap_l1) { - nested_release_page_clean(page); + nested_release_page(kmap_mdl); WARN_ON(1); return false; } @@ -9572,8 +7857,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, MSR_TYPE_W); } } - kunmap(page); - nested_release_page_clean(page); + kunmap(kmap_mdl); + nested_release_page(kmap_mdl); return true; } @@ -9583,8 +7868,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, { if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && !nested_cpu_has_apic_reg_virt(vmcs12) && - !nested_cpu_has_vid(vmcs12) && - !nested_cpu_has_posted_intr(vmcs12)) + !nested_cpu_has_vid(vmcs12)) return 0; /* @@ -9603,17 +7887,6 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, !nested_exit_on_intr(vcpu)) return -EINVAL; - /* - * bits 15:8 should be zero in posted_intr_nv, - * the descriptor address has been already checked - * in nested_get_vmcs12_pages. - */ - if (nested_cpu_has_posted_intr(vmcs12) && - (!nested_cpu_has_vid(vmcs12) || - !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) - return -EINVAL; - /* tpr shadow is needed by all apicv features. */ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return -EINVAL; @@ -9622,8 +7895,8 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, } static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, - unsigned long count_field, - unsigned long addr_field) + size_t count_field, + size_t addr_field) { int maxphyaddr; u64 count, addr; @@ -9792,98 +8065,74 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + vmcs_write16(vcpu, GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(vcpu, GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(vcpu, GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(vcpu, GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(vcpu, GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(vcpu, GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(vcpu, GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(vcpu, GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(vcpu, GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(vcpu, GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(vcpu, GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(vcpu, GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(vcpu, GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(vcpu, GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(vcpu, GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(vcpu, GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(vcpu, GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(vcpu, GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(vcpu, GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(vcpu, GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(vcpu, GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + vmcs_write32(vcpu, GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(vcpu, GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(vcpu, GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(vcpu, GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(vcpu, GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(vcpu, GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(vcpu, GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(vcpu, GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(vcpu, GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(vcpu, GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(vcpu, GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(vcpu, GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(vcpu, GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(vcpu, GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(vcpu, GUEST_IDTR_BASE, vmcs12->guest_idtr_base); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs_write32(vcpu, VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs_write32(vcpu, VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_write32(vcpu, VM_ENTRY_INSTRUCTION_LEN, vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs_write32(vcpu, GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_writel(vcpu, GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(vcpu, XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + vmcs_write64(vcpu, VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - - /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { - /* - * Note that we use L0's vector here and in - * vmx_deliver_nested_posted_interrupt. - */ - vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } else - exec_control &= ~PIN_BASED_POSTED_INTR; - - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + vmcs_write32(vcpu, PIN_BASED_VM_EXEC_CONTROL, exec_control); /* * Whether page-faults are trapped is determined by a combination of @@ -9905,9 +8154,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * To fix this, we will need to emulate the PFEC checking (on the L1 * page tables), using walk_addr(), when injecting PFs to L1. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MASK, enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + vmcs_write32(vcpu, PAGE_FAULT_ERROR_CODE_MATCH, enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { @@ -9929,12 +8178,12 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * can never be accessed, this feature won't do * anything anyway. */ - if (!vmx->nested.apic_access_page) + if (!vmx->nested.apic_access_mdl) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; else - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->nested.apic_access_page)); + vmcs_write64(vcpu, APIC_ACCESS_ADDR, + mdl_to_phys(vmx->nested.apic_access_mdl)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= @@ -9943,19 +8192,19 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { - vmcs_write64(EOI_EXIT_BITMAP0, + vmcs_write64(vcpu, EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, + vmcs_write64(vcpu, EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, + vmcs_write64(vcpu, EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, + vmcs_write64(vcpu, EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); - vmcs_write16(GUEST_INTR_STATUS, + vmcs_write16(vcpu, GUEST_INTR_STATUS, vmcs12->guest_intr_status); } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write32(vcpu, SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9983,9 +8232,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control |= vmcs12->cpu_based_vm_exec_control; if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->nested.virtual_apic_page)); - vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + vmcs_write64(vcpu, VIRTUAL_APIC_PAGE_ADDR, + mdl_to_phys(vmx->nested.virtual_apic_mdl)); + vmcs_write32(vcpu, TPR_THRESHOLD, vmcs12->tpr_threshold); } if (cpu_has_vmx_msr_bitmap() && @@ -10002,7 +8251,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + vmcs_write32(vcpu, CPU_BASED_VM_EXEC_CONTROL, exec_control); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to @@ -10010,13 +8259,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(vcpu, CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); /* L2->L1 exit controls are emulated - the hardware exit is to L0 so * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + vmcs_write32(vcpu, VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -10027,24 +8276,22 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + vmcs_write64(vcpu, GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + vmcs_write64(vcpu, GUEST_IA32_PAT, vmx->vcpu.arch.pat); set_cr4_guest_host_mask(vmx); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + vmcs_write64(vcpu, GUEST_BNDCFGS, vmcs12->guest_bndcfgs); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vmcs_write64(TSC_OFFSET, + vmcs_write64(vcpu, TSC_OFFSET, vcpu->arch.tsc_offset + vmcs12->tsc_offset); else - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(vcpu, TSC_OFFSET, vcpu->arch.tsc_offset); if (enable_vpid) { /* @@ -10056,13 +8303,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * even if spawn a lot of nested vCPUs. */ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + vmcs_write16(vcpu, VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); } } else { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmcs_write16(vcpu, VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx_flush_tlb(vcpu); } @@ -10091,10 +8338,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * have more bits than L1 expected. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + vmcs_writel(vcpu, CR0_READ_SHADOW, nested_read_cr0(vmcs12)); vmx_set_cr4(vcpu, vmcs12->guest_cr4); - vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); + vmcs_writel(vcpu, CR4_READ_SHADOW, nested_read_cr4(vmcs12)); /* shadow page tables on either EPT or shadow page tables */ kvm_set_cr3(vcpu, vmcs12->guest_cr3); @@ -10107,10 +8354,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * L1 may access the L2's PDPTR, so save them to construct vmcs12 */ if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + vmcs_write64(vcpu, GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(vcpu, GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(vcpu, GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(vcpu, GUEST_PDPTR3, vmcs12->guest_pdptr3); } kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); @@ -10273,7 +8520,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.vmcs01_debugctl = vmcs_read64(vcpu, GUEST_IA32_DEBUGCTL); cpu = get_cpu(); vmx->loaded_vmcs = vmcs02; @@ -10330,23 +8577,23 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * didn't necessarily allow them to be changed in GUEST_CR0 - and rather * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. */ -static inline unsigned long +static inline size_t vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return - /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | + /*1*/ (vmcs_readl(vcpu, GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | - /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | + /*3*/ (vmcs_readl(vcpu, CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | vcpu->arch.cr0_guest_owned_bits)); } -static inline unsigned long +static inline size_t vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return - /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | + /*1*/ (vmcs_readl(vcpu, GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | - /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | + /*3*/ (vmcs_readl(vcpu, CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | vcpu->arch.cr4_guest_owned_bits)); } @@ -10396,14 +8643,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { - if (vmx->nested.nested_run_pending) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); - return 0; - } - if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { if (vmx->nested.nested_run_pending || vcpu->arch.interrupt.pending) @@ -10428,21 +8667,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); -} - -static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) -{ - ktime_t remaining = - hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); - u64 value; - - if (ktime_to_ns(remaining) <= 0) - return 0; - - value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; - do_div(value, 1000000); - return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + return 0; } /* @@ -10458,7 +8683,7 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + size_t exit_qualification) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -10466,62 +8691,54 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); - - vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); - vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); - vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); - vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); - vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); - vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); - vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); - vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); - vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); - vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); - vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); - vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); - vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); - vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); - vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); - vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); - vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); - vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); - vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); - vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); - vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); - vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); - vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); - vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); - vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); - vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); - vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); - vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); - vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); - vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); - vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + vmcs12->guest_rflags = vmcs_readl(vcpu, GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(vcpu, GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(vcpu, GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(vcpu, GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(vcpu, GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(vcpu, GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(vcpu, GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(vcpu, GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(vcpu, GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(vcpu, GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(vcpu, GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(vcpu, GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(vcpu, GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(vcpu, GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(vcpu, GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(vcpu, GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(vcpu, GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(vcpu, GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(vcpu, GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(vcpu, GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(vcpu, GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(vcpu, GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(vcpu, GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(vcpu, GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(vcpu, GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(vcpu, GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(vcpu, GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(vcpu, GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(vcpu, GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(vcpu, GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(vcpu, GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(vcpu, GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(vcpu, GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(vcpu, GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(vcpu, GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(vcpu, GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(vcpu, GUEST_IDTR_BASE); vmcs12->guest_interruptibility_info = - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs_read32(vcpu, GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs_readl(vcpu, GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == GVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } - /* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. @@ -10531,41 +8748,41 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Additionally, restore L2's PDPTR to vmcs12. */ if (enable_ept) { - vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); - vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); - vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); - vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); - vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + vmcs12->guest_cr3 = vmcs_readl(vcpu, GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(vcpu, GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(vcpu, GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(vcpu, GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(vcpu, GUEST_PDPTR3); } if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(vcpu, GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) - vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmcs12->guest_intr_status = vmcs_read16(vcpu, GUEST_INTR_STATUS); vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + kvm_get_dr(vcpu, 7, (size_t *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(vcpu, GUEST_IA32_DEBUGCTL); } /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + vmcs12->guest_ia32_pat = vmcs_read64(vcpu, GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + vmcs12->guest_sysenter_cs = vmcs_read32(vcpu, GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(vcpu, GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(vcpu, GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + vmcs12->guest_bndcfgs = vmcs_read64(vcpu, GUEST_BNDCFGS); if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); + vmcs12->xss_exit_bitmap = vmcs_read64(vcpu, XSS_EXIT_BITMAP); /* update exit information fields: */ @@ -10577,10 +8794,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs_read32(vcpu, VM_EXIT_INTR_ERROR_CODE); vmcs12->idt_vectoring_info_field = 0; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + vmcs12->vm_exit_instruction_len = vmcs_read32(vcpu, VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(vcpu, VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { /* vm_entry_intr_info_field is cleared on exit. Emulate this @@ -10641,14 +8858,14 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * but we also need to update cr0_guest_host_mask and exception_bitmap. */ update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(vcpu, CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); /* * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); + * (kvm doesn't change it)- no reason to call set_cr4_guest_host_mask(); */ - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(vcpu, CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); @@ -10669,22 +8886,22 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, } - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(vcpu, GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); + vmcs_writel(vcpu, GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); + vmcs_writel(vcpu, GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); + vmcs_writel(vcpu, GUEST_IDTR_BASE, vmcs12->host_idtr_base); + vmcs_writel(vcpu, GUEST_GDTR_BASE, vmcs12->host_gdtr_base); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, 0); + vmcs_write64(vcpu, GUEST_BNDCFGS, 0); if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vmcs_write64(vcpu, GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, + vmcs_write64(vcpu, GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); /* Set L1 segment info according to Intel SDM @@ -10734,7 +8951,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmcs_write64(vcpu, GUEST_IA32_DEBUGCTL, 0); if (cpu_has_vmx_msr_bitmap()) vmx_set_msr_bitmap(vcpu); @@ -10751,7 +8968,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + size_t exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -10777,13 +8994,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); @@ -10795,15 +9005,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(vcpu, TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { vmx->nested.change_vmcs01_virtual_x2apic_mode = false; @@ -10815,26 +9017,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx->host_rsp = 0; /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; + if (vmx->nested.apic_access_mdl) { + nested_release_page(vmx->nested.apic_access_mdl); + vmx->nested.apic_access_mdl = NULL; } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; + if (vmx->nested.virtual_apic_mdl) { + nested_release_page(vmx->nested.virtual_apic_mdl); + vmx->nested.virtual_apic_mdl = NULL; } /* * We are now running in L2, mmu_notifier will force to reload the * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + kvm_make_request(GVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Exiting from L2 to L1, we're now back to L1 which thinks it just @@ -10843,14 +9039,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vmcs_read32(vcpu, VM_INSTRUCTION_ERROR)); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; } /* @@ -10872,7 +9068,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu) */ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification) + u32 reason, size_t qualification) { load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; @@ -10889,75 +9085,10 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } -#ifdef CONFIG_X86_64 -/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ -static inline int u64_shl_div_u64(u64 a, unsigned int shift, - u64 divisor, u64 *result) -{ - u64 low = a << shift, high = a >> (64 - shift); - - /* To avoid the overflow on divq */ - if (high >= divisor) - return 1; - - /* Low hold the result, high hold rem which is discarded */ - asm("divq %2\n\t" : "=a" (low), "=d" (high) : - "rm" (divisor), "0" (low), "1" (high)); - *result = low; - - return 0; -} - -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(); - u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); - u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - - /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && - u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, - &delta_tsc)) - return -ERANGE; - - /* - * If the delta tsc can't fit in the 32 bit after the multi shift, - * we can't use the preemption timer. - * It's possible that it fits on later vmentries, but checking - * on every vmentry is costly so we just use an hrtimer. - */ - if (delta_tsc >> (cpu_preemption_timer_multi + 32)) - return -ERANGE; - - vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; -} - -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); -} -#endif - -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (ple_gap) - shrink_ple_window(vcpu); -} - static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { kvm_mmu_slot_leaf_clear_dirty(kvm, slot); - kvm_mmu_slot_largepage_remove_write_access(kvm, slot); } static void vmx_slot_disable_log_dirty(struct kvm *kvm, @@ -10968,257 +9099,17 @@ static void vmx_slot_disable_log_dirty(struct kvm *kvm, static void vmx_flush_log_dirty(struct kvm *kvm) { - kvm_flush_pml_buffers(kvm); + //kvm_flush_pml_buffers(kvm); } static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t offset, unsigned long mask) + gfn_t offset, size_t mask) { kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - - do { - old.control = new.control = pi_desc->control; - - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - return 0; -} - -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops->set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); - ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - -static void vmx_setup_mce(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mcg_cap & MCG_LMCE_P) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= - FEATURE_CONTROL_LMCE; - else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_LMCE; -} - -static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { +static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .hardware_setup = hardware_setup, @@ -11233,7 +9124,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .save_host_state = vmx_save_host_state, + .load_host_state = vmx_load_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -11264,11 +9156,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .get_pkru = vmx_get_pkru, - - .fpu_activate = vmx_fpu_activate, - .fpu_deactivate = vmx_fpu_deactivate, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, @@ -11276,7 +9163,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .skip_emulated_instruction = skip_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, @@ -11295,8 +9181,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -11326,52 +9210,22 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .check_nested_events = vmx_check_nested_events, - .sched_in = vmx_sched_in, - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, - .update_pi_irte = vmx_update_pi_irte, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, + //.pmu_ops = &intel_pmu_ops, }; -static int __init vmx_init(void) +int vmx_init(void) { - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); - if (r) - return r; - -#ifdef CONFIG_KEXEC_CORE - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); -#endif - - return 0; + return kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 0); } -static void __exit vmx_exit(void) +void vmx_exit(void) { -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif - kvm_exit(); } -module_init(vmx_init) -module_exit(vmx_exit) diff --git a/arch/x86/kvm/vmx_def.h b/arch/x86/kvm/vmx_def.h new file mode 100755 index 0000000..89ff76a --- /dev/null +++ b/arch/x86/kvm/vmx_def.h @@ -0,0 +1,425 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "irq.h" +#include "mmu.h" +#include "cpuid.h" +#include "lapic.h" + +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <ntkrutils.h> +#include <__asm.h> +#include "kvm_cache_regs.h" +#include "x86.h" +#include <asm/vmx.h> + +#include "pmu.h" + +/* MTRR memory types, which are defined in SDM */ +#define MTRR_TYPE_UNCACHABLE 0 +#define MTRR_TYPE_WRCOMB 1 +/*#define MTRR_TYPE_ 2*/ +/*#define MTRR_TYPE_ 3*/ +#define MTRR_TYPE_WRTHROUGH 4 +#define MTRR_TYPE_WRPROT 5 +#define MTRR_TYPE_WRBACK 6 +#define MTRR_NUM_TYPES 7 + + +#define GVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define GVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) +#define GVM_VM_CR0_ALWAYS_ON \ + (GVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define GVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) + +#define GVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define GVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +#define NR_AUTOLOAD_MSRS 8 +#define VMCS02_POOL_SIZE 1 + +struct vmcs { + u32 revision_id; + u32 abort; + char data[1016]; +}; + +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + struct vmcs *shadow_vmcs; + int cpu; + int launched; +}; + +/* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * If there are changes in this struct, VMCS12_REVISION must be changed. + */ +typedef u64 natural_width; +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + u32 revision_id; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 posted_intr_desc_addr; + u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 guest_bndcfgs; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 padding64[8]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * size_t fields with no explict size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 guest_intr_status; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; +}; + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications. + */ +#define VMCS12_SIZE 0x1000 + +/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +struct vmcs02_list { + struct list_head list; + gpa_t vmptr; + struct loaded_vmcs vmcs02; +}; + +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + gpa_t vmxon_ptr; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* The host-usable pointer to the above */ + PMDL current_vmcs12_mdl; + struct vmcs12 *current_vmcs12; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMXOFF, VMCLEAR, VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; + /* + * Indicates if the shadow vmcs must be updated with the + * data hold by vmcs12 + */ + bool sync_shadow_vmcs; + + /* vmcs02_list cache of VMCSs recently used to run L2 guests */ + struct list_head vmcs02_pool; + int vmcs02_num; + bool change_vmcs01_virtual_x2apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; + /* + * Guest pages referred to in vmcs02 with host-physical pointers, so + * we must keep them pinned while L2 runs. + */ + PMDL apic_access_mdl; + PMDL virtual_apic_mdl; + + size_t *msr_bitmap; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; + + u16 vpid02; + u16 last_vpid; + + u32 nested_vmx_procbased_ctls_low; + u32 nested_vmx_procbased_ctls_high; + u32 nested_vmx_true_procbased_ctls_low; + u32 nested_vmx_secondary_ctls_low; + u32 nested_vmx_secondary_ctls_high; + u32 nested_vmx_pinbased_ctls_low; + u32 nested_vmx_pinbased_ctls_high; + u32 nested_vmx_exit_ctls_low; + u32 nested_vmx_exit_ctls_high; + u32 nested_vmx_true_exit_ctls_low; + u32 nested_vmx_entry_ctls_low; + u32 nested_vmx_entry_ctls_high; + u32 nested_vmx_true_entry_ctls_low; + u32 nested_vmx_misc_low; + u32 nested_vmx_misc_high; + u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + size_t host_rsp; + u8 fail; + bool nmi_known_unmasked; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { + unsigned nr; + struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + } msr_autoload; + struct { + u16 fs_sel, gs_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif + int gs_reload_needed; + int fs_reload_needed; + u64 msr_host_bndcfgs; + size_t vmcs_host_cr4; /* May not match real cr4 */ + } host_state; + struct { + int vm86_active; + ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment { + u16 selector; + size_t base; + u32 limit; + u32 ar; + } seg[8]; + } segment_cache; + int vpid; + bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + u32 exit_reason; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; +}; + +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04c5d96..44637f3 100644..100755 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7,6 +7,7 @@ * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@qumranet.com> @@ -19,67 +20,28 @@ * */ +#include <gvm_types.h> +#include <ntkrutils.h> +#include <gvm-main.h> #include <linux/kvm_host.h> #include "irq.h" #include "mmu.h" -#include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" -#include "hyperv.h" - -#include <linux/clocksource.h> -#include <linux/interrupt.h> -#include <linux/kvm.h> -#include <linux/fs.h> -#include <linux/vmalloc.h> -#include <linux/export.h> -#include <linux/moduleparam.h> -#include <linux/mman.h> -#include <linux/highmem.h> -#include <linux/iommu.h> -#include <linux/intel-iommu.h> -#include <linux/cpufreq.h> -#include <linux/user-return-notifier.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <linux/hash.h> -#include <linux/pci.h> -#include <linux/timekeeper_internal.h> -#include <linux/pvclock_gtod.h> -#include <linux/kvm_irqfd.h> -#include <linux/irqbypass.h> -#include <trace/events/kvm.h> - -#include <asm/debugreg.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <asm/mce.h> -#include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ -#include <asm/pvclock.h> -#include <asm/div64.h> -#include <asm/irq_remapping.h> - -#define CREATE_TRACE_POINTS -#include "trace.h" +#include <asm/vmx.h> + #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) /* EFER defaults: - * - enable syscall per default because its emulated by KVM - * - enable LME and LMA per default on 64 bit KVM + * - enable syscall per default because its emulated by kvm + * - enable LME and LMA per default on 64 bit kvm */ #ifdef CONFIG_X86_64 static @@ -88,219 +50,39 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x) offsetof(struct kvm, stat.x), GVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), GVM_STAT_VCPU -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ - KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +#define GVM_X2APIC_API_VALID_FLAGS (GVM_X2APIC_API_USE_32BIT_IDS | \ + GVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); -static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags); struct kvm_x86_ops *kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - -static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, S_IRUGO); - -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ unsigned int __read_mostly lapic_timer_advance_ns = 0; -module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, S_IRUGO); static bool __read_mostly backwards_tsc_observed = false; -#define KVM_NR_SHARED_MSRS 16 - -struct kvm_shared_msrs_global { - int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; -}; - -struct kvm_shared_msrs { - struct user_return_notifier urn; - bool registered; - struct kvm_shared_msr_values { - u64 host; - u64 curr; - } values[KVM_NR_SHARED_MSRS]; -}; - -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, - { NULL } -}; - u64 __read_mostly host_xcr0; -static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); - -static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) -{ - int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) - vcpu->arch.apf.gfns[i] = ~0; -} - -static void kvm_on_user_return(struct user_return_notifier *urn) -{ - unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; - unsigned long flags; - - /* - * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() - */ - local_irq_save(flags); - if (locals->registered) { - locals->registered = false; - user_return_notifier_unregister(urn); - } - local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; - if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); - values->curr = values->host; - } - } -} - -static void shared_msr_update(unsigned slot, u32 msr) -{ - u64 value; - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; - } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; -} - -void kvm_define_shared_msr(unsigned slot, u32 msr) -{ - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; -} -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); - -static void kvm_shared_msr_cpu_online(void) -{ - unsigned i; - - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); -} - -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) -{ - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - int err; - - if (((value ^ smsr->values[slot].curr) & mask) == 0) - return 0; - smsr->values[slot].curr = value; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); - if (err) - return 1; - - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); - -static void drop_user_return_notifiers(void) -{ - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - if (smsr->registered) - kvm_on_user_return(&smsr->urn); -} - u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -323,14 +105,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_lapic_set_base(vcpu, msr_info->data); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); - -asmlinkage __visible void kvm_spurious_fault(void) -{ - /* Fault while not rebooting. We want the trace. */ - BUG(); -} -EXPORT_SYMBOL_GPL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 @@ -385,7 +159,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, u32 prev_nr; int class1, class2; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); if (!vcpu->arch.exception.pending) { queue: @@ -403,7 +177,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_make_request(GVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); @@ -426,13 +200,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, true); } -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -441,7 +213,6 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) else kvm_x86_ops->skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -449,7 +220,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -464,21 +234,18 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau void kvm_inject_nmi(struct kvm_vcpu *vcpu) { atomic_inc(&vcpu->arch.nmi_queued); - kvm_make_request(KVM_REQ_NMI, vcpu); + kvm_make_request(GVM_REQ_NMI, vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, true); } -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue @@ -491,7 +258,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } -EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { @@ -501,7 +267,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) kvm_queue_exception(vcpu, UD_VECTOR); return false; } -EXPORT_SYMBOL_GPL(kvm_require_dr); /* * This function will be used to read from the physical memory of the currently @@ -525,7 +290,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len, u32 access) @@ -537,7 +301,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, size_t cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; @@ -564,14 +328,13 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); + (size_t *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + (size_t *)&vcpu->arch.regs_dirty); out: return ret; } -EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { @@ -585,7 +348,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) return false; if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + (size_t *)&vcpu->arch.regs_avail)) return true; gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; @@ -600,10 +363,10 @@ out: return changed; } -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, size_t cr0) { - unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; + size_t old_cr0 = kvm_read_cr0(vcpu); + size_t update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -642,28 +405,21 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } - if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + //kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, GVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr0); -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +void kvm_lmsw(struct kvm_vcpu *vcpu, size_t msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } -EXPORT_SYMBOL_GPL(kvm_lmsw); static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) { @@ -733,12 +489,11 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_set_cr4(struct kvm_vcpu *vcpu, size_t cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + size_t old_cr4 = kvm_read_cr4(vcpu); + size_t pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) @@ -789,9 +544,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr4); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, size_t cr3) { #ifdef CONFIG_X86_64 cr3 &= ~CR3_PCID_INVD; @@ -799,7 +553,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(GVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -815,9 +569,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_cr3(vcpu); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr3); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int kvm_set_cr8(struct kvm_vcpu *vcpu, size_t cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; @@ -827,46 +580,44 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr8); -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) +size_t kvm_get_cr8(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { int i; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - for (i = 0; i < KVM_NR_DB_REGS; i++) + if (!(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < GVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; + vcpu->arch.switch_db_regs |= GVM_DEBUGREG_RELOAD; } } static void kvm_update_dr6(struct kvm_vcpu *vcpu) { - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + if (!(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP)) kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); } static void kvm_update_dr7(struct kvm_vcpu *vcpu) { - unsigned long dr7; + size_t dr7; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; + vcpu->arch.switch_db_regs |= GVM_DEBUGREG_BP_ENABLED; } static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) @@ -878,12 +629,15 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) return fixed; } -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, size_t val) { switch (dr) { - case 0 ... 3: + case 0: + case 1: + case 2: + case 3: vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + if (!(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; case 4: @@ -907,7 +661,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, size_t val) { if (__kvm_set_dr(vcpu, dr, val)) { kvm_inject_gp(vcpu, 0); @@ -915,18 +669,20 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, size_t *val) { switch (dr) { - case 0 ... 3: + case 0: + case 1: + case 2: + case 3: *val = vcpu->arch.db[dr]; break; case 4: /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) *val = vcpu->arch.dr6; else *val = kvm_x86_ops->get_dr6(vcpu); @@ -939,8 +695,8 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dr); +#if 0 bool kvm_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -954,11 +710,11 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); return err; } -EXPORT_SYMBOL_GPL(kvm_rdpmc); +#endif /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * List of msr numbers which we expose to userspace through GVM_GET_MSRS + * and GVM_SET_MSRS, and GVM_GET_MSR_INDEX_LIST. * * This list is modified at module load time to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are @@ -972,45 +728,19 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_TSC, MSR_IA32_CR_PAT, //MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, //MSR_IA32_BNDCFGS, MSR_TSC_AUX, }; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, -}; - -static unsigned num_emulated_msrs; - bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) return false; if (efer & EFER_FFXSR) { - struct kvm_cpuid_entry2 *feat; + struct kvm_cpuid_entry *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) @@ -1018,7 +748,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) } if (efer & EFER_SVME) { - struct kvm_cpuid_entry2 *feat; + struct kvm_cpuid_entry *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) @@ -1027,7 +757,6 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) return true; } -EXPORT_SYMBOL_GPL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { @@ -1056,7 +785,6 @@ void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); /* * Writes msr value into into the appropriate "register". @@ -1092,7 +820,6 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } return kvm_x86_ops->set_msr(vcpu, msr); } -EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention @@ -1122,257 +849,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, &msr); } -#ifdef CONFIG_X86_64 -struct pvclock_gtod_data { - seqcount_t seq; - - struct { /* extract of a clocksource struct */ - int vclock_mode; - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; - - u64 boot_ns; - u64 nsec_base; -}; - -static struct pvclock_gtod_data pvclock_gtod_data; - -static void update_pvclock_gtod(struct timekeeper *tk) -{ - struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; - - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); - - write_seqcount_begin(&vdata->seq); - - /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->tkr_mono.cycle_last; - vdata->clock.mask = tk->tkr_mono.mask; - vdata->clock.mult = tk->tkr_mono.mult; - vdata->clock.shift = tk->tkr_mono.shift; - - vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr_mono.xtime_nsec; - - write_seqcount_end(&vdata->seq); -} -#endif - void kvm_set_pending_timer(struct kvm_vcpu *vcpu) { /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * Note: GVM_REQ_PENDING_TIMER is implicitly checked in * vcpu_enter_guest. This function is only called from * the physical CPU that is running vcpu. */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); -} - -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) -{ - int version; - int r; - struct pvclock_wall_clock wc; - struct timespec64 boot; - - if (!wall_clock) - return; - - r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); - if (r) - return; - - if (version & 1) - ++version; /* first time write, random junk */ - - ++version; - - if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) - return; - - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. - */ - getboottime64(&boot); - - if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; - wc.version = version; - - kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); - - version++; - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); -} - -static uint32_t div_frac(uint32_t dividend, uint32_t divisor) -{ - do_shl32_div32(dividend, divisor); - return dividend; -} - -static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, - s8 *pshift, u32 *pmultiplier) -{ - uint64_t scaled64; - int32_t shift = 0; - uint64_t tps64; - uint32_t tps32; - - tps64 = base_hz; - scaled64 = scaled_hz; - while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { - tps64 >>= 1; - shift--; - } - - tps32 = (uint32_t)tps64; - while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { - if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) - scaled64 >>= 1; - else - tps32 <<= 1; - shift++; - } - - *pshift = shift; - *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", - __func__, base_hz, scaled_hz, shift, *pmultiplier); + kvm_make_request(GVM_REQ_PENDING_TIMER, vcpu); } #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif -static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -static unsigned long max_tsc_khz; - -static u32 adjust_tsc_khz(u32 khz, s32 ppm) -{ - u64 v = (u64)khz * (1000000 + ppm); - do_div(v, 1000000); - return v; -} - -static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - u64 ratio; - - /* Guest TSC same frequency as host TSC? */ - if (!scale) { - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - return 0; - } - - /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - return 0; - } else { - WARN(1, "user requested TSC rate below hardware speed\n"); - return -1; - } - } - - /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, - user_tsc_khz, tsc_khz); - - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { - WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); - return -1; - } - - vcpu->arch.tsc_scaling_ratio = ratio; - return 0; -} - -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) -{ - u32 thresh_lo, thresh_hi; - int use_scaling = 0; - - /* tsc_khz can be zero if TSC calibration fails */ - if (user_tsc_khz == 0) { - /* set tsc_scaling_ratio to a safe value */ - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - return -1; - } - - /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, - &vcpu->arch.virtual_tsc_shift, - &vcpu->arch.virtual_tsc_mult); - vcpu->arch.virtual_tsc_khz = user_tsc_khz; - - /* - * Compute the variation in TSC rate which is acceptable - * within the range of tolerance and decide if the - * rate being applied is within that bounds of the hardware - * rate. If so, no scaling or compensation need be done. - */ - thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); - thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); - if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); - use_scaling = 1; - } - return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); -} - -static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) -{ - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, - vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.this_tsc_write; - return tsc; -} - -static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - bool vcpus_matched; - struct kvm_arch *ka = &vcpu->kvm->arch; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&vcpu->kvm->online_vcpus)); - - /* - * Once the masterclock is enabled, always perform request in - * order to update it. - * - * In order to enable masterclock, the host clocksource must be TSC - * and the vcpus need to have matched TSCs. When that happens, - * perform request to enable masterclock. - */ - if (ka->use_master_clock || - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, - atomic_read(&vcpu->kvm->online_vcpus), - ka->use_master_clock, gtod->clock.vclock_mode); -#endif -} +static DEFINE_PER_CPU(size_t, cpu_tsc_khz); +static size_t max_tsc_khz; static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { @@ -1380,47 +872,19 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } -/* - * Multiply tsc by a fixed point number represented by ratio. - * - * The most significant 64-N bits (mult) of ratio represent the - * integral part of the fixed point number; the remaining N bits - * (frac) represent the fractional part, ie. ratio represents a fixed - * point number (mult + frac * 2^(-N)). - * - * N equals to kvm_tsc_scaling_ratio_frac_bits. - */ -static inline u64 __scale_tsc(u64 ratio, u64 tsc) -{ - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); -} - -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) -{ - u64 _tsc = tsc; - u64 ratio = vcpu->arch.tsc_scaling_ratio; - - if (ratio != kvm_default_tsc_scaling_ratio) - _tsc = __scale_tsc(ratio, tsc); - - return _tsc; -} -EXPORT_SYMBOL_GPL(kvm_scale_tsc); - static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc()); + tsc = __rdtsc(); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.tsc_offset + host_tsc; } -EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { @@ -1430,128 +894,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { - struct kvm *kvm = vcpu->kvm; - u64 offset, ns, elapsed; - unsigned long flags; - s64 usdiff; - bool matched; - bool already_matched; + u64 offset; + //size_t flags; u64 data = msr->data; - raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + //spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); - elapsed = ns - kvm->arch.last_tsc_nsec; - - if (vcpu->arch.virtual_tsc_khz) { - int faulted = 0; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("1: idivl %[divisor]\n" - "2: xor %%edx, %%edx\n" - " movl $0, %[faulted]\n" - "3:\n" - ".section .fixup,\"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - - : "=A"(usdiff), [faulted] "=r" (faulted) - : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); - -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* idivl overflow => difference is larger than USEC_PER_SEC */ - if (faulted) - usdiff = USEC_PER_SEC; - } else - usdiff = USEC_PER_SEC; /* disable TSC match window below */ - - /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * - * For a reliable TSC, we can match TSC offsets, and for an unstable - * TSC, we add elapsed time in this computation. We could let the - * compensation code attempt to catch up if we fall behind, but - * it's better to try to match offsets from the beginning. - */ - if (usdiff < USEC_PER_SEC && - vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { - if (!check_tsc_unstable()) { - offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); - } else { - u64 delta = nsec_to_cycles(vcpu, elapsed); - data += delta; - offset = kvm_compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); - } - matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; - pr_debug("kvm: new tsc generation %llu, clock %llu\n", - kvm->arch.cur_tsc_generation, data); - } - - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) update_ia32_tsc_adjust_msr(vcpu, offset); kvm_vcpu_write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } + //spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) @@ -1559,549 +914,16 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); } -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) - WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); - adjust_tsc_offset_guest(vcpu, adjustment); -} - -#ifdef CONFIG_X86_64 - -static cycle_t read_tsc(void) -{ - cycle_t ret = (cycle_t)rdtsc_ordered(); - u64 last = pvclock_gtod_data.clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a function of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -static inline u64 vgettsc(cycle_t *cycle_now) -{ - long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - - *cycle_now = read_tsc(); - - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; - return v * gtod->clock.mult; -} - -static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - unsigned long seq; - int mode; - u64 ns; - - do { - seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; - ns = gtod->nsec_base; - ns += vgettsc(cycle_now); - ns >>= gtod->clock.shift; - ns += gtod->boot_ns; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - *t = ns; - - return mode; -} - -/* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) -{ - /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) - return false; - - return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; -} -#endif - -/* - * - * Assuming a stable TSC across physical CPUS, and a stable TSC - * across virtual CPUs, the following condition is possible. - * Each numbered line represents an event visible to both - * CPUs at the next numbered event. - * - * "timespecX" represents host monotonic time. "tscX" represents - * RDTSC value. - * - * VCPU0 on CPU0 | VCPU1 on CPU1 - * - * 1. read timespec0,tsc0 - * 2. | timespec1 = timespec0 + N - * | tsc1 = tsc0 + M - * 3. transition to guest | transition to guest - * 4. ret0 = timespec0 + (rdtsc - tsc0) | - * 5. | ret1 = timespec1 + (rdtsc - tsc1) - * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) - * - * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: - * - * - ret0 < ret1 - * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) - * ... - * - 0 < N - M => M < N - * - * That is, when timespec0 != timespec1, M < N. Unfortunately that is not - * always the case (the difference between two distinct xtime instances - * might be smaller then the difference between corresponding TSC reads, - * when updating guest vcpus pvclock areas). - * - * To avoid that problem, do not allow visibility of distinct - * system_timestamp/tsc_timestamp values simultaneously: use a master - * copy of host monotonic time values. Update that master copy - * in lockstep. - * - * Rely on synchronization of host TSCs and guest TSCs for monotonicity. - * - */ - -static void pvclock_update_vm_gtod_copy(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - struct kvm_arch *ka = &kvm->arch; - int vclock_mode; - bool host_tsc_clocksource, vcpus_matched; - - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&kvm->online_vcpus)); - - /* - * If the host uses TSC clock, then passthrough TSC as stable - * to the guest. - */ - host_tsc_clocksource = kvm_get_time_and_clockread( - &ka->master_kernel_ns, - &ka->master_cycle_now); - - ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed - && !ka->boot_vcpu_runs_old_kvmclock; - - if (ka->use_master_clock) - atomic_set(&kvm_guest_has_master_clock, 1); - - vclock_mode = pvclock_gtod_data.clock.vclock_mode; - trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, - vcpus_matched); -#endif -} - -void kvm_make_mclock_inprogress_request(struct kvm *kvm) -{ - kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); -} - -static void kvm_gen_update_masterclock(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - kvm_make_mclock_inprogress_request(kvm); - /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - /* guest entries allowed */ - kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); - - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif -} - -static u64 __get_kvmclock_ns(struct kvm *kvm) -{ - struct kvm_arch *ka = &kvm->arch; - struct pvclock_vcpu_time_info hv_clock; - - spin_lock(&ka->pvclock_gtod_sync_lock); - if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; - } - - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); - - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, - &hv_clock.tsc_shift, - &hv_clock.tsc_to_system_mul); - return __pvclock_read_cycles(&hv_clock, rdtsc()); -} - -u64 get_kvmclock_ns(struct kvm *kvm) -{ - unsigned long flags; - s64 ns; - - local_irq_save(flags); - ns = __get_kvmclock_ns(kvm); - local_irq_restore(flags); - - return ns; -} - -static void kvm_setup_pvclock_page(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - struct pvclock_vcpu_time_info guest_hv_clock; - - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return; - - /* This VCPU is paused, but it's legal for a guest to read another - * VCPU's kvmclock, so we really have to follow the specification where - * it says that version is odd if data is being modified, and even after - * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. - */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); - - smp_wmb(); - - /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - smp_wmb(); - - vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); -} - -static int kvm_guest_time_update(struct kvm_vcpu *v) -{ - unsigned long flags, tgt_tsc_khz; - struct kvm_vcpu_arch *vcpu = &v->arch; - struct kvm_arch *ka = &v->kvm->arch; - s64 kernel_ns; - u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; - bool use_master_clock; - - kernel_ns = 0; - host_tsc = 0; - - /* - * If the host uses TSC clock, then passthrough TSC as stable - * to the guest. - */ - spin_lock(&ka->pvclock_gtod_sync_lock); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - spin_unlock(&ka->pvclock_gtod_sync_lock); - - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); - if (unlikely(tgt_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - if (!use_master_clock) { - host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); - } - - tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); - - /* - * We may have to catch up the TSC to match elapsed wall clock - * time for two reasons, even if kvmclock is used. - * 1) CPU could have been running below the maximum TSC rate - * 2) Broken TSC compensation resets the base at each VCPU - * entry to avoid unknown leaps of TSC even when running - * again on the same CPU. This may cause apparent elapsed - * time to disappear, and the guest to stand still or run - * very slowly. - */ - if (vcpu->tsc_catchup) { - u64 tsc = compute_guest_tsc(v, kernel_ns); - if (tsc > tsc_timestamp) { - adjust_tsc_offset_guest(v, tsc - tsc_timestamp); - tsc_timestamp = tsc; - } - } - - local_irq_restore(flags); - - /* With all the info we got, fill in the values */ - - if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); - - if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = tgt_tsc_khz; - } - - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_guest_tsc = tsc_timestamp; - - /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; - if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; - - vcpu->hv_clock.flags = pvclock_flags; - - if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v); - if (v == kvm_get_vcpu(v->kvm, 0)) - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); - return 0; -} - -/* - * kvmclock updates which are isolated to a given vcpu, such as - * vcpu->cpu migration, should not allow system_timestamp from - * the rest of the vcpus to remain static. Otherwise ntp frequency - * correction applies to one vcpu's system_timestamp but not - * the others. - * - * So in those cases, request a kvmclock update for all vcpus. - * We need to rate-limit these requests though, as they can - * considerably slow guests that have a large number of vcpus. - * The time for a remote vcpu to update its kvmclock is bound - * by the delay we use to rate-limit the updates. - */ - -#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - -static void kvmclock_update_fn(struct work_struct *work) -{ - int i; - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_update_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_vcpu_kick(vcpu); - } -} - -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) -{ - struct kvm *kvm = v->kvm; - - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - schedule_delayed_work(&kvm->arch.kvmclock_update_work, - KVMCLOCK_UPDATE_DELAY); -} - -#define KVMCLOCK_SYNC_PERIOD (300 * HZ) - -static void kvmclock_sync_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_sync_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); -} - -static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_MCG_STATUS: - vcpu->arch.mcg_status = data; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return 1; - if (data != 0 && data != ~(u64)0) - return -1; - vcpu->arch.mcg_ctl = data; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - vcpu->arch.mce_banks[offset] = data; - break; - } - return 1; - } - return 0; -} - -static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 - : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; - u32 page_num = data & ~PAGE_MASK; - u64 page_addr = data & PAGE_MASK; - u8 *page; - int r; - - r = -E2BIG; - if (page_num >= blob_size) - goto out; - r = -ENOMEM; - page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; - } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; -} - -static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) -{ - gpa_t gpa = data & ~0x3f; - - /* Bits 2:5 are reserved, Should be zero */ - if (data & 0x3c) - return 1; - - vcpu->arch.apf.msr_val = data; - - if (!(data & KVM_ASYNC_PF_ENABLED)) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - return 0; - } - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) - return 1; - - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); - kvm_async_pf_wakeup_all(vcpu); - return 0; -} - -static void kvmclock_reset(struct kvm_vcpu *vcpu) -{ - vcpu->arch.pv_time_enabled = false; -} - -static void record_steal_time(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) - return; - - if (vcpu->arch.st.steal.version & 1) - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ - - vcpu->arch.st.steal.version += 1; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - - smp_wmb(); - - vcpu->arch.st.steal.steal += current->sched_info.run_delay - - vcpu->arch.st.last_steal; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - - smp_wmb(); - - vcpu->arch.st.steal.version += 1; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; + //bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; + if (msr >= 0x200 && msr <= 0x2ff) + return kvm_mtrr_set_msr(vcpu, msr, data); + if (msr >= APIC_BASE_MSR && msr <= (APIC_BASE_MSR + 0x3ff)) + return kvm_x2apic_msr_write(vcpu, msr, data); switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_REV: @@ -2143,15 +965,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", __func__, data); break; - case 0x200 ... 0x2ff: - return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: - kvm_set_lapic_tscdeadline_msr(vcpu, data); - break; case MSR_IA32_TSC_ADJUST: if (guest_cpuid_has_tsc_adjust(vcpu)) { if (!msr_info->host_initiated) { @@ -2169,81 +984,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; - case MSR_KVM_WALL_CLOCK_NEW: - case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data); - break; - case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; - struct kvm_arch *ka = &vcpu->kvm->arch; - - kvmclock_reset(vcpu); - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - if (!(data & 1)) - break; - - gpa_offset = data & ~(PAGE_MASK | 1); - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else - vcpu->arch.pv_time_enabled = true; - - break; - } - case MSR_KVM_ASYNC_PF_EN: - if (kvm_pv_enable_async_pf(vcpu, data)) - return 1; - break; - case MSR_KVM_STEAL_TIME: - - if (unlikely(!sched_info_on())) - return 1; - - if (data & KVM_STEAL_RESERVED_MASK) - return 1; - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS, - sizeof(struct kvm_steal_time))) - return 1; - - vcpu->arch.st.msr_val = data; - - if (!(data & KVM_MSR_ENABLED)) - break; - - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - - break; - case MSR_KVM_PV_EOI_EN: - if (kvm_lapic_enable_pv_eoi(vcpu, data)) - return 1; - break; - - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return set_msr_mce(vcpu, msr, data); - +#if 0 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: pr = true; /* fall through */ @@ -2256,6 +997,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; +#endif case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. @@ -2266,18 +1008,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * the need to ignore the workaround. */ break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: - return kvm_hv_set_msr_common(vcpu, msr, data, - msr_info->host_initiated); +#if 0 case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; +#endif case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) return 1; @@ -2289,8 +1027,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.osvw.status = data; break; default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return xen_hvm_config(vcpu, data); +#if 0 if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { @@ -2302,10 +1039,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr, data); break; } +#endif + break; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_msr_common); /* @@ -2317,45 +1055,13 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return kvm_x86_ops->get_msr(vcpu, msr); } -EXPORT_SYMBOL_GPL(kvm_get_msr); - -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - data = 0; - break; - case MSR_IA32_MCG_CAP: - data = vcpu->arch.mcg_cap; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return 1; - data = vcpu->arch.mcg_ctl; - break; - case MSR_IA32_MCG_STATUS: - data = vcpu->arch.mcg_status; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; - data = vcpu->arch.mce_banks[offset]; - break; - } - return 1; - } - *pdata = data; - return 0; -} int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + if (msr_info->index >= 0x200 && msr_info->index <= 0x2ff) + return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); + if (msr_info->index >= APIC_BASE_MSR && msr_info->index <= (APIC_BASE_MSR + 0x3ff)) + return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: @@ -2376,6 +1082,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PERF_CTL: msr_info->data = 0; break; +#if 0 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -2384,11 +1091,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); msr_info->data = 0; break; +#endif case MSR_IA32_UCODE_REV: msr_info->data = 0x100000000ULL; break; case MSR_MTRRcap: - case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -2410,12 +1117,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; - case MSR_IA32_TSCDEADLINE: - msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); - break; case MSR_IA32_TSC_ADJUST: msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; @@ -2436,30 +1137,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EFER: msr_info->data = vcpu->arch.efer; break; - case MSR_KVM_WALL_CLOCK: - case MSR_KVM_WALL_CLOCK_NEW: - msr_info->data = vcpu->kvm->arch.wall_clock; - break; - case MSR_KVM_SYSTEM_TIME: - case MSR_KVM_SYSTEM_TIME_NEW: - msr_info->data = vcpu->arch.time; - break; - case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; - break; - case MSR_KVM_STEAL_TIME: - msr_info->data = vcpu->arch.st.msr_val; - break; - case MSR_KVM_PV_EOI_EN: - msr_info->data = vcpu->arch.pv_eoi.msr_val; - break; - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr_info->index, &msr_info->data); +#if 0 case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2472,13 +1150,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ msr_info->data = 0x20000000; break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: - return kvm_hv_get_msr_common(vcpu, - msr_info->index, &msr_info->data); - break; +#endif case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -2503,6 +1175,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; default: +#if 0 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { @@ -2513,10 +1186,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; } break; +#endif + break; } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. @@ -2544,7 +1218,8 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, * * @return number of msrs set successfully. */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, +static int msr_io(PIRP pIrp, struct kvm_vcpu *vcpu, + struct kvm_msrs __user *user_msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback) @@ -2573,10 +1248,17 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, if (r < 0) goto out_free; - r = -EFAULT; - if (writeback && copy_to_user(user_msrs->entries, entries, size)) + /* write back n of msrs handled here*/ + r = gvmUpdateReturnBuffer(pIrp, 0, &n, sizeof(n)); + if (r) goto out_free; + if (writeback) { + r = gvmUpdateReturnBuffer(pIrp, sizeof(msrs), entries, size); + if (r) + goto out_free; + } + r = n; out_free: @@ -2590,56 +1272,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int r; switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_HLT: - case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_SET_TSS_ADDR: - case KVM_CAP_EXT_CPUID: - case KVM_CAP_EXT_EMUL_CPUID: - case KVM_CAP_CLOCKSOURCE: - case KVM_CAP_PIT: - case KVM_CAP_NOP_IO_DELAY: - case KVM_CAP_MP_STATE: - case KVM_CAP_SYNC_MMU: - case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: - case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IOEVENTFD: - case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: - case KVM_CAP_SET_IDENTITY_MAP_ADDR: - case KVM_CAP_XEN_HVM: - case KVM_CAP_VCPU_EVENTS: - case KVM_CAP_HYPERV: - case KVM_CAP_HYPERV_VAPIC: - case KVM_CAP_HYPERV_SPIN: - case KVM_CAP_HYPERV_SYNIC: - case KVM_CAP_PCI_SEGMENT: - case KVM_CAP_DEBUGREGS: - case KVM_CAP_X86_ROBUST_SINGLESTEP: - case KVM_CAP_XSAVE: - case KVM_CAP_ASYNC_PF: - case KVM_CAP_GET_TSC_KHZ: - case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: - case KVM_CAP_HYPERV_TIME: - case KVM_CAP_IOAPIC_POLARITY_IGNORED: - case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: - case KVM_CAP_DISABLE_QUIRKS: - case KVM_CAP_SET_BOOT_CPU_ID: - case KVM_CAP_SPLIT_IRQCHIP: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif + case GVM_CAP_IRQCHIP: + case GVM_CAP_HLT: + case GVM_CAP_MMU_SHADOW_CACHE_CONTROL: + case GVM_CAP_EXT_EMUL_CPUID: + case GVM_CAP_NOP_IO_DELAY: + case GVM_CAP_SYNC_MMU: + case GVM_CAP_USER_NMI: + case GVM_CAP_REINJECT_CONTROL: + case GVM_CAP_SET_IDENTITY_MAP_ADDR: + case GVM_CAP_VCPU_EVENTS: r = 1; break; - case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + case GVM_CAP_PCI_SEGMENT: + case GVM_CAP_DEBUGREGS: + case GVM_CAP_X86_ROBUST_SINGLESTEP: + case GVM_CAP_XSAVE: + case GVM_CAP_READONLY_MEM: + case GVM_CAP_IOAPIC_POLARITY_IGNORED: + case GVM_CAP_ENABLE_CAP_VM: + case GVM_CAP_DISABLE_QUIRKS: + case GVM_CAP_SET_BOOT_CPU_ID: + r = 0; break; - case KVM_CAP_X86_SMM: + case GVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is @@ -2650,41 +1306,21 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; - case KVM_CAP_VAPIC: + case GVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; - case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; - break; - case KVM_CAP_MAX_VCPUS: - r = KVM_MAX_VCPUS; - break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; - case KVM_CAP_PV_MMU: /* obsolete */ - r = 0; + case GVM_CAP_NR_VCPUS: + r = GVM_SOFT_MAX_VCPUS; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); + case GVM_CAP_MAX_VCPUS: + r = GVM_MAX_VCPUS; break; -#endif - case KVM_CAP_MCE: - r = KVM_MAX_MCE_BANKS; + case GVM_CAP_NR_MEMSLOTS: + r = GVM_USER_MEM_SLOTS; break; - case KVM_CAP_XCRS: + case GVM_CAP_XCRS: r = boot_cpu_has(X86_FEATURE_XSAVE); break; - case KVM_CAP_TSC_CONTROL: - r = kvm_has_tsc_control; - break; - case KVM_CAP_X2APIC_API: - r = KVM_X2APIC_API_VALID_FLAGS; - break; default: r = 0; break; @@ -2693,64 +1329,53 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) } -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_dev_ioctl(struct gvm_device_extension *devext, + PIRP pIrp, unsigned int ioctl) { - void __user *argp = (void __user *)arg; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; + size_t args = IoGetCurrentIrpStackLocation(pIrp)->Parameters.DeviceIoControl.InputBufferLength; long r; switch (ioctl) { - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; + case GVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list *msr_list = argp; unsigned n; - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) - goto out; - r = -E2BIG; - if (n < msr_list.nmsrs) + if (args < sizeof(struct kvm_msr_list)) { + r = -EINVAL; goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) + } + + r = STATUS_SUCCESS; + n = msr_list->nmsrs; + __u32 nmsrs = num_msrs_to_save; + r = gvmUpdateReturnBuffer(pIrp, 0, &nmsrs, sizeof(nmsrs)); + if (r) goto out; - if (copy_to_user(user_msr_list->indices + num_msrs_to_save, - &emulated_msrs, - num_emulated_msrs * sizeof(u32))) + + if (n < nmsrs) { + r = -E2BIG; goto out; - r = 0; + } + + r = gvmUpdateReturnBuffer(pIrp, sizeof(nmsrs), &msrs_to_save, + num_msrs_to_save * sizeof(u32)); break; } - case KVM_GET_SUPPORTED_CPUID: - case KVM_GET_EMULATED_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + case GVM_GET_SUPPORTED_CPUID: + case GVM_GET_EMULATED_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + r = kvm_dev_ioctl_get_cpuid(pIrp, &cpuid, cpuid_arg->entries, ioctl); if (r) goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { - r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) - goto out; r = 0; break; } @@ -2761,84 +1386,20 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - -static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_arch_has_noncoherent_dma(vcpu->kvm); -} - -static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - /* Address WBINVD may be executed by guest */ - if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); - } - kvm_x86_ops->vcpu_load(vcpu, cpu); - - /* Apply any externally detected TSC adjustments (due to suspend) */ - if (unlikely(vcpu->arch.tsc_offset_adjustment)) { - adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); - vcpu->arch.tsc_offset_adjustment = 0; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } - - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - rdtsc() - vcpu->arch.last_host_tsc; - if (tsc_delta < 0) - mark_tsc_unstable("KVM discovered backwards TSC"); - - if (check_tsc_unstable()) { - u64 offset = kvm_compute_tsc_offset(vcpu, - vcpu->arch.last_guest_tsc); - kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; - } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); - /* - * On a host with synchronized TSC, there is no need to update - * kvmclock on vcpu->cpu migration - */ - if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); - vcpu->cpu = cpu; - } - - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + vcpu->cpu = cpu; } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - return kvm_apic_get_state(vcpu, s); } @@ -2878,12 +1439,12 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq >= KVM_NR_INTERRUPTS) + if (irq->irq >= GVM_NR_INTERRUPTS) return -EINVAL; if (!irqchip_in_kernel(vcpu->kvm)) { kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -2898,7 +1459,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EEXIST; vcpu->arch.pending_external_vector = irq->irq; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -2911,7 +1472,7 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_make_request(GVM_REQ_SMI, vcpu); return 0; } @@ -2925,80 +1486,6 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, return 0; } -static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, - u64 mcg_cap) -{ - int r; - unsigned bank_num = mcg_cap & 0xff, bank; - - r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) - goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) - goto out; - r = 0; - vcpu->arch.mcg_cap = mcg_cap; - /* Init IA32_MCG_CTL to all 1s */ - if (mcg_cap & MCG_CTL_P) - vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) - vcpu->arch.mce_banks[bank*4] = ~(u64)0; - - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); -out: - return r; -} - -static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, - struct kvm_x86_mce *mce) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - u64 *banks = vcpu->arch.mce_banks; - - if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) - return -EINVAL; - /* - * if IA32_MCG_CTL is not all 1s, the uncorrected error - * reporting is disabled - */ - if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && - vcpu->arch.mcg_ctl != ~(u64)0) - return 0; - banks += 4 * mce->bank; - /* - * if IA32_MCi_CTL is not all 1s, the uncorrected error - * reporting is disabled for the bank - */ - if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) - return 0; - if (mce->status & MCI_STATUS_UC) { - if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 0; - } - if (banks[1] & MCI_STATUS_VAL) - mce->status |= MCI_STATUS_OVER; - banks[2] = mce->addr; - banks[3] = mce->misc; - vcpu->arch.mcg_status = mce->mcg_status; - banks[1] = mce->status; - kvm_queue_exception(vcpu, MC_VECTOR); - } else if (!(banks[1] & MCI_STATUS_VAL) - || !(banks[1] & MCI_STATUS_UC)) { - if (banks[1] & MCI_STATUS_VAL) - mce->status |= MCI_STATUS_OVER; - banks[2] = mce->addr; - banks[3] = mce->misc; - banks[1] = mce->status; - } else - banks[1] |= MCI_STATUS_OVER; - return 0; -} - static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -3030,19 +1517,19 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); events->smi.latched_init = kvm_lapic_latched_init(vcpu); - events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SHADOW - | KVM_VCPUEVENT_VALID_SMM); + events->flags = (GVM_VCPUEVENT_VALID_NMI_PENDING + | GVM_VCPUEVENT_VALID_SHADOW + | GVM_VCPUEVENT_VALID_SMM); memset(&events->reserved, 0, sizeof(events->reserved)); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW - | KVM_VCPUEVENT_VALID_SMM)) + if (events->flags & ~(GVM_VCPUEVENT_VALID_NMI_PENDING + | GVM_VCPUEVENT_VALID_SIPI_VECTOR + | GVM_VCPUEVENT_VALID_SHADOW + | GVM_VCPUEVENT_VALID_SMM)) return -EINVAL; if (events->exception.injected && @@ -3058,20 +1545,20 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) + if (events->flags & GVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + if (events->flags & GVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && + if (events->flags & GVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; - if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (events->flags & GVM_VCPUEVENT_VALID_SMM) { if (events->smi.smm) vcpu->arch.hflags |= HF_SMM_MASK; else @@ -3083,13 +1570,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + set_bit(GVM_APIC_INIT, &vcpu->arch.apic->pending_events); else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + clear_bit(GVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -3097,7 +1584,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - unsigned long val; + size_t val; memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); @@ -3128,11 +1615,87 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } +u64 xfeatures_mask; +static unsigned int xstate_offsets[XFEATURE_MAX] = { 0 }; +static unsigned int xstate_sizes[XFEATURE_MAX] = { 0 }; +static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; + +/* + * Note that in the future we will likely need a pair of + * functions here: one for user xstates and the other for + * system xstates. For now, they are the same. + */ +static int xfeature_enabled(enum xfeature xfeature) +{ + return !!(xfeatures_mask & ((u64)1 << xfeature)); +} + +/* + * Given an xstate feature mask, calculate where in the xsave + * buffer the state is. Callers should ensure that the buffer + * is valid. + * + * Note: does not work for compacted buffers. + */ +static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) +{ + int feature_nr = fls64(xstate_feature_mask) - 1; + + if (!xfeature_enabled(feature_nr)) { + return NULL; + } + + return (u8 *)xsave + xstate_comp_offsets[feature_nr]; +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Note that if there is no data for the field in the xsave buffer + * this will return NULL. + * + * Inputs: + * xstate: the thread's storage area for all FPU data + * xstate_feature: state which is defined in xsave.h (e.g. + * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) + * Output: + * address of the state in the xsave area, or NULL if the + * field is not present in the xsave buffer. + */ +void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) +{ + /* + * Do we even *have* xsave state? + */ + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return NULL; + + /* + * This assumes the last 'xsave*' instruction to + * have requested that 'xstate_feature' be saved. + * If it did not, we might be seeing and old value + * of the field in the buffer. + * + * This can happen because the last 'xsave' did not + * request that this feature be saved (unlikely) + * or because the "init optimization" caused it + * to not be saved. + */ + if (!(xsave->header.xfeatures & xstate_feature)) + return NULL; + + return __raw_xsave_addr(xsave, xstate_feature); +} + #define XSTATE_COMPACTION_ENABLED (1ULL << 63) static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; @@ -3151,7 +1714,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; + u64 feature = valid & -(s64)valid; int index = fls64(feature) - 1; void *src = get_xsave_addr(xsave, feature); @@ -3168,7 +1731,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3189,7 +1752,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; + u64 feature = valid & -(s64)valid; int index = fls64(feature) - 1; void *dest = get_xsave_addr(xsave, feature); @@ -3212,7 +1775,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state.fxsave, + &vcpu->arch.guest_fpu.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XFEATURE_MASK_FPSSE; @@ -3237,7 +1800,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, } else { if (xstate_bv & ~XFEATURE_MASK_FPSSE) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state.fxsave, + memcpy(&vcpu->arch.guest_fpu.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); } return 0; @@ -3265,7 +1828,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; - if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + if (guest_xcrs->nr_xcrs > GVM_MAX_XCRS || guest_xcrs->flags) return -EINVAL; for (i = 0; i < guest_xcrs->nr_xcrs; i++) @@ -3280,40 +1843,11 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, return r; } -/* - * kvm_set_guest_paused() indicates to the guest kernel that it has been - * stopped by the hypervisor. This function will be called from the host only. - * EINVAL is returned when the host attempts to set the flag for a guest that - * does not support pv clocks. - */ -static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.pv_time_enabled) - return -EINVAL; - vcpu->arch.pvclock_set_guest_stopped_request = true; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return 0; -} - -static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, - struct kvm_enable_cap *cap) +long kvm_arch_vcpu_ioctl(struct gvm_device_extension *devext, + PIRP pIrp, unsigned int ioctl) { - if (cap->flags) - return -EINVAL; - - switch (cap->cap) { - case KVM_CAP_HYPERV_SYNIC: - return kvm_hv_activate_synic(vcpu); - default: - return -EINVAL; - } -} - -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu = devext->PrivData; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; int r; union { struct kvm_lapic_state *lapic; @@ -3324,7 +1858,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u.buffer = NULL; switch (ioctl) { - case KVM_GET_LAPIC: { + case GVM_GET_LAPIC: { r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; @@ -3336,13 +1870,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, u.lapic, + sizeof(struct kvm_lapic_state)); break; } - case KVM_SET_LAPIC: { + case GVM_SET_LAPIC: { r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; @@ -3353,7 +1885,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); break; } - case KVM_INTERRUPT: { + case GVM_INTERRUPT: { struct kvm_interrupt irq; r = -EFAULT; @@ -3362,59 +1894,50 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; } - case KVM_NMI: { + case GVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); break; } - case KVM_SMI: { + case GVM_SMI: { r = kvm_vcpu_ioctl_smi(vcpu); break; } - case KVM_SET_CPUID: { + case GVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - break; - } - case KVM_SET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); break; } - case KVM_GET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + case GVM_GET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, + r = kvm_vcpu_ioctl_get_cpuid(vcpu, &cpuid, cpuid_arg->entries); if (r) goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + r = gvmUpdateReturnBuffer(pIrp, 0, &cpuid, sizeof(cpuid)); + if (r) goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, sizeof(cpuid), &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry)); break; } - case KVM_GET_MSRS: - r = msr_io(vcpu, argp, do_get_msr, 1); + case GVM_GET_MSRS: + r = msr_io(pIrp, vcpu, argp, do_get_msr, 1); break; - case KVM_SET_MSRS: - r = msr_io(vcpu, argp, do_set_msr, 0); + case GVM_SET_MSRS: + r = msr_io(pIrp, vcpu, argp, do_set_msr, 0); break; - case KVM_TPR_ACCESS_REPORTING: { + case GVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; r = -EFAULT; @@ -3423,13 +1946,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, &tac, sizeof tac)) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &tac, sizeof(tac)); break; }; - case KVM_SET_VAPIC_ADDR: { + case GVM_SET_VAPIC_ADDR: { struct kvm_vapic_addr va; int idx; @@ -3444,36 +1964,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } - case KVM_X86_SETUP_MCE: { - u64 mcg_cap; - - r = -EFAULT; - if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) - goto out; - r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); - break; - } - case KVM_X86_SET_MCE: { - struct kvm_x86_mce mce; - - r = -EFAULT; - if (copy_from_user(&mce, argp, sizeof mce)) - goto out; - r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - break; - } - case KVM_GET_VCPU_EVENTS: { + case GVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); - r = -EFAULT; - if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &events, + sizeof(struct kvm_vcpu_events)); break; } - case KVM_SET_VCPU_EVENTS: { + case GVM_SET_VCPU_EVENTS: { struct kvm_vcpu_events events; r = -EFAULT; @@ -3483,19 +1983,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); break; } - case KVM_GET_DEBUGREGS: { + case GVM_GET_DEBUGREGS: { struct kvm_debugregs dbgregs; kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); - r = -EFAULT; - if (copy_to_user(argp, &dbgregs, - sizeof(struct kvm_debugregs))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &dbgregs, + sizeof(struct kvm_debugregs)); break; } - case KVM_SET_DEBUGREGS: { + case GVM_SET_DEBUGREGS: { struct kvm_debugregs dbgregs; r = -EFAULT; @@ -3506,7 +2003,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } - case KVM_GET_XSAVE: { + case GVM_GET_XSAVE: { u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!u.xsave) @@ -3514,13 +2011,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); - r = -EFAULT; - if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, u.xsave, + sizeof(struct kvm_xsave)); break; } - case KVM_SET_XSAVE: { + case GVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); if (IS_ERR(u.xsave)) return PTR_ERR(u.xsave); @@ -3528,7 +2023,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } - case KVM_GET_XCRS: { + case GVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!u.xcrs) @@ -3536,14 +2031,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); - r = -EFAULT; - if (copy_to_user(argp, u.xcrs, - sizeof(struct kvm_xcrs))) - break; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, u.xcrs, + sizeof(struct kvm_xcrs)); break; } - case KVM_SET_XCRS: { + case GVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); if (IS_ERR(u.xcrs)) return PTR_ERR(u.xcrs); @@ -3551,40 +2043,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } - case KVM_SET_TSC_KHZ: { - u32 user_tsc_khz; - - r = -EINVAL; - user_tsc_khz = (u32)arg; - - if (user_tsc_khz >= kvm_max_guest_tsc_khz) - goto out; - - if (user_tsc_khz == 0) - user_tsc_khz = tsc_khz; - - if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) - r = 0; - - goto out; - } - case KVM_GET_TSC_KHZ: { - r = vcpu->arch.virtual_tsc_khz; - goto out; - } - case KVM_KVMCLOCK_CTRL: { - r = kvm_set_guest_paused(vcpu); - goto out; - } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); - break; - } default: r = -EINVAL; } @@ -3593,12 +2051,7 @@ out: return r; } -int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - -static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, size_t addr) { int ret; @@ -3618,7 +2071,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { - if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) + if (kvm_nr_mmu_pages < GVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; mutex_lock(&kvm->slots_lock); @@ -3641,17 +2094,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: + case GVM_IRQCHIP_PIC_MASTER: memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], sizeof(struct kvm_pic_state)); break; - case KVM_IRQCHIP_PIC_SLAVE: + case GVM_IRQCHIP_PIC_SLAVE: memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], sizeof(struct kvm_pic_state)); break; - case KVM_IRQCHIP_IOAPIC: + case GVM_IRQCHIP_IOAPIC: r = kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: @@ -3667,21 +2120,21 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: + case GVM_IRQCHIP_PIC_MASTER: spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); spin_unlock(&pic_irqchip(kvm)->lock); break; - case KVM_IRQCHIP_PIC_SLAVE: + case GVM_IRQCHIP_PIC_SLAVE: spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); spin_unlock(&pic_irqchip(kvm)->lock); break; - case KVM_IRQCHIP_IOAPIC: + case GVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: @@ -3692,83 +2145,6 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) return r; } -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; - - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); - - mutex_lock(&kps->lock); - memcpy(ps, &kps->channels, sizeof(*ps)); - mutex_unlock(&kps->lock); - return 0; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int i; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return 0; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int start = 0; - int i; - u32 prev_legacy, cur_legacy; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&pit->pit_state.channels, &ps->channels, - sizeof(pit->pit_state.channels)); - pit->pit_state.flags = ps->flags; - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, - start && i == 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - if (!pit) - return -ENXIO; - - /* pit->pit_state.lock was overloaded to prevent userspace from getting - * an inconsistent state after running multiple KVM_REINJECT_CONTROL - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. - */ - mutex_lock(&pit->pit_state.lock); - kvm_pit_set_reinject(pit, control->pit_reinject); - mutex_unlock(&pit->pit_state.lock); - - return 0; -} - /** * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot * @kvm: kvm instance @@ -3779,7 +2155,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, * * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * bitmap may be corrupt. Regardless of previous outcome the kvm logging API * does not preclude user space subsequent dirty log read. Flushing TLB ensures * writes will be marked dirty for next log read. * @@ -3791,7 +2167,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { bool is_dirty = false; - int r; + int r = 0; mutex_lock(&kvm->slots_lock); @@ -3807,7 +2183,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ - lockdep_assert_held(&kvm->slots_lock); if (is_dirty) kvm_flush_remote_tlbs(kvm); @@ -3821,7 +2196,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, if (!irqchip_in_kernel(kvm)) return -ENXIO; - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->status = kvm_set_irq(kvm, GVM_USERSPACE_IRQ_SOURCE_ID, irq_event->irq, irq_event->level, line_status); return 0; @@ -3836,44 +2211,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; switch (cap->cap) { - case KVM_CAP_DISABLE_QUIRKS: + case GVM_CAP_DISABLE_QUIRKS: kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; - case KVM_CAP_SPLIT_IRQCHIP: { - mutex_lock(&kvm->lock); - r = -EINVAL; - if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) - goto split_irqchip_unlock; - r = -EEXIST; - if (irqchip_in_kernel(kvm)) - goto split_irqchip_unlock; - if (kvm->created_vcpus) - goto split_irqchip_unlock; - r = kvm_setup_empty_irq_routing(kvm); - if (r) - goto split_irqchip_unlock; - /* Pairs with irqchip_in_kernel. */ - smp_wmb(); - kvm->arch.irqchip_split = true; - kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; - r = 0; -split_irqchip_unlock: - mutex_unlock(&kvm->lock); - break; - } - case KVM_CAP_X2APIC_API: - r = -EINVAL; - if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) - break; - - if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) - kvm->arch.x2apic_format = true; - if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) - kvm->arch.x2apic_broadcast_quirk_disabled = true; - - r = 0; - break; default: r = -EINVAL; break; @@ -3881,43 +2222,39 @@ split_irqchip_unlock: return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vm_ioctl(struct gvm_device_extension *devext, + PIRP pIrp, unsigned int ioctl) { - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; + struct kvm *kvm = devext->PrivData; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; int r = -ENOTTY; - /* - * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be - * combined, not added together. - */ - union { - struct kvm_pit_state ps; - struct kvm_pit_state2 ps2; - struct kvm_pit_config pit_config; - } u; switch (ioctl) { - case KVM_SET_TSS_ADDR: - r = kvm_vm_ioctl_set_tss_addr(kvm, arg); + case GVM_SET_TSS_ADDR: + r = -EFAULT; + if (IoGetCurrentIrpStackLocation(pIrp)->Parameters.DeviceIoControl.InputBufferLength + < sizeof(size_t)) + goto out; + r = kvm_vm_ioctl_set_tss_addr(kvm, *(size_t *)argp); break; - case KVM_SET_IDENTITY_MAP_ADDR: { + case GVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; r = -EFAULT; - if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + if (IoGetCurrentIrpStackLocation(pIrp)->Parameters.DeviceIoControl.InputBufferLength + < sizeof(ident_addr)) goto out; + ident_addr = *(u64 *)argp; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); break; } - case KVM_SET_NR_MMU_PAGES: - r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); + case GVM_SET_NR_MMU_PAGES: + r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, *(unsigned int*)argp); break; - case KVM_GET_NR_MMU_PAGES: + case GVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_CREATE_IRQCHIP: { + case GVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; mutex_lock(&kvm->lock); @@ -3956,27 +2293,7 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->lock); break; } - case KVM_CREATE_PIT: - u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; - goto create_pit; - case KVM_CREATE_PIT2: - r = -EFAULT; - if (copy_from_user(&u.pit_config, argp, - sizeof(struct kvm_pit_config))) - goto out; - create_pit: - mutex_lock(&kvm->lock); - r = -EEXIST; - if (kvm->arch.vpit) - goto create_pit_unlock; - r = -ENOMEM; - kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); - if (kvm->arch.vpit) - r = 0; - create_pit_unlock: - mutex_unlock(&kvm->lock); - break; - case KVM_GET_IRQCHIP: { + case GVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; @@ -3987,20 +2304,17 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_in_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) goto get_irqchip_out; - r = -EFAULT; - if (copy_to_user(argp, chip, sizeof *chip)) - goto get_irqchip_out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, chip, sizeof(*chip)); get_irqchip_out: kfree(chip); break; } - case KVM_SET_IRQCHIP: { + case GVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; @@ -4011,7 +2325,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_in_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4021,121 +2335,16 @@ long kvm_arch_vm_ioctl(struct file *filp, kfree(chip); break; } - case KVM_GET_PIT: { - r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_get_pit(kvm, &u.ps); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) - goto out; - r = 0; - break; - } - case KVM_SET_PIT: { - r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof u.ps)) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - break; - } - case KVM_GET_PIT2: { - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) - goto out; - r = 0; - break; - } - case KVM_SET_PIT2: { - r = -EFAULT; - if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - break; - } - case KVM_REINJECT_CONTROL: { - struct kvm_reinject_control control; - r = -EFAULT; - if (copy_from_user(&control, argp, sizeof(control))) - goto out; - r = kvm_vm_ioctl_reinject(kvm, &control); - break; - } - case KVM_SET_BOOT_CPU_ID: + case GVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; else - kvm->arch.bsp_vcpu_id = arg; + kvm->arch.bsp_vcpu_id = *(u32 *)argp; mutex_unlock(&kvm->lock); break; - case KVM_XEN_HVM_CONFIG: { - r = -EFAULT; - if (copy_from_user(&kvm->arch.xen_hvm_config, argp, - sizeof(struct kvm_xen_hvm_config))) - goto out; - r = -EINVAL; - if (kvm->arch.xen_hvm_config.flags) - goto out; - r = 0; - break; - } - case KVM_SET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - local_irq_enable(); - kvm_gen_update_masterclock(kvm); - break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - local_irq_enable(); - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; - break; - } - case KVM_ENABLE_CAP: { + case GVM_ENABLE_CAP: { struct kvm_enable_cap cap; r = -EFAULT; @@ -4145,7 +2354,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + break; } out: return r; @@ -4183,6 +2392,7 @@ static void kvm_init_msr_list(void) } num_msrs_to_save = j; +#if 0 for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { switch (emulated_msrs[i]) { case MSR_IA32_SMBASE: @@ -4198,6 +2408,7 @@ static void kvm_init_msr_list(void) j++; } num_emulated_msrs = j; +#endif } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -4205,17 +2416,20 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, { int handled = 0; int n; + const char *__v = v; do { n = min(len, 8); if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) + && kvm_io_bus_write(vcpu, GVM_MMIO_BUS, addr, n, v)) break; handled += n; addr += n; len -= n; - v += n; + __v = (char *)v; + __v += n; + v = (void *)__v; } while (len); return handled; @@ -4225,19 +2439,21 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { int handled = 0; int n; + char *__v; do { n = min(len, 8); if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) + && kvm_io_bus_read(vcpu, GVM_MMIO_BUS, addr, n, v)) break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); handled += n; addr += n; len -= n; - v += n; + __v = (char *)v; + __v += n; + v = (void *)__v; } while (len); return handled; @@ -4304,6 +2520,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { void *data = val; + char *__data; int r = X86EMUL_CONTINUE; while (bytes) { @@ -4323,7 +2540,9 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, } bytes -= toread; - data += toread; + __data = (char *)data; + __data += toread; + data = (void *)__data; addr += toread; } out: @@ -4367,7 +2586,6 @@ int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4378,7 +2596,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, } static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes) + size_t addr, void *val, unsigned int bytes) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); @@ -4393,6 +2611,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); void *data = val; + char *__data; int r = X86EMUL_CONTINUE; while (bytes) { @@ -4412,15 +2631,16 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, } bytes -= towrite; - data += towrite; + __data = (char *)data; + __data += towrite; + data = (void *)__data; addr += towrite; } out: return r; } -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); -static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, +static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, size_t gva, gpa_t *gpa, struct x86_exception *exception, bool write) { @@ -4437,7 +2657,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); - trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } @@ -4451,7 +2670,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 1; if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); return 1; } @@ -4485,8 +2703,6 @@ struct read_write_emulator_ops { static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_fragments[0].gpa, *(u64 *)val); vcpu->mmio_read_completed = 0; return 1; } @@ -4508,14 +2724,12 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) { - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); return vcpu_mmio_write(vcpu, gpa, bytes, val); } static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); return X86EMUL_IO_NEEDED; } @@ -4542,7 +2756,7 @@ static const struct read_write_emulator_ops write_emultor = { .write = true, }; -static int emulator_read_write_onepage(unsigned long addr, void *val, +static int emulator_read_write_onepage(size_t addr, void *val, unsigned int bytes, struct x86_exception *exception, struct kvm_vcpu *vcpu, @@ -4552,6 +2766,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + char *__val; ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -4575,9 +2790,11 @@ mmio: gpa += handled; bytes -= handled; - val += handled; + __val = val; + __val += handled; + val = __val; - WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + WARN_ON(vcpu->mmio_nr_fragments >= GVM_MAX_MMIO_FRAGMENTS); frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; frag->gpa = gpa; frag->data = val; @@ -4586,7 +2803,7 @@ mmio: } static int emulator_read_write(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, void *val, unsigned int bytes, struct x86_exception *exception, const struct read_write_emulator_ops *ops) @@ -4594,6 +2811,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; int rc; + char *__val; if (ops->read_write_prepare && ops->read_write_prepare(vcpu, val, bytes)) @@ -4605,7 +2823,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int now; - now = -addr & ~PAGE_MASK; + now = -(ssize_t)addr & ~PAGE_MASK; rc = emulator_read_write_onepage(addr, val, now, exception, vcpu, ops); @@ -4614,7 +2832,9 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, addr += now; if (ctxt->mode != X86EMUL_MODE_PROT64) addr = (u32)addr; - val += now; + __val = val; + __val += now; + val = __val; bytes -= now; } @@ -4633,14 +2853,14 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; - vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = GVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = gpa; return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, void *val, unsigned int bytes, struct x86_exception *exception) @@ -4650,7 +2870,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, } static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, const void *val, unsigned int bytes, struct x86_exception *exception) @@ -4670,7 +2890,7 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, #endif static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, + size_t addr, const void *old, const void *new, unsigned int bytes, @@ -4678,9 +2898,10 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; - struct page *page; char *kaddr; bool exchanged; + size_t hva; + PMDL kmap_mdl; /* guests cmpxchg8b have to be emulated atomically */ if (bytes > 8 || (bytes & (bytes - 1))) @@ -4695,11 +2916,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); + if (kvm_is_error_hva(hva)) goto emul_write; - kaddr = kmap_atomic(page); + if (get_user_pages_fast(hva, 1, 1, &kmap_mdl) != 1) + goto emul_write; + + kaddr = kmap_atomic(kmap_mdl); + if (!kaddr) + goto emul_write; kaddr += offset_in_page(gpa); switch (bytes) { case 1: @@ -4717,8 +2943,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr); - kvm_release_page_dirty(page); + kunmap_atomic(kmap_mdl); if (!exchanged) return X86EMUL_CMPXCHG_FAILED; @@ -4740,10 +2965,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu, GVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, + r = kvm_io_bus_write(vcpu, GVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); return r; @@ -4763,10 +2988,10 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 1; } - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->exit_reason = GVM_EXIT_IO; + vcpu->run->io.direction = in ? GVM_EXIT_IO_IN : GVM_EXIT_IO_OUT; vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.data_offset = GVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; @@ -4787,7 +3012,6 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (ret) { data_avail: memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; return 1; } @@ -4802,11 +3026,10 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } -static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +static size_t get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); } @@ -4818,19 +3041,6 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { - if (!need_emulate_wbinvd(vcpu)) - return X86EMUL_CONTINUE; - - if (kvm_x86_ops->has_wbinvd_exit()) { - int cpu = get_cpu(); - - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); - put_cpu(); - cpumask_clear(vcpu->arch.wbinvd_dirty_mask); - } else - wbinvd(); return X86EMUL_CONTINUE; } @@ -4839,7 +3049,6 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) kvm_x86_ops->skip_emulated_instruction(vcpu); return kvm_emulate_wbinvd_noskip(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -4849,16 +3058,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) } static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) + size_t *dest) { - return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + //return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return 0; } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long value) + size_t value) { - - return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); + return 0; + //return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -4866,10 +3076,10 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) +static size_t emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long value; + size_t value; switch (cr) { case 0: @@ -4888,7 +3098,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - kvm_err("%s: unexpected cr %u\n", __func__, cr); + //kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4917,7 +3127,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - kvm_err("%s: unexpected cr %u\n", __func__, cr); + //kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } @@ -4949,7 +3159,7 @@ static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); } -static unsigned long emulator_get_cached_segment_base( +static size_t emulator_get_cached_segment_base( struct x86_emulate_ctxt *ctxt, int seg) { return get_segment_base(emul_to_vcpu(ctxt), seg); @@ -4972,7 +3182,7 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.g) var.limit >>= 12; set_desc_limit(desc, var.limit); - set_desc_base(desc, (unsigned long)var.base); + set_desc_base(desc, (size_t)var.base); #ifdef CONFIG_X86_64 if (base3) *base3 = var.base >> 32; @@ -5063,13 +3273,15 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + //return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + return 0; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { - return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); + //return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); + return 0; } static void emulator_halt(struct x86_emulate_ctxt *ctxt) @@ -5085,11 +3297,12 @@ static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) * CR0.TS may reference the host fpu state, not the guest fpu state, * so it may be clear at this point. */ - clts(); + __clts(); } static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) { + kvm_save_guest_fpu(emul_to_vcpu(ctxt)); preempt_enable(); } @@ -5154,7 +3367,6 @@ static const struct x86_emulate_ops emulate_ops = { .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, - .fix_hypercall = emulator_fix_hypercall, .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, @@ -5177,7 +3389,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (unlikely(int_shadow || mask)) { kvm_x86_ops->set_interrupt_shadow(vcpu, mask); if (!mask) - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } } @@ -5209,9 +3421,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); @@ -5244,17 +3453,15 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu) { int r = EMULATE_DONE; ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->exit_reason = GVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = GVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; r = EMULATE_FAIL; } @@ -5303,8 +3510,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, if (is_error_noslot_pfn(pfn)) return false; - kvm_release_pfn_clean(pfn); - /* The instructions are well-emulated on direct mmu. */ if (vcpu->arch.mmu.direct_map) { unsigned int indirect_shadow_pages; @@ -5335,10 +3540,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) + size_t cr2, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + size_t last_retry_eip, last_retry_addr, gpa = cr2; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; @@ -5384,11 +3589,8 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu); static void kvm_smm_changed(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.hflags & HF_SMM_MASK)) { - /* This is a good place to trace that we are exiting SMM. */ - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); - /* Process a latched INIT or SMI, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } kvm_mmu_reset_context(vcpu); @@ -5404,8 +3606,8 @@ static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) kvm_smm_changed(vcpu); } -static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, - unsigned long *db) +static int kvm_vcpu_check_hw_bp(size_t addr, u32 type, u32 dr7, + size_t *db) { u32 dr6 = 0; int i; @@ -5419,7 +3621,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, size_t rflags, int *r) { struct kvm_run *kvm_run = vcpu->run; @@ -5432,12 +3634,12 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) { kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->exit_reason = GVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; @@ -5455,10 +3657,10 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { - if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + if (unlikely(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { struct kvm_run *kvm_run = vcpu->run; - unsigned long eip = kvm_get_linear_rip(vcpu); + size_t eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.guest_debug_dr7, vcpu->arch.eff_db); @@ -5467,7 +3669,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->exit_reason = GVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; return true; } @@ -5475,7 +3677,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { - unsigned long eip = kvm_get_linear_rip(vcpu); + size_t eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); @@ -5493,7 +3695,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, + size_t cr2, int emulation_type, void *insn, int insn_len) @@ -5531,7 +3733,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, r = x86_decode_insn(ctxt, insn, insn_len); - trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) @@ -5600,7 +3801,7 @@ restart: r = EMULATE_DONE; if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + size_t rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (vcpu->arch.hflags != ctxt->emul_flags) @@ -5613,214 +3814,40 @@ restart: __kvm_set_rflags(vcpu, ctxt->eflags); /* - * For STI, interrupts are shadowed; so KVM_REQ_EVENT will + * For STI, interrupts are shadowed; so GVM_REQ_EVENT will * do nothing, and it will be requested again as soon as * the shadow expires. But we still need to check here, * because POPF has no interrupt shadow. */ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + size_t val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); /* do not return to emulator after return from userspace */ vcpu->arch.pio.count = 0; return ret; } -EXPORT_SYMBOL_GPL(kvm_fast_pio_out); - -static int kvmclock_cpu_down_prep(unsigned int cpu) -{ - __this_cpu_write(cpu_tsc_khz, 0); - return 0; -} - -static void tsc_khz_changed(void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long khz = 0; - - if (data) - khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - khz = cpufreq_quick_get(raw_smp_processor_id()); - if (!khz) - khz = tsc_khz; - __this_cpu_write(cpu_tsc_khz, khz); -} - -static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i, send_ipi = 0; - - /* - * We allow guests to temporarily run on slowing clocks, - * provided we notify them after, or to run on accelerating - * clocks, provided we notify them before. Thus time never - * goes backwards. - * - * However, we have a problem. We can't atomically update - * the frequency of a given CPU from this function; it is - * merely a notifier, which can be called from any CPU. - * Changing the TSC frequency at arbitrary points in time - * requires a recomputation of local variables related to - * the TSC for each VCPU. We must flag these local variables - * to be updated and be sure the update takes place with the - * new frequency before any guests proceed. - * - * Unfortunately, the combination of hotplug CPU and frequency - * change creates an intractable locking scenario; the order - * of when these callouts happen is undefined with respect to - * CPU hotplug, and they can race with each other. As such, - * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is - * undefined; you can actually have a CPU frequency change take - * place in between the computation of X and the setting of the - * variable. To protect against this problem, all updates of - * the per_cpu tsc_khz variable are done in an interrupt - * protected IPI, and all callers wishing to update the value - * must wait for a synchronous IPI to complete (which is trivial - * if the caller is on the CPU already). This establishes the - * necessary total order on variable updates. - * - * Note that because a guest time update may take place - * anytime after the setting of the VCPU's request bit, the - * correct TSC value must be set before the request. However, - * to ensure the update actually makes it to any guest which - * starts running in hardware virtualization between the set - * and the acquisition of the spinlock, we must also ping the - * CPU after setting the request bit. - * - */ - - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) - return 0; - if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) - return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != freq->cpu) - continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) - send_ipi = 1; - } - } - spin_unlock(&kvm_lock); - - if (freq->old < freq->new && send_ipi) { - /* - * We upscale the frequency. Must make the guest - * doesn't see old kvmclock values while running with - * the new frequency, otherwise we risk the guest sees - * time go backwards. - * - * In case we update the frequency for another cpu - * (which might be in guest context) send an interrupt - * to kick the cpu out of guest context. Next time - * guest context is entered kvmclock will be updated, - * so the guest will not see stale values. - */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - } - return 0; -} - -static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier -}; - -static int kvmclock_cpu_online(unsigned int cpu) -{ - tsc_khz_changed(NULL); - return 0; -} - -static void kvm_timer_init(void) -{ - max_tsc_khz = tsc_khz; - - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; - int cpu; - - memset(&policy, 0, sizeof(policy)); - cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; - put_cpu(); -#endif - cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE", - kvmclock_cpu_online, kvmclock_cpu_down_prep); -} static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) { __this_cpu_write(current_vcpu, vcpu); } -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) { __this_cpu_write(current_vcpu, NULL); } -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); static void kvm_set_mmio_spte_mask(void) { @@ -5852,53 +3879,9 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } -#ifdef CONFIG_X86_64 -static void pvclock_gtod_update_fn(struct work_struct *work) -{ - struct kvm *kvm; - - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); -} - -static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); - -/* - * Notification about pvclock gtod data update. - */ -static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, - void *priv) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - struct timekeeper *tk = priv; - - update_pvclock_gtod(tk); - - /* disable master clock if host does not trust, or does not - * use, TSC clocksource - */ - if (gtod->clock.vclock_mode != VCLOCK_TSC && - atomic_read(&kvm_guest_has_master_clock) != 0) - queue_work(system_long_wq, &pvclock_gtod_work); - - return 0; -} - -static struct notifier_block pvclock_gtod_notifier = { - .notifier_call = pvclock_gtod_notify, -}; -#endif - int kvm_arch_init(void *opaque) { - int r; + int r = -EFAULT, i; struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { @@ -5918,17 +3901,6 @@ int kvm_arch_init(void *opaque) goto out; } - r = -ENOMEM; - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out; - } - - r = kvm_mmu_module_init(); - if (r) - goto out_free_percpu; - kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; @@ -5936,79 +3908,46 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, PT_PRESENT_MASK); - kvm_timer_init(); - - perf_register_guest_info_callbacks(&kvm_guest_cbs); if (boot_cpu_has(X86_FEATURE_XSAVE)) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + /* We have to move array initialization here since gcc's extension + * of array initialization is not supported here. + */ + for (i = 0; i < XFEATURE_MAX; i++) + xstate_offsets[i] = xstate_sizes[i] = -1; kvm_lapic_init(); -#ifdef CONFIG_X86_64 - pvclock_gtod_register_notifier(&pvclock_gtod_notifier); -#endif return 0; -out_free_percpu: - free_percpu(shared_msrs); out: return r; } void kvm_arch_exit(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); -#ifdef CONFIG_X86_64 - pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); -#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); - free_percpu(shared_msrs); } int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + vcpu->arch.mp_state = GVM_MP_STATE_HALTED; return 1; } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; + vcpu->run->exit_reason = GVM_EXIT_HLT; return 0; } } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { kvm_x86_ops->skip_emulated_instruction(vcpu); return kvm_vcpu_halt(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -/* - * kvm_pv_kick_cpu_op: Kick a vcpu. - * - * @apicid - apicid of vcpu to be kicked. - */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) -{ - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = 0; - lapic_irq.dest_mode = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; - - lapic_irq.delivery_mode = APIC_DM_REMRD; - kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); -} void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) { @@ -6016,70 +3955,6 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; - - kvm_x86_ops->skip_emulated_instruction(vcpu); - - if (kvm_hv_hypercall_enabled(vcpu->kvm)) - return kvm_hv_hypercall(vcpu); - - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - - trace_kvm_hypercall(nr, a0, a1, a2, a3); - - op_64_bit = is_64_bit_mode(vcpu); - if (!op_64_bit) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } - - if (kvm_x86_ops->get_cpl(vcpu) != 0) { - ret = -KVM_EPERM; - goto out; - } - - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - case KVM_HC_KICK_CPU: - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); - ret = 0; - break; - default: - ret = -KVM_ENOSYS; - break; - } -out: - if (!op_64_bit) - ret = (u32)ret; - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - ++vcpu->stat.hypercalls; - return r; -} -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); - -static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - char instruction[3]; - unsigned long rip = kvm_rip_read(vcpu); - - kvm_x86_ops->patch_hypercall(vcpu, instruction); - - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); -} - static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { return vcpu->run->request_interrupt_window && @@ -6091,7 +3966,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; + kvm_run->flags = is_smm(vcpu) ? GVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); kvm_run->ready_for_interrupt_injection = @@ -6131,10 +4006,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -6182,7 +4053,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * calling check_nested_events again here to avoid a race condition. * See https://lkml.org/lkml/2014/7/2/60 for discussion about this * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? + * GVM_REQ_EVENT only on certain events and not unconditionally? */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); @@ -6213,7 +4084,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } #define put_smstate(type, buf, offset, val) \ @@ -6273,7 +4144,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) { struct desc_ptr dt; struct kvm_segment seg; - unsigned long val; + size_t val; int i; put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); @@ -6324,7 +4195,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) #ifdef CONFIG_X86_64 struct desc_ptr dt; struct kvm_segment seg; - unsigned long val; + size_t val; int i; for (i = 0; i < 16; i++) @@ -6383,7 +4254,6 @@ static void enter_smm(struct kvm_vcpu *vcpu) char buf[512]; u32 cr0; - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has_longmode(vcpu)) @@ -6448,12 +4318,12 @@ static void enter_smm(struct kvm_vcpu *vcpu) static void process_smi(struct kvm_vcpu *vcpu) { vcpu->arch.smi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); } void kvm_make_scan_ioapic_request(struct kvm *kvm) { - kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); + kvm_make_all_cpus_request(kvm, GVM_REQ_SCAN_IOAPIC); } static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) @@ -6465,15 +4335,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); - if (irqchip_split(vcpu->kvm)) - kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } - bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, - vcpu_to_synic(vcpu)->vec_bitmap, 256); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + bitmap_copy((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, 256); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } @@ -6485,7 +4348,7 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; + pfn_t pfn = 0; if (!lapic_in_kernel(vcpu)) return; @@ -6493,29 +4356,128 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) + pfn = gfn_to_pfn(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_noslot_pfn(pfn)) return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + kvm_x86_ops->set_apic_access_page_addr(vcpu, pfn << PAGE_SHIFT); /* * Do not pin apic access page in memory, the MMU notifier * will call us again if it is migrated or swapped out. */ - put_page(page); + //put_page(page); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) + size_t address) { /* * The physical address of apic access page is stored in the VMCS. * Update it when it becomes invalid. */ if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + kvm_make_all_cpus_request(kvm, GVM_REQ_APIC_PAGE_RELOAD); +} + +//#define HOST_STAT_DEBUG +/* + * A useful tool to check whether host state remains the same across + * host->guest->host switches. In theory, host state should be saved/restored + * only when it is subject to change. However, without souce code and + * document, you never know. When something goes terribly wrong, this tool + * can help check whether it is caused by incomplete host stat restore. + */ +#ifdef HOST_STAT_DEBUG +#include <intrin.h> +struct host_stat { + struct desc_ptr gdt; + struct desc_ptr idt; + u16 cs_sel; + u16 ss_sel; + u16 ds_sel; + u16 es_sel; + u16 fs_sel; + u16 gs_sel; + u16 ldt_sel; + u16 tr_sel; + struct desc_struct cs; + struct desc_struct ss; + struct desc_struct ds; + struct desc_struct es; + struct desc_struct fs; + struct desc_struct gs; + struct desc_struct ldt; + struct desc_struct tr; + u64 fs_base; + u64 gs_base; + u64 kernel_gs_base; + u64 cr0; + u64 cr2; + u64 cr3; + u64 cr4; + u64 cr8; + u64 efer; + u64 star; + u64 lstar; + u64 cstar; + u64 sf_mask; + u64 sysenter_cs; + u64 sysenter_eip; + u64 sysenter_esp; +}; + +static void save_host_stat_full(struct host_stat *hs) +{ + struct desc_struct *gdt; + + _sgdt(&hs->gdt); + __sidt(&hs->idt); + + savesegment(cs, hs->ds_sel); + savesegment(ss, hs->ds_sel); + savesegment(ds, hs->ds_sel); + savesegment(es, hs->es_sel); + savesegment(fs, hs->fs_sel); + savesegment(gs, hs->gs_sel); + hs->ldt_sel = gvm_read_ldt(); + hs->tr_sel = gvm_read_tr(); + + gdt = (struct desc_struct *)hs->gdt.address; + hs->cs = gdt[hs->cs_sel >> 3]; + hs->ss = gdt[hs->ss_sel >> 3]; + hs->ds = gdt[hs->ds_sel >> 3]; + hs->es = gdt[hs->es_sel >> 3]; + hs->fs = gdt[hs->fs_sel >> 3]; + hs->gs = gdt[hs->gs_sel >> 3]; + hs->ldt = gdt[hs->ldt_sel >> 3]; + hs->tr = gdt[hs->tr_sel >> 3]; + + hs->fs_base = __readmsr(MSR_FS_BASE); + hs->gs_base = __readmsr(MSR_GS_BASE); + hs->kernel_gs_base = __readmsr(MSR_KERNEL_GS_BASE); + + hs->cr0 = __readcr0(); + hs->cr2 = __readcr2(); + hs->cr3 = __readcr3(); + hs->cr4 = __readcr4(); + hs->cr8 = __readcr8(); + + hs->efer = __readmsr(MSR_EFER); + hs->star = __readmsr(MSR_STAR); + hs->lstar = __readmsr(MSR_LSTAR); + hs->cstar = __readmsr(MSR_CSTAR); + hs->sf_mask = __readmsr(MSR_SYSCALL_MASK); + + hs->sysenter_cs = __readmsr(MSR_IA32_SYSENTER_CS); + hs->sysenter_eip = __readmsr(MSR_IA32_SYSENTER_EIP); + hs->sysenter_esp = __readmsr(MSR_IA32_SYSENTER_ESP); +} + +static int check_host_stat(struct host_stat *a, struct host_stat *b) +{ + return 0; } +#endif /* * Returns 1 to let vcpu_run() continue the guest execution loop without @@ -6530,100 +4492,46 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_cpu_accept_dm_intr(vcpu); bool req_immediate_exit = false; +#ifdef HOST_STAT_DEBUG + struct host_stat *enter = kzalloc(sizeof(struct host_stat), GFP_KERNEL); + struct host_stat *exit = kzalloc(sizeof(struct host_stat), GFP_KERNEL); +#endif if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + if (kvm_check_request(GVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); - if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) - __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); - if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) - kvm_gen_kvmclock_update(vcpu); - if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { - r = kvm_guest_time_update(vcpu); - if (unlikely(r)) - goto out; - } - if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) + if (kvm_check_request(GVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + if (kvm_check_request(GVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu); - if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; + if (kvm_check_request(GVM_REQ_REPORT_TPR_ACCESS, vcpu)) { + vcpu->run->exit_reason = GVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + if (kvm_check_request(GVM_REQ_TRIPLE_FAULT, vcpu)) { + vcpu->run->exit_reason = GVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } - if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { - /* Page is swapped out. Do synthetic halt */ - vcpu->arch.apf.halted = true; - r = 1; - goto out; - } - if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) - record_steal_time(vcpu); - if (kvm_check_request(KVM_REQ_SMI, vcpu)) + if (kvm_check_request(GVM_REQ_SMI, vcpu)) process_smi(vcpu); - if (kvm_check_request(KVM_REQ_NMI, vcpu)) + if (kvm_check_request(GVM_REQ_NMI, vcpu)) process_nmi(vcpu); - if (kvm_check_request(KVM_REQ_PMU, vcpu)) +#if 0 + if (kvm_check_request(GVM_REQ_PMU, vcpu)) kvm_pmu_handle_event(vcpu); - if (kvm_check_request(KVM_REQ_PMI, vcpu)) + if (kvm_check_request(GVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); - if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { - BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); - if (test_bit(vcpu->arch.pending_ioapic_eoi, - vcpu->arch.ioapic_handled_vectors)) { - vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; - vcpu->run->eoi.vector = - vcpu->arch.pending_ioapic_eoi; - r = 0; - goto out; - } - } - if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) +#endif + if (kvm_check_request(GVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); - if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) + if (kvm_check_request(GVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); - if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; - r = 0; - goto out; - } - if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; - r = 0; - goto out; - } - if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv = vcpu->arch.hyperv.exit; - r = 0; - goto out; - } - - /* - * KVM_REQ_HV_STIMER has to be processed after - * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers - * depend on the guest clock being up-to-date - */ - if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) - kvm_hv_process_stimers(vcpu); } /* - * KVM_REQ_EVENT is not set when posted interrupts are set by + * GVM_REQ_EVENT is not set when posted interrupts are set by * VT-d hardware, so we have to update RVI unconditionally. */ if (kvm_lapic_enabled(vcpu)) { @@ -6636,9 +4544,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_find_highest_irr(vcpu)); } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + if (kvm_check_request(GVM_REQ_EVENT, vcpu) || req_int_win) { kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + if (vcpu->arch.mp_state == GVM_MP_STATE_INIT_RECEIVED) { r = 1; goto out; } @@ -6674,14 +4582,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - preempt_disable(); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); + local_irq_disable(); +#ifdef HOST_STAT_DEBUG + save_host_stat_full(enter); +#endif + kvm_x86_ops->save_host_state(vcpu); vcpu->mode = IN_GUEST_MODE; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vcpu->cpu = smp_processor_id(); /* * We should set ->mode before check ->requests, @@ -6690,16 +4599,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * to the page tables done while the VCPU is running. * Please see the comment in kvm_flush_remote_tlbs. */ - smp_mb__after_srcu_read_unlock(); + smp_mb(); - local_irq_disable(); - - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests - || need_resched() || signal_pending(current)) { + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_x86_ops->load_host_state(vcpu); local_irq_enable(); - preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = 1; goto cancel_injection; @@ -6708,14 +4614,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_xcr0(vcpu); if (req_immediate_exit) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); smp_send_reschedule(vcpu->cpu); } - trace_kvm_entry(vcpu->vcpu_id); - wait_lapic_expire(vcpu); - guest_enter_irqoff(); - if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -6723,26 +4625,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_RELOAD; } + kvm_load_guest_fpu(vcpu); + kvm_x86_ops->run(vcpu); /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit * can (a) read the correct value of the debug registers, (b) set - * KVM_DEBUGREG_WONT_EXIT again. + * GVM_DEBUGREG_WONT_EXIT again. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { - WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + if (unlikely(vcpu->arch.switch_db_regs & GVM_DEBUGREG_WONT_EXIT)) { + WARN_ON(vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP); kvm_x86_ops->sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + vcpu->arch.switch_db_regs &= ~GVM_DEBUGREG_RELOAD; } +#if 0 /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. @@ -6752,36 +4657,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (hw_breakpoint_active()) hw_breakpoint_restore(); +#endif vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + kvm_save_guest_fpu(vcpu); + + //Set CPU to -1 since we don't know when we got scheduled to another + //cpu by Windows scheduler. + vcpu->cpu = -1; vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); kvm_put_guest_xcr0(vcpu); + kvm_x86_ops->load_host_state(vcpu); + kvm_x86_ops->vcpu_put(vcpu); +#ifdef HOST_STAT_DEBUG + save_host_stat_full(exit); + BUG_ON(check_host_stat(enter, exit)); +#endif kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; - guest_exit_irqoff(); - local_irq_enable(); - preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) { - unsigned long rip = kvm_rip_read(vcpu); - profile_hit(KVM_PROFILING, (void *)rip); - } - - if (unlikely(vcpu->arch.tsc_always_catchup)) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); @@ -6790,7 +4693,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) cancel_injection: kvm_x86_ops->cancel_injection(vcpu); - if (unlikely(vcpu->arch.apic_attention)) + if ((vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: return r; @@ -6798,29 +4701,23 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); - - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + if (!kvm_check_request(GVM_REQ_UNHALT, vcpu)) return 1; } kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { - case KVM_MP_STATE_HALTED: - vcpu->arch.pv.pv_unhalted = false; + case GVM_MP_STATE_HALTED: vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; + GVM_MP_STATE_RUNNABLE; + case GVM_MP_STATE_RUNNABLE: break; - case KVM_MP_STATE_INIT_RECEIVED: + case GVM_MP_STATE_INIT_RECEIVED: break; default: return -EINTR; @@ -6831,8 +4728,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted); + return (vcpu->arch.mp_state == GVM_MP_STATE_RUNNABLE); } static int vcpu_run(struct kvm_vcpu *vcpu) @@ -6852,31 +4748,21 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + clear_bit(GVM_REQ_PENDING_TIMER, &vcpu->requests); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); if (dm_request_for_irq_injection(vcpu) && kvm_vcpu_ready_for_interrupt_injection(vcpu)) { r = 0; - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + vcpu->run->exit_reason = GVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } - - kvm_check_async_pf_completion(vcpu); - - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; + if (test_and_clear_bit(0, (size_t *)&vcpu->run->user_event_pending)) { + vcpu->run->exit_reason = GVM_EXIT_INTR; break; } - if (need_resched()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - cond_resched(); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - } } srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); @@ -6925,6 +4811,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; unsigned len; + char *__data; BUG_ON(!vcpu->mmio_needed); @@ -6940,7 +4827,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->mmio_cur_fragment++; } else { /* Go forward to the next mmio piece. */ - frag->data += len; + __data = frag->data; + __data += len; + frag->data = __data; frag->gpa += len; frag->len -= len; } @@ -6955,7 +4844,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return complete_emulated_io(vcpu); } - run->exit_reason = KVM_EXIT_MMIO; + run->exit_reason = GVM_EXIT_MMIO; run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) memcpy(run->mmio.data, frag->data, min(8u, frag->len)); @@ -6968,19 +4857,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct fpu *fpu = ¤t->thread.fpu; int r; - sigset_t sigsaved; - - fpu__activate_curr(fpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + if (unlikely(vcpu->arch.mp_state == GVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + clear_bit(GVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; } @@ -7006,9 +4888,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: post_kvm_run_save(vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - return r; } @@ -7079,7 +4958,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -7092,7 +4971,6 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *db = cs.db; *l = cs.l; } -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -7128,7 +5006,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, - (unsigned long *)sregs->interrupt_bitmap); + (size_t *)sregs->interrupt_bitmap); return 0; } @@ -7137,11 +5015,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && - vcpu->arch.pv.pv_unhalted) - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; - else - mp_state->mp_state = vcpu->arch.mp_state; + mp_state->mp_state = vcpu->arch.mp_state; return 0; } @@ -7150,15 +5024,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + mp_state->mp_state != GVM_MP_STATE_RUNNABLE) return -EINVAL; - if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; - set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); + if (mp_state->mp_state == GVM_MP_STATE_SIPI_RECEIVED) { + vcpu->arch.mp_state = GVM_MP_STATE_INIT_RECEIVED; + set_bit(GVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else vcpu->arch.mp_state = mp_state->mp_state; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -7178,10 +5052,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(kvm_task_switch); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -7233,9 +5106,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - max_bits = KVM_NR_INTERRUPTS; + max_bits = GVM_NR_INTERRUPTS; pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); + (const size_t *)sregs->interrupt_bitmap, max_bits); if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); @@ -7257,9 +5130,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(GVM_REQ_EVENT, vcpu); return 0; } @@ -7267,14 +5140,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - unsigned long rflags; + size_t rflags; int i, r; - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { + if (dbg->control & (GVM_GUESTDBG_INJECT_DB | GVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) goto out; - if (dbg->control & KVM_GUESTDBG_INJECT_DB) + if (dbg->control & GVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else kvm_queue_exception(vcpu, BP_VECTOR); @@ -7287,20 +5160,20 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, rflags = kvm_get_rflags(vcpu); vcpu->guest_debug = dbg->control; - if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) + if (!(vcpu->guest_debug & GVM_GUESTDBG_ENABLE)) vcpu->guest_debug = 0; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - for (i = 0; i < KVM_NR_DB_REGS; ++i) + if (vcpu->guest_debug & GVM_GUESTDBG_USE_HW_BP) { + for (i = 0; i < GVM_NR_DB_REGS; ++i) vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; } else { - for (i = 0; i < KVM_NR_DB_REGS; i++) + for (i = 0; i < GVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; } kvm_update_dr7(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + get_segment_base(vcpu, VCPU_SREG_CS); @@ -7325,7 +5198,7 @@ out: int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { - unsigned long vaddr = tr->linear_address; + size_t vaddr = tr->linear_address; gpa_t gpa; int idx; @@ -7343,7 +5216,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + &vcpu->arch.guest_fpu.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -7360,7 +5233,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + &vcpu->arch.guest_fpu.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7374,11 +5247,28 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +static inline void fpstate_init_fxstate(struct fxregs_state *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = 0x1f80; +} + +static void fpstate_init(union fpu_state *state) +{ + memset(state, 0, PAGE_SIZE); + +#if 0 + if (static_cpu_has(X86_FEATURE_XSAVES)) + fpstate_init_xstate(&state->xsave); +#endif + fpstate_init_fxstate(&state->fxsave); +} + static void fx_init(struct kvm_vcpu *vcpu) { - fpstate_init(&vcpu->arch.guest_fpu.state); + fpstate_init(&vcpu->arch.guest_fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = + vcpu->arch.guest_fpu.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -7389,54 +5279,78 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +static inline void fpu_fxsave(union fpu_state *fpu) { - if (vcpu->guest_fpu_loaded) - return; +#if 0 + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + } +#endif - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - vcpu->guest_fpu_loaded = 1; - __kernel_fpu_begin(); - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); - trace_kvm_fpu(1); +#ifdef _WIN64 + _fxsave64(&fpu->fxsave); +#else + _fxsave(&fpu->fxsave); +#endif } -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +static inline void fpu_fxstore(union fpu_state *fpu) { - if (!vcpu->guest_fpu_loaded) { - vcpu->fpu_counter = 0; +#if 0 + if (use_xsave()) { + copy_kernel_to_xregs(&fpstate->xsave, mask); return; } +#endif +#ifdef _WIN64 + _fxrstor64(&fpu->fxsave); +#else + _fxrstor(&fpu->fxsave); +#endif +} - vcpu->guest_fpu_loaded = 0; - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - __kernel_fpu_end(); - ++vcpu->stat.fpu_reload; - /* - * If using eager FPU mode, or if the guest is a frequent user - * of the FPU, just leave the FPU active for next time. - * Every 255 times fpu_counter rolls over to 0; a guest that uses - * the FPU in bursts will revert to loading it on demand. - */ - if (!use_eager_fpu()) { - if (++vcpu->fpu_counter < 5) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - } - trace_kvm_fpu(0); +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + uint64_t efer; + + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~EFER_FFXSR); + + fpu_fxsave(&vcpu->arch.host_fpu); + fpu_fxstore(&vcpu->arch.guest_fpu); + + if (efer & EFER_FFXSR) + wrmsrl(MSR_EFER, efer); } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_save_guest_fpu(struct kvm_vcpu *vcpu) { - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + uint64_t efer; - kvmclock_reset(vcpu); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_FFXSR) + wrmsrl(MSR_EFER, efer & ~EFER_FFXSR); + fpu_fxsave(&vcpu->arch.guest_fpu); + fpu_fxstore(&vcpu->arch.host_fpu); + + if (efer & EFER_FFXSR) + wrmsrl(MSR_EFER, efer); +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ kvm_x86_ops->vcpu_free(vcpu); - free_cpumask_var(wbinvd_dirty_mask); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, @@ -7456,47 +5370,25 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int r; - kvm_vcpu_mtrr_init(vcpu); - r = vcpu_load(vcpu); - if (r) - return r; kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); - vcpu_put(vcpu); - return r; + return 0; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct msr_data msr; - struct kvm *kvm = vcpu->kvm; - if (vcpu_load(vcpu)) - return; msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); - vcpu_put(vcpu); - - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int r; - vcpu->arch.apf.msr_val = 0; - - r = vcpu_load(vcpu); - BUG_ON(r); kvm_mmu_unload(vcpu); - vcpu_put(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -7521,18 +5413,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.cr2 = 0; - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; - vcpu->arch.st.msr_val = 0; - - kvmclock_reset(vcpu); - - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - vcpu->arch.apf.halted = false; + kvm_make_request(GVM_REQ_EVENT, vcpu); if (!init_event) { - kvm_pmu_reset(vcpu); + //kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; } @@ -7556,99 +5440,12 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) int kvm_arch_hardware_enable(void) { - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; - int ret; - u64 local_tsc; - u64 max_tsc = 0; - bool stable, backwards_tsc = false; - - kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); - if (ret != 0) - return ret; - - local_tsc = rdtsc(); - stable = !check_tsc_unstable(); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!stable && vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (stable && vcpu->arch.last_host_tsc > local_tsc) { - backwards_tsc = true; - if (vcpu->arch.last_host_tsc > max_tsc) - max_tsc = vcpu->arch.last_host_tsc; - } - } - } - - /* - * Sometimes, even reliable TSCs go backwards. This happens on - * platforms that reset TSC during suspend or hibernate actions, but - * maintain synchronization. We must compensate. Fortunately, we can - * detect that condition here, which happens early in CPU bringup, - * before any KVM threads can be running. Unfortunately, we can't - * bring the TSCs fully up to date with real time, as we aren't yet far - * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot - * variables that haven't been updated yet. - * - * So we simply find the maximum observed TSC above, then record the - * adjustment to TSC in each VCPU. When the VCPU later gets loaded, - * the adjustment will be applied. Note that we accumulate - * adjustments, in case multiple suspend cycles happen before some VCPU - * gets a chance to run again. In the event that no KVM threads get a - * chance to run, we will miss the entire elapsed period, as we'll have - * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may - * loose cycle time. This isn't too big a deal, since the loss will be - * uniform across all VCPUs (not to mention the scenario is extremely - * unlikely). It is possible that a second hibernate recovery happens - * much faster than a first, causing the observed TSC here to be - * smaller; this would require additional padding adjustment, which is - * why we set last_host_tsc to the local tsc observed here. - * - * N.B. - this code below runs only on platforms with reliable TSC, - * as that is the only way backwards_tsc is set above. Also note - * that this runs for ALL vcpus, which is not a bug; all VCPUs should - * have the same delta_cyc adjustment applied if backwards_tsc - * is detected. Note further, this adjustment is only done once, - * as we reset last_host_tsc on all VCPUs to stop this from being - * called multiple times (one for each physical CPU bringup). - * - * Platforms with unreliable TSCs don't have to deal with this, they - * will be compensated by the logic in vcpu_load, which sets the TSC to - * catchup mode. This will catchup all VCPUs to real time, but cannot - * guarantee that they stay in perfect synchronization. - */ - if (backwards_tsc) { - u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.tsc_offset_adjustment += delta_cyc; - vcpu->arch.last_host_tsc = local_tsc; - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - } - - /* - * We have to disable TSC offset matching.. if you were - * booting a VM while issuing an S4 host suspend.... - * you may have some problem. Solving this issue is - * left as an exercise to the reader. - */ - kvm->arch.last_tsc_nsec = 0; - kvm->arch.last_tsc_write = 0; - } - - } - return 0; + return kvm_x86_ops->hardware_enable(); } void kvm_arch_hardware_disable(void) { kvm_x86_ops->hardware_disable(); - drop_user_return_notifiers(); } int kvm_arch_hardware_setup(void) @@ -7659,20 +5456,6 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; - if (kvm_has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated needed because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; - } - kvm_init_msr_list(); return 0; } @@ -7691,19 +5474,16 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -struct static_key kvm_no_apic_vcpu __read_mostly; -EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); +int kvm_no_apic_vcpu = 1; int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - struct page *page; struct kvm *kvm; int r; @@ -7711,50 +5491,27 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); - vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = GVM_MP_STATE_RUNNABLE; else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); + vcpu->arch.mp_state = GVM_MP_STATE_UNINITIALIZED; - kvm_set_tsc_khz(vcpu, max_tsc_khz); + vcpu->arch.pio_data = (void *)((size_t)vcpu->run + PAGE_SIZE); r = kvm_mmu_create(vcpu); if (r < 0) - goto fail_free_pio_data; + goto fail; if (irqchip_in_kernel(kvm)) { r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; - } else - static_key_slow_inc(&kvm_no_apic_vcpu); - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { - r = -ENOMEM; - goto fail_free_mce_banks; - } + } fx_init(vcpu); vcpu->arch.ia32_tsc_adjust_msr = 0x0; - vcpu->arch.pv_time_enabled = false; vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -7763,23 +5520,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); + //kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; - kvm_hv_vcpu_init(vcpu); - return 0; -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_lapic: - kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); fail: return r; } @@ -7788,24 +5536,14 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; - kvm_hv_vcpu_uninit(vcpu); - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); + //kvm_pmu_destroy(vcpu); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); - if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); -} - -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - kvm_x86_ops->sched_in(vcpu, cpu); } -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +int kvm_arch_init_vm(struct kvm *kvm, size_t type) { if (type) return -EINVAL; @@ -7813,24 +5551,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); - atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); + set_bit(GVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); - pvclock_update_vm_gtod_copy(kvm); - - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); @@ -7843,11 +5569,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - int r; - r = vcpu_load(vcpu); - BUG_ON(r); kvm_mmu_unload(vcpu); - vcpu_put(vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -7859,7 +5581,6 @@ static void kvm_free_vcpus(struct kvm *kvm) * Unpin any mmu pages first. */ kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) @@ -7873,23 +5594,15 @@ static void kvm_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); - kvm_free_pit(kvm); -} - int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + size_t hva; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot, old; /* Called with kvm->slots_lock held. */ - if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) + if (WARN_ON(id >= GVM_MEM_SLOTS_NUM)) return -EINVAL; slot = id_to_memslot(slots, id); @@ -7913,7 +5626,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } old = *slot; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; m.slot = id | (i << 16); @@ -7933,7 +5646,6 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) return 0; } -EXPORT_SYMBOL_GPL(__x86_set_memory_region); int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { @@ -7945,11 +5657,10 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) return r; } -EXPORT_SYMBOL_GPL(x86_set_memory_region); void kvm_arch_destroy_vm(struct kvm *kvm) { - if (current->mm == kvm->mm) { + if (IoGetCurrentProcess() == kvm->process) { /* * Free memory regions allocated on behalf of userspace, * unless the the memory map has changed due to process exit @@ -7961,82 +5672,31 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference(kvm->arch.apic_map)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_destroy(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } - if (i == 0) - continue; - - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + if (!dont || free->arch.rmap != dont->arch.rmap) { + kvfree(free->arch.rmap); + free->arch.rmap = NULL; } - kvm_page_track_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) + size_t npages) { - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - struct kvm_lpage_info *linfo; - unsigned long ugfn; - int lpages; - int level = i + 1; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); - if (!linfo) - goto out_free; - - slot->arch.lpage_info[i - 1] = linfo; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - linfo[0].disallow_lpage = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - linfo[lpages - 1].disallow_lpage = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - linfo[j].disallow_lpage = 1; - } - } + slot->arch.rmap = + kvm_kvzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + goto out_free; if (kvm_page_track_create_memslot(slot, npages)) goto out_free; @@ -8044,15 +5704,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; - - kvfree(slot->arch.lpage_info[i - 1]); - slot->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.rmap); return -ENOMEM; } @@ -8077,7 +5729,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *new) { /* Still write protect RO slot */ - if (new->flags & KVM_MEM_READONLY) { + if (new->flags & GVM_MEM_READONLY) { kvm_mmu_slot_remove_write_access(kvm, new); return; } @@ -8087,8 +5739,8 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * kvm_x86_ops->slot_disable_log_dirty is called when: * - * - KVM_MR_CREATE with dirty logging is disabled - * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag + * - GVM_MR_CREATE with dirty logging is disabled + * - GVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag * * The reason is, in case of PML, we need to set D-bit for any slots * with dirty logging disabled in order to eliminate unnecessary GPA @@ -8112,7 +5764,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * See the comments in fast_page_fault(). */ - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (new->flags & GVM_MEM_LOG_DIRTY_PAGES) { if (kvm_x86_ops->slot_enable_log_dirty) kvm_x86_ops->slot_enable_log_dirty(kvm, new); else @@ -8149,22 +5801,22 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. */ - if ((change != KVM_MR_DELETE) && - (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if ((change != GVM_MR_DELETE) && + (old->flags & GVM_MEM_LOG_DIRTY_PAGES) && + !(new->flags & GVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); /* * Set up write protection and/or dirty logging for the new slot. * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have + * For GVM_MR_DELETE and GVM_MR_MOVE, the shadow pages of old slot have * been zapped so no dirty logging staff is needed for old slot. For - * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the + * GVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the * new and it's also covered when dealing with the new slot. * * FIXME: const-ify all uses of struct kvm_memory_slot. */ - if (change != KVM_MR_DELETE) + if (change != GVM_MR_DELETE) kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); } @@ -8181,28 +5833,19 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { - if (!list_empty_careful(&vcpu->async_pf.done)) - return true; - if (kvm_apic_has_events(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (atomic_read(&vcpu->arch.nmi_queued)) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (test_bit(GVM_REQ_SMI, &vcpu->requests)) return true; if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) return true; - if (kvm_hv_has_stimer_pending(vcpu)) - return true; - return false; } @@ -8224,295 +5867,45 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } -unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) +size_t kvm_get_linear_rip(struct kvm_vcpu *vcpu) { if (is_64_bit_mode(vcpu)) return kvm_rip_read(vcpu); return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + kvm_rip_read(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_get_linear_rip); -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, size_t linear_rip) { return kvm_get_linear_rip(vcpu) == linear_rip; } -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +size_t kvm_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + size_t rflags; rflags = kvm_x86_ops->get_rflags(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; } -EXPORT_SYMBOL_GPL(kvm_get_rflags); -static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + if (vcpu->guest_debug & GVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); } -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +void kvm_set_rflags(struct kvm_vcpu *vcpu, size_t rflags) { __kvm_set_rflags(vcpu, rflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_set_rflags); - -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - work->wakeup_all) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) - return; - - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); -} - -static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -{ - return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); -} - -static inline u32 kvm_async_pf_next_probe(u32 key) -{ - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); -} - -static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 key = kvm_async_pf_hash_fn(gfn); - - while (vcpu->arch.apf.gfns[key] != ~0) - key = kvm_async_pf_next_probe(key); - - vcpu->arch.apf.gfns[key] = gfn; -} - -static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int i; - u32 key = kvm_async_pf_hash_fn(gfn); - - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && - (vcpu->arch.apf.gfns[key] != gfn && - vcpu->arch.apf.gfns[key] != ~0); i++) - key = kvm_async_pf_next_probe(key); - - return key; -} - -bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; -} - -static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 i, j, k; - - i = j = kvm_async_pf_gfn_slot(vcpu, gfn); - while (true) { - vcpu->arch.apf.gfns[i] = ~0; - do { - j = kvm_async_pf_next_probe(j); - if (vcpu->arch.apf.gfns[j] == ~0) - return; - k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); - /* - * k lies cyclically in ]i,j] - * | i.k.j | - * |....j i.k.| or |.k..j i...| - */ - } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); - vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; - i = j; - } -} - -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); -} - -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_not_present(work->arch.token, work->gva); - kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } -} - -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (work->wakeup_all) - work->arch.token = ~0; /* broadcast wakeup */ - else - kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } - vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -} - -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) - return true; - else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); -} - -void kvm_arch_start_assignment(struct kvm *kvm) -{ - atomic_inc(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); - -void kvm_arch_end_assignment(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); - -bool kvm_arch_has_assigned_device(struct kvm *kvm) -{ - return atomic_read(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); - -void kvm_arch_register_noncoherent_dma(struct kvm *kvm) -{ - atomic_inc(&kvm->arch.noncoherent_dma_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); - -void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.noncoherent_dma_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); - -bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) -{ - return atomic_read(&kvm->arch.noncoherent_dma_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); - -bool kvm_arch_has_irq_bypass(void) -{ - return kvm_x86_ops->update_pi_irte != NULL; -} - -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - irqfd->producer = prod; - - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); -} - -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - int ret; - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; - - /* - * When producer of consumer is unregistered, we change back to - * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabled or the consumer side (KVM - * int this case doesn't want to receive the interrupts. - */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); -} - -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + kvm_make_request(GVM_REQ_EVENT, vcpu); } bool kvm_vector_hashing_enabled(void) { return vector_hashing; } -EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); - -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); + diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4..0b6b308 100644..100755 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,9 +1,14 @@ -#ifndef ARCH_X86_KVM_X86_H -#define ARCH_X86_KVM_X86_H +/* + * Copyright 2019 Google LLC + */ + +#ifndef ARCH_X86_GVM_X86_H +#define ARCH_X86_GVM_X86_H #include <linux/kvm_host.h> -#include <asm/pvclock.h> +#include <gvm_types.h> #include "kvm_cache_regs.h" +#include <asm/msr-index.h> #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL @@ -67,17 +72,17 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) static inline int is_pae(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); + return (int)kvm_read_cr4_bits(vcpu, X86_CR4_PAE); } static inline int is_pse(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); + return (int)kvm_read_cr4_bits(vcpu, X86_CR4_PSE); } static inline int is_paging(struct kvm_vcpu *vcpu) { - return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); + return likely((int)kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } static inline u32 bit(int bitno) @@ -113,7 +118,7 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) vcpu->arch.mmio_gva = 0; } -static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) +static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, size_t gva) { if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) @@ -131,21 +136,21 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, +static inline size_t kvm_register_readl(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - unsigned long val = kvm_register_read(vcpu, reg); + size_t val = kvm_register_read(vcpu, reg); return is_64_bit_mode(vcpu) ? val : (u32)val; } static inline void kvm_register_writel(struct kvm_vcpu *vcpu, enum kvm_reg reg, - unsigned long val) + size_t val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; - return kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, val); } static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) @@ -178,7 +183,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); -#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ +#define GVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU) @@ -190,13 +195,7 @@ extern unsigned int min_timer_period_us; extern unsigned int lapic_timer_advance_ns; -extern struct static_key kvm_no_apic_vcpu; - -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) -{ - return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); -} +extern int kvm_no_apic_vcpu; /* Same "calling convention" as do_div: * - divide (n << 32) by base diff --git a/asmgen/asmgen.c b/asmgen/asmgen.c new file mode 100755 index 0000000..8e65723 --- /dev/null +++ b/asmgen/asmgen.c @@ -0,0 +1,80 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * This program prepares data definitions needed by assembly code + * in driver. + */ +#pragma warning(disable:4146) +#pragma warning(disable:4013) +#include <stdio.h> +#include <stdlib.h> +#include <linux\kvm_host.h> +#include <arch\x86\kvm\vmx_def.h> +#include <arch\x86\kvm\svm_def.h> +#include <asm\kvm_emulate.h> +#include <intrin.h> + +#define ASM_GEN_OFFSET(name, type, field) \ + printf("\t" #name "\tEQU 0%zxh\n", offsetof(type, field)) + +int main(void) +{ + _ReadWriteBarrier(); + printf("; This is generated by asmgen\n"); + printf("; Please make sure to rerun asmgen after updating\n"); + printf("; key data structures used by both assembly and C.\n\n"); + + //struct vcpu_vmx + printf("\n"); + ASM_GEN_OFFSET(VMX_TO_LAUNCHED, struct vcpu_vmx, __launched); + ASM_GEN_OFFSET(VMX_TO_FAIL, struct vcpu_vmx, fail); + ASM_GEN_OFFSET(VMX_TO_RSP, struct vcpu_vmx, host_rsp); + ASM_GEN_OFFSET(VMX_TO_RAX, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX]); + ASM_GEN_OFFSET(VMX_TO_RBX, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX]); + ASM_GEN_OFFSET(VMX_TO_RCX, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX]); + ASM_GEN_OFFSET(VMX_TO_RDX, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX]); + ASM_GEN_OFFSET(VMX_TO_RSI, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI]); + ASM_GEN_OFFSET(VMX_TO_RDI, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI]); + ASM_GEN_OFFSET(VMX_TO_RBP, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP]); + ASM_GEN_OFFSET(VMX_TO_R8, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8]); + ASM_GEN_OFFSET(VMX_TO_R9, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9]); + ASM_GEN_OFFSET(VMX_TO_R10, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10]); + ASM_GEN_OFFSET(VMX_TO_R11, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11]); + ASM_GEN_OFFSET(VMX_TO_R12, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12]); + ASM_GEN_OFFSET(VMX_TO_R13, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13]); + ASM_GEN_OFFSET(VMX_TO_R14, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14]); + ASM_GEN_OFFSET(VMX_TO_R15, struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15]); + ASM_GEN_OFFSET(VMX_TO_CR2, struct vcpu_vmx, vcpu.arch.cr2); + + //struct vcpu_svm + ASM_GEN_OFFSET(SVM_TO_VMCB_PA, struct vcpu_svm, vmcb_pa); + ASM_GEN_OFFSET(SVM_TO_RBX, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX]); + ASM_GEN_OFFSET(SVM_TO_RCX, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX]); + ASM_GEN_OFFSET(SVM_TO_RDX, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX]); + ASM_GEN_OFFSET(SVM_TO_RSI, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI]); + ASM_GEN_OFFSET(SVM_TO_RDI, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI]); + ASM_GEN_OFFSET(SVM_TO_RBP, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]); + ASM_GEN_OFFSET(SVM_TO_R8, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8]); + ASM_GEN_OFFSET(SVM_TO_R9, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9]); + ASM_GEN_OFFSET(SVM_TO_R10, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10]); + ASM_GEN_OFFSET(SVM_TO_R11, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11]); + ASM_GEN_OFFSET(SVM_TO_R12, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12]); + ASM_GEN_OFFSET(SVM_TO_R13, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13]); + ASM_GEN_OFFSET(SVM_TO_R14, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14]); + ASM_GEN_OFFSET(SVM_TO_R15, struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]); + + ASM_GEN_OFFSET(CXT_TO_DST, struct x86_emulate_ctxt, dst.val); + ASM_GEN_OFFSET(CXT_TO_SRC, struct x86_emulate_ctxt, src.val); + ASM_GEN_OFFSET(CXT_TO_SRC2, struct x86_emulate_ctxt, src2.val); +} diff --git a/asmgen/asmgen.vcxproj b/asmgen/asmgen.vcxproj new file mode 100755 index 0000000..5f31287 --- /dev/null +++ b/asmgen/asmgen.vcxproj @@ -0,0 +1,131 @@ +<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}</ProjectGuid>
+ <RootNamespace>asmgen</RootNamespace>
+ <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v142</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v142</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v142</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ <SpectreMitigation>false</SpectreMitigation>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v142</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ <SpectreMitigation>false</SpectreMitigation>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="Shared">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <OutDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</OutDir>
+ <IntDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <OutDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</OutDir>
+ <IntDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <SDLCheck>true</SDLCheck>
+ </ClCompile>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <SDLCheck>true</SDLCheck>
+ <AdditionalIncludeDirectories>$(KIT_SHARED_IncludePath)\..\km;$(ProjectDIr)..\arch\x86\include;$(ProjectDIr)..\include;$(ProjectDIr)..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>CONFIG_X86_64;CONFIG_X86_LOCAL_APIC;WINNT=1;_AMD64_;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ </ClCompile>
+ <Link>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <AdditionalIncludeDirectories>$(KIT_SHARED_IncludePath)\..\km;$(ProjectDIr)..\arch\x86\include;$(ProjectDIr)..\include;$(ProjectDIr)..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>CONFIG_X86_64;CONFIG_X86_LOCAL_APIC;WINNT=1;_AMD64_;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="asmgen.c" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project>
\ No newline at end of file diff --git a/asmgen/asmgen.vcxproj.user b/asmgen/asmgen.vcxproj.user new file mode 100644 index 0000000..6e2aec7 --- /dev/null +++ b/asmgen/asmgen.vcxproj.user @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup />
+</Project>
\ No newline at end of file diff --git a/assembly/x64/assembly.asm b/assembly/x64/assembly.asm new file mode 100755 index 0000000..5b5294c --- /dev/null +++ b/assembly/x64/assembly.asm @@ -0,0 +1,2270 @@ +; Copyright 2019 Google LLC + +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; version 2 as published by the Free Software Foundation. + +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. + +; low-level assembly code for gvm as there is no inline assembly support +; from microsoft c++ compiler. +include <__asm.inc> + +public vmx_return + .data +vmx_return qword offset ret_from_nonroot + + .code +__spin_lock proc + xor edx, edx + inc edx + jmp __spin_lock_try +__spin_lock_retry: + pause +__spin_lock_try: + xor eax, eax + lock cmpxchg [rcx], edx + jnz __spin_lock_retry + ret +__spin_lock endp + +read_flags proc + pushfq + pop rax + ret +read_flags endp + +__fninit proc + fninit + ret +__fninit endp + +__fnstsw proc + fnstsw word ptr[rcx] + ret +__fnstsw endp + +__fnstcw proc + fnstcw word ptr[rcx] + ret +__fnstcw endp + +__fwait proc + fwait + ret +__fwait endp + +__clts proc + clts + ret +__clts endp + +__bswap64 proc + mov rax, qword ptr[rcx] + bswap rax + mov qword ptr[rcx], rax + ret +__bswap64 endp + +__bswap32 proc + mov eax, dword ptr[rcx] + bswap eax + mov dword ptr[rcx], eax + ret +__bswap32 endp + +align 16 +__int2 proc + int 2 + ret +__int2 endp + +__divq proc + mov rax, rcx + div r8 + ret +__divq endp + +xchg8 proc + mov al, dl + lock xchg [rcx], al + ret +xchg8 endp + +xchg16 proc + mov ax, dx + lock xchg [rcx], ax + ret +xchg16 endp + +cmpxchg8 proc + mov al, dl + lock cmpxchg [rcx], r8b + ret +cmpxchg8 endp + +cmpxchg16 proc + mov ax, dx + lock cmpxchg [rcx], r8w + ret +cmpxchg16 endp + +load_TR_desc proc + mov rcx, 40h + ltr cx + ret +load_TR_desc endp + +gvm_read_ldt proc + sldt ax + ret +gvm_read_ldt endp + +gvm_load_ldt proc + lldt cx + ret +gvm_load_ldt endp + +gvm_read_tr proc + str ax + ret +gvm_read_tr endp + +gvm_load_tr proc + ltr cx + ret +gvm_load_tr endp + +load_ss_segment proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + mov ss, cx + + mov rsp, rbp + pop rbp + ret +load_ss_segment endp + +load_ds_segment proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + mov ds, cx + + mov rsp, rbp + pop rbp + ret +load_ds_segment endp + +load_es_segment proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + mov es, cx + + mov rsp, rbp + pop rbp + ret +load_es_segment endp + +load_fs_segment proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + mov fs, cx + + mov rsp, rbp + pop rbp + ret +load_fs_segment endp + +load_gs_segment proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + mov gs, cx + + mov rsp, rbp + pop rbp + ret +load_gs_segment endp + +load_gs_index proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + swapgs + mov gs, cx + swapgs + + mov rsp, rbp; + pop rbp; + ret +load_gs_index endp + +save_cs_segment proc + mov ax, cs + ret +save_cs_segment endp + +save_ss_segment proc + mov ax, ss + ret +save_ss_segment endp + +save_ds_segment proc + mov ax, ds + ret +save_ds_segment endp + +save_es_segment proc + mov ax, es + ret +save_es_segment endp + +save_fs_segment proc + mov ax, fs + ret +save_fs_segment endp + +save_gs_segment proc + mov ax, gs + ret +save_gs_segment endp + +__asm_vmx_vcpu_run proc + ;save abi non-volatile registers + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + ;save host flags + pushfq + ;refer to KVM + push rbp + push rcx + push rcx + cmp rsp, qword ptr VMX_TO_RSP[rcx] + je skip_save_rsp + mov qword ptr VMX_TO_RSP[rcx], rsp + mov rdx, 6c14h + vmwrite rdx, rsp +skip_save_rsp: + mov rax, qword ptr VMX_TO_CR2[rcx] + mov rdx, cr2 + cmp rax, rdx + je skip_load_cr2 + mov cr2, rax +skip_load_cr2: + cmp byte ptr VMX_TO_LAUNCHED[rcx], 0h + mov rax, qword ptr VMX_TO_RAX[rcx] + mov rbx, qword ptr VMX_TO_RBX[rcx] + mov rdx, qword ptr VMX_TO_RDX[rcx] + mov rsi, qword ptr VMX_TO_RSI[rcx] + mov rdi, qword ptr VMX_TO_RDI[rcx] + mov rbp, qword ptr VMX_TO_RBP[rcx] + mov r8, qword ptr VMX_TO_R8[rcx] + mov r9, qword ptr VMX_TO_R9[rcx] + mov r10, qword ptr VMX_TO_R10[rcx] + mov r11, qword ptr VMX_TO_R11[rcx] + mov r12, qword ptr VMX_TO_R12[rcx] + mov r13, qword ptr VMX_TO_R13[rcx] + mov r14, qword ptr VMX_TO_R14[rcx] + mov r15, qword ptr VMX_TO_R15[rcx] + mov rcx, qword ptr VMX_TO_RCX[rcx] + jne go_resume + vmlaunch + jmp ret_from_nonroot +go_resume: + vmresume +ret_from_nonroot:: + mov qword ptr 8h[rsp], rcx + pop rcx + mov qword ptr VMX_TO_RAX[rcx], rax + mov qword ptr VMX_TO_RBX[rcx], rbx + pop qword ptr VMX_TO_RCX[rcx] + mov qword ptr VMX_TO_RDX[rcx], rdx + mov qword ptr VMX_TO_RSI[rcx], rsi + mov qword ptr VMX_TO_RDI[rcx], rdi + mov qword ptr VMX_TO_RBP[rcx], rbp + mov qword ptr VMX_TO_R8[rcx], r8 + mov qword ptr VMX_TO_R9[rcx], r9 + mov qword ptr VMX_TO_R10[rcx], r10 + mov qword ptr VMX_TO_R11[rcx], r11 + mov qword ptr VMX_TO_R12[rcx], r12 + mov qword ptr VMX_TO_R13[rcx], r13 + mov qword ptr VMX_TO_R14[rcx], r14 + mov qword ptr VMX_TO_R15[rcx], r15 + mov rax, cr2 + mov qword ptr VMX_TO_CR2[rcx], rax + setbe byte ptr VMX_TO_FAIL[rcx] + pop rbp + ;restore host flags + popfq + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +__asm_vmx_vcpu_run endp + +__asm_vmx_handle_external_intr proc + mov rax, rsp + and rsp, 0fffffffffffffff0h + push 18h + push rax + pushfq + push 10h + call rcx + ret +__asm_vmx_handle_external_intr endp + +;-----mov mmx------- +__asm_save_mm0 proc + movq [rcx], mm0 + ret +__asm_save_mm0 endp + +__asm_save_mm1 proc + movq [rcx], mm1 + ret +__asm_save_mm1 endp + +__asm_save_mm2 proc + movq [rcx], mm2 + ret +__asm_save_mm2 endp + +__asm_save_mm3 proc + movq [rcx], mm3 + ret +__asm_save_mm3 endp + +__asm_save_mm4 proc + movq [rcx], mm4 + ret +__asm_save_mm4 endp + +__asm_save_mm5 proc + movq [rcx], mm5 + ret +__asm_save_mm5 endp + +__asm_save_mm6 proc + movq [rcx], mm6 + ret +__asm_save_mm6 endp + +__asm_save_mm7 proc + movq [rcx], mm7 + ret +__asm_save_mm7 endp + +__asm_store_mm0 proc + movq mm0, [rcx] + ret +__asm_store_mm0 endp + +__asm_store_mm1 proc + movq mm1, [rcx] + ret +__asm_store_mm1 endp + +__asm_store_mm2 proc + movq mm2, [rcx] + ret +__asm_store_mm2 endp + +__asm_store_mm3 proc + movq mm3, [rcx] + ret +__asm_store_mm3 endp + +__asm_store_mm4 proc + movq mm4, [rcx] + ret +__asm_store_mm4 endp + +__asm_store_mm5 proc + movq mm5, [rcx] + ret +__asm_store_mm5 endp + +__asm_store_mm6 proc + movq mm6, [rcx] + ret +__asm_store_mm6 endp + +__asm_store_mm7 proc + movq mm7, [rcx] + ret +__asm_store_mm7 endp + +;-----movdqa------- +__asm_save_xmm0 proc + movdqa xmmword ptr[rcx], xmm0 + ret +__asm_save_xmm0 endp + +__asm_store_xmm0 proc + movdqa xmm0, xmmword ptr[rcx] + ret +__asm_store_xmm0 endp + +__asm_save_xmm1 proc + movdqa xmmword ptr[rcx], xmm1 + ret +__asm_save_xmm1 endp + +__asm_store_xmm1 proc + movdqa xmm1, xmmword ptr[rcx] + ret +__asm_store_xmm1 endp + +__asm_save_xmm2 proc + movdqa xmmword ptr[rcx], xmm2 + ret +__asm_save_xmm2 endp + +__asm_store_xmm2 proc + movdqa xmm2, xmmword ptr[rcx] + ret +__asm_store_xmm2 endp + +__asm_save_xmm3 proc + movdqa xmmword ptr[rcx], xmm3 + ret +__asm_save_xmm3 endp + +__asm_store_xmm3 proc + movdqa xmm3, xmmword ptr[rcx] + ret +__asm_store_xmm3 endp + +__asm_save_xmm4 proc + movdqa xmmword ptr[rcx], xmm4 + ret +__asm_save_xmm4 endp + +__asm_store_xmm4 proc + movdqa xmm4, xmmword ptr[rcx] + ret +__asm_store_xmm4 endp + +__asm_save_xmm5 proc + movdqa xmmword ptr[rcx], xmm5 + ret +__asm_save_xmm5 endp + +__asm_store_xmm5 proc + movdqa xmm5, xmmword ptr[rcx] + ret +__asm_store_xmm5 endp + +__asm_save_xmm6 proc + movdqa xmmword ptr[rcx], xmm6 + ret +__asm_save_xmm6 endp + +__asm_store_xmm6 proc + movdqa xmm6, xmmword ptr[rcx] + ret +__asm_store_xmm6 endp + +__asm_save_xmm7 proc + movdqa xmmword ptr[rcx], xmm7 + ret +__asm_save_xmm7 endp + +__asm_store_xmm7 proc + movdqa xmm7, xmmword ptr[rcx] + ret +__asm_store_xmm7 endp + +__asm_save_xmm8 proc + movdqa xmmword ptr[rcx], xmm8 + ret +__asm_save_xmm8 endp + +__asm_store_xmm8 proc + movdqa xmm8, xmmword ptr[rcx] + ret +__asm_store_xmm8 endp + +__asm_save_xmm9 proc + movdqa xmmword ptr[rcx], xmm9 + ret +__asm_save_xmm9 endp + +__asm_store_xmm9 proc + movdqa xmm9, xmmword ptr[rcx] + ret +__asm_store_xmm9 endp + +__asm_save_xmm10 proc + movdqa xmmword ptr[rcx], xmm10 + ret +__asm_save_xmm10 endp + +__asm_store_xmm10 proc + movdqa xmm10, xmmword ptr[rcx] + ret +__asm_store_xmm10 endp + +__asm_save_xmm11 proc + movdqa xmmword ptr[rcx], xmm11 + ret +__asm_save_xmm11 endp + +__asm_store_xmm11 proc + movdqa xmm11, xmmword ptr[rcx] + ret +__asm_store_xmm11 endp + +__asm_save_xmm12 proc + movdqa xmmword ptr[rcx], xmm12 + ret +__asm_save_xmm12 endp + +__asm_store_xmm12 proc + movdqa xmm12, xmmword ptr[rcx] + ret +__asm_store_xmm12 endp + +__asm_save_xmm13 proc + movdqa xmmword ptr[rcx], xmm13 + ret +__asm_save_xmm13 endp + +__asm_store_xmm13 proc + movdqa xmm13, xmmword ptr[rcx] + ret +__asm_store_xmm13 endp + +__asm_save_xmm14 proc + movdqa xmmword ptr[rcx], xmm14 + ret +__asm_save_xmm14 endp + +__asm_store_xmm14 proc + movdqa xmm14, xmmword ptr[rcx] + ret +__asm_store_xmm14 endp + +__asm_save_xmm15 proc + movdqa xmmword ptr[rcx], xmm15 + ret +__asm_save_xmm15 endp + +__asm_store_xmm15 proc + movdqa xmm15, xmmword ptr[rcx] + ret +__asm_store_xmm15 endp + +;-----Fastop Functions------ +; Fastop functions's entry is __asm_fastop. +; Never call underlying functions directly as it is not written following +; normal ABI. + public __asm_test_cc +__asm_test_cc proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + push rdx + popfq + call rcx + + mov rsp, rbp + pop rbp + ret +__asm_test_cc endp + + public __asm_fastop +__asm_fastop proc frame + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + .endprolog + + push rdi + mov rdi, rcx + push rsi + mov rsi, rdx + mov rax, qword ptr CXT_TO_DST[r8] + mov rdx, qword ptr CXT_TO_SRC[r8] + mov rcx, qword ptr CXT_TO_SRC2[r8] + + push qword ptr[rdi] + popfq + call rsi + pushfq + pop qword ptr[rdi] + + mov qword ptr CXT_TO_DST[r8], rax + mov qword ptr CXT_TO_SRC[r8], rdx + pop rsi + pop rdi + + mov rsp, rbp + pop rbp + ret +__asm_fastop endp + + public kvm_fastop_exception +kvm_fastop_exception proc + xor esi, esi + ret +kvm_fastop_exception endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_setcc +em_setcc proc +__seto proc + seto al + ret +__seto endp +em_setcc endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setno proc + setno al + ret +__setno endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setc proc + setb al + ret +__setc endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setnc proc + setnb al + ret +__setnc endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setz proc + setz al + ret +__setz endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setnz proc + setnz al + ret +__setnz endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setbe proc + setbe al + ret +__setbe endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setnbe proc + setnbe al + ret +__setnbe endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__sets proc + sets al + ret +__sets endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setns proc + setns al + ret +__setns endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setp proc + setp al + ret +__setp endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setnp proc + setnp al + ret +__setnp endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setl proc + setl al + ret +__setl endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setnl proc + setnl al + ret +__setnl endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setle proc + setle al + ret +__setle endp + +; --------------------------------------------------------------------------- + align 4 + +; =============== S U B R O U T I N E ======================================= +__setnle proc + setnle al + ret +__setnle endp + +; =============== S U B R O U T I N E ======================================= + public em_salc +em_salc proc + pushfq + sbb al, al + popfq + ret +em_salc endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_add +em_add proc +__addb_al_dl proc + add al, dl + ret +__addb_al_dl endp +em_add endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__addw_ax_dx proc + add ax, dx + ret +__addw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__addl_eax_edx proc + add eax, edx + ret +__addl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__addq_rax_rdx proc + add rax, rdx + ret +__addq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_or +em_or proc +__orb_al_dl proc + or al, dl + ret +__orb_al_dl endp +em_or endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__orw_ax_dx proc + or ax, dx + ret +__orw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__orl_eax_edx proc + or eax, edx + ret +__orl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__orq_rax_rdx proc + or rax, rdx + ret +__orq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_adc +em_adc proc +__adcb_al_dl proc + adc al, dl + ret +__adcb_al_dl endp +em_adc endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__adcw_ax_dx proc + adc ax, dx + ret +__adcw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__adcl_eax_edx proc + adc eax, edx + ret +__adcl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__adcq_rax_rdx proc + adc rax, rdx + ret +__adcq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_sbb +em_sbb proc +__sbbb_al_dl proc + sbb al, dl + ret +__sbbb_al_dl endp +em_sbb endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__sbbw_ax_dx proc + sbb ax, dx + ret +__sbbw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__sbbl_eax_edx proc + sbb eax, edx + ret +__sbbl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__sbbq_rax_rdx proc + sbb rax, rdx + ret +__sbbq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_and +em_and proc +__andb_al_dl proc + and al, dl + ret +__andb_al_dl endp +em_and endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__andw_ax_dx proc + and ax, dx + ret +__andw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__andl_eax_edx proc + and eax, edx + ret +__andl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__andq_rax_rdx proc + and rax, rdx + ret +__andq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_sub +em_sub proc +__subb_al_dl proc + sub al, dl + ret +__subb_al_dl endp +em_sub endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__subw_ax_dx proc + sub ax, dx + ret +__subw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__subl_eax_edx proc + sub eax, edx + ret +__subl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__subq_rax_rdx proc + sub rax, rdx + ret +__subq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_xor +em_xor proc +__xorb_al_dl proc + xor al, dl + ret +__xorb_al_dl endp +em_xor endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__xorw_ax_dx proc + xor ax, dx + ret +__xorw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__xorl_eax_edx proc + xor eax, edx + ret +__xorl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__xorq_rax_rdx proc + xor rax, rdx + ret +__xorq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_cmp +em_cmp proc +__cmpb_al_dl proc + cmp al, dl + ret +__cmpb_al_dl endp +em_cmp endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__cmpw_ax_dx proc + cmp ax, dx + ret +__cmpw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__cmpl_eax_edx proc + cmp eax, edx + ret +__cmpl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__cmpq_rax_rdx proc + cmp rax, rdx + ret +__cmpq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_test +em_test proc +__testb_al_dl proc + test al, dl + ret +__testb_al_dl endp +em_test endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__testw_ax_dx proc + test ax, dx + ret +__testw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__testl_eax_edx proc + test eax, edx + ret +__testl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__testq_rax_rdx proc + test rax, rdx + ret +__testq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_mul_ex +em_mul_ex proc +__mul_cl proc + mul cl + ret +__mul_cl endp +em_mul_ex endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__mul_cx proc + mul cx + ret +__mul_cx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__mul_ecx proc + mul ecx + ret +__mul_ecx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__mul_rcx proc + mul rcx + ret +__mul_rcx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_imul_ex +em_imul_ex proc +__imul_cl proc + imul cl + ret +__imul_cl endp +em_imul_ex endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__imul_cx proc + imul cx + ret +__imul_cx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__imul_ecx proc + imul ecx + ret +__imul_ecx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__imul_rcx proc + imul rcx + ret +__imul_rcx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_div_ex +em_div_ex proc +__div_cl proc + div cl + ret +__div_cl endp +em_div_ex endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__div_cx proc + div cx + ret +__div_cx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__div_ecx proc + div ecx + ret +__div_ecx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__div_rcx proc + div rcx + ret +__div_rcx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_idiv_ex +em_idiv_ex proc +__idiv_cl proc + idiv cl + ret +__idiv_cl endp +em_idiv_ex endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__idiv_cx proc + idiv cx + ret +__idiv_cx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__idiv_ecx proc + idiv ecx + ret +__idiv_ecx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__idiv_rcx proc + idiv rcx + ret +__idiv_rcx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_shld +em_shld proc + ret +em_shld endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shldw_ax_dx_cl proc + shld ax, dx, cl + ret +__shldw_ax_dx_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shldl_eax_edx_cl proc + shld eax, edx, cl + ret +__shldl_eax_edx_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shldq_rax_rdx_cl proc + shld rax, rdx, cl + ret +__shldq_rax_rdx_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_shrd +em_shrd proc + ret +em_shrd endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shrdw_ax_dx_cl proc + shrd ax, dx, cl + ret +__shrdw_ax_dx_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shrdl_eax_edx_cl proc + shrd eax, edx, cl + ret +__shrdl_eax_edx_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shrdq_rax_rdx_cl proc + shrd rax, rdx, cl + ret +__shrdq_rax_rdx_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_imul +em_imul proc + ret +em_imul endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__imulw_ax_dx proc + imul ax, dx + ret +__imulw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__imull_eax_edx proc + imul eax, edx + ret +__imull_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__imulq_rax_rdx proc + imul rax, rdx + ret +__imulq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_not +em_not proc +__notb_al proc + not al + ret +__notb_al endp +em_not endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__notw_ax proc + not ax + ret +__notw_ax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__notl_eax proc + not eax + ret +__notl_eax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__notq_rax proc + not rax + ret +__notq_rax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_neg +em_neg proc +__negb_al proc + neg al + ret +__negb_al endp +em_neg endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__negw_ax proc + neg ax + ret +__negw_ax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__negl_eax proc + neg eax + ret +__negl_eax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__negq_rax proc + neg rax + ret +__negq_rax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_inc +em_inc proc +__incb_al proc + inc al + ret +__incb_al endp +em_inc endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__incw_ax proc + inc ax + ret +__incw_ax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__incl_eax proc + inc eax + ret +__incl_eax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__incq_rax proc + inc rax + ret +__incq_rax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_dec +em_dec proc +__decb_al proc + dec al + ret +__decb_al endp +em_dec endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__decw_ax proc + dec ax + ret +__decw_ax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__decl_eax proc + dec eax + ret +__decl_eax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__decq_rax proc + dec rax + ret +__decq_rax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_rol +em_rol proc +__rolb_al_cl proc + rol al, cl + ret +__rolb_al_cl endp +em_rol endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rolw_ax_cl proc + rol ax, cl + ret +__rolw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__roll_eax_cl proc + rol eax, cl + ret +__roll_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rolq_rax_cl proc + rol rax, cl + ret +__rolq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_ror +em_ror proc +__rorb_al_cl proc + ror al, cl + ret +__rorb_al_cl endp +em_ror endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rorw_ax_cl proc + ror ax, cl + ret +__rorw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rorl_eax_cl proc + ror eax, cl + ret +__rorl_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rorq_rax_cl proc + ror rax, cl + ret +__rorq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_rcl +em_rcl proc +__rclb_al_cl proc + rcl al, cl + ret +__rclb_al_cl endp +em_rcl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rclw_ax_cl proc + rcl ax, cl + ret +__rclw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rcll_eax_cl proc + rcl eax, cl + ret +__rcll_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rclq_rax_cl proc + rcl rax, cl + ret +__rclq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_rcr +em_rcr proc +__rcrb_al_cl proc + rcr al, cl + ret +__rcrb_al_cl endp +em_rcr endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rcrw_ax_cl proc + rcr ax, cl + ret +__rcrw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rcrl_eax_cl proc + rcr eax, cl + ret +__rcrl_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__rcrq_rax_cl proc + rcr rax, cl + ret +__rcrq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_shl +em_shl proc +__shlb_al_cl proc + shl al, cl + ret +__shlb_al_cl endp +em_shl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shlw_ax_cl proc + shl ax, cl + ret +__shlw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shll_eax_cl proc + shl eax, cl + ret +__shll_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shlq_rax_cl proc + shl rax, cl + ret +__shlq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_shr +em_shr proc +__shrb_al_cl proc + shr al, cl + ret +__shrb_al_cl endp +em_shr endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shrw_ax_cl proc + shr ax, cl + ret +__shrw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shrl_eax_cl proc + shr eax, cl + ret +__shrl_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__shrq_rax_cl proc + shr rax, cl + ret +__shrq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_sar +em_sar proc +__sarb_al_cl proc + sar al, cl + ret +__sarb_al_cl endp +em_sar endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__sarw_ax_cl proc + sar ax, cl + ret +__sarw_ax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__sarl_eax_cl proc + sar eax, cl + ret +__sarl_eax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__sarq_rax_cl proc + sar rax, cl + ret +__sarq_rax_cl endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_bsf +em_bsf proc + ret +em_bsf endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__bsfw_ax_dx proc + bsf ax, dx + ret +__bsfw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__bsfl_eax_edx proc + bsf eax, edx + ret +__bsfl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__bsfq_rax_rdx proc + bsf rax, rdx + ret +__bsfq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_bsr +em_bsr proc + ret +em_bsr endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__bsrw_ax_dx proc + bsr ax, dx + ret +__bsrw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__bsrl_eax_edx proc + bsr eax, edx + ret +__bsrl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__bsrq_rax_rdx proc + bsr rax, rdx + ret +__bsrq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_bt +em_bt proc + ret +em_bt endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btw_ax_dx proc + bt ax, dx + ret +__btw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btl_eax_edx proc + bt eax, edx + ret +__btl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btq_rax_rdx proc + bt rax, rdx + ret +__btq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_bts +em_bts proc + ret +em_bts endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btsw_ax_dx proc + bts ax, dx + ret +__btsw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btsl_eax_edx proc + bts eax, edx + ret +__btsl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btsq_rax_rdx proc + bts rax, rdx + ret +__btsq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_btr +em_btr proc + ret +em_btr endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btrw_ax_dx proc + btr ax, dx + ret +__btrw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btrl_eax_edx proc + btr eax, edx + ret +__btrl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btrq_rax_rdx proc + btr rax, rdx + ret +__btrq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_btc +em_btc proc + ret +em_btc endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btcw_ax_dx proc + btc ax, dx + ret +__btcw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btcl_eax_edx proc + btc eax, edx + ret +__btcl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__btcq_rax_rdx proc + btc rax, rdx + ret +__btcq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_xadd +em_xadd proc +__xaddb_al_dl proc + xadd al, dl + ret +__xaddb_al_dl endp +em_xadd endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__xaddw_ax_dx proc + xadd ax, dx + ret +__xaddw_ax_dx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__xaddl_eax_edx proc + xadd eax, edx + ret +__xaddl_eax_edx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__xaddq_rax_rdx proc + xadd rax, rdx + ret +__xaddq_rax_rdx endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= + public em_cmp_r +em_cmp_r proc +__cmpb_dl_al proc + cmp dl, al + ret +__cmpb_dl_al endp +em_cmp_r endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__cmpw_dx_ax proc + cmp dx, ax + ret +__cmpw_dx_ax endp + +; --------------------------------------------------------------------------- + align 8 + +; =============== S U B R O U T I N E ======================================= +__cmpl_edx_eax proc + cmp edx, eax + ret +__cmpl_edx_eax endp + +; --------------------------------------------------------------------------- + align 8 +; =============== S U B R O U T I N E ======================================= +__cmpq_rdx_rax proc + cmp rdx, rax + ret +__cmpq_rdx_rax endp + +__int12 proc + int 12h + ret +__int12 endp + +__read_dr0 proc + mov rax, dr0 + ret +__read_dr0 endp + +__read_dr1 proc + mov rax, dr1 + ret +__read_dr1 endp + +__read_dr2 proc + mov rax, dr2 + ret +__read_dr2 endp + +__read_dr3 proc + mov rax, dr3 + ret +__read_dr3 endp + +__read_dr6 proc + mov rax, dr6 + ret +__read_dr6 endp + +__read_dr7 proc + mov rax, dr7 + ret +__read_dr7 endp + +__write_dr0 proc + mov dr0, rcx + ret +__write_dr0 endp + +__write_dr1 proc + mov dr1, rcx + ret +__write_dr1 endp + +__write_dr2 proc + mov dr2, rcx + ret +__write_dr2 endp + +__write_dr3 proc + mov dr3, rcx + ret +__write_dr3 endp + +__write_dr6 proc + mov dr6, rcx + ret +__write_dr6 endp + +__write_dr7 proc + mov dr7, rcx + ret +__write_dr7 endp + +__asm_svm_vcpu_run proc + ;save abi non-volatile ergisters + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + ;refer to KVM svm.c + mov rax, rcx + push rbp + mov rbx, qword ptr SVM_TO_RBX[rax] + mov rcx, qword ptr SVM_TO_RCX[rax] + mov rdx, qword ptr SVM_TO_RDX[rax] + mov rsi, qword ptr SVM_TO_RSI[rax] + mov rdi, qword ptr SVM_TO_RDI[rax] + mov rbp, qword ptr SVM_TO_RBP[rax] + mov r8, qword ptr SVM_TO_R8[rax] + mov r9, qword ptr SVM_TO_R9[rax] + mov r10, qword ptr SVM_TO_R10[rax] + mov r11, qword ptr SVM_TO_R11[rax] + mov r12, qword ptr SVM_TO_R12[rax] + mov r13, qword ptr SVM_TO_R13[rax] + mov r14, qword ptr SVM_TO_R14[rax] + mov r15, qword ptr SVM_TO_R15[rax] + ;Enter guest mode + push rax + mov rax, qword ptr SVM_TO_VMCB_PA[rax] + vmload rax + vmrun rax + vmsave rax + pop rax + ;Save guest registers, load host registers + mov qword ptr SVM_TO_RBX[rax], rbx + mov qword ptr SVM_TO_RCX[rax], rcx + mov qword ptr SVM_TO_RDX[rax], rdx + mov qword ptr SVM_TO_RSI[rax], rsi + mov qword ptr SVM_TO_RDI[rax], rdi + mov qword ptr SVM_TO_RBP[rax], rbp + mov qword ptr SVM_TO_R8[rax], r8 + mov qword ptr SVM_TO_R9[rax], r9 + mov qword ptr SVM_TO_R10[rax], r10 + mov qword ptr SVM_TO_R11[rax], r11 + mov qword ptr SVM_TO_R12[rax], r12 + mov qword ptr SVM_TO_R13[rax], r13 + mov qword ptr SVM_TO_R14[rax], r14 + mov qword ptr SVM_TO_R15[rax], r15 + pop rbp + + ;restore abi non-volatile registers + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +__asm_svm_vcpu_run endp + + end diff --git a/gvm-main.c b/gvm-main.c new file mode 100755 index 0000000..02e4dc9 --- /dev/null +++ b/gvm-main.c @@ -0,0 +1,429 @@ +/* + * Copyright 2019 Google LLC + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <ntddk.h> +#include <gvm-main.h> +#include <ntkrutils.h> +#include <linux/kvm_host.h> + +#define ClearFlag(_F,_SF) ((_F) &= ~(_SF)) + +struct cpuinfo_x86 boot_cpu_data; + +/* Device Name */ +#define GVM_DEVICE_NAME L"\\Device\\gvm" +#define GVM_DOS_DEVICE_NAME L"\\DosDevices\\gvm" +#define POWER_CALL_BACK_NAME L"\\Callback\\PowerState" + +static PCALLBACK_OBJECT power_callback; +static PVOID power_callback_handle; +static int suspend; +static atomic_t suspend_wait; + +DRIVER_INITIALIZE DriverEntry; + +PDRIVER_OBJECT gpDrvObj; + +PVOID pZeroPage = NULL; + +extern int vmx_init(void); +extern void vmx_exit(void); +extern int svm_init(void); +extern void svm_exit(void); +extern int kvm_suspend(void); +extern void kvm_resume(void); + +int gvmUpdateReturnBuffer(PIRP pIrp, size_t start, void *src, size_t size) +{ + PIO_STACK_LOCATION pIoStack = IoGetCurrentIrpStackLocation(pIrp); + unsigned char *pBuff = pIrp->AssociatedIrp.SystemBuffer; + size_t buffSize = pIoStack->Parameters.DeviceIoControl.OutputBufferLength; + + if ((start + size) > buffSize) + return -E2BIG; + + RtlCopyBytes(pBuff + start, src, size); + pIrp->IoStatus.Information = start + size; + return 0; +} + +VOID NTAPI gvmWaitSuspend( + _In_ PKAPC Apc, + _Inout_ PKNORMAL_ROUTINE* NormalRoutine, + _Inout_ PVOID* NormalContext, + _Inout_ PVOID* SystemArgument1, + _Inout_ PVOID* SystemArgument2) +{ + UNREFERENCED_PARAMETER(NormalRoutine); + UNREFERENCED_PARAMETER(NormalContext); + UNREFERENCED_PARAMETER(SystemArgument1); + UNREFERENCED_PARAMETER(SystemArgument2); + + atomic_inc(&suspend_wait); + + while (suspend) + _mm_pause(); + + atomic_dec(&suspend_wait); +} + +VOID gvmDriverUnload(PDRIVER_OBJECT pDrvObj) +{ + //XXX: Clean up other devices? + PDEVICE_OBJECT pDevObj = pDrvObj->DeviceObject; + UNICODE_STRING DosDeviceName; + char CPUString[13]; + unsigned int eax = 0; + + if (power_callback_handle) + ExUnregisterCallback(power_callback_handle); + if (power_callback) + ObDereferenceObject(power_callback); + + RtlInitUnicodeString(&DosDeviceName, GVM_DOS_DEVICE_NAME); + IoDeleteSymbolicLink(&DosDeviceName); + IoDeleteDevice(pDevObj); + + RtlZeroBytes(CPUString, 13); + cpuid(0, &eax, + (unsigned int *)&CPUString[0], + (unsigned int *)&CPUString[8], + (unsigned int *)&CPUString[4]); + if (strcmp("GenuineIntel", CPUString) == 0) + vmx_exit(); + else if (strcmp("AuthenticAMD", CPUString) == 0) + svm_exit(); + + ExFreePoolWithTag(pZeroPage, GVM_POOL_TAG); + NtKrUtilsExit(); +} + +NTSTATUS kvm_vcpu_release(PDEVICE_OBJECT pDevObj, PIRP pIrp); +NTSTATUS kvm_vm_release(PDEVICE_OBJECT pDevObj, PIRP pIrp); +NTSTATUS gvmDeviceClose(PDEVICE_OBJECT pDevObj, PIRP pIrp) +{ + NTSTATUS rc = STATUS_INVALID_PARAMETER; + struct gvm_device_extension *pDevExt; + + DbgPrint("GVM device close\n"); + + pDevExt = pDevObj->DeviceExtension; + switch (pDevExt->DevType) { + case GVM_DEVICE_TOP: + rc = STATUS_SUCCESS; + break; + case GVM_DEVICE_VM: + rc = kvm_vm_release(pDevObj, pIrp); + break; + case GVM_DEVICE_VCPU: + rc = kvm_vcpu_release(pDevObj, pIrp); + break; + default: + DbgPrint("gvm Device Close with incorrect device type!\n"); + } + + if (pDevExt->DevType != GVM_DEVICE_TOP) + IoDeleteDevice(pDevObj); + + // Completing the device control + pIrp->IoStatus.Status = rc; + pIrp->IoStatus.Information = 0; + IoCompleteRequest(pIrp, IO_NO_INCREMENT); + + return rc; +} + +NTSTATUS gvmDeviceCreate(PDEVICE_OBJECT pDevObj, PIRP pIrp) +{ + DbgPrint("GVM device open\n"); + UNREFERENCED_PARAMETER(pDevObj); + + pIrp->IoStatus.Status = STATUS_SUCCESS; + pIrp->IoStatus.Information = 0; + + IoCompleteRequest(pIrp, IO_NO_INCREMENT); + return STATUS_SUCCESS; +} + +NTSTATUS gvmCreateVMDevice(PHANDLE pHandle, + UINT32 vmNumber, INT32 vcpuNumber, PVOID PrivData) +{ + UNICODE_STRING deviceName; + WCHAR wDeviceName[64] = { 0 }; + PDEVICE_OBJECT pDevObj = NULL; + OBJECT_ATTRIBUTES objAttr; + NTSTATUS rc; + HANDLE handle; + IO_STATUS_BLOCK ioStatBlock; + struct gvm_device_extension *pDevExt; + + RtlInitEmptyUnicodeString(&deviceName, wDeviceName, 64); + + if (vcpuNumber == -1) + RtlUnicodeStringPrintf(&deviceName, + L"\\Device\\gvm_vm%d", vmNumber); + else if(vcpuNumber >= 0 && vcpuNumber <= 128 ) + RtlUnicodeStringPrintf(&deviceName, + L"\\Device\\gvm_vm%d_vcpu%d", + vmNumber, + vcpuNumber); + + rc = IoCreateDevice(gpDrvObj, + sizeof(struct gvm_device_extension), + &deviceName, + FILE_DEVICE_GVM, + FILE_DEVICE_SECURE_OPEN, + FALSE, + &pDevObj); + if (!NT_SUCCESS(rc)) + return rc; + + pDevExt = pDevObj->DeviceExtension; + if (vcpuNumber == -1) + pDevExt->DevType = GVM_DEVICE_VM; + else + pDevExt->DevType = GVM_DEVICE_VCPU; + pDevExt->PrivData = PrivData; + + ClearFlag(pDevObj->Flags, DO_DEVICE_INITIALIZING); + + InitializeObjectAttributes(&objAttr, &deviceName, 0, NULL, NULL); + + rc = ZwCreateFile(&handle, + GENERIC_ALL, + &objAttr, + &ioStatBlock, + NULL, + FILE_ATTRIBUTE_NORMAL, + FILE_SHARE_READ | FILE_SHARE_WRITE, + FILE_OPEN, + FILE_NON_DIRECTORY_FILE, + 0, 0); + if (NT_SUCCESS(rc)) + *pHandle = handle; + + return rc; +} + +NTSTATUS gvmDeleteVMDevice(PDEVICE_OBJECT pDevObj, + UINT32 vmNumber, INT32 vcpuNumber) +{ + UNICODE_STRING deviceName; + WCHAR wDeviceName[32] = { 0 }; + PFILE_OBJECT pFileObj = NULL; + NTSTATUS rc; + + // If Device Object is already specified, simple delete it + if (pDevObj) + IoDeleteDevice(pDevObj); + + // We need to locate the device object first + RtlInitEmptyUnicodeString(&deviceName, wDeviceName, 32); + + if (vcpuNumber == -1) + RtlUnicodeStringPrintf(&deviceName, + L"\\Device\\gvm_vm%d", vmNumber); + else if (vcpuNumber >= 0 && vcpuNumber <= 128) + RtlUnicodeStringPrintf(&deviceName, + L"\\Device\\gvm_vm%d_vcpu%d", + vmNumber, + vcpuNumber); + + rc = IoGetDeviceObjectPointer(&deviceName, + FILE_ALL_ACCESS, + &pFileObj, + &pDevObj); + ObDereferenceObject(pFileObj); + if (!NT_SUCCESS(rc)) + goto out; + + IoDeleteDevice(pDevObj); +out: + return rc; +} + +NTSTATUS gvmDeviceControl(PDEVICE_OBJECT pDevObj, PIRP pIrp) +{ + NTSTATUS rc = STATUS_INVALID_PARAMETER; + PIO_STACK_LOCATION pIoStackLocation; + ULONG ioctl; + size_t arg; + struct gvm_device_extension *pDevExt; + + pIoStackLocation = IoGetCurrentIrpStackLocation(pIrp); + NT_ASSERT(pIoStackLocation != NULL); + + ioctl = pIoStackLocation->Parameters.DeviceIoControl.IoControlCode; + arg = (size_t)pIrp->AssociatedIrp.SystemBuffer; + + pDevExt = pDevObj->DeviceExtension; + switch (pDevExt->DevType) { + case GVM_DEVICE_TOP: + rc = kvm_dev_ioctl(pDevObj, pIrp, ioctl); + break; + case GVM_DEVICE_VM: + rc = kvm_vm_ioctl(pDevObj, pIrp, ioctl); + break; + case GVM_DEVICE_VCPU: + rc = kvm_vcpu_ioctl(pDevObj, pIrp, ioctl); + break; + default: + DbgPrint("gvm Device Control with incorrect device type!\n"); + } + + switch (rc) { + case -EINVAL: + rc = STATUS_INVALID_PARAMETER; + break; + case -EAGAIN: + rc = STATUS_RETRY; + break; + case -E2BIG: + rc = STATUS_BUFFER_OVERFLOW; + break; + case -EFAULT: + rc = STATUS_INTERNAL_ERROR; + break; + default: + break; + } + + // Completing the device control + pIrp->IoStatus.Status = rc; + IoCompleteRequest(pIrp, IO_NO_INCREMENT); + + return rc; +} + +static void gvmPowerCallback(void *notused, void *arg1, void *arg2) +{ + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i, wait; + + if (arg1 != (PVOID) PO_CB_SYSTEM_STATE_LOCK) + return; + + if (arg2 == (PVOID) 0) { + // About to enter suspend mode + suspend = 1; + wait = 0; +#define LIST_ENTRY_TYPE_INFO struct kvm + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (KeInsertQueueApc(&vcpu->apc, 0, 0, 0)) + wait++; + } + } +#undef LIST_ENTRY_TYPE_INFO + // Wait APC preempted vcpu threads + while (wait != suspend_wait) + _mm_pause(); + kvm_suspend(); + } else if (arg2 == (PVOID)1) { + // Resume from suspend mode + kvm_resume(); + suspend = 0; + } +} + +NTSTATUS _stdcall DriverEntry(PDRIVER_OBJECT pDrvObj, PUNICODE_STRING pRegPath) +{ + UNICODE_STRING DeviceName; + UNICODE_STRING DosDeviceName; + UNICODE_STRING PowerCallbackName;; + OBJECT_ATTRIBUTES PowerCallbackAttr; + PDEVICE_OBJECT pDevObj = NULL; + struct gvm_device_extension *pDevExt; + NTSTATUS rc; + int r; + char CPUString[13]; + unsigned int eax = 0; + + rc = NtKrUtilsInit(); + if (!NT_SUCCESS(rc)) + return rc; + + // Allocate and Initialize a zero page + pZeroPage = ExAllocatePoolWithTag(NonPagedPool, + PAGE_SIZE, GVM_POOL_TAG); + if (!pZeroPage) + return STATUS_NO_MEMORY; + RtlZeroBytes(pZeroPage, PAGE_SIZE); + + RtlZeroBytes(CPUString, 13); + cpuid(0, &eax, + (unsigned int *)&CPUString[0], + (unsigned int *)&CPUString[8], + (unsigned int *)&CPUString[4]); + if (strcmp("GenuineIntel", CPUString) == 0) + r = vmx_init(); + else if (strcmp("AuthenticAMD", CPUString) == 0) + r = svm_init(); + else { + DbgPrint("Processor %s is not supported\n", CPUString); + r = STATUS_NOT_SUPPORTED; + } + if (r) + return r; + + gpDrvObj = pDrvObj; + + RtlInitUnicodeString(&DeviceName, GVM_DEVICE_NAME); + + rc = IoCreateDevice(pDrvObj, + sizeof(struct gvm_device_extension), + &DeviceName, + FILE_DEVICE_GVM, + FILE_DEVICE_SECURE_OPEN, + FALSE, + &pDevObj); + + if (!NT_SUCCESS(rc)) + goto out_free1; + + pDevExt = pDevObj->DeviceExtension; + pDevExt->DevType = GVM_DEVICE_TOP; + + pDrvObj->DriverUnload = gvmDriverUnload; + pDrvObj->MajorFunction[IRP_MJ_CREATE] = gvmDeviceCreate; + pDrvObj->MajorFunction[IRP_MJ_CLOSE] = gvmDeviceClose; + pDrvObj->MajorFunction[IRP_MJ_DEVICE_CONTROL] = gvmDeviceControl; + + /* Register callback for system sleep transitions. + * According to OSR online document, the other way available + * is to convert the driver to be PNP compliant. + */ + RtlInitUnicodeString(&PowerCallbackName, POWER_CALL_BACK_NAME); + InitializeObjectAttributes(&PowerCallbackAttr, + &PowerCallbackName, 0, NULL, NULL); + rc = ExCreateCallback(&power_callback, &PowerCallbackAttr, + true, true); + if (NT_SUCCESS(rc)) + power_callback_handle = ExRegisterCallback(power_callback, + gvmPowerCallback, + NULL); + + RtlInitUnicodeString(&DosDeviceName, GVM_DOS_DEVICE_NAME); + + rc = IoCreateSymbolicLink(&DosDeviceName, &DeviceName); + if (!NT_SUCCESS(rc)) + goto out_free2; + + return STATUS_SUCCESS; + +out_free2: + IoDeleteDevice(pDevObj); +out_free1: + return rc; +} diff --git a/gvm-main.h b/gvm-main.h new file mode 100755 index 0000000..a0007c2 --- /dev/null +++ b/gvm-main.h @@ -0,0 +1,43 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#pragma once + +#include <ntddk.h> +#include <ntstrsafe.h> +#include <gvm_types.h> +#include <ntkrutils.h> + +#define GVM_DEVICE_TOP 0 +#define GVM_DEVICE_VM 1 +#define GVM_DEVICE_VCPU 2 +struct gvm_device_extension { + UINT32 DevType; + PVOID PrivData; +}; + +extern PVOID pZeroPage; + +extern int gvmUpdateReturnBuffer(PIRP pIrp, size_t start, void *src, size_t size); +extern void gvmWaitSuspend( + _In_ PKAPC Apc, + _Inout_ PKNORMAL_ROUTINE* NormalRoutine, + _Inout_ PVOID* NormalContext, + _Inout_ PVOID* SystemArgument1, + _Inout_ PVOID* SystemArgument2) ; +extern long kvm_dev_ioctl(PDEVICE_OBJECT pDevObj, PIRP pIrp, unsigned int ioctl); +extern long kvm_vm_ioctl(PDEVICE_OBJECT pDevObj, PIRP pIrp, unsigned int ioctl); +extern long kvm_vcpu_ioctl(PDEVICE_OBJECT pDevObj, PIRP pIrp, unsigned int ioctl); +extern NTSTATUS gvmCreateVMDevice(PHANDLE pHandle, UINT32 vmNumber, INT32 vcpuNumber, + PVOID PrivData); +extern NTSTATUS gvmDeleteVMDevice(PDEVICE_OBJECT pDevObj, UINT32 vmNumber, INT32 vcpuNumber); @@ -0,0 +1,41 @@ +/*
+ * Copyright 2019 Google LLC
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <gvm_ver.h>
+#include <windows.h>
+
+#define VER_DEBUG 2
+#define VER_PRERELEASE 0
+#define VER_FILEFLAGSMASK VS_FFI_FILEFLAGSMASK
+#define VER_FILEOS VOS_NT_WINDOWS32
+#define VER_FILEFLAGS (VER_PRERELEASE|VER_DEBUG)
+
+#define VER_FILETYPE VFT_DRV
+#define VER_FILESUBTYPE VFT2_DRV_SYSTEM
+
+#define VER_COMPANYNAME_STR "Google LLC"
+#define VER_PRODUCTNAME_STR "Android Emulator Hypervisor Driver for AMD Processors"
+#define VER_LEGALCOPYRIGHT_YEARS "2019"
+#define VER_LEGALCOPYRIGHT_STR "Copyright (c) " VER_LEGALCOPYRIGHT_YEARS " " VER_COMPANYNAME_STR
+#define VER_LEGALTRADEMARKS_STR VER_LEGALCOPYRIGHT_STR
+
+#define VER_PRODUCTVERSION GVM_RC_VERSION
+#define VER_PRODUCTVERSION_STR GVM_RC_VERSION_STR
+#define VER_PRODUCTVERSION_W (0x0200)
+#define VER_PRODUCTVERSION_DW (0x0200)
+#define VER_FILEDESCRIPTION_STR "Android Emulator Hypervisor Driver for AMD Processors"
+#define VER_INTERNALNAME_STR "Android Emulator Hypervisor Driver for AMD Processors"
+#define VER_ORIGINALFILENAME_STR "gvm.sys"
+
+#include "common.ver"
+
diff --git a/gvm/gvm.sln b/gvm/gvm.sln new file mode 100755 index 0000000..1eec919 --- /dev/null +++ b/gvm/gvm.sln @@ -0,0 +1,36 @@ +
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.26228.57
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gvm", "gvm.vcxproj", "{9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}"
+ ProjectSection(ProjectDependencies) = postProject
+ {07877F58-4EE6-4C6E-A6AA-AF42B477A5BE} = {07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asmgen", "..\asmgen\asmgen.vcxproj", "{07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}.Debug|x64.ActiveCfg = Debug|x64
+ {9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}.Debug|x64.Build.0 = Debug|x64
+ {9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}.Debug|x64.Deploy.0 = Debug|x64
+ {9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}.Release|x64.ActiveCfg = Release|x64
+ {9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}.Release|x64.Build.0 = Release|x64
+ {9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}.Release|x64.Deploy.0 = Release|x64
+ {07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}.Debug|x64.ActiveCfg = Debug|x64
+ {07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}.Debug|x64.Build.0 = Debug|x64
+ {07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}.Release|x64.ActiveCfg = Release|x64
+ {07877F58-4EE6-4C6E-A6AA-AF42B477A5BE}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {47E32EEA-C78F-41B3-9BCA-D6354BE920E8}
+ EndGlobalSection
+EndGlobal
diff --git a/gvm/gvm.vcxproj b/gvm/gvm.vcxproj new file mode 100755 index 0000000..ab038cf --- /dev/null +++ b/gvm/gvm.vcxproj @@ -0,0 +1,189 @@ +<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{9CDEE243-5FEC-44CD-9C26-A6B8AE76245E}</ProjectGuid>
+ <TemplateGuid>{dd38f7fc-d7bd-488b-9242-7d8754cde80d}</TemplateGuid>
+ <TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
+ <MinimumVisualStudioVersion>12.0</MinimumVisualStudioVersion>
+ <Configuration>Debug</Configuration>
+ <Platform Condition="'$(Platform)' == ''">Win32</Platform>
+ <RootNamespace>gvm</RootNamespace>
+ <WindowsTargetPlatformVersion>$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <TargetVersion>Windows10</TargetVersion>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>WindowsKernelModeDriver10.0</PlatformToolset>
+ <ConfigurationType>Driver</ConfigurationType>
+ <DriverType>WDM</DriverType>
+ <DriverTargetPlatform>Desktop</DriverTargetPlatform>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <TargetVersion>Windows10</TargetVersion>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>WindowsKernelModeDriver10.0</PlatformToolset>
+ <ConfigurationType>Driver</ConfigurationType>
+ <DriverType>WDM</DriverType>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <TargetVersion>Windows7</TargetVersion>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>WindowsKernelModeDriver10.0</PlatformToolset>
+ <ConfigurationType>Driver</ConfigurationType>
+ <DriverType>WDM</DriverType>
+ <SpectreMitigation>false</SpectreMitigation>
+ <DriverTargetPlatform>Desktop</DriverTargetPlatform>
+ <_NT_TARGET_VERSION>0x0601</_NT_TARGET_VERSION>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <TargetVersion>Windows7</TargetVersion>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>WindowsKernelModeDriver10.0</PlatformToolset>
+ <ConfigurationType>Driver</ConfigurationType>
+ <DriverType>WDM</DriverType>
+ <SpectreMitigation>false</SpectreMitigation>
+ <DriverTargetPlatform>Desktop</DriverTargetPlatform>
+ <_NT_TARGET_VERSION>0x0601</_NT_TARGET_VERSION>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <DebuggerFlavor>DbgengKernelDebugger</DebuggerFlavor>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <DebuggerFlavor>DbgengKernelDebugger</DebuggerFlavor>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <DebuggerFlavor>DbgengKernelDebugger</DebuggerFlavor>
+ <IntDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</IntDir>
+ <OutDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</OutDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <DebuggerFlavor>DbgengKernelDebugger</DebuggerFlavor>
+ <RunCodeAnalysis>false</RunCodeAnalysis>
+ <CodeAnalysisRuleSet>..\..\..\Program Files (x86)\Windows Kits\10\CodeAnalysis\DriverMinimumRules.ruleset</CodeAnalysisRuleSet>
+ <OutDir>$(SolutionDir)..\$(ConfigurationName)\</OutDir>
+ <IntDir>$(ProjectDir)..\build\$(ProjectName)\$(Platform)\$(Configuration)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <AdditionalIncludeDirectories>$(ProjectDir)..\arch\x86\include;$(ProjectDir)..\include;$(ProjectDir)..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>CONFIG_X86_64;CONFIG_X86_LOCAL_APIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <TreatWarningAsError>false</TreatWarningAsError>
+ <PreprocessToFile>false</PreprocessToFile>
+ <EnablePREfast>false</EnablePREfast>
+ </ClCompile>
+ <MASM>
+ <PreprocessorDefinitions>X64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <IncludePaths>$(ProjectDir)..\;%(IncludePaths)</IncludePaths>
+ </MASM>
+ <PreBuildEvent>
+ <Command>$(SolutionDir)\..\build\asmgen\x64\$(Configuration)\asmgen.exe > $(ProjectDir)..\__asm.inc</Command>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <AdditionalIncludeDirectories>$(ProjectDir)..\arch\x86\include;$(ProjectDir)..\include;$(ProjectDir)..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>CONFIG_X86_64;CONFIG_X86_LOCAL_APIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <TreatWarningAsError>false</TreatWarningAsError>
+ </ClCompile>
+ <PreBuildEvent>
+ <Command>$(SolutionDir)\..\build\asmgen\x64\$(Configuration)\asmgen.exe > $(ProjectDir)..\__asm.inc</Command>
+ </PreBuildEvent>
+ <MASM>
+ <IncludePaths>$(ProjectDir)..\;%(IncludePaths)</IncludePaths>
+ </MASM>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <FilesToPackage Include="$(TargetPath)" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="..\arch\x86\kvm\cpuid.c" />
+ <ClCompile Include="..\arch\x86\kvm\emulate.c" />
+ <ClCompile Include="..\arch\x86\kvm\i8259.c" />
+ <ClCompile Include="..\arch\x86\kvm\ioapic.c" />
+ <ClCompile Include="..\arch\x86\kvm\irq.c" />
+ <ClCompile Include="..\arch\x86\kvm\irq_comm.c" />
+ <ClCompile Include="..\arch\x86\kvm\lapic.c" />
+ <ClCompile Include="..\arch\x86\kvm\mmu.c" />
+ <ClCompile Include="..\arch\x86\kvm\mmu_audit.c" />
+ <ClCompile Include="..\arch\x86\kvm\mtrr.c" />
+ <ClCompile Include="..\arch\x86\kvm\page_track.c" />
+ <ClCompile Include="..\arch\x86\kvm\pmu.c" />
+ <ClCompile Include="..\arch\x86\kvm\pmu_amd.c" />
+ <ClCompile Include="..\arch\x86\kvm\pmu_intel.c" />
+ <ClCompile Include="..\arch\x86\kvm\svm.c" />
+ <ClCompile Include="..\arch\x86\kvm\vmx.c" />
+ <ClCompile Include="..\arch\x86\kvm\x86.c" />
+ <ClCompile Include="..\gvm-main.c" />
+ <ClCompile Include="..\ntkrutils.c" />
+ <ClCompile Include="..\virt\kvm\irqchip.c" />
+ <ClCompile Include="..\virt\kvm\kvm_main.c" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\arch\x86\include\asm\kvm_emulate.h" />
+ <ClInclude Include="..\arch\x86\include\asm\kvm_guest.h" />
+ <ClInclude Include="..\arch\x86\include\asm\kvm_host.h" />
+ <ClInclude Include="..\arch\x86\include\asm\kvm_page_track.h" />
+ <ClInclude Include="..\arch\x86\include\uapi\asm\debugreg.h" />
+ <ClInclude Include="..\arch\x86\include\uapi\asm\kvm.h" />
+ <ClInclude Include="..\arch\x86\include\uapi\asm\kvm_perf.h" />
+ <ClInclude Include="..\arch\x86\include\uapi\asm\processor-flags.h" />
+ <ClInclude Include="..\arch\x86\include\uapi\asm\vmx.h" />
+ <ClInclude Include="..\arch\x86\kvm\cpuid.h" />
+ <ClInclude Include="..\arch\x86\kvm\ioapic.h" />
+ <ClInclude Include="..\arch\x86\kvm\irq.h" />
+ <ClInclude Include="..\arch\x86\kvm\kvm_cache_regs.h" />
+ <ClInclude Include="..\arch\x86\kvm\lapic.h" />
+ <ClInclude Include="..\arch\x86\kvm\mmu.h" />
+ <ClInclude Include="..\arch\x86\kvm\mmutrace.h" />
+ <ClInclude Include="..\arch\x86\kvm\paging_tmpl.h" />
+ <ClInclude Include="..\arch\x86\kvm\pmu.h" />
+ <ClInclude Include="..\arch\x86\kvm\tss.h" />
+ <ClInclude Include="..\arch\x86\kvm\x86.h" />
+ <ClInclude Include="..\gvm-main.h" />
+ <ClInclude Include="..\gvm_types.h" />
+ <ClInclude Include="..\include\kvm\iodev.h" />
+ <ClInclude Include="..\include\linux\kvm_host.h" />
+ <ClInclude Include="..\include\linux\kvm_types.h" />
+ <ClInclude Include="..\include\linux\list.h" />
+ <ClInclude Include="..\ntkrutils.h" />
+ <ClInclude Include="..\__asm.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\arch\x86\kvm\Kconfig" />
+ <None Include="..\arch\x86\kvm\Makefile" />
+ <None Include="..\virt\kvm\Kconfig" />
+ </ItemGroup>
+ <ItemGroup>
+ <MASM Include="..\assembly\x64\assembly.asm" />
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="..\gvm.rc" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+</Project>
\ No newline at end of file diff --git a/gvm_types.h b/gvm_types.h new file mode 100755 index 0000000..9c54710 --- /dev/null +++ b/gvm_types.h @@ -0,0 +1,1534 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#pragma once +#pragma warning(disable : 4018) +#pragma warning(disable : 4100) +#pragma warning(disable : 4152) +#pragma warning(disable : 4389) +#pragma warning(disable : 4267) +#pragma warning(disable : 4242) +#pragma warning(disable : 4244) +#pragma warning(disable : 4245) +#include <intrin.h> +#include <ntddk.h> + +#define __align(a) __declspec(align(a)) +#define inline __inline +#define __always_inline __forceinline +#define __alwaysinline __forceinline + +typedef unsigned char uint8_t; +typedef char int8_t; +typedef unsigned short uint16_t; +typedef short int16_t; +typedef unsigned int uint32_t; +typedef int int32_t; +typedef unsigned long long uint64_t; +typedef long long int64_t; + +typedef unsigned char u8; +typedef char s8; +typedef unsigned short u16; +typedef short s16; +typedef unsigned int u32; +typedef int s32; +typedef unsigned long long u64; +typedef long long s64; + +typedef unsigned char __u8; +typedef char __s8; +typedef unsigned short __u16; +typedef short __s16; +typedef unsigned int __u32; +typedef int __s32; +typedef unsigned long long __u64; +typedef long long __s64; + +/* This is a hack. We should really replace ulong to size_t */ +typedef size_t ulong; + +#define bool _Bool +#define null NULL + +/* It seems VS has size_t but not ssize_t*/ +typedef intptr_t ssize_t; + +// per-cpu implementation +#define MAX_CPU_NUMBERS 512 +#define DEFINE_PER_CPU(type, name) \ + type name[MAX_CPU_NUMBERS] + +#define DECLARE_PER_CPU(type, name) \ + extern type name[MAX_CPU_NUMBERS] + +#define per_cpu(name, cpu) \ + name[cpu] +#define this_cpu_ptr(pname) \ + pname[raw_smp_processor_id()] +#define __this_cpu_write(name, val) \ + name[smp_processor_id()] = val + +//intel pmc stuff +#define INTEL_PMC_MAX_GENERIC 32 +#define INTEL_PMC_MAX_FIXED 3 + +struct irq_work { + int DONOTCARE2; +}; + +typedef u8 mtrr_type; + +#define PAGE_MASK (~(unsigned long long)(PAGE_SIZE - 1)) + +#define kvm_PAGE_TRACK_MAX 1 + +/* +* These are used to make use of C type-checking.. +*/ +typedef size_t pteval_t; +typedef size_t pmdval_t; +typedef size_t pudval_t; +typedef size_t pgdval_t; +typedef size_t pgprotval_t; + +typedef struct { pteval_t pte; } pte_t; + +#define __default_cpu_present_to_apicid(a) 0 + +#define NR_CPU_REGS 17 + +/* BITS_PER_LONG is coming from linux kernel where long int has 64bits for + * x86_64 and 32bits for x86. Microsoft VC always treats long as int. So + * We keep the linux kernel definitions here. Since we replaced long(ulong) + * to ssize_t(size_t). This definition is indeed BITS_PER_SIZET. + */ +#ifdef _WIN64 +#define BITS_PER_LONG 64 +#else +#define BITS_PER_LONG 32 +#endif + +#define atomic_read(a) *a + +#define __must_check + +#define false (unsigned char)0 +#define true (unsigned char)1 + +#pragma warning(disable : 4201) +#pragma pack(push, 1) +struct desc_struct { + union { + struct { + unsigned int a; + unsigned int b; + }; + struct { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1; + unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8; + }; + }; +}; + +/* LDT or TSS descriptor in the GDT. 16 bytes. */ +struct ldttss_desc64 { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + u32 base3; + u32 zero1; +}; +#pragma pack(pop) + +static __inline size_t get_desc_base(const struct desc_struct *desc) +{ + return (size_t)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); +} + +static __inline void set_desc_base(struct desc_struct *desc, size_t base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + +static __inline size_t get_desc_limit(const struct desc_struct *desc) +{ + return desc->limit0 | (desc->limit << 16); +} + +static __inline void set_desc_limit(struct desc_struct *desc, size_t limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + +#define __user + +#ifndef EPERM +#define EPERM 1 /* Operation not permitted */ +#endif + +#ifndef ENOENT +#define ENOENT 2 /* No such file or directory */ +#endif + +#ifndef ESRCH +#define ESRCH 3 /* No such process */ +#endif + +#ifndef EINTR +#define EINTR 4 /* Interrupted system call */ +#endif + +#ifndef EIO +#define EIO 5 /* I/O error */ +#endif + +#ifndef ENXIO +#define ENXIO 6 /* No such device or address */ +#endif + +#ifndef E2BIG +#define E2BIG 7 /* Arg list too long */ +#endif + +#ifndef ENOEXEC +#define ENOEXEC 8 /* Exec format error */ +#endif + +#ifndef EBADF +#define EBADF 9 /* Bad file number */ +#endif + +#ifndef ECHILD +#define ECHILD 10 /* No child processes */ +#endif + +#ifndef EAGAIN +#define EAGAIN 11 /* Try again */ +#endif + +#ifndef ENOMEM +#define ENOMEM 12 /* Out of memory */ +#endif + +#ifndef EACCES +#define EACCES 13 /* Permission denied */ +#endif + +#ifndef EFAULT +#define EFAULT 14 /* Bad address */ +#endif + +#ifndef ENOTBLK +#define ENOTBLK 15 /* Block device required */ +#endif + +#ifndef EBUSY +#define EBUSY 16 /* Device or resource busy */ +#endif + +#ifndef EEXIST +#define EEXIST 17 /* File exists */ +#endif + +#ifndef EXDEV +#define EXDEV 18 /* Cross-device link */ +#endif + +#ifndef ENODEV +#define ENODEV 19 /* No such device */ +#endif + +#ifndef ENOTDIR +#define ENOTDIR 20 /* Not a directory */ +#endif + +#ifndef EISDIR +#define EISDIR 21 /* Is a directory */ +#endif + +#ifndef EINVAL +#define EINVAL 22 /* Invalid argument */ +#endif + +#ifndef ENFILE +#define ENFILE 23 /* File table overflow */ +#endif + +#ifndef EMFILE +#define EMFILE 24 /* Too many open files */ +#endif + +#ifndef ENOTTY +#define ENOTTY 25 /* Not a typewriter */ +#endif + +#ifndef ETXTBSY +#define ETXTBSY 26 /* Text file busy */ +#endif + +#ifndef EFBIG +#define EFBIG 27 /* File too large */ +#endif + +#ifndef ENOSPC +#define ENOSPC 28 /* No space left on device */ +#endif + +#ifndef ESPIPE +#define ESPIPE 29 /* Illegal seek */ +#endif + +#ifndef EROFS +#define EROFS 30 /* Read-only file system */ +#endif + +#ifndef EMLINK +#define EMLINK 31 /* Too many links */ +#endif + +#ifndef EPIPE +#define EPIPE 32 /* Broken pipe */ +#endif + +#ifndef EDOM +#define EDOM 33 /* Math argument out of domain of func */ +#endif + +#ifndef ERANGE +#define ERANGE 34 /* Math result not representable */ +#endif + +#ifndef EDEADLK +#define EDEADLK 35 /* Resource deadlock would occur */ +#endif + +#ifndef ENAMETOOLONG +#define ENAMETOOLONG 36 /* File name too long */ +#endif + +#ifndef ENOLCK +#define ENOLCK 37 /* No record locks available */ +#endif + +#ifndef ENOSYS +#define ENOSYS 38 /* Function not implemented */ +#endif + +#ifndef ENOTEMPTY +#define ENOTEMPTY 39 /* Directory not empty */ +#endif + +#ifndef ELOOP +#define ELOOP 40 /* Too many symbolic links encountered */ +#endif + +#ifndef EWOULDBLOCK +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#endif + +#ifndef ENOMSG +#define ENOMSG 42 /* No message of desired type */ +#endif + +#ifndef EIDRM +#define EIDRM 43 /* Identifier removed */ +#endif + +#ifndef ECHRNG +#define ECHRNG 44 /* Channel number out of range */ +#endif + +#ifndef EL2NSYNC +#define EL2NSYNC 45 /* Level 2 not synchronized */ +#endif + +#ifndef EL3HLT +#define EL3HLT 46 /* Level 3 halted */ +#endif + +#ifndef EL3RST +#define EL3RST 47 /* Level 3 reset */ +#endif + +#ifndef ELNRNG +#define ELNRNG 48 /* Link number out of range */ +#endif + +#ifndef EUNATCH +#define EUNATCH 49 /* Protocol driver not attached */ +#endif + +#ifndef ENOCSI +#define ENOCSI 50 /* No CSI structure available */ +#endif + +#ifndef EL2HLT +#define EL2HLT 51 /* Level 2 halted */ +#endif + +#ifndef EBADE +#define EBADE 52 /* Invalid exchange */ +#endif + +#ifndef EBADR +#define EBADR 53 /* Invalid request descriptor */ +#endif + +#ifndef EXFULL +#define EXFULL 54 /* Exchange full */ +#endif + +#ifndef ENOANO +#define ENOANO 55 /* No anode */ +#endif + +#ifndef EBADRQC +#define EBADRQC 56 /* Invalid request code */ +#endif + +#ifndef EBADSLT +#define EBADSLT 57 /* Invalid slot */ +#endif + +#ifndef EDEADLOCK +#define EDEADLOCK EDEADLK +#endif + +#ifndef EBFONT +#define EBFONT 59 /* Bad font file format */ +#endif + +#ifndef ENOSTR +#define ENOSTR 60 /* Device not a stream */ +#endif + +#ifndef ENODATA +#define ENODATA 61 /* No data available */ +#endif + +#ifndef ETIME +#define ETIME 62 /* Timer expired */ +#endif + +#ifndef ENOSR +#define ENOSR 63 /* Out of streams resources */ +#endif + +#ifndef ENONET +#define ENONET 64 /* Machine is not on the network */ +#endif + +#ifndef ENOPKG +#define ENOPKG 65 /* Package not installed */ +#endif + +#ifndef EREMOTE +#define EREMOTE 66 /* Object is remote */ +#endif + +#ifndef ENOLINK +#define ENOLINK 67 /* Link has been severed */ +#endif + +#ifndef EADV +#define EADV 68 /* Advertise error */ +#endif + +#ifndef ESRMNT +#define ESRMNT 69 /* Srmount error */ +#endif + +#ifndef ECOMM +#define ECOMM 70 /* Communication error on send */ +#endif + +#ifndef EPROTO +#define EPROTO 71 /* Protocol error */ +#endif + +#ifndef EMULTIHOP +#define EMULTIHOP 72 /* Multihop attempted */ +#endif + +#ifndef EDOTDOT +#define EDOTDOT 73 /* RFS specific error */ +#endif + +#ifndef EBADMSG +#define EBADMSG 74 /* Not a data message */ +#endif + +#ifndef EOVERFLOW +#define EOVERFLOW 75 /* Value too large for defined data type */ +#endif + +#ifndef ENOTUNIQ +#define ENOTUNIQ 76 /* Name not unique on network */ +#endif + +#ifndef EBADFD +#define EBADFD 77 /* File descriptor in bad state */ +#endif + +#ifndef EREMCHG +#define EREMCHG 78 /* Remote address changed */ +#endif + +#ifndef ELIBACC +#define ELIBACC 79 /* Can not access a needed shared library */ +#endif + +#ifndef ELIBBAD +#define ELIBBAD 80 /* Accessing a corrupted shared library */ +#endif + +#ifndef ELIBSCN +#define ELIBSCN 81 /* .lib section in a.out corrupted */ +#endif + +#ifndef ELIBMAX +#define ELIBMAX 82 /* Attempting to link in too many shared libraries */ +#endif + +#ifndef ELIBEXEC +#define ELIBEXEC 83 /* Cannot exec a shared library directly */ +#endif + +#ifndef EILSEQ +#define EILSEQ 84 /* Illegal byte sequence */ +#endif + +#ifndef ERESTART +#define ERESTART 85 /* Interrupted system call should be restarted */ +#endif + +#ifndef ESTRPIPE +#define ESTRPIPE 86 /* Streams pipe error */ +#endif + +#ifndef EUSERS +#define EUSERS 87 /* Too many users */ +#endif + +#ifndef ENOTSOCK +#define ENOTSOCK 88 /* Socket operation on non-socket */ +#endif + +#ifndef EDESTADDRREQ +#define EDESTADDRREQ 89 /* Destination address required */ +#endif + +#ifndef EMSGSIZE +#define EMSGSIZE 90 /* Message too long */ +#endif + +#ifndef EPROTOTYPE +#define EPROTOTYPE 91 /* Protocol wrong type for socket */ +#endif + +#ifndef ENOPROTOOPT +#define ENOPROTOOPT 92 /* Protocol not available */ +#endif + +#ifndef EPROTONOSUPPORT +#define EPROTONOSUPPORT 93 /* Protocol not supported */ +#endif + +#ifndef ESOCKTNOSUPPORT +#define ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#endif + +#ifndef EOPNOTSUPP +#define EOPNOTSUPP 95 /* Operation not supported on transport endpoint */ +#endif + +#ifndef EPFNOSUPPORT +#define EPFNOSUPPORT 96 /* Protocol family not supported */ +#endif + +#ifndef EAFNOSUPPORT +#define EAFNOSUPPORT 97 /* Address family not supported by protocol */ +#endif + +#ifndef EADDRINUSE +#define EADDRINUSE 98 /* Address already in use */ +#endif + +#ifndef EADDRNOTAVAIL +#define EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#endif + +#ifndef ENETDOWN +#define ENETDOWN 100 /* Network is down */ +#endif + +#ifndef ENETUNREACH +#define ENETUNREACH 101 /* Network is unreachable */ +#endif + +#ifndef ENETRESET +#define ENETRESET 102 /* Network dropped connection because of reset */ +#endif + +#ifndef ECONNABORTED +#define ECONNABORTED 103 /* Software caused connection abort */ +#endif + +#ifndef ECONNRESET +#define ECONNRESET 104 /* Connection reset by peer */ +#endif + +#ifndef ENOBUFS +#define ENOBUFS 105 /* No buffer space available */ +#endif + +#ifndef EISCONN +#define EISCONN 106 /* Transport endpoint is already connected */ +#endif + +#ifndef ENOTCONN +#define ENOTCONN 107 /* Transport endpoint is not connected */ +#endif + +#ifndef ESHUTDOWN +#define ESHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ +#endif + +#ifndef ETOOMANYREFS +#define ETOOMANYREFS 109 /* Too many references: cannot splice */ +#endif + +#ifndef ETIMEDOUT +#define ETIMEDOUT 110 /* Connection timed out */ +#endif + +#ifndef ECONNREFUSED +#define ECONNREFUSED 111 /* Connection refused */ +#endif + +#ifndef EHOSTDOWN +#define EHOSTDOWN 112 /* Host is down */ +#endif + +#ifndef EHOSTUNREACH +#define EHOSTUNREACH 113 /* No route to host */ +#endif + +#ifndef EALREADY +#define EALREADY 114 /* Operation already in progress */ +#endif + +#ifndef EINPROGRESS +#define EINPROGRESS 115 /* Operation now in progress */ +#endif + +#ifndef ESTALE +#define ESTALE 116 /* Stale NFS file handle */ +#endif + +#ifndef EUCLEAN +#define EUCLEAN 117 /* Structure needs cleaning */ +#endif + +#ifndef ENOTNAM +#define ENOTNAM 118 /* Not a XENIX named type file */ +#endif + +#ifndef ENAVAIL +#define ENAVAIL 119 /* No XENIX semaphores available */ +#endif + +#ifndef EISNAM +#define EISNAM 120 /* Is a named type file */ +#endif + +#ifndef EREMOTEIO +#define EREMOTEIO 121 /* Remote I/O error */ +#endif + +#ifndef EDQUOT +#define EDQUOT 122 /* Quota exceeded */ +#endif + +#ifndef ENOMEDIUM +#define ENOMEDIUM 123 /* No medium found */ +#endif + +#ifndef EMEDIUMTYPE +#define EMEDIUMTYPE 124 /* Wrong medium type */ +#endif + +#ifndef ECANCELED +#define ECANCELED 125 /* Operation Cancelled */ +#endif + +#ifndef ENOKEY +#define ENOKEY 126 /* Required key not available */ +#endif + +#ifndef EKEYEXPIRED +#define EKEYEXPIRED 127 /* Key has expired */ +#endif + +#ifndef EKEYREVOKED +#define EKEYREVOKED 128 /* Key has been revoked */ +#endif + +#ifndef EKEYREJECTED +#define EKEYREJECTED 129 /* Key was rejected by service */ +#endif + +#ifndef MAX_ERRNO +#define MAX_ERRNO 4095 +#endif + +#define IS_ERR_VALUE(x) ((x) >= (size_t)-MAX_ERRNO) + +static __inline void* ERR_PTR(ssize_t error) +{ + return (void *)error; +} + +static __inline size_t PTR_ERR(const void *ptr) +{ + return (size_t)ptr; +} + +static __inline size_t IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((size_t)ptr); +} + +#define FOLL_NOWAIT 0 +#define FOLL_HWPOISON 0 +#define FOLL_WRITE 0 +#define FOLL_TOUCH 0 +#define FOLL_NOWAIT 0 + +#define VM_READ + +#define down_read(a) +#define up_read(a) + +#define WRITE_ONCE(a, b) \ +do { \ + _ReadWriteBarrier(); \ + a = b; \ +} while(0) +#define ACCESS_ONCE(a, b) \ +do { \ + _ReadWriteBarrier(); \ + b = a; \ +} while(0) +#define READ_ONCE(a, b) ACCESS_ONCE(a, b) + +#define WARN_ON(a) 0 + +#define PIDTYPE_PID 0 + +#define NOTIFY_OK 0 + +#define atomic_set(a, b) WRITE_ONCE(*a, b) + +#define XSAVE_HDR_SIZE 0 +#define XSAVE_HDR_OFFSET 0x10 +#define XFEATURE_MASK_EXTEND 0x0 + +#define might_sleep() 0 + +// visual c compiler does not support branch hint +#define likely(a) a +#define unlikely(a) a + +#define kvm_pmu_refresh(a) 0 +#define printk DbgPrint +#define pr_info_ratelimited DbgPrint +#define printk_ratelimited DbgPrint +#define printk_once DbgPrint +#define kdprint DbgPrint +#define pr_info DbgPrint +#define pr_warn_once DbgPrint + +// cpuid.c +enum cpuid_leafs +{ + CPUID_1_EDX = 0, + CPUID_8000_0001_EDX, + CPUID_8086_0001_EDX, + CPUID_LNX_1, + CPUID_1_ECX, + CPUID_C000_0001_EDX, + CPUID_8000_0001_ECX, + CPUID_LNX_2, + CPUID_LNX_3, + CPUID_7_0_EBX, + CPUID_D_1_EAX, + CPUID_F_0_EDX, + CPUID_F_1_EDX, + CPUID_8000_0008_EBX, + CPUID_6_EAX, + CPUID_8000_000A_EDX, + CPUID_7_ECX, + CPUID_8000_0007_EBX, +}; + +extern int CPU_HAS_X86_FEATURE_XSAVE; +extern int CPU_HAS_X86_FEATURE_PKU; +extern int CPU_HAS_X86_FEATURE_GBPAGES; +extern int CPU_HAS_X86_FEATURE_HLE; +extern int CPU_HAS_X86_FEATURE_RTM; +extern int CPU_HAS_X86_FEATURE_NX; +extern int CPU_HAS_X86_FEATURE_FXSR_OPT; +extern int CPU_HAS_X86_FEATURE_NPT; +extern int CPU_HAS_X86_FEATURE_AVIC; +extern int CPU_HAS_X86_FEATURE_DECODEASSISTS; +extern int CPU_HAS_X86_FEATURE_RDTSCP; +extern int CPU_HAS_X86_FEATURE_LBRV; +extern int CPU_HAS_X86_FEATURE_NRIPS; +extern int CPU_HAS_X86_FEATURE_SMEP; +extern int CPU_HAS_X86_FEATURE_MPX; +extern int CPU_HAS_X86_FEATURE_XSAVES; +extern int CPU_HAS_X86_FEATURE_CONSTANT_TSC; +extern int CPU_HAS_X86_BUG_AMD_TLB_MMATCH; +extern int CPU_HAS_X86_FEATURE_FLUSHBYASID; +extern int CPU_HAS_X86_FEATURE_OSVW; +extern int CPU_HAS_X86_FEATURE_SVM; + +#define cpu_has(notused, feature) (CPU_HAS_##feature) +#define boot_cpu_has(feature) (CPU_HAS_##feature) +#define static_cpu_has(feature) (CPU_HAS_##feature) + +#define WARN_ON_ONCE(a) 0 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +#define __min_t_func_type(a) \ +static __inline a __##a##_min(a b, a c) \ +{ \ + return (b < c) ? b : c; \ +} + +__min_t_func_type(unsigned) +__min_t_func_type(u64) +__min_t_func_type(u32) +__min_t_func_type(int) + +#define min_t(a, b, c) __##a##_min((b), (c)) + +#define offset_in_page(p) ((size_t)(p) & ~PAGE_MASK) +// Let's borrow MS's HYPERVISOR_ERR here +#define BUG() KeBugCheck(0x00020001) +#define BUG_ON(cond) do { if (cond) BUG();} while (0) +#define volatile + +#define min3(a, b, c) min(min(a, b),c) + +#pragma pack(push, 1) +struct desc_ptr { + unsigned short size; + size_t address; +}; +#pragma pack(pop) + +/* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_RPL_MASK 0x3 + +/* User mode is privilege level 3: */ +#define USER_RPL 0x3 + +/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ +#define SEGMENT_TI_MASK 0x4 +/* LDT segment has TI set ... */ +#define SEGMENT_LDT 0x4 +/* ... GDT has it cleared */ +#define SEGMENT_GDT 0x0 + +#define GDT_ENTRY_INVALID_SEG 0 + +#define swab16 RtlUshortByteSwap +#define swab32 RtlUlongByteSwap +#define swab64 RtlUlonglongByteSwap + +#define container_of CONTAINING_RECORD +#define KERN_WARNING +#define KERN_INFO +#define KERN_ERR +#define KERN_CRIT +#define KERN_DEBUG + +// Bitmaps +#define BITS_TO_LONGS(bits) (bits + BITS_PER_LONG - 1)/BITS_PER_LONG +#define DECLARE_BITMAP(name, bits) \ + size_t name[BITS_TO_LONGS(bits)] + +#define BITMAP_FIRST_WORD_MASK(start) (~(size_t)0 << ((start) & (BITS_PER_LONG - 1))) +#define BITMAP_LAST_WORD_MASK(nbits) (~(size_t)0 >> (-((ssize_t)nbits) & (BITS_PER_LONG - 1))) + +#define small_const_nbits(nbits) \ + ((nbits) <= BITS_PER_LONG) + +static __inline int __bitmap_and(size_t *dst, const size_t *bitmap1, + const size_t *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int lim = bits / BITS_PER_LONG; + size_t result = 0; + + for (k = 0; k < lim; k++) + result |= (dst[k] = bitmap1[k] & bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); + return result != 0; +} + +static __inline void __bitmap_or(size_t *dst, const size_t *bitmap1, + const size_t *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] | bitmap2[k]; +} + +static __inline void __bitmap_xor(size_t *dst, const size_t *bitmap1, + const size_t *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] ^ bitmap2[k]; +} + +static __inline int __bitmap_andnot(size_t *dst, const size_t *bitmap1, + const size_t *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int lim = bits / BITS_PER_LONG; + size_t result = 0; + + for (k = 0; k < lim; k++) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); + return result != 0; +} + +static __inline void __bitmap_complement(size_t *dst, const size_t *src, unsigned int bits) +{ + unsigned int k, lim = bits / BITS_PER_LONG; + for (k = 0; k < lim; ++k) + dst[k] = ~src[k]; + + if (bits % BITS_PER_LONG) + dst[k] = ~src[k]; +} + +static __inline void bitmap_zero(size_t *dst, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = 0UL; + else { + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(size_t); + memset(dst, 0, len); + } +} + +static __inline void bitmap_copy(size_t *dst, const size_t *src, + unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = *src; + else { + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(size_t); + memcpy(dst, src, len); + } +} + +static __inline int bitmap_and(size_t *dst, const size_t *src1, + const size_t *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; + return __bitmap_and(dst, src1, src2, nbits); +} + +static __inline void bitmap_or(size_t *dst, const size_t *src1, + const size_t *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = *src1 | *src2; + else + __bitmap_or(dst, src1, src2, nbits); +} + +static __inline void bitmap_xor(size_t *dst, const size_t *src1, + const size_t *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = *src1 ^ *src2; + else + __bitmap_xor(dst, src1, src2, nbits); +} + +static __inline int bitmap_andnot(size_t *dst, const size_t *src1, + const size_t *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; + return __bitmap_andnot(dst, src1, src2, nbits); +} + +static inline void bitmap_complement(size_t *dst, const size_t *src, + unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = ~(*src); + else + __bitmap_complement(dst, src, nbits); +} + +// Bitwise operations +#ifdef _WIN64 +// Non-atomic +static __forceinline bool __test_and_set_bit(size_t pos, volatile size_t *bitmap) +{ + return _bittestandset64((LONG64 *)bitmap, (LONG64)pos); +} + +// Non-atomic +static __forceinline bool __test_and_clear_bit(size_t pos, volatile size_t *bitmap) +{ + return _bittestandreset64((LONG64 *)bitmap, (LONG64)pos); +} + +// Atomic +static __forceinline bool test_and_set_bit(size_t pos, volatile size_t *bitmap) +{ + return _interlockedbittestandset64((LONG64 *)bitmap, (LONG64)pos); +} + +// Atomic +static __forceinline bool test_and_clear_bit(size_t pos, volatile size_t *bitmap) +{ + return _interlockedbittestandreset64((LONG64 *)bitmap, (LONG64)pos); +} + +// Non-atomic +static __forceinline void __set_bit(size_t nr, volatile size_t *addr) +{ + _bittestandset64((LONG64 *)addr, (LONG64)nr); +} + +// Non-atomic +static __forceinline void __clear_bit(size_t nr, volatile size_t *addr) +{ + _bittestandreset64((LONG64 *)addr, (LONG64)nr); +} + +// Atomic +static __forceinline void set_bit(size_t nr, volatile size_t *addr) +{ + _interlockedbittestandset64((LONG64 *)addr, (LONG64)nr); +} + +// Atomic +static __forceinline void clear_bit(size_t nr, volatile size_t *addr) +{ + _interlockedbittestandreset64((LONG64 *)addr, (LONG64)nr); +} + +static __forceinline unsigned char test_bit(size_t nr, volatile size_t *addr) +{ + return _bittest64((LONG64 *)addr, (LONG64)nr); +} +#else +// Non-atomic +static __forceinline bool __test_and_set_bit(size_t pos, volatile size_t *bitmap) +{ + return _bittestandset((LONG *)bitmap, (LONG)pos); +} + +// Non-atomic +static __forceinline bool __test_and_clear_bit(size_t pos, volatile size_t *bitmap) +{ + return _bittestandreset((LONG *)bitmap, (LONG)pos); +} + +// Atomic +static __forceinline bool test_and_set_bit(size_t pos, volatile size_t *bitmap) +{ + return _interlockedbittestandset((LONG *)bitmap, (LONG)pos); +} + +// Atomic +static __forceinline bool test_and_clear_bit(size_t pos, volatile size_t *bitmap) +{ + return _interlockedbittestandreset((LONG *)bitmap, (LONG)pos); +} + +// Non-atomic +static __forceinline void __set_bit(size_t nr, volatile size_t *addr) +{ + _bittestandset((LONG *)addr, (LONG)nr); +} + +// Non-atomic +static __forceinline void __clear_bit(size_t nr, volatile size_t *addr) +{ + _bittestandreset((LONG *)addr, (LONG)nr); +} + +// Atomic +static __forceinline void set_bit(size_t nr, volatile size_t *addr) +{ + _interlockedbittestandset((LONG *)addr, (LONG)nr); +} + +// Atomic +static __forceinline void clear_bit(size_t nr, volatile size_t *addr) +{ + _interlockedbittestandreset((LONG *)addr, (LONG)nr); +} + +static __forceinline unsigned char test_bit(size_t nr, volatile size_t *addr) +{ + return _bittest((LONG *)addr, (LONG)nr); +} +#endif + +#ifdef _WIN64 +static __forceinline size_t __ffs(size_t mask) +{ + unsigned long pos; + _BitScanForward64(&pos, mask); + return pos; +} + +static __forceinline size_t __fls(size_t mask) +{ + unsigned long pos; + _BitScanReverse64(&pos, mask); + return pos; +} +#else +static __forceinline size_t __ffs(size_t mask) +{ + unsigned long pos; + _BitScanForward(&pos, mask); + return pos; +} + +static __forceinline size_t __fls(size_t mask) +{ + unsigned long pos; + _BitScanReverse(&pos, mask); + return pos; +} +#endif + + +// Note the difference of linux kernel ffs with BitScanForward +static __forceinline unsigned int ffs(int x) +{ + unsigned long pos; + unsigned char ret = _BitScanForward(&pos, x); + return ret ? pos + 1 : ret; +} + +static __forceinline size_t ffz(size_t x) +{ + return __ffs(~x); +} + +static __forceinline unsigned int fls(int x) +{ + unsigned long pos; + unsigned char ret = _BitScanReverse(&pos, x); + return ret ? pos + 1 : ret; +} + +#ifdef _WIN64 +static __forceinline int fls64(size_t x) +{ + unsigned long pos; + unsigned char ret = _BitScanReverse64(&pos, x); + return ret ? pos + 1 : ret; +} +#else +static __forceinline int fls64(__u64 x) +{ + __u32 h = x >> 32; + if (h) + return fls(h) + 32; + return fls(x); +} +#endif + +static __forceinline u64 do_div(u64 *n, u64 base) +{ + u64 rem = (*n) % base; + *n = (*n) / base; + + return rem; +} + +#ifdef _WIN64 +static __inline uint64_t div64_u64(uint64_t dividend, uint64_t divisor) +{ + return dividend / divisor; +} +#else +static __inline uint64_t div64_u64(uint64_t dividend, uint64_t divisor) +{ + uint32_t high, d; + + high = divisor >> 32; + if (high) + { + unsigned int shift = __fls(high); + + d = divisor >> shift; + dividend >>= shift; + } + else + { + d = divisor; + } + + do_div(dividend, d); + + return dividend; +} +#endif + +#define __read_mostly + +#define HZ 100 + +#define module_param_named(a, b, c, d) 0 +#define module_param(a, b, c) 0 + +#define GDT_ENTRY_TSS 8 + +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_SOFTW1 9 /* available for programmer */ +#define _PAGE_BIT_SOFTW2 10 /* " */ +#define _PAGE_BIT_SOFTW3 11 /* " */ +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ +#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ +#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ +#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */ +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ +#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 + +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL + +#define _AT(x, y) y + +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) +#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) + +#define NMI_VECTOR 2 + +#define pr_err_ratelimited DbgPrint +#define pr_err DbgPrint +#define pr_debug DbgPrint + +//TODO:IOW/R +#define FILE_DEVICE_GVM 0xE3E3 +#define _IO(a, b) CTL_CODE(FILE_DEVICE_GVM,b,METHOD_BUFFERED,FILE_ANY_ACCESS) +#define _IOR(a, b, c) CTL_CODE(FILE_DEVICE_GVM,b,METHOD_BUFFERED,FILE_ANY_ACCESS) +#define _IOW(a, b, c) CTL_CODE(FILE_DEVICE_GVM,b,METHOD_BUFFERED,FILE_ANY_ACCESS) +#define _IOWR(a, b, c) CTL_CODE(FILE_DEVICE_GVM,b,METHOD_BUFFERED,FILE_ANY_ACCESS) + +// bit maps + +/* +* This looks more complex than it should be. But we need to +* get the type for the ~ right in round_down (it needs to be +* as wide as the result!), and we want to evaluate the macro +* arguments just once each. +*/ +#define __round_mask(x, y) ((size_t)((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) + +/* +* This is a common helper function for find_next_bit and +* find_next_zero_bit. The difference is the "invert" argument, which +* is XORed with each fetched word before searching it for one bits. +*/ + +static size_t _find_next_bit(const size_t *addr, + size_t nbits, size_t start, size_t invert) +{ + size_t tmp; + + if (!nbits || start >= nbits) + return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + + /* Handle 1st word. */ + tmp &= BITMAP_FIRST_WORD_MASK(start); + start = round_down(start, BITS_PER_LONG); + + while (!tmp) { + start += BITS_PER_LONG; + if (start >= nbits) + return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + } + + return min(start + __ffs(tmp), nbits); +} + +static size_t find_next_bit(const size_t *addr, size_t size, size_t offset) +{ + return _find_next_bit(addr, size, offset, (size_t)0); +} + +static size_t find_next_zero_bit(const size_t *addr, size_t size, size_t offset) +{ + return _find_next_bit(addr, size, offset, ~(size_t)0); +} +/* +* Find the first zero bit in a memory region +*/ +static size_t find_first_zero_bit(const size_t *addr, size_t size) +{ + size_t idx; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + if (addr[idx] != ~0UL) + return min(idx * BITS_PER_LONG + ffz(addr[idx]), size); + } + + return size; +} + +/* +* Find the first set bit in a memory region. +*/ +static __inline size_t find_first_bit(const size_t *addr, size_t size) +{ + size_t idx; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + if (addr[idx]) + return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size); + } + + return size; +} + +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define REPEAT_BYTE(x) ((~0ull / 0xff) * (x)) + +//cpumask +#define NR_CPUS 512 +struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); }; +typedef struct cpumask cpumask_t; +typedef struct cpumask *cpumask_var_t; +#define cpumask_bits(maskp) (&((maskp)->bits[0])) + +static inline unsigned int cpumask_check(unsigned int cpu) +{ + return cpu; +} + +static __inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +static __inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +static __inline void cpumask_clear(struct cpumask *dstp) +{ + memset(dstp->bits, 0, NR_CPUS / 8); +} + +static __inline unsigned char cpumask_test_cpu(int cpu, struct cpumask *dstp) +{ + return test_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +static inline bool cpumask_empty(const struct cpumask *srcp) +{ + return find_first_bit(cpumask_bits(srcp), MAX_CPU_NUMBERS) + == MAX_CPU_NUMBERS; +} + +static _forceinline unsigned int cpumask_next(int n, const struct cpumask *srcp) +{ + return (unsigned int)_find_next_bit(cpumask_bits(srcp), MAX_CPU_NUMBERS, n+1, 0); +} + +/** +* for_each_cpu - iterate over every cpu in a mask +* @cpu: the (optionally unsigned) integer iterator +* @mask: the cpumask pointer +* +* After the loop, cpu is >= nr_cpu_ids. +*/ +#define for_each_cpu(cpu, mask) \ + for ((cpu) = -1; \ + (cpu) = cpumask_next((cpu), (mask)), \ + (cpu) < MAX_CPU_NUMBERS;) + +#define for_each_online_cpu(cpu) \ + for_each_cpu(cpu, cpu_online_mask) + +#define for_each_possible_cpu(cpu) \ + for_each_cpu(cpu, cpu_online_mask) + +#define VM_FAULT_SIGBUS 0x0002 + +/* +* Defines x86 CPU feature bits +*/ +#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ + +/* +* CPU type and hardware bug flags. Kept separately for each CPU. +* Members of this structure are referenced in head.S, so think twice +* before touching them. [mj] +*/ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; +#ifdef CONFIG_X86_32 + char wp_works_ok; /* It doesn't on 386's */ + + /* Problems on some 486Dx4's and old 386's: */ + char rfu; + char pad0; + char pad1; +#else + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ + int x86_tlbsize; +#endif + __u8 x86_virt_bits; + __u8 x86_phys_bits; + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; + /* Max extended CPUID function supported: */ + __u32 extended_cpuid_level; + /* Maximum supported CPUID level, -1=no CPUID: */ + int cpuid_level; + __u32 x86_capability[NCAPINTS + NBUGINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + /* in KB - valid for CPUS which support this call: */ + int x86_cache_size; + int x86_cache_alignment; /* In bytes */ + /* Cache QoS architectural values: */ + int x86_cache_max_rmid; /* max index */ + int x86_cache_occ_scale; /* scale to bytes */ + int x86_power; + unsigned long loops_per_jiffy; + /* cpuid returned max cores value: */ + u16 x86_max_cores; + u16 apicid; + u16 initial_apicid; + u16 x86_clflush_size; + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ + u16 phys_proc_id; + /* Logical processor id: */ + u16 logical_proc_id; + /* Core id: */ + u16 cpu_core_id; + /* Index into per_cpu list: */ + u16 cpu_index; + u32 microcode; +}; + +extern struct cpuinfo_x86 boot_cpu_data; + +#pragma warning(disable : 4214) +/* 16byte gate */ +#pragma pack(push, 1) +struct gate_struct64 { + u16 offset_low; + u16 segment; + u16 ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; + u16 offset_middle; + u32 offset_high; + u32 zero1; +}; +#pragma pack(pop) +#ifdef CONFIG_X86_64 +typedef struct gate_struct64 gate_desc; +#define gate_offset(g) ((g).offset_low | ((size_t)(g).offset_middle << 16) | ((size_t)(g).offset_high << 32)) +#endif + diff --git a/gvm_ver.h b/gvm_ver.h new file mode 100644 index 0000000..34d1aa3 --- /dev/null +++ b/gvm_ver.h @@ -0,0 +1,25 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#pragma once + +#define _STR(str) #str +#define _XSTR(str) _STR(str) + +#define GVM_MAJOR_VERSION 1 +#define GVM_MINOR_VERSION 0 + +#define GVM_VERSION ((GVM_MAJOR_VERSION << 16) | GVM_MINOR_VERSION) + +#define GVM_RC_VERSION GVM_MAJOR_VERSION,GVM_MINOR_VERSION +#define GVM_RC_VERSION_STR _XSTR(GVM_MAJOR_VERSION) "." _XSTR(GVM_MINOR_VERSION) "\0" diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h deleted file mode 100644 index fa25bec..0000000 --- a/include/asm-generic/kvm_para.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _ASM_GENERIC_KVM_PARA_H -#define _ASM_GENERIC_KVM_PARA_H - -#include <uapi/asm-generic/kvm_para.h> - - -/* - * This function is used by architectures that support kvm to avoid issuing - * false soft lockup messages. - */ -static inline bool kvm_check_and_clear_guest_paused(void) -{ - return false; -} - -static inline unsigned int kvm_arch_para_features(void) -{ - return 0; -} - -static inline bool kvm_para_available(void) -{ - return false; -} - -#endif diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h deleted file mode 100644 index dda39d8..0000000 --- a/include/kvm/arm_arch_timer.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef __ASM_ARM_KVM_ARCH_TIMER_H -#define __ASM_ARM_KVM_ARCH_TIMER_H - -#include <linux/clocksource.h> -#include <linux/hrtimer.h> -#include <linux/workqueue.h> - -struct arch_timer_kvm { - /* Virtual offset */ - cycle_t cntvoff; -}; - -struct arch_timer_cpu { - /* Registers: control register, timer value */ - u32 cntv_ctl; /* Saved/restored */ - cycle_t cntv_cval; /* Saved/restored */ - - /* - * Anything that is not used directly from assembly code goes - * here. - */ - - /* Background timer used when the guest is not running */ - struct hrtimer timer; - - /* Work queued with the above timer expires */ - struct work_struct expired; - - /* Background timer active */ - bool armed; - - /* Timer IRQ */ - struct kvm_irq_level irq; - - /* Active IRQ state caching */ - bool active_cleared_last; - - /* Is the timer enabled */ - bool enabled; -}; - -int kvm_timer_hyp_init(void); -int kvm_timer_enable(struct kvm_vcpu *vcpu); -void kvm_timer_init(struct kvm *kvm); -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, - const struct kvm_irq_level *irq); -void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); -void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); -void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); - -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); -int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); - -bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); -void kvm_timer_schedule(struct kvm_vcpu *vcpu); -void kvm_timer_unschedule(struct kvm_vcpu *vcpu); - -void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); - -#endif diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h deleted file mode 100644 index 92e7e97..0000000 --- a/include/kvm/arm_pmu.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (C) 2015 Linaro Ltd. - * Author: Shannon Zhao <shannon.zhao@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ASM_ARM_KVM_PMU_H -#define __ASM_ARM_KVM_PMU_H - -#include <linux/perf_event.h> -#include <asm/perf_event.h> - -#define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) - -#ifdef CONFIG_KVM_ARM_PMU - -struct kvm_pmc { - u8 idx; /* index into the pmu->pmc array */ - struct perf_event *perf_event; - u64 bitmask; -}; - -struct kvm_pmu { - int irq_num; - struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS]; - bool ready; - bool irq_level; -}; - -#define kvm_arm_pmu_v3_ready(v) ((v)->arch.pmu.ready) -#define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) -u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); -void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); -void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); -void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); -void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, - u64 select_idx); -bool kvm_arm_support_pmu_v3(void); -int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr); -int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr); -int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr); -#else -struct kvm_pmu { -}; - -#define kvm_arm_pmu_v3_ready(v) (false) -#define kvm_arm_pmu_irq_initialized(v) (false) -static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, - u64 select_idx) -{ - return 0; -} -static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, - u64 select_idx, u64 val) {} -static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) -{ - return 0; -} -static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, - u64 data, u64 select_idx) {} -static inline bool kvm_arm_support_pmu_v3(void) { return false; } -static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -static inline int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -static inline int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -#endif - -#endif diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h deleted file mode 100644 index 002f092..0000000 --- a/include/kvm/arm_vgic.h +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef __KVM_ARM_VGIC_H -#define __KVM_ARM_VGIC_H - -#include <linux/kernel.h> -#include <linux/kvm.h> -#include <linux/irqreturn.h> -#include <linux/spinlock.h> -#include <linux/static_key.h> -#include <linux/types.h> -#include <kvm/iodev.h> -#include <linux/list.h> -#include <linux/jump_label.h> - -#define VGIC_V3_MAX_CPUS 255 -#define VGIC_V2_MAX_CPUS 8 -#define VGIC_NR_IRQS_LEGACY 256 -#define VGIC_NR_SGIS 16 -#define VGIC_NR_PPIS 16 -#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) -#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1) -#define VGIC_MAX_SPI 1019 -#define VGIC_MAX_RESERVED 1023 -#define VGIC_MIN_LPI 8192 -#define KVM_IRQCHIP_NUM_PINS (1020 - 32) - -enum vgic_type { - VGIC_V2, /* Good ol' GICv2 */ - VGIC_V3, /* New fancy GICv3 */ -}; - -/* same for all guests, as depending only on the _host's_ GIC model */ -struct vgic_global { - /* type of the host GIC */ - enum vgic_type type; - - /* Physical address of vgic virtual cpu interface */ - phys_addr_t vcpu_base; - - /* GICV mapping */ - void __iomem *vcpu_base_va; - - /* virtual control interface mapping */ - void __iomem *vctrl_base; - - /* Number of implemented list registers */ - int nr_lr; - - /* Maintenance IRQ number */ - unsigned int maint_irq; - - /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ - int max_gic_vcpus; - - /* Only needed for the legacy KVM_CREATE_IRQCHIP */ - bool can_emulate_gicv2; - - /* GIC system register CPU interface */ - struct static_key_false gicv3_cpuif; -}; - -extern struct vgic_global kvm_vgic_global_state; - -#define VGIC_V2_MAX_LRS (1 << 6) -#define VGIC_V3_MAX_LRS 16 -#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) - -enum vgic_irq_config { - VGIC_CONFIG_EDGE = 0, - VGIC_CONFIG_LEVEL -}; - -struct vgic_irq { - spinlock_t irq_lock; /* Protects the content of the struct */ - struct list_head lpi_list; /* Used to link all LPIs together */ - struct list_head ap_list; - - struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU - * SPIs and LPIs: The VCPU whose ap_list - * this is queued on. - */ - - struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should - * be sent to, as a result of the - * targets reg (v2) or the - * affinity reg (v3). - */ - - u32 intid; /* Guest visible INTID */ - bool pending; - bool line_level; /* Level only */ - bool soft_pending; /* Level only */ - bool active; /* not used for LPIs */ - bool enabled; - bool hw; /* Tied to HW IRQ */ - struct kref refcount; /* Used for LPIs */ - u32 hwintid; /* HW INTID number */ - union { - u8 targets; /* GICv2 target VCPUs mask */ - u32 mpidr; /* GICv3 target VCPU */ - }; - u8 source; /* GICv2 SGIs only */ - u8 priority; - enum vgic_irq_config config; /* Level or edge */ -}; - -struct vgic_register_region; -struct vgic_its; - -enum iodev_type { - IODEV_CPUIF, - IODEV_DIST, - IODEV_REDIST, - IODEV_ITS -}; - -struct vgic_io_device { - gpa_t base_addr; - union { - struct kvm_vcpu *redist_vcpu; - struct vgic_its *its; - }; - const struct vgic_register_region *regions; - enum iodev_type iodev_type; - int nr_regions; - struct kvm_io_device dev; -}; - -struct vgic_its { - /* The base address of the ITS control register frame */ - gpa_t vgic_its_base; - - bool enabled; - bool initialized; - struct vgic_io_device iodev; - struct kvm_device *dev; - - /* These registers correspond to GITS_BASER{0,1} */ - u64 baser_device_table; - u64 baser_coll_table; - - /* Protects the command queue */ - struct mutex cmd_lock; - u64 cbaser; - u32 creadr; - u32 cwriter; - - /* Protects the device and collection lists */ - struct mutex its_lock; - struct list_head device_list; - struct list_head collection_list; -}; - -struct vgic_dist { - bool in_kernel; - bool ready; - bool initialized; - - /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ - u32 vgic_model; - - /* Do injected MSIs require an additional device ID? */ - bool msis_require_devid; - - int nr_spis; - - /* TODO: Consider moving to global state */ - /* Virtual control interface mapping */ - void __iomem *vctrl_base; - - /* base addresses in guest physical address space: */ - gpa_t vgic_dist_base; /* distributor */ - union { - /* either a GICv2 CPU interface */ - gpa_t vgic_cpu_base; - /* or a number of GICv3 redistributor regions */ - gpa_t vgic_redist_base; - }; - - /* distributor enabled */ - bool enabled; - - struct vgic_irq *spis; - - struct vgic_io_device dist_iodev; - - bool has_its; - - /* - * Contains the attributes and gpa of the LPI configuration table. - * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share - * one address across all redistributors. - * GICv3 spec: 6.1.2 "LPI Configuration tables" - */ - u64 propbaser; - - /* Protects the lpi_list and the count value below. */ - spinlock_t lpi_list_lock; - struct list_head lpi_list_head; - int lpi_list_count; -}; - -struct vgic_v2_cpu_if { - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u64 vgic_eisr; /* Saved only */ - u64 vgic_elrsr; /* Saved only */ - u32 vgic_apr; - u32 vgic_lr[VGIC_V2_MAX_LRS]; -}; - -struct vgic_v3_cpu_if { - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_sre; /* Restored only, change ignored */ - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr; /* Saved only */ - u32 vgic_elrsr; /* Saved only */ - u32 vgic_ap0r[4]; - u32 vgic_ap1r[4]; - u64 vgic_lr[VGIC_V3_MAX_LRS]; -}; - -struct vgic_cpu { - /* CPU vif control registers for world switch */ - union { - struct vgic_v2_cpu_if vgic_v2; - struct vgic_v3_cpu_if vgic_v3; - }; - - unsigned int used_lrs; - struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; - - spinlock_t ap_list_lock; /* Protects the ap_list */ - - /* - * List of IRQs that this VCPU should consider because they are either - * Active or Pending (hence the name; AP list), or because they recently - * were one of the two and need to be migrated off this list to another - * VCPU. - */ - struct list_head ap_list_head; - - u64 live_lrs; - - /* - * Members below are used with GICv3 emulation only and represent - * parts of the redistributor. - */ - struct vgic_io_device rd_iodev; - struct vgic_io_device sgi_iodev; - - /* Contains the attributes and gpa of the LPI pending tables. */ - u64 pendbaser; - - bool lpis_enabled; -}; - -extern struct static_key_false vgic_v2_cpuif_trap; - -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); -void kvm_vgic_early_init(struct kvm *kvm); -int kvm_vgic_create(struct kvm *kvm, u32 type); -void kvm_vgic_destroy(struct kvm *kvm); -void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); -int kvm_vgic_map_resources(struct kvm *kvm); -int kvm_vgic_hyp_init(void); - -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level); -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level); -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); - -#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) -#define vgic_initialized(k) ((k)->arch.vgic.initialized) -#define vgic_ready(k) ((k)->arch.vgic.ready) -#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ - ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) - -bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); - -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); - -/** - * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW - * - * The host's GIC naturally limits the maximum amount of VCPUs a guest - * can use. - */ -static inline int kvm_vgic_get_max_vcpus(void) -{ - return kvm_vgic_global_state.max_gic_vcpus; -} - -int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); - -/** - * kvm_vgic_setup_default_irq_routing: - * Setup a default flat gsi routing table mapping all SPIs - */ -int kvm_vgic_setup_default_irq_routing(struct kvm *kvm); - -#endif /* __KVM_ARM_VGIC_H */ diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h index a6d208b..23228e9 100644..100755 --- a/include/kvm/iodev.h +++ b/include/kvm/iodev.h @@ -1,4 +1,6 @@ /* + * Copyright 2019 Google LLC + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. @@ -16,7 +18,6 @@ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> -#include <linux/errno.h> struct kvm_io_device; struct kvm_vcpu; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01c0b9c..9fbffc8 100644..100755 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1,3 +1,7 @@ +/* + * Copyright 2019 Google LLC + */ + #ifndef __KVM_HOST_H #define __KVM_HOST_H @@ -6,37 +10,17 @@ * the COPYING file in the top-level directory. */ -#include <linux/types.h> -#include <linux/hardirq.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/bug.h> -#include <linux/mm.h> -#include <linux/mmu_notifier.h> -#include <linux/preempt.h> -#include <linux/msi.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> -#include <linux/ratelimit.h> -#include <linux/err.h> -#include <linux/irqflags.h> -#include <linux/context_tracking.h> -#include <linux/irqbypass.h> -#include <linux/swait.h> -#include <asm/signal.h> - -#include <linux/kvm.h> -#include <linux/kvm_para.h> +#include <uapi/linux/kvm.h> #include <linux/kvm_types.h> #include <asm/kvm_host.h> -#ifndef KVM_MAX_VCPU_ID -#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS +#include <gvm-main.h> +#include <ntkrutils.h> + +#ifndef GVM_MAX_VCPU_ID +#define GVM_MAX_VCPU_ID GVM_MAX_VCPUS #endif /* @@ -44,14 +28,14 @@ * in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ -#define KVM_MEMSLOT_INVALID (1UL << 16) -#define KVM_MEMSLOT_INCOHERENT (1UL << 17) +#define GVM_MEMSLOT_INVALID (1ULL << 16) +#define GVM_MEMSLOT_INCOHERENT (1ULL << 17) /* Two fragments for cross MMIO pages. */ -#define KVM_MAX_MMIO_FRAGMENTS 2 +#define GVM_MAX_MMIO_FRAGMENTS 2 -#ifndef KVM_ADDRESS_SPACE_NUM -#define KVM_ADDRESS_SPACE_NUM 1 +#ifndef GVM_ADDRESS_SPACE_NUM +#define GVM_ADDRESS_SPACE_NUM 1 #endif /* @@ -59,13 +43,13 @@ * so we can mask bit 62 ~ bit 52 to indicate the error pfn, * mask bit 63 to indicate the noslot pfn. */ -#define KVM_PFN_ERR_MASK (0x7ffULL << 52) -#define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) -#define KVM_PFN_NOSLOT (0x1ULL << 63) +#define GVM_PFN_ERR_MASK (0x7ffULL << 52) +#define GVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) +#define GVM_PFN_NOSLOT (0x1ULL << 63) -#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) -#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) -#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) +#define GVM_PFN_ERR_FAULT (GVM_PFN_ERR_MASK) +#define GVM_PFN_ERR_HWPOISON (GVM_PFN_ERR_MASK + 1) +#define GVM_PFN_ERR_RO_FAULT (GVM_PFN_ERR_MASK + 2) /* * error pfns indicate that the gfn is in slot but faild to @@ -73,7 +57,7 @@ */ static inline bool is_error_pfn(kvm_pfn_t pfn) { - return !!(pfn & KVM_PFN_ERR_MASK); + return !!(pfn & GVM_PFN_ERR_MASK); } /* @@ -83,32 +67,29 @@ static inline bool is_error_pfn(kvm_pfn_t pfn) */ static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) { - return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); + return !!(pfn & GVM_PFN_ERR_NOSLOT_MASK); } /* noslot pfn indicates that the gfn is not in slot. */ static inline bool is_noslot_pfn(kvm_pfn_t pfn) { - return pfn == KVM_PFN_NOSLOT; + return pfn == GVM_PFN_NOSLOT; } /* - * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390) - * provide own defines and kvm_is_error_hva + * According to Windows Virtual Space, it is the middle of [0, 2^64-1] + * that is least likely to be used. We grab two to server as our + * bad hva. */ -#ifndef KVM_HVA_ERR_BAD - -#define KVM_HVA_ERR_BAD (PAGE_OFFSET) -#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) +#define GVM_HVA_ERR_BAD (0x8000000000000000) +#define GVM_HVA_ERR_RO_BAD (GVM_HVA_ERR_BAD + PAGE_SIZE) -static inline bool kvm_is_error_hva(unsigned long addr) +static inline bool kvm_is_error_hva(size_t addr) { - return addr >= PAGE_OFFSET; + return addr == GVM_HVA_ERR_BAD || addr == GVM_HVA_ERR_RO_BAD; } -#endif - -#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) +#define GVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) static inline bool is_error_page(struct page *page) { @@ -119,13 +100,12 @@ static inline bool is_error_page(struct page *page) * Architecture-independent vcpu->requests bit members * Bits 4-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH 0 -#define KVM_REQ_MMU_RELOAD 1 -#define KVM_REQ_PENDING_TIMER 2 -#define KVM_REQ_UNHALT 3 +#define GVM_REQ_TLB_FLUSH 0 +#define GVM_REQ_MMU_RELOAD 1 +#define GVM_REQ_PENDING_TIMER 2 +#define GVM_REQ_UNHALT 3 -#define KVM_USERSPACE_IRQ_SOURCE_ID 0 -#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 +#define GVM_USERSPACE_IRQ_SOURCE_ID 0 extern struct kmem_cache *kvm_vcpu_cache; @@ -142,16 +122,15 @@ struct kvm_io_range { struct kvm_io_bus { int dev_count; - int ioeventfd_count; struct kvm_io_range range[]; }; enum kvm_bus { - KVM_MMIO_BUS, - KVM_PIO_BUS, - KVM_VIRTIO_CCW_NOTIFY_BUS, - KVM_FAST_MMIO_BUS, - KVM_NR_BUSES + GVM_MMIO_BUS, + GVM_PIO_BUS, + GVM_VIRTIO_CCW_NOTIFY_BUS, + GVM_FAST_MMIO_BUS, + GVM_NR_BUSES }; int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, @@ -167,26 +146,6 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); -#ifdef CONFIG_KVM_ASYNC_PF -struct kvm_async_pf { - struct work_struct work; - struct list_head link; - struct list_head queue; - struct kvm_vcpu *vcpu; - struct mm_struct *mm; - gva_t gva; - unsigned long addr; - struct kvm_arch_async_pf arch; - bool wakeup_all; -}; - -void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); -void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch); -int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); -#endif - enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, @@ -206,66 +165,37 @@ struct kvm_mmio_fragment { struct kvm_vcpu { struct kvm *kvm; -#ifdef CONFIG_PREEMPT_NOTIFIERS - struct preempt_notifier preempt_notifier; -#endif int cpu; int vcpu_id; int srcu_idx; int mode; - unsigned long requests; - unsigned long guest_debug; + size_t requests; + size_t guest_debug; int pre_pcpu; struct list_head blocked_vcpu_list; struct mutex mutex; struct kvm_run *run; + size_t run_userva; - int fpu_active; - int guest_fpu_loaded, guest_xcr0_loaded; - unsigned char fpu_counter; - struct swait_queue_head wq; - struct pid *pid; - int sigset_active; - sigset_t sigset; + int guest_xcr0_loaded; + KEVENT kick_event; + u64 blocked; + PETHREAD thread; + KAPC apc; struct kvm_vcpu_stat stat; - unsigned int halt_poll_ns; bool valid_wakeup; -#ifdef CONFIG_HAS_IOMEM int mmio_needed; int mmio_read_completed; int mmio_is_write; int mmio_cur_fragment; int mmio_nr_fragments; - struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS]; -#endif - -#ifdef CONFIG_KVM_ASYNC_PF - struct { - u32 queued; - struct list_head queue; - struct list_head done; - spinlock_t lock; - } async_pf; -#endif + struct kvm_mmio_fragment mmio_fragments[GVM_MAX_MMIO_FRAGMENTS]; -#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT - /* - * Cpu relax intercept or pause loop exit optimization - * in_spin_loop: set when a vcpu does a pause loop exit - * or cpu relax intercepted. - * dy_eligible: indicates whether vcpu is eligible for directed yield. - */ - struct { - bool in_spin_loop; - bool dy_eligible; - } spin_loop; -#endif bool preempted; struct kvm_vcpu_arch arch; - struct dentry *debugfs_dentry; }; static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) @@ -277,36 +207,38 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) * Some of the bitops functions do not support too long bitmaps. * This number must be determined not to exceed such limits. */ -#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) +#define GVM_MEM_MAX_NR_PAGES ((1ULL << 31) - 1) + +struct pmem_lock { + /* Lock to prevent multiple fault in to the same pfn + * but allow to different pfns. + */ + spinlock_t lock; + PMDL lock_mdl; +}; struct kvm_memory_slot { gfn_t base_gfn; - unsigned long npages; - unsigned long *dirty_bitmap; + size_t npages; + size_t *dirty_bitmap; struct kvm_arch_memory_slot arch; - unsigned long userspace_addr; + size_t userspace_addr; u32 flags; short id; + struct pmem_lock *pmem_lock; + /* A link back to KVM for rp_bitmap */ + struct kvm *kvm; }; -static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) +#define ALIGN(x, mask) (((x) + (mask - 1)) & ~(mask - 1)) +#define IS_ALIGNED(x, a) (((x) & ((u64)(a) - 1)) == 0) +#define PAGE_ALIGNED(addr) IS_ALIGNED((size_t)(addr), PAGE_SIZE) + +static inline size_t kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) { - return ALIGN(memslot->npages, BITS_PER_LONG) / 8; + return ALIGN(memslot->npages, (size_t)BITS_PER_LONG) / 8; } -struct kvm_s390_adapter_int { - u64 ind_addr; - u64 summary_addr; - u64 ind_offset; - u32 summary_offset; - u32 adapter_id; -}; - -struct kvm_hv_sint { - u32 vcpu; - u32 sint; -}; - struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -325,15 +257,12 @@ struct kvm_kernel_irq_routing_entry { u32 flags; u32 devid; } msi; - struct kvm_s390_adapter_int adapter; - struct kvm_hv_sint hv_sint; }; struct hlist_node link; }; -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + int chip[GVM_NR_IRQCHIPS][GVM_IRQCHIP_NUM_PINS]; u32 nr_rt_entries; /* * Array indexed by gsi. Each entry contains list of irq chips @@ -341,17 +270,16 @@ struct kvm_irq_routing_table { */ struct hlist_head map[0]; }; -#endif -#ifndef KVM_PRIVATE_MEM_SLOTS -#define KVM_PRIVATE_MEM_SLOTS 0 +#ifndef GVM_PRIVATE_MEM_SLOTS +#define GVM_PRIVATE_MEM_SLOTS 0 #endif -#ifndef KVM_MEM_SLOTS_NUM -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) +#ifndef GVM_MEM_SLOTS_NUM +#define GVM_MEM_SLOTS_NUM (GVM_USER_MEM_SLOTS + GVM_PRIVATE_MEM_SLOTS) #endif -#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#ifndef __GVM_VCPU_MULTIPLE_ADDRESS_SPACE static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; @@ -365,9 +293,9 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) */ struct kvm_memslots { u64 generation; - struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM]; + struct kvm_memory_slot memslots[GVM_MEM_SLOTS_NUM]; /* The mapping table from slot id to the index in memslots[]. */ - short id_to_index[KVM_MEM_SLOTS_NUM]; + short id_to_index[GVM_MEM_SLOTS_NUM]; atomic_t lru_slot; int used_slots; }; @@ -375,15 +303,16 @@ struct kvm_memslots { struct kvm { spinlock_t mmu_lock; struct mutex slots_lock; - struct mm_struct *mm; /* userspace tied to this vm */ - struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; - struct srcu_struct srcu; - struct srcu_struct irq_srcu; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + PEPROCESS process; + u64 vm_id; + struct kvm_memslots *memslots[GVM_ADDRESS_SPACE_NUM]; + size_t *rp_bitmap; + u64 rp_bitmap_size; + struct kvm_vcpu *vcpus[GVM_MAX_VCPUS]; /* * created_vcpus is protected by kvm->lock, and is incremented - * at the beginning of KVM_CREATE_VCPU. online_vcpus is only + * at the beginning of GVM_CREATE_VCPU. online_vcpus is only * incremented after storing the kvm_vcpu pointer in vcpus, * and is accessed atomically. */ @@ -392,56 +321,31 @@ struct kvm { int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; - struct kvm_io_bus *buses[KVM_NR_BUSES]; -#ifdef CONFIG_HAVE_KVM_EVENTFD - struct { - spinlock_t lock; - struct list_head items; - struct list_head resampler_list; - struct mutex resampler_lock; - } irqfds; - struct list_head ioeventfds; -#endif + struct kvm_io_bus *buses[GVM_NR_BUSES]; struct kvm_vm_stat stat; struct kvm_arch arch; atomic_t users_count; -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; - spinlock_t ring_lock; - struct list_head coalesced_zones; -#endif struct mutex irq_lock; -#ifdef CONFIG_HAVE_KVM_IRQCHIP /* * Update side is protected by irq_lock. */ - struct kvm_irq_routing_table __rcu *irq_routing; -#endif -#ifdef CONFIG_HAVE_KVM_IRQFD - struct hlist_head irq_ack_notifier_list; -#endif + struct kvm_irq_routing_table *irq_routing; -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - struct mmu_notifier mmu_notifier; - unsigned long mmu_notifier_seq; - long mmu_notifier_count; -#endif long tlbs_dirty; - struct list_head devices; - struct dentry *debugfs_dentry; - struct kvm_stat_data **debugfs_stat_data; + struct srcu_struct srcu; + struct srcu_struct irq_srcu; }; #define kvm_err(fmt, ...) \ - pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) + pr_err("kvm: " fmt, ## __VA_ARGS__) #define kvm_info(fmt, ...) \ - pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) + pr_info("kvm: " fmt, ## __VA_ARGS__) #define kvm_debug(fmt, ...) \ - pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) + pr_debug("kvm: " fmt, ## __VA_ARGS__) #define kvm_pr_unimpl(fmt, ...) \ - pr_err_ratelimited("kvm [%i]: " fmt, \ - task_tgid_nr(current), ## __VA_ARGS__) + pr_err_ratelimited("kvm: " fmt, \ + ## __VA_ARGS__) /* The guest did something we don't support. */ #define vcpu_unimpl(vcpu, fmt, ...) \ @@ -476,7 +380,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) if (id < 0) return NULL; - if (id < KVM_MAX_VCPUS) + if (id < GVM_MAX_VCPUS) vcpu = kvm_get_vcpu(kvm, id); if (vcpu && vcpu->vcpu_id == id) return vcpu; @@ -486,9 +390,23 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } +static inline struct kvm_vcpu *kvm_get_vcpu_by_thread(struct kvm *kvm, + PETHREAD thread) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + if (!thread < 0) + return NULL; + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->thread == thread) + return vcpu; + return NULL; +} + #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ - memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ + memslot < slots->memslots + GVM_MEM_SLOTS_NUM && memslot->npages;\ memslot++) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); @@ -497,33 +415,10 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -#ifdef __KVM_HAVE_IOAPIC void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); -#else -static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) -{ -} -static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ -} -#endif -#ifdef CONFIG_HAVE_KVM_IRQFD -int kvm_irqfd_init(void); -void kvm_irqfd_exit(void); -#else -static inline int kvm_irqfd_init(void) -{ - return 0; -} - -static inline void kvm_irqfd_exit(void) -{ -} -#endif -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module); +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); @@ -531,9 +426,12 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { + return kvm->memslots[as_id]; +#if 0 return rcu_dereference_check(kvm->memslots[as_id], srcu_read_lock_held(&kvm->srcu) || lockdep_is_held(&kvm->slots_lock)); +#endif } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) @@ -561,7 +459,7 @@ id_to_memslot(struct kvm_memslots *slots, int id) } /* - * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: + * GVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot * - delete an existing memory slot * - modify an existing memory slot @@ -572,10 +470,10 @@ id_to_memslot(struct kvm_memslots *slots, int id) * differentiation is the best we can do for __kvm_set_memory_region(): */ enum kvm_mr_change { - KVM_MR_CREATE, - KVM_MR_DELETE, - KVM_MR_MOVE, - KVM_MR_FLAGS_ONLY, + GVM_MR_CREATE, + GVM_MR_DELETE, + GVM_MR_MOVE, + GVM_MR_FLAGS_ONLY, }; int kvm_set_memory_region(struct kvm *kvm, @@ -585,7 +483,7 @@ int __kvm_set_memory_region(struct kvm *kvm, void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages); + size_t npages); void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -596,27 +494,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); -bool kvm_largepages_enabled(void); -void kvm_disable_largepages(void); /* flush all memory translations */ void kvm_arch_flush_shadow_all(struct kvm *kvm); /* flush memory translations pointing to 'slot' */ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages); +int gfn_to_pfn_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, + pfn_t *pfn, int nr_pages); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); -unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); -unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); -unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); -unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, +size_t gfn_to_hva(struct kvm *kvm, gfn_t gfn); +size_t gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); +size_t gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); +size_t gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); -void kvm_release_page_clean(struct page *page); -void kvm_release_page_dirty(struct page *page); -void kvm_set_page_accessed(struct page *page); - kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, @@ -627,31 +518,24 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable); -void kvm_release_pfn_clean(kvm_pfn_t pfn); -void kvm_set_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_get_pfn(kvm_pfn_t pfn); - int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len); -int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, size_t len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); + void *data, size_t len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, - unsigned long len); + size_t len); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); + void *data, size_t len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len); + gpa_t gpa, size_t len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, size_t len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); +size_t kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); @@ -659,18 +543,18 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); -unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); -unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); +size_t kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); +size_t kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len); int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, - unsigned long len); + size_t len); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, - unsigned long len); + size_t len); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, - unsigned long len); + size_t len); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); @@ -681,17 +565,16 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_save_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); +long kvm_arch_dev_ioctl(struct gvm_device_extension *devext, PIRP pIrp, + unsigned int ioctl); +long kvm_arch_vcpu_ioctl(struct gvm_device_extension *devext, PIRP pIrp, + unsigned int ioctl); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -704,15 +587,14 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, - unsigned long mask); + size_t mask); -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log); +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); +long kvm_arch_vm_ioctl(struct gvm_device_extension *devext, PIRP pIrp, + unsigned int ioctl); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); @@ -750,9 +632,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); -bool kvm_arch_has_vcpu_debugfs(void); -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); - int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); int kvm_arch_hardware_setup(void); @@ -761,9 +640,9 @@ void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -void *kvm_kvzalloc(unsigned long size); +void *kvm_kvzalloc(size_t size); -#ifndef __KVM_HAVE_ARCH_VM_ALLOC +#ifndef __GVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { return kzalloc(sizeof(struct kvm), GFP_KERNEL); @@ -775,53 +654,7 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif -#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA -void kvm_arch_register_noncoherent_dma(struct kvm *kvm); -void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); -bool kvm_arch_has_noncoherent_dma(struct kvm *kvm); -#else -static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm) -{ -} - -static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) -{ -} - -static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) -{ - return false; -} -#endif -#ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE -void kvm_arch_start_assignment(struct kvm *kvm); -void kvm_arch_end_assignment(struct kvm *kvm); -bool kvm_arch_has_assigned_device(struct kvm *kvm); -#else -static inline void kvm_arch_start_assignment(struct kvm *kvm) -{ -} - -static inline void kvm_arch_end_assignment(struct kvm *kvm) -{ -} - -static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) -{ - return false; -} -#endif - -static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) -{ -#ifdef __KVM_HAVE_ARCH_WQP - return vcpu->arch.wqp; -#else - return &vcpu->wq; -#endif -} - -#ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED +#ifdef __GVM_HAVE_ARCH_INTC_INITIALIZED /* * returns true if the virtual interrupt controller is initialized and * ready to accept virtual IRQ. On some architectures the virtual interrupt @@ -835,21 +668,12 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) } #endif -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); +int kvm_arch_init_vm(struct kvm *kvm, size_t type); void kvm_arch_destroy_vm(struct kvm *kvm); -void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_is_reserved_pfn(kvm_pfn_t pfn); - -struct kvm_irq_ack_notifier { - struct hlist_node link; - unsigned gsi; - void (*irq_acked)(struct kvm_irq_ack_notifier *kian); -}; - int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi); int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); @@ -864,29 +688,9 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian); -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -#else -static inline int kvm_iommu_map_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - return 0; -} - -static inline void kvm_iommu_unmap_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} -#endif - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. @@ -928,7 +732,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) return search_memslots(slots, gfn); } -static inline unsigned long +static inline size_t __gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; @@ -940,7 +744,7 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) } static inline gfn_t -hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) +hva_to_gfn_memslot(size_t hva, struct kvm_memory_slot *slot) { gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT; @@ -964,31 +768,13 @@ static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) { - unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + size_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); return kvm_is_error_hva(hva); } -enum kvm_stat_kind { - KVM_STAT_VM, - KVM_STAT_VCPU, -}; - -struct kvm_stat_data { - int offset; - struct kvm *kvm; -}; - -struct kvm_stats_debugfs_item { - const char *name; - int offset; - enum kvm_stat_kind kind; -}; -extern struct kvm_stats_debugfs_item debugfs_entries[]; -extern struct dentry *kvm_debugfs_dir; - -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) -static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) +#if defined(CONFIG_MMU_NOTIFIER) && defined(GVM_ARCH_WANT_MMU_NOTIFIER) +static inline int mmu_notifier_retry(struct kvm *kvm, size_t mmu_seq) { if (unlikely(kvm->mmu_notifier_count)) return 1; @@ -998,7 +784,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) * mmu_notifier_invalidate_range_end to make sure that the caller * either sees the old (non-zero) value of mmu_notifier_count or * the new (incremented) value of mmu_notifier_seq. - * PowerPC Book3s HV KVM calls this under a per-page lock + * PowerPC Book3s HV kvm calls this under a per-page lock * rather than under kvm->mmu_lock, for scalability, so * can't rely on kvm->mmu_lock to keep things ordered. */ @@ -1009,15 +795,8 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) } #endif -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING -#ifdef CONFIG_S390 -#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that... -#elif defined(CONFIG_ARM64) -#define KVM_MAX_IRQ_ROUTES 4096 -#else -#define KVM_MAX_IRQ_ROUTES 1024 -#endif +#define GVM_MAX_IRQ_ROUTES 1024 int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, @@ -1028,57 +807,8 @@ int kvm_set_routing_entry(struct kvm *kvm, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); -#else - -static inline void kvm_free_irq_routing(struct kvm *kvm) {} - -#endif - int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); -#ifdef CONFIG_HAVE_KVM_EVENTFD - -void kvm_eventfd_init(struct kvm *kvm); -int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); - -#ifdef CONFIG_HAVE_KVM_IRQFD -int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); -void kvm_irqfd_release(struct kvm *kvm); -void kvm_irq_routing_update(struct kvm *); -#else -static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) -{ - return -EINVAL; -} - -static inline void kvm_irqfd_release(struct kvm *kvm) {} -#endif - -#else - -static inline void kvm_eventfd_init(struct kvm *kvm) {} - -static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) -{ - return -EINVAL; -} - -static inline void kvm_irqfd_release(struct kvm *kvm) {} - -#ifdef CONFIG_HAVE_KVM_IRQCHIP -static inline void kvm_irq_routing_update(struct kvm *kvm) -{ -} -#endif -void kvm_arch_irq_routing_update(struct kvm *kvm); - -static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) -{ - return -ENOSYS; -} - -#endif /* CONFIG_HAVE_KVM_EVENTFD */ - static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* @@ -1107,93 +837,7 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) extern bool kvm_rebooting; -struct kvm_device { - struct kvm_device_ops *ops; - struct kvm *kvm; - void *private; - struct list_head vm_node; -}; - -/* create, destroy, and name are mandatory */ -struct kvm_device_ops { - const char *name; - - /* - * create is called holding kvm->lock and any operations not suitable - * to do while holding the lock should be deferred to init (see - * below). - */ - int (*create)(struct kvm_device *dev, u32 type); - - /* - * init is called after create if create is successful and is called - * outside of holding kvm->lock. - */ - void (*init)(struct kvm_device *dev); - - /* - * Destroy is responsible for freeing dev. - * - * Destroy may be called before or after destructors are called - * on emulated I/O regions, depending on whether a reference is - * held by a vcpu or other kvm component that gets destroyed - * after the emulated I/O. - */ - void (*destroy)(struct kvm_device *dev); - - int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); - int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); - int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); - long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, - unsigned long arg); -}; - -void kvm_device_get(struct kvm_device *dev); -void kvm_device_put(struct kvm_device *dev); -struct kvm_device *kvm_device_from_filp(struct file *filp); -int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); -void kvm_unregister_device_ops(u32 type); - -extern struct kvm_device_ops kvm_mpic_ops; -extern struct kvm_device_ops kvm_xics_ops; -extern struct kvm_device_ops kvm_arm_vgic_v2_ops; -extern struct kvm_device_ops kvm_arm_vgic_v3_ops; - -#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT - -static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) -{ - vcpu->spin_loop.in_spin_loop = val; -} -static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) -{ - vcpu->spin_loop.dy_eligible = val; -} - -#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ - -static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) -{ -} - -static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) -{ -} -#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ - -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS -bool kvm_arch_has_irq_bypass(void); -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, - struct irq_bypass_producer *); -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, - struct irq_bypass_producer *); -void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); -void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); -#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ - -#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS +#ifdef CONFIG_HAVE_GVM_INVALID_WAKEUPS /* If we wakeup during the poll time, was it a sucessful poll? */ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { @@ -1205,6 +849,6 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return true; } -#endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ +#endif /* CONFIG_HAVE_GVM_INVALID_WAKEUPS */ #endif diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h deleted file mode 100644 index 0c1de05..0000000 --- a/include/linux/kvm_irqfd.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * irqfd: Allows an fd to be used to inject an interrupt to the guest - * Credit goes to Avi Kivity for the original idea. - */ - -#ifndef __LINUX_KVM_IRQFD_H -#define __LINUX_KVM_IRQFD_H - -#include <linux/kvm_host.h> -#include <linux/poll.h> - -/* - * Resampling irqfds are a special variety of irqfds used to emulate - * level triggered interrupts. The interrupt is asserted on eventfd - * trigger. On acknowledgment through the irq ack notifier, the - * interrupt is de-asserted and userspace is notified through the - * resamplefd. All resamplers on the same gsi are de-asserted - * together, so we don't need to track the state of each individual - * user. We can also therefore share the same irq source ID. - */ -struct kvm_kernel_irqfd_resampler { - struct kvm *kvm; - /* - * List of resampling struct _irqfd objects sharing this gsi. - * RCU list modified under kvm->irqfds.resampler_lock - */ - struct list_head list; - struct kvm_irq_ack_notifier notifier; - /* - * Entry in list of kvm->irqfd.resampler_list. Use for sharing - * resamplers among irqfds on the same gsi. - * Accessed and modified under kvm->irqfds.resampler_lock - */ - struct list_head link; -}; - -struct kvm_kernel_irqfd { - /* Used for MSI fast-path */ - struct kvm *kvm; - wait_queue_t wait; - /* Update side is protected by irqfds.lock */ - struct kvm_kernel_irq_routing_entry irq_entry; - seqcount_t irq_entry_sc; - /* Used for level IRQ fast-path */ - int gsi; - struct work_struct inject; - /* The resampler used by this irqfd (resampler-only) */ - struct kvm_kernel_irqfd_resampler *resampler; - /* Eventfd notified on resample (resampler-only) */ - struct eventfd_ctx *resamplefd; - /* Entry in list of irqfds for a resampler (resampler-only) */ - struct list_head resampler_link; - /* Used for setup/shutdown */ - struct eventfd_ctx *eventfd; - struct list_head list; - poll_table pt; - struct work_struct shutdown; - struct irq_bypass_consumer consumer; - struct irq_bypass_producer *producer; -}; - -#endif /* __LINUX_KVM_IRQFD_H */ diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h deleted file mode 100644 index 35e568f..0000000 --- a/include/linux/kvm_para.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __LINUX_KVM_PARA_H -#define __LINUX_KVM_PARA_H - -#include <uapi/linux/kvm_para.h> - - -static inline bool kvm_para_has_feature(unsigned int feature) -{ - return !!(kvm_arch_para_features() & (1UL << feature)); -} -#endif /* __LINUX_KVM_PARA_H */ diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 8bf259d..8f5b576 100644..100755 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -1,4 +1,6 @@ /* + * Copyright 2019 Google LLC + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. @@ -18,8 +20,6 @@ #define __KVM_TYPES_H__ struct kvm; -struct kvm_async_pf; -struct kvm_device_ops; struct kvm_interrupt; struct kvm_irq_routing_table; struct kvm_memory_slot; @@ -27,12 +27,11 @@ struct kvm_one_reg; struct kvm_run; struct kvm_userspace_memory_region; struct kvm_vcpu; -struct kvm_vcpu_init; struct kvm_memslots; enum kvm_mr_change; -#include <asm/types.h> +#include <gvm_types.h> /* * Address types: @@ -45,11 +44,11 @@ enum kvm_mr_change; * hfn - host frame number */ -typedef unsigned long gva_t; +typedef size_t gva_t; typedef u64 gpa_t; typedef u64 gfn_t; -typedef unsigned long hva_t; +typedef size_t hva_t; typedef u64 hpa_t; typedef u64 hfn_t; @@ -58,8 +57,8 @@ typedef hfn_t kvm_pfn_t; struct gfn_to_hva_cache { u64 generation; gpa_t gpa; - unsigned long hva; - unsigned long len; + size_t hva; + size_t len; struct kvm_memory_slot *memslot; }; diff --git a/include/linux/list.h b/include/linux/list.h new file mode 100755 index 0000000..4f38f41 --- /dev/null +++ b/include/linux/list.h @@ -0,0 +1,802 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* We don't want to include ntkrutil.h here as we do in other + * places, because nrktutil.h depends on list.h. + * So this header should be made to work with all dependencies + * in gvm_types.h. + */ +#include <gvm_types.h> + +/* +* These are non-NULL pointers that will result in page faults +* under normal circumstances, used to verify that nobody uses +* non-initialized list entries. +*/ +#define LIST_POISON1 ((void *) 0x100) +#define LIST_POISON2 ((void *) 0x200) + +struct list_head { + struct list_head *next, *prev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +/* + * since there is not typeof in MSVC C portion, have to + * ask programmer to define the type info befor calling + * to these macros. +*/ +#define typeof(a) LIST_ENTRY_TYPE_INFO +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + WRITE_ONCE(list->next, list); + list->prev = list; +} + +/* + * Insert a new__ entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +#ifndef CONFIG_DEBUG_LIST +static inline void __list_add(struct list_head *new__, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new__; + new__->next = next; + new__->prev = prev; + WRITE_ONCE(prev->next, new__); +} +#else +extern void __list_add(struct list_head *new__, + struct list_head *prev, + struct list_head *next); +#endif + +/** + * list_add - add a new__ entry + * @new__: new__ entry to be added + * @head: list head to add it after + * + * Insert a new__ entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new__, struct list_head *head) +{ + __list_add(new__, head, head->next); +} + + +/** + * list_add_tail - add a new__ entry + * @new__: new__ entry to be added + * @head: list head to add it before + * + * Insert a new__ entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new__, struct list_head *head) +{ + __list_add(new__, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + WRITE_ONCE(prev->next, next); +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +#ifndef CONFIG_DEBUG_LIST +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} +#else +extern void __list_del_entry(struct list_head *entry); +extern void list_del(struct list_head *entry); +#endif + +/** + * list_replace - replace old entry by new__ one + * @old : the element to be replaced + * @new__ : the new__ element to insert + * + * If @old was empty, it will be overwritten. + */ +static inline void list_replace(struct list_head *old, + struct list_head *new__) +{ + new__->next = old->next; + new__->next->prev = new__; + new__->prev = old->prev; + new__->prev->next = new__; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *new__) +{ + list_replace(old, new__); + INIT_LIST_HEAD(old); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +/** + * list_is_last - tests whether @list is the last entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + struct list_head *temp; + READ_ONCE(head->next, temp); + return temp == head; +} + +/** + * list_empty_careful - tests whether a list is empty and not being modified + * @head: the list to test + * + * Description: + * tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +/** + * list_rotate_left - rotate the list to the left + * @head: the head of the list + */ +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +/** + * list_is_singular - tests whether a list has just one entry. + * @head: the list to test. + */ +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + struct list_head *new___first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new___first; + new___first->prev = head; +} + +/** + * list_cut_position - cut a list into two + * @list: a new__ list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * and if so we won't cut the list + * + * This helper moves the initial part of @head, up to and + * including @entry, from @head to @list. You should + * pass on @entry an element you know is on @head. @list + * should be an empty list or a list you do not care about + * losing its data. + * + */ +static inline void list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && + (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +/** + * list_splice - join two lists, this is designed for stacks + * @list: the new__ list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(const struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +/** + * list_splice_tail - join two lists, each list being a queue + * @list: the new__ list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice_tail(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new__ list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +/** + * list_splice_tail_init - join two lists and reinitialise the emptied list + * @list: the new__ list to add. + * @head: the place to add it in the first list. + * + * Each of the lists is a queue. + * The list at @list is reinitialised + */ +static inline void list_splice_tail_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_last_entry - get the last element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = READ_ONCE(head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + +/** + * list_next_entry - get the next element in list + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. + */ +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) + +/** + * list_prev_entry - get the prev element in list + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. + */ +#define list_prev_entry(pos, member) \ + list_entry((pos)->member.prev, typeof(*(pos)), member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; \ + pos != (head); \ + pos = n, n = pos->prev) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) + +/** + * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() + * @pos: the type * to use as a start point + * @head: the head of the list + * @member: the name of the list_head within the struct. + * + * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). + */ +#define list_prepare_entry(pos, head, member) \ + ((pos) ? (pos) : list_entry(head, typeof(*pos), member)) + +/** + * list_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_continue_reverse - iterate backwards from the given point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Start to iterate over list of given type backwards, continuing after + * the current position. + */ +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) + +/** + * list_for_each_entry_from - iterate over list of given type from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + +/** + * list_for_each_entry_safe_continue - continue list iteration safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_next_entry(pos, member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + +/** + * list_for_each_entry_safe_from - iterate over list from current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + +/** + * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) + +/** + * list_safe_reset_next - reset a stale list_for_each_entry_safe loop + * @pos: the loop cursor used in the list_for_each_entry_safe loop + * @n: temporary storage used in list_for_each_entry_safe + * @member: the name of the list_head within the struct. + * + * list_safe_reset_next is not safe to use in general if the list may be + * modified concurrently (eg. the lock is dropped in the loop body). An + * exception to this is if the cursor element (pos) is pinned in the list, + * and list_safe_reset_next is called after re-taking the lock and before + * completing the current iteration of the loop body. + */ +#define list_safe_reset_next(pos, n, member) \ + n = list_next_entry(pos, member) + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + struct hlist_node *temp; + READ_ONCE(h->first, temp); + return !temp; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + + WRITE_ONCE(*pprev, next); + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + WRITE_ONCE(h->first, n); + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + WRITE_ONCE(*(n->pprev), n); +} + +static inline void hlist_add_behind(struct hlist_node *n, + struct hlist_node *prev) +{ + n->next = prev->next; + WRITE_ONCE(prev->next, n); + n->pprev = &prev->next; + + if (n->next) + n->next->pprev = &n->next; +} + +/* after that we'll appear to be on some hlist and hlist_del will work */ +static inline void hlist_add_fake(struct hlist_node *n) +{ + n->pprev = &n->next; +} + +static inline bool hlist_fake(struct hlist_node *h) +{ + return h->pprev == &h->next; +} + +/* + * Check whether the node is the only node of the head without + * accessing head: + */ +static inline bool +hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) +{ + return !n->next && n->pprev == &h->first; +} + +/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ +static inline void hlist_move_list(struct hlist_head *old, + struct hlist_head *new__) +{ + new__->first = old->first; + if (new__->first) + new__->first->pprev = &new__->first; + old->first = NULL; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos ; pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ + pos = n) + +#if 0 +#define hlist_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ + }) +#endif +#define hlist_entry_safe(ptr, type, member) \ + ( (ptr) ? hlist_entry(ptr, type, member) : NULL ) + +/** + * hlist_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_continue - iterate over a hlist continuing after current point + * @pos: the type * to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(pos, member) \ + for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_from - iterate over a hlist continuing from current point + * @pos: the type * to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(pos, member) \ + for (; pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ + pos; \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + +#endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h new file mode 100755 index 0000000..9213889 --- /dev/null +++ b/include/uapi/linux/kvm.h @@ -0,0 +1,567 @@ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_KVM_H +#define __LINUX_KVM_H + +/* + * Userspace interface for /dev/kvm - kernel based virtual machine + * + * Note: you must update GVM_API_VERSION if you change this interface. + */ + +#include <uapi/asm/kvm.h> +#include <gvm_types.h> +#include <gvm_ver.h> + +/* for GVM_CREATE_MEMORY_REGION */ +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for GVM_SET_USER_MEMORY_REGION */ +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* + * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, + * other bits are reserved for kvm internal use which are defined in + * include/linux/kvm_host.h. + */ +#define GVM_MEM_LOG_DIRTY_PAGES (1ULL << 0) +#define GVM_MEM_READONLY (1ULL << 1) + +/* for GVM_IRQ_LINE */ +struct kvm_irq_level { + /* + * ACPI gsi notion of irq. + * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. + * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. + * For ARM: See Documentation/virtual/kvm/api.txt + */ + union { + __u32 irq; + __s32 status; + }; + __u32 level; +}; + + +struct kvm_irqchip { + __u32 chip_id; + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +#define GVM_EXIT_UNKNOWN 0 +#define GVM_EXIT_EXCEPTION 1 +#define GVM_EXIT_IO 2 +#define GVM_EXIT_HYPERCALL 3 +#define GVM_EXIT_DEBUG 4 +#define GVM_EXIT_HLT 5 +#define GVM_EXIT_MMIO 6 +#define GVM_EXIT_IRQ_WINDOW_OPEN 7 +#define GVM_EXIT_SHUTDOWN 8 +#define GVM_EXIT_FAIL_ENTRY 9 +#define GVM_EXIT_INTR 10 +#define GVM_EXIT_SET_TPR 11 +#define GVM_EXIT_TPR_ACCESS 12 +#define GVM_EXIT_NMI 16 +#define GVM_EXIT_INTERNAL_ERROR 17 +#define GVM_EXIT_OSI 18 +#define GVM_EXIT_PAPR_HCALL 19 +#define GVM_EXIT_WATCHDOG 21 +#define GVM_EXIT_EPR 23 +#define GVM_EXIT_SYSTEM_EVENT 24 +#define GVM_EXIT_IOAPIC_EOI 26 +#define GVM_EXIT_RAM_PROT 27 + +/* For GVM_EXIT_INTERNAL_ERROR */ +/* Emulate instruction failed. */ +#define GVM_INTERNAL_ERROR_EMULATION 1 +/* Encounter unexpected simultaneous exceptions. */ +#define GVM_INTERNAL_ERROR_SIMUL_EX 2 +/* Encounter unexpected vm-exit due to delivery event. */ +#define GVM_INTERNAL_ERROR_DELIVERY_EV 3 + +/* for GVM_RUN, returned by mmap(vcpu_fd, offset=0) */ +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + __u8 user_event_pending; + __u8 padding1[6]; + + /* out */ + __u32 exit_reason; + __u8 ready_for_interrupt_injection; + __u8 if_flag; + __u16 flags; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + __u64 apic_base; + + union { + /* GVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + /* GVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + /* GVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* GVM_EXIT_IO */ + struct { +#define GVM_EXIT_IO_IN 0 +#define GVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + /* GVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + /* GVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + /* GVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + /* GVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + /* GVM_EXIT_INTERNAL_ERROR */ + struct { + __u32 suberror; + /* Available with GVM_CAP_INTERNAL_ERROR_DATA: */ + __u32 ndata; + __u64 data[16]; + } internal; + /* GVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + /* GVM_EXIT_PAPR_HCALL */ + struct { + __u64 nr; + __u64 ret; + __u64 args[9]; + } papr_hcall; + /* GVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + /* GVM_EXIT_SYSTEM_EVENT */ + struct { +#define GVM_SYSTEM_EVENT_SHUTDOWN 1 +#define GVM_SYSTEM_EVENT_RESET 2 +#define GVM_SYSTEM_EVENT_CRASH 3 + __u32 type; + __u64 flags; + } system_event; + /* GVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + /* GVM_EXIT_RAM_PROT */ + struct { + __u64 gfn; + } rp; + /* Fix the size of the union. */ + char padding[256]; + }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[2048]; + } s; +}; + +/* for GVM_TRANSLATE */ +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +/* for GVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +/* for GVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +/* for GVM_TPR_ACCESS_REPORTING */ +struct kvm_tpr_access_ctl { + __u32 enabled; + __u32 flags; + __u32 reserved[8]; +}; + +/* for GVM_SET_VAPIC_ADDR */ +struct kvm_vapic_addr { + __u64 vapic_addr; +}; + +/* for GVM_SET_MP_STATE */ + +/* not all states are valid on all architectures */ +#define GVM_MP_STATE_RUNNABLE 0 +#define GVM_MP_STATE_UNINITIALIZED 1 +#define GVM_MP_STATE_INIT_RECEIVED 2 +#define GVM_MP_STATE_HALTED 3 +#define GVM_MP_STATE_SIPI_RECEIVED 4 +#define GVM_MP_STATE_STOPPED 5 +#define GVM_MP_STATE_CHECK_STOP 6 +#define GVM_MP_STATE_OPERATING 7 +#define GVM_MP_STATE_LOAD 8 + +struct kvm_mp_state { + __u32 mp_state; +}; + +/* for GVM_SET_GUEST_DEBUG */ + +#define GVM_GUESTDBG_ENABLE 0x00000001 +#define GVM_GUESTDBG_SINGLESTEP 0x00000002 + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + +/* for GVM_ENABLE_CAP */ +struct kvm_enable_cap { + /* in */ + __u32 cap; + __u32 flags; + __u64 args[4]; + __u8 pad[64]; +}; + +#define KVMIO 0xAE + +/* + * ioctls for /dev/kvm fds: + */ +#define GVM_GET_API_VERSION _IO(KVMIO, 0x00) +#define GVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define GVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) +/* + * Check if a kvm extension is available. Argument is extension number, + * return is 1 (yes) or 0 (no, sorry). + */ +#define GVM_CHECK_EXTENSION _IO(KVMIO, 0x03) +/* + * Get size for mmap(vcpu_fd) + */ +#define GVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ +#define GVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid) +#define GVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid) +/* + * Extension capability list. + */ +#define GVM_CAP_IRQCHIP 0 +#define GVM_CAP_HLT 1 +#define GVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 +#define GVM_CAP_VAPIC 6 +#define GVM_CAP_NR_VCPUS 9 /* returns recommended max vcpus per vm */ +#define GVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ +#define GVM_CAP_NOP_IO_DELAY 12 +#define GVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#define GVM_CAP_USER_NMI 22 +#ifdef __GVM_HAVE_GUEST_DEBUG +#define GVM_CAP_SET_GUEST_DEBUG 23 +#endif +#define GVM_CAP_REINJECT_CONTROL 24 +#define GVM_CAP_IRQ_ROUTING 25 +#define GVM_CAP_SET_BOOT_CPU_ID 34 +#define GVM_CAP_SET_IDENTITY_MAP_ADDR 37 +#ifdef __GVM_HAVE_VCPU_EVENTS +#define GVM_CAP_VCPU_EVENTS 41 +#endif +#define GVM_CAP_PCI_SEGMENT 47 +#define GVM_CAP_INTR_SHADOW 49 +#ifdef __GVM_HAVE_DEBUGREGS +#define GVM_CAP_DEBUGREGS 50 +#endif +#define GVM_CAP_X86_ROBUST_SINGLESTEP 51 +#define GVM_CAP_ENABLE_CAP 54 +#ifdef __GVM_HAVE_XSAVE +#define GVM_CAP_XSAVE 55 +#endif +#ifdef __GVM_HAVE_XCRS +#define GVM_CAP_XCRS 56 +#endif +#define GVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define GVM_CAP_SW_TLB 69 +#define GVM_CAP_SYNC_REGS 74 +#define GVM_CAP_SIGNAL_MSI 77 +#define GVM_CAP_READONLY_MEM 81 +#define GVM_CAP_EXT_EMUL_CPUID 95 +#define GVM_CAP_IOAPIC_POLARITY_IGNORED 97 +#define GVM_CAP_ENABLE_CAP_VM 98 +#define GVM_CAP_VM_ATTRIBUTES 101 +#define GVM_CAP_DISABLE_QUIRKS 116 +#define GVM_CAP_X86_SMM 117 +#define GVM_CAP_MULTI_ADDRESS_SPACE 118 +#define GVM_CAP_GUEST_DEBUG_HW_BPS 119 +#define GVM_CAP_GUEST_DEBUG_HW_WPS 120 +#define GVM_CAP_VCPU_ATTRIBUTES 127 +#define GVM_CAP_MAX_VCPU_ID 128 + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + union { + __u32 pad; + __u32 devid; + }; +}; + +struct kvm_irq_routing_hv_sint { + __u32 vcpu; + __u32 sint; +}; + +/* gsi routing entry types */ +#define GVM_IRQ_ROUTING_IRQCHIP 1 +#define GVM_IRQ_ROUTING_MSI 2 + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_hv_sint hv_sint; + __u32 pad[8]; + } u; +}; + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +/* For GVM_CAP_SW_TLB */ + +#define GVM_MMU_FSL_BOOKE_NOHV 0 +#define GVM_MMU_FSL_BOOKE_HV 1 + +struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; +}; + +struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; +}; + +/* Available with GVM_CAP_ONE_REG */ + +#define GVM_REG_ARCH_MASK 0xff00000000000000ULL +#define GVM_REG_GENERIC 0x0000000000000000ULL + +/* + * Architecture specific registers are to be defined in arch headers and + * ORed with the arch identifier. + */ +#define GVM_REG_X86 0x2000000000000000ULL + +#define GVM_REG_SIZE_SHIFT 52 +#define GVM_REG_SIZE_MASK 0x00f0000000000000ULL +#define GVM_REG_SIZE_U8 0x0000000000000000ULL +#define GVM_REG_SIZE_U16 0x0010000000000000ULL +#define GVM_REG_SIZE_U32 0x0020000000000000ULL +#define GVM_REG_SIZE_U64 0x0030000000000000ULL +#define GVM_REG_SIZE_U128 0x0040000000000000ULL +#define GVM_REG_SIZE_U256 0x0050000000000000ULL +#define GVM_REG_SIZE_U512 0x0060000000000000ULL +#define GVM_REG_SIZE_U1024 0x0070000000000000ULL + +struct kvm_reg_list { + __u64 n; /* number of regs */ + __u64 reg[0]; +}; + +struct kvm_one_reg { + __u64 id; + __u64 addr; +}; + +#define GVM_MSI_VALID_DEVID (1U << 0) +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u32 devid; + __u8 pad[12]; +}; + +#define RP_NOACCESS 0 +#define RP_RDWREX 7 +struct gvm_ram_protect { + __u64 pa; + __u64 size; + __u32 flags; + __u32 reserved; +}; + +/* + * ioctls for VM fds + */ +#define GVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) +/* + * GVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns + * a vcpu fd. + */ +#define GVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define GVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) +/* GVM_SET_MEMORY_ALIAS is obsolete: */ +#define GVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define GVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) +#define GVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define GVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ + struct kvm_userspace_memory_region) +#define GVM_SET_TSS_ADDR _IO(KVMIO, 0x47) +#define GVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define GVM_KICK_VCPU _IO(KVMIO, 0x49) +#define GVM_RAM_PROTECT _IOW(KVMIO, 0x50, struct gvm_ram_protect) + +/* Device model IOC */ +#define GVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) +#define GVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip) +#define GVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip) +#define GVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level) +#define GVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) +#define GVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) +/* Available with GVM_CAP_SIGNAL_MSI */ +#define GVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) + +/* + * ioctls for vcpu fds + */ +#define GVM_RUN _IO(KVMIO, 0x80) +#define GVM_VCPU_MMAP _IO(KVMIO, 0x87) +#define GVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define GVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define GVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define GVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define GVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define GVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +#define GVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) +#define GVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define GVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) +#define GVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) +#define GVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) +#define GVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) +#define GVM_SET_CPUID _IOW(KVMIO, 0x90, struct kvm_cpuid) +#define GVM_GET_CPUID _IOWR(KVMIO, 0x91, struct kvm_cpuid) +/* Available with GVM_CAP_VAPIC */ +#define GVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) +/* Available with GVM_CAP_VAPIC */ +#define GVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) +#define GVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) +#define GVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) +/* Available with GVM_CAP_USER_NMI */ +#define GVM_NMI _IO(KVMIO, 0x9a) +/* Available with GVM_CAP_SET_GUEST_DEBUG */ +#define GVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) +/* Available with GVM_CAP_VCPU_EVENTS */ +#define GVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) +#define GVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) +/* Available with GVM_CAP_DEBUGREGS */ +#define GVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) +#define GVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) +/* + * vcpu version available with GVM_ENABLE_CAP + * vm version available with GVM_CAP_ENABLE_CAP_VM + */ +#define GVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) +/* Available with GVM_CAP_XSAVE */ +#define GVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) +#define GVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) +/* Available with GVM_CAP_XCRS */ +#define GVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) +#define GVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) +/* Available with GVM_CAP_SW_TLB */ +#define GVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) +/* Available with GVM_CAP_X86_SMM */ +#define GVM_SMI _IO(KVMIO, 0xb7) + +#define GVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define GVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +#endif /* __LINUX_KVM_H */ diff --git a/ntkrutils.c b/ntkrutils.c new file mode 100644 index 0000000..2509940 --- /dev/null +++ b/ntkrutils.c @@ -0,0 +1,599 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <ntddk.h> +#include <gvm_types.h> +#include <ntkrutils.h> +#include <linux/list.h> + +LIST_HEAD(global_malloc_list); +DEFINE_SPINLOCK(global_malloc_lock); +struct page** pglist; +DEFINE_SPINLOCK(global_page_lock); + +int CPU_HAS_X86_FEATURE_XSAVE; +int CPU_HAS_X86_FEATURE_PKU; +int CPU_HAS_X86_FEATURE_GBPAGES; +int CPU_HAS_X86_FEATURE_HLE; +int CPU_HAS_X86_FEATURE_RTM; +int CPU_HAS_X86_FEATURE_NX; +int CPU_HAS_X86_FEATURE_FXSR_OPT; +int CPU_HAS_X86_FEATURE_NPT; +int CPU_HAS_X86_FEATURE_AVIC; +int CPU_HAS_X86_FEATURE_DECODEASSISTS; +int CPU_HAS_X86_FEATURE_RDTSCP; +int CPU_HAS_X86_FEATURE_LBRV; +int CPU_HAS_X86_FEATURE_NRIPS; +int CPU_HAS_X86_FEATURE_SMEP; +int CPU_HAS_X86_FEATURE_MPX; +int CPU_HAS_X86_FEATURE_XSAVES; +int CPU_HAS_X86_FEATURE_CONSTANT_TSC; +int CPU_HAS_X86_BUG_AMD_TLB_MMATCH; +int CPU_HAS_X86_FEATURE_FLUSHBYASID; +int CPU_HAS_X86_FEATURE_OSVW; +int CPU_HAS_X86_FEATURE_SVM; + +struct cpumask __cpu_online_mask; +struct cpumask *cpu_online_mask = &__cpu_online_mask; +unsigned int cpu_online_count; +u64 max_pagen; + +DEFINE_PER_CPU(struct cpu_getput_cxt, cpu_getput_cxt); + +typedef struct _KAFFINITY_EX { + uint16_t Count; + uint16_t Size; + uint32_t Padding; + uint64_t bitmap[20]; +} KAFFINITYEX, *PKAFFINITYEX; + +typedef void (NTAPI *PFNHALREQUESTIPI)(uint32_t, PKAFFINITYEX); +typedef void (NTAPI *PFNKEINITIALIZEAFFINITYEX)(PKAFFINITYEX); +typedef void (NTAPI *PFNKEADDPROCESSORAFFINITYEX)(PKAFFINITYEX, uint32_t); + +PFNHALREQUESTIPI pHalRequestIpi; +PFNKEINITIALIZEAFFINITYEX pKeInitializeAffinityEx; +PFNKEADDPROCESSORAFFINITYEX pKeAddProcessorAffinityEx; + +// Fix me: We assume there is not cpu online at this time + +NTSTATUS gvmGetCpuOnlineMap(void) +{ + NTSTATUS rc; + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *inf = NULL; + PPROCESSOR_GROUP_INFO pginf = NULL; + PROCESSOR_NUMBER pn; + ULONG buffSize = 0; + u32 ig; + u32 ip; + u32 cpuIndex; + + cpu_online_count = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS); + + rc = KeQueryLogicalProcessorRelationship(NULL, + RelationGroup, NULL, &buffSize); + NT_ASSERT(rc == STATUS_INFO_LENGTH_MISMATCH); + + inf = ExAllocatePoolWithTag(NonPagedPool, buffSize, GVM_POOL_TAG); + + if (!inf) + return STATUS_INSUFFICIENT_RESOURCES; + + rc = KeQueryLogicalProcessorRelationship(NULL, RelationGroup, + inf, &buffSize); + + if (!NT_SUCCESS(rc)) + goto mapout; + + for (ig = 0; NT_SUCCESS(rc) && ig < inf->Group.ActiveGroupCount; ig++) { + pginf = &inf->Group.GroupInfo[ig]; + + for (ip = 0; ip < pginf->MaximumProcessorCount; ip++) { + pn.Group = ig; + pn.Number = ip; + pn.Reserved = 0; + + cpuIndex = KeGetProcessorIndexFromNumber(&pn); + + if (cpuIndex == INVALID_PROCESSOR_INDEX) { + DbgPrint("Cannot find CPU Index for processor \ + in group %d[%d", ig, ip); + continue; + } + + if (test_bit(ip, &pginf->ActiveProcessorMask)) + cpumask_set_cpu(cpuIndex, cpu_online_mask); + else + DbgPrint("Processor %d inside group %d[%d] \ + is not active", cpuIndex, ig, ip); + } + } + +mapout: + ExFreePoolWithTag(inf, GVM_POOL_TAG); + return rc; +} + +/* + Timer Stuffs + */ +void timer_dpc_fn(struct _KDPC *Dpc, + PVOID DeferredContext, + PVOID SystemArgument1, + PVOID SystemArgument2) +{ + struct hrtimer *timer = (struct hrtimer*)DeferredContext; + enum hrtimer_restart ret = timer->function(timer); + if(ret == HRTIMER_RESTART) + hrtimer_restart(timer); +} + +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) +{ + KeInitializeTimerEx(&timer->ktimer, SynchronizationTimer); + timer->base = &timer->base_hack; + timer->base->get_time = ktime_get; + KeInitializeDpc(&timer->kdpc, (PKDEFERRED_ROUTINE)timer_dpc_fn, timer); +} + +int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +{ + int r; + // We only emulate hrtimer mode that KVM uses + ASSERTMSG("Unsupported hrtimer mode", mode == HRTIMER_MODE_ABS_PINNED); + timer->due_time.QuadPart = ktime_to_ns(tim); + timer->node.expires = tim; + do_div(&(u64)timer->due_time.QuadPart, 100); + r = (int)KeSetTimer(&timer->ktimer, timer->due_time, &timer->kdpc); + return r; +} + +int hrtimer_cancel(struct hrtimer *timer) +{ + int r; + r = KeCancelTimer(&timer->ktimer); + return r; +} + +int hrtimer_restart(struct hrtimer* timer) +{ + int r; + //timer->due_time.QuadPart = (ktime_to_ns(ktime_get()) - ktime_to_ns(timer->node.expires)) / 100; + timer->due_time.QuadPart = ktime_to_ns(timer->node.expires); + do_div(&(u64)timer->due_time.QuadPart, 100); + r = (int)KeSetTimer(&timer->ktimer, timer->due_time, &timer->kdpc); + return r; +} + +struct list_head gvm_mmap_list; +DEFINE_RAW_SPINLOCK(gvm_mmap_lock); + +size_t vm_mmap(struct file *notused, size_t addr, size_t len, size_t prot, + size_t flag, size_t offset) +{ + return __vm_mmap(notused, addr, len, prot, flag, offset, 0); +} + +size_t __declspec(noinline) __vm_mmap(struct file *notused, size_t addr, + size_t len, size_t prot, size_t flag, size_t offset, size_t keva) +{ + PMDL pMDL = NULL; + PVOID pMem = NULL; + PVOID UserVA = NULL; + struct gvm_mmap_node *node; + + node = ExAllocatePoolWithTag(NonPagedPool, + sizeof(struct gvm_mmap_node), + GVM_POOL_TAG); + if (!node) + return (size_t)NULL; + + if (keva) + pMem = (PVOID)keva; + else { + pMem = ExAllocatePoolWithTag(NonPagedPool, len, GVM_POOL_TAG); + if (!pMem) + goto free_node; + RtlZeroMemory(pMem, len); + } + + pMDL = IoAllocateMdl(pMem, len, FALSE, FALSE, NULL); + if (!pMDL) + goto free_pmem; + + MmBuildMdlForNonPagedPool(pMDL); + UserVA = MmMapLockedPagesSpecifyCache(pMDL, UserMode, MmCached, + 0, 0, NormalPagePriority); + + if (!UserVA) + goto free_mdl; + + node->UserVA = UserVA; + node->pMDL = pMDL; + node->pMem = pMem; + + raw_spin_lock(&gvm_mmap_lock); + list_add_tail(&node->list, &gvm_mmap_list); + raw_spin_unlock(&gvm_mmap_lock); + + return (size_t)UserVA; + + free_mdl: + IoFreeMdl(pMDL); + free_pmem: + if (keva) + ExFreePoolWithTag(pMem, GVM_POOL_TAG); + free_node: + ExFreePoolWithTag(node, GVM_POOL_TAG); + + return (size_t)NULL; +} + +int vm_munmap(size_t start, size_t len) +{ + return __vm_munmap(start, len, true); +} + +int __declspec(noinline) __vm_munmap(size_t start, size_t len, bool freepage) +{ + struct gvm_mmap_node *node = NULL; + int find = 0; + + raw_spin_lock(&gvm_mmap_lock); +#define LIST_ENTRY_TYPE_INFO struct gvm_mmap_node + list_for_each_entry(node, &gvm_mmap_list, list) + if (node->UserVA == (PVOID)start) { + find = 1; + break; + } +#undef LIST_ENTRY_TYPE_INFO + if (find) + list_del(&node->list); + raw_spin_unlock(&gvm_mmap_lock); + + if (!find) + return -1; + + BUG_ON(!node->UserVA); + BUG_ON(!node->pMDL); + BUG_ON(!node->pMem); + + MmUnmapLockedPages(node->UserVA, node->pMDL); + IoFreeMdl(node->pMDL); + + if (freepage) + ExFreePoolWithTag(node->pMem, GVM_POOL_TAG); + + ExFreePoolWithTag(node, GVM_POOL_TAG); + return 0; +} + +struct sfc_data { + void (*func)(void *info); + void *info; + int done; + struct spin_lock lock; +}; + +DEFINE_PER_CPU(KDPC, ipi_dpc); +DEFINE_PER_CPU(struct sfc_data, smp_call_function_data); + +static void sfc_dpc_routine(KDPC *Dpc, PVOID DeferredContext, + PVOID func, PVOID info) +{ + struct sfc_data *sfc_data; + sfc_data = &per_cpu(smp_processor_id(), smp_call_function_data); + if (sfc_data->func) + sfc_data->func(sfc_data->info); + sfc_data->done = 1; +} + +/* + * smp_call_function_xxx has been changed several times from KeIpiGenericCall + * to HalRequestIpi... + * Current version used DPC with HighImportance to emulate physical IPIs. + * The major concern here is making code easy to debug. Playing with physical + * IPIs incorrectly (some time even correctly) can hang the system and WinDbg + * cannot debug these cases. + * We may later to switch to physical IPIs. + * Note: a DPC (or an IPI) issued to current processor just preempts the + * code. + */ +int smp_call_function_many(cpumask_var_t mask, + void(*func) (void *info), void *info, int wait) +{ + int cpu; + struct sfc_data *sfc_data; + + for_each_cpu(cpu, mask) { + sfc_data = &per_cpu(cpu, smp_call_function_data); + spin_lock(&sfc_data->lock); + sfc_data->func = func; + sfc_data->info = info; + sfc_data->done = 0; + if (!KeInsertQueueDpc(&per_cpu(cpu, ipi_dpc), + NULL, NULL)) + DbgBreakPoint(); + } + + for_each_cpu(cpu, mask) { + sfc_data = &per_cpu(cpu, smp_call_function_data); + while (!sfc_data->done) + _mm_pause(); + spin_unlock(&sfc_data->lock); + } + + return 0; +} + +int smp_call_function_single(int cpu, void(*func)(void *info), + void *info, int wait) +{ + struct sfc_data *sfc_data; + + sfc_data = &per_cpu(cpu, smp_call_function_data); + spin_lock(&sfc_data->lock); + sfc_data->func = func; + sfc_data->info = info; + sfc_data->done = 0; + if (!KeInsertQueueDpc(&per_cpu(cpu, ipi_dpc), + func, info)) + DbgBreakPoint(); + while (!sfc_data->done) + _mm_pause(); + spin_unlock(&sfc_data->lock); + return 0; +} + + +void smp_send_reschedule(int cpu) +{ + KAFFINITYEX target; + + pKeInitializeAffinityEx(&target); + pKeAddProcessorAffinityEx(&target, cpu); + pHalRequestIpi(0, &target); +} + +enum cpuid_reg { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + +#define check_cpu_has(name, leaf, reg, bitpos) \ + do { \ + __cpuid(cpuid_info, leaf); \ + CPU_HAS_##name = !!(cpuid_info[reg] & (1 << bitpos)); \ + } while (0) + +#define check_cpu_has_ex(name, leaf, level, reg, bitpos) \ + do { \ + __cpuidex(cpuid_info, leaf, level); \ + CPU_HAS_##name = !!(cpuid_info[reg] & (1 << bitpos)); \ + } while (0) + + +static void cpu_features_init(void) +{ + int cpuid_info[4] = { 0 }; + + check_cpu_has(X86_FEATURE_XSAVE, 1, CPUID_ECX, 26); + + check_cpu_has(X86_FEATURE_OSVW, 0x80000001, CPUID_ECX, 9); + check_cpu_has(X86_FEATURE_SVM, 0x80000001, CPUID_ECX, 2); + + check_cpu_has(X86_FEATURE_NX, 0x80000001, CPUID_EDX, 20); + check_cpu_has(X86_FEATURE_FXSR_OPT, 0x80000001, CPUID_EDX, 25); + check_cpu_has(X86_FEATURE_GBPAGES, 0x80000001, CPUID_EDX, 26); + check_cpu_has(X86_FEATURE_RDTSCP, 0x80000001, CPUID_EDX, 27); + + check_cpu_has_ex(X86_FEATURE_HLE, 7, 0, CPUID_EBX, 4); + check_cpu_has_ex(X86_FEATURE_RTM, 7, 0, CPUID_EBX, 11); + check_cpu_has_ex(X86_FEATURE_MPX, 7, 0, CPUID_EBX, 14); + + check_cpu_has_ex(X86_FEATURE_PKU, 7, 0, CPUID_ECX, 3); + check_cpu_has_ex(X86_FEATURE_SMEP, 7, 0, CPUID_ECX, 7); + + check_cpu_has(X86_FEATURE_NPT, 0x8000000a, CPUID_EDX, 0); + check_cpu_has(X86_FEATURE_LBRV, 0x8000000a, CPUID_EDX, 1); + check_cpu_has(X86_FEATURE_NRIPS, 0x8000000a, CPUID_EDX, 3); + check_cpu_has(X86_FEATURE_FLUSHBYASID, 0x8000000a, CPUID_EDX, 6); + check_cpu_has(X86_FEATURE_DECODEASSISTS, 0x8000000a, CPUID_EDX, 7); + check_cpu_has(X86_FEATURE_AVIC, 0x8000000a, CPUID_EDX, 13); + + check_cpu_has_ex(X86_FEATURE_XSAVES, 0xd, 1, CPUID_EAX, 3); +} + +static NTSTATUS prepare_boot_cpu_data(void) +{ + /* Check Physical Address Bit*/ + unsigned int eax, ebx, ecx, edx; + + boot_cpu_data.extended_cpuid_level = cpuid_eax(0x80000000); + boot_cpu_data.x86_phys_bits = 36; + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (boot_cpu_data.extended_cpuid_level >= 0x80000008) + if (edx & (1 << 29)) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + boot_cpu_data.x86_phys_bits = eax & 0xFF; + } + + return STATUS_SUCCESS; +} + +#define RegName L"\\Registry\\Machine\\HARDWARE\\RESOURCEMAP\\System Resources\\Physical Memory" +static NTSTATUS get_physical_memsize(u64 *size) +{ + OBJECT_ATTRIBUTES keyAttribute; + UNICODE_STRING keyName, valName; + HANDLE keyHandle; + NTSTATUS rc; + ULONG buffSize, count; + PKEY_VALUE_FULL_INFORMATION buff; + PCM_RESOURCE_LIST res; + PCM_PARTIAL_RESOURCE_LIST list; + PCM_PARTIAL_RESOURCE_DESCRIPTOR pres; + + RtlInitUnicodeString(&keyName, RegName); + InitializeObjectAttributes(&keyAttribute, + &keyName, + OBJ_CASE_INSENSITIVE | OBJ_KERNEL_HANDLE, + NULL, + NULL); + rc = ZwOpenKey(&keyHandle, KEY_READ, &keyAttribute); + if (!NT_SUCCESS(rc)) + return rc; + + RtlInitUnicodeString(&valName, L".Translated"); + rc = ZwQueryValueKey(keyHandle, + &valName, + KeyValueFullInformation, + NULL, + 0, + &buffSize); + if (!(rc == STATUS_BUFFER_TOO_SMALL || + rc == STATUS_BUFFER_OVERFLOW)) + goto key_close; + + buff = ExAllocatePoolWithTag(NonPagedPool, buffSize, GVM_POOL_TAG); + if (!buff) { + rc = STATUS_NO_MEMORY; + goto key_close; + } + + RtlZeroMemory(buff, buffSize); + rc = ZwQueryValueKey(keyHandle, + &valName, + KeyValueFullInformation, + buff, + buffSize, + &buffSize); + if (!NT_SUCCESS(rc)) + goto free_buff; + + ASSERT(buff->Type == REG_RESOURCE_LIST); + res = (PCM_RESOURCE_LIST)((char *)buff + buff->DataOffset); + ASSERT(res->Count == 1); + list = &res->List[0].PartialResourceList; + count = list->Count; + pres = &list->PartialDescriptors[count - 1]; + + switch (pres->Type) { + case CmResourceTypeMemory: + *size = pres->u.Memory.Start.QuadPart + + pres->u.Memory.Length; + break; + case CmResourceTypeMemoryLarge: + switch (pres->Flags) { + case CM_RESOURCE_MEMORY_LARGE_40: + *size = pres->u.Memory40.Start.QuadPart + + ((u64)pres->u.Memory40.Length40 << 8); + break; + case CM_RESOURCE_MEMORY_LARGE_48: + *size = pres->u.Memory48.Start.QuadPart + + ((u64)pres->u.Memory48.Length48 << 16); + break; + case CM_RESOURCE_MEMORY_LARGE_64: + *size = pres->u.Memory64.Start.QuadPart + + ((u64)pres->u.Memory64.Length64 << 32); + break; + } + break; + } + + rc = STATUS_SUCCESS; + + free_buff: + ExFreePoolWithTag(buff, GVM_POOL_TAG); + key_close: + ZwClose(keyHandle); + return rc; +} + +/* + * Init/Deinit Nt Kernel Support Routines + */ + +NTSTATUS NtKrUtilsInit(void) +{ + u64 phy_memsize = 0; + UNICODE_STRING FuncName; + NTSTATUS rc; + int cpu; + PROCESSOR_NUMBER cpu_number; + + cpu_features_init(); + + rc = get_physical_memsize(&phy_memsize); + if (!NT_SUCCESS(rc)) + return rc; + max_pagen = (phy_memsize >> PAGE_SHIFT) + 1; + + rc = prepare_boot_cpu_data(); + if (!NT_SUCCESS(rc)) + return rc; + + rc = gvmGetCpuOnlineMap(); + if (!NT_SUCCESS(rc)) + return rc; + + // Prepare smp call function stuffs + RtlInitUnicodeString(&FuncName, L"HalRequestIpi"); + pHalRequestIpi = MmGetSystemRoutineAddress(&FuncName); + RtlInitUnicodeString(&FuncName, L"KeInitializeAffinityEx"); + pKeInitializeAffinityEx = MmGetSystemRoutineAddress(&FuncName); + RtlInitUnicodeString(&FuncName, L"KeAddProcessorAffinityEx"); + pKeAddProcessorAffinityEx = MmGetSystemRoutineAddress(&FuncName); + for (cpu = 0; cpu < cpu_online_count; cpu++) { + KeInitializeDpc(&per_cpu(cpu, ipi_dpc), + sfc_dpc_routine, NULL); + rc = KeGetProcessorNumberFromIndex(cpu, &cpu_number); + if (!NT_SUCCESS(rc)) + return rc; + rc = KeSetTargetProcessorDpcEx( + &per_cpu(cpu, ipi_dpc), + &cpu_number); + if (!NT_SUCCESS(rc)) + return rc; + KeSetImportanceDpc(&per_cpu(cpu, ipi_dpc), + HighImportance); + } + + pglist = (struct page**)ExAllocatePoolWithTag(NonPagedPool, + max_pagen*sizeof(struct page *), + GVM_POOL_TAG); + if (!pglist) + return STATUS_NO_MEMORY; + + RtlZeroMemory(pglist, max_pagen*sizeof(struct page *)); + INIT_LIST_HEAD(&gvm_mmap_list); + spin_lock_init(&gvm_mmap_lock); + + return STATUS_SUCCESS; +} + +void NtKrUtilsExit(void) +{ + u64 i; + + /* Well implemented code won't rely on freeing here */ + for (i = 0; i < max_pagen; i++) + if (pglist[i]) + ExFreePoolWithTag(pglist[i], GVM_POOL_TAG); + ExFreePoolWithTag(pglist, GVM_POOL_TAG); + pglist = NULL; +} + diff --git a/ntkrutils.h b/ntkrutils.h new file mode 100644 index 0000000..bea0a30 --- /dev/null +++ b/ntkrutils.h @@ -0,0 +1,1269 @@ +/* + * Copyright 2019 Google LLC + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#pragma once +#include <ntddk.h> +#include <intrin.h> +#include <gvm_types.h> +#include <string.h> +#include <dos.h> +#include <linux/list.h> + +// APC definitions (undocumented) +typedef enum _KAPC_ENVIRONMENT +{ + OriginalApcEnvironment, + AttachedApcEnvironment, + CurrentApcEnvironment, + InsertApcEnvironment +} KAPC_ENVIRONMENT; + +typedef +VOID +(NTAPI *PKNORMAL_ROUTINE)( + _In_ PVOID NormalContext, + _In_ PVOID SystemArgument1, + _In_ PVOID SystemArgument2 + ); + +typedef +VOID +(NTAPI *PKKERNEL_ROUTINE)( + _In_ PKAPC Apc, + _Inout_ PKNORMAL_ROUTINE* NormalRoutine, + _Inout_ PVOID* NormalContext, + _Inout_ PVOID* SystemArgument1, + _Inout_ PVOID* SystemArgument2 + ); + +typedef +VOID +(NTAPI *PKRUNDOWN_ROUTINE) ( + _In_ PKAPC Apc + ); + +NTKERNELAPI +VOID +NTAPI +KeInitializeApc( + _Out_ PRKAPC Apc, + _In_ PETHREAD Thread, + _In_ KAPC_ENVIRONMENT Environment, + _In_ PKKERNEL_ROUTINE KernelRoutine, + _In_opt_ PKRUNDOWN_ROUTINE RundownRoutine, + _In_opt_ PKNORMAL_ROUTINE NormalRoutine, + _In_opt_ KPROCESSOR_MODE ApcMode, + _In_opt_ PVOID NormalContext + ); + +NTKERNELAPI +BOOLEAN +NTAPI +KeInsertQueueApc( + _Inout_ PRKAPC Apc, + _In_opt_ PVOID SystemArgument1, + _In_opt_ PVOID SystemArgument2, + _In_ KPRIORITY Increment + ); + +// MSDN recommends the string in reverse order +#define GVM_POOL_TAG '_MVG' + +// cpuid +static __forceinline void cpuid(unsigned int op, + unsigned int *eax, + unsigned int *ebx, + unsigned int *ecx, + unsigned int *edx) +{ + int cpuInfo[4]; + __cpuid(cpuInfo, op); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +} + +static __forceinline void cpuid_count(unsigned int op, + unsigned int count, + unsigned int *eax, + unsigned int *ebx, + unsigned int *ecx, + unsigned int *edx) +{ + int cpuInfo[4]; + __cpuidex(cpuInfo, op, count); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +} + +static __inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static __inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static __inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static __inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +static __forceinline unsigned int x86_family(unsigned int sig) +{ + unsigned int x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static __forceinline unsigned int x86_cpuid_family(void) +{ + return x86_family(cpuid_eax(1)); +} + +static __forceinline unsigned int x86_model(unsigned int sig) +{ + unsigned int fam, model; + + fam = x86_family(sig); + + model = (sig >> 4) & 0xf; + + if (fam >= 0x6) + model += ((sig >> 16) & 0xf) << 4; + + return model; +} + +static __forceinline unsigned int x86_cpuid_model(void) +{ + return x86_model(cpuid_eax(1)); +} + +static __forceinline unsigned int x86_stepping(unsigned int sig) +{ + return sig & 0xf; +} + +/* + * cpu_has_vmx + */ +static __inline int cpu_has_vmx(void) +{ + size_t ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + +/* + * Memory Barriers + */ +#define smp_mb() _mm_mfence() +#define smp_rmb() _mm_lfence() +#define smp_wmb() _mm_sfence() +#define mb() _mm_mfence() +#define rmb() _mm_lfence() +#define wmb() _mm_sfence() +#define smp_mb__after_atomic() _mm_mfence(); + +// smp_processor_id +static __inline unsigned int raw_smp_processor_id(void) +{ + return KeGetCurrentProcessorNumberEx(NULL); +} + +static __inline unsigned int smp_processor_id(void) +{ + return raw_smp_processor_id(); +} + +/* + * cpu_get/put for ensure vmx safety + */ + +struct cpu_getput_cxt { + long count; + KIRQL irql; +}; + +DECLARE_PER_CPU(struct cpu_getput_cxt, cpu_getput_cxt); + +static __inline unsigned int get_cpu() +{ + KIRQL oldIrql = KeRaiseIrqlToDpcLevel(); + unsigned int cpu = smp_processor_id(); + long newcount = InterlockedIncrement(&per_cpu(cpu_getput_cxt, cpu).count); + + if (newcount == 1) + per_cpu(cpu_getput_cxt, cpu).irql = oldIrql; + + return cpu; +} + +static __inline void put_cpu() +{ + unsigned int cpu = smp_processor_id(); + long newcount = InterlockedDecrement(&per_cpu(cpu_getput_cxt, cpu).count); + BUG_ON(newcount < 0); + if (newcount == 0) { + KIRQL oldIrql = per_cpu(cpu_getput_cxt, cpu).irql; + per_cpu(cpu_getput_cxt, cpu).irql = 0; + KeLowerIrql(oldIrql); + } +} + +#define preempt_disable() KeRaiseIrqlToDpcLevel() +#define preempt_enable() KeLowerIrql(PASSIVE_LEVEL) + +// msr access +static _forceinline void wrmsrl(unsigned int msr, u64 val) +{ + __writemsr(msr, val); +} + +extern struct cpumask *cpu_online_mask; +extern unsigned int cpu_online_count; + +/* + * SpinLock Implementation + * Compared with Windows Native Support, this implementation does not raise IRQL to DPC level. + * KVM has nasty lock nesting that might work on Linux but not directly on Windows. + */ +struct spin_lock { + volatile LONG lock; +}; + +typedef struct spin_lock spinlock_t; +typedef struct spin_lock raw_spinlock_t; + +#define DEFINE_SPINLOCK(x) spinlock_t x +#define DECLARE_SPINLOCK(x) extern spinlock_t x +#define DEFINE_RAW_SPINLOCK(x) spinlock_t x +#define DECLARE_RAW_SPINLOCK(x) extern spinlock_t x + +static __forceinline void spin_lock_init(spinlock_t *lock) +{ + lock->lock = 0; +} + +extern __forceinline void __spin_lock(spinlock_t *lock); +static __forceinline void spin_lock(spinlock_t *lock) +{ + __spin_lock(lock); +} + +static __forceinline void spin_unlock(spinlock_t *lock) +{ + lock->lock = 0; +} + +static __forceinline void raw_spin_lock_init(spinlock_t *lock) +{ + spin_lock_init(lock); +} + +static __forceinline void raw_spin_lock(spinlock_t *lock) +{ + spin_lock(lock); +} + +static __forceinline void raw_spin_unlock(spinlock_t *lock) +{ + spin_unlock(lock); +} + +/* + Mutex Windows Implementation + */ +struct mutex +{ + FAST_MUTEX mutex; +}; +typedef struct mutex mutex; + +static __forceinline void mutex_init(struct mutex *lock) +{ + ExInitializeFastMutex(&lock->mutex); +} + +static __forceinline void mutex_lock(struct mutex *lock) +{ + ExAcquireFastMutex(&lock->mutex); +} + +static __forceinline void mutex_unlock(struct mutex *lock) +{ + ExReleaseFastMutex(&lock->mutex); +} + +#define __KERNEL_CS 0x10 +#define __KERNEL_DS 0x28 +#define __KERNEL_SS 0x18 +#define __KERNEL_FS 0x53 + +/* + MSR access + */ +static __inline void __rdmsr(u32 index, u32 *low, u32 *high) +{ + u64 val = __readmsr(index); + *low = (u32)val; + *high = (u32)(val >> 32); +} + +static __inline int __rdmsr_safe(u32 index, u32 *low, u32 *high) +{ + u64 val = 0; + __try { + val = __readmsr(index); + *low = (u32)val; + *high = (u32)(val >> 32); + } __except(EXCEPTION_EXECUTE_HANDLER) { + return -1; + } + return 0; +} + +static __inline int __rdmsrl_safe(u32 index, u64 *val) +{ + __try { + *val = __readmsr(index); + } __except(EXCEPTION_EXECUTE_HANDLER) { + return -1; + } + return 0; +} + +static __inline u64 native_read_msr_safe(u32 index, int *err) +{ + u64 value = 0; + *err = __rdmsrl_safe(index, &value); + return value; +} + +static __inline int __wrmsr_safe(u32 index, u32 low, u32 high) +{ + u64 val = (((u64)high) << 32) | low; + __try { + __writemsr(index, val); + } __except(EXCEPTION_EXECUTE_HANDLER) { + return -1; + } + return 0; +} + +static __inline int __wrmsrl_safe(u32 index, u64 val) +{ + __try { + __writemsr(index, val); + } __except(EXCEPTION_EXECUTE_HANDLER) { + return -1; + } + return 0; +} + +static __inline int native_write_msr_safe(u32 index, u32 low, u32 high) +{ + return __wrmsr_safe(index, low, high); +} + +#define rdmsr(a, b, c) __rdmsr(a, &b, &c) +#define rdmsr_safe(a, b, c) __rdmsr_safe(a, b, c) +#define rdmsrl(a, b) b=__readmsr(a) +#define rdmsrl_safe(a, b) __rdmsrl_safe(a, b) + +#define wrmsr(a,b) __writemsr(a,b) +#define wrmsrl(a,b) __writemsr(a,b) +#define wrmsr_safe(a, b, c) __wrmsr_safe(a, b, c) +#define wrmsrl_safe(a,b) __wrmsrl_safe(a,b) + +/* + Local Irq Disable + */ +static __forceinline void local_irq_disable(void) +{ + _disable(); +} + +static __forceinline void local_irq_enable(void) +{ + _enable(); +} + +/* + Timer Stuffs + */ + +#define MSEC_PER_SEC 1000L +#define USEC_PER_MSEC 1000L +#define NSEC_PER_USEC 1000L +#define NSEC_PER_MSEC 1000000L +#define USEC_PER_SEC 1000000L +#define NSEC_PER_SEC 1000000000L +#define FSEC_PER_SEC 1000000000000000LL + +union ktime +{ + s64 tv64; + struct { + s32 nsec, sec; + } tv; +}; + +typedef union ktime ktime_t; + +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#define KTIME_SEC_MAX LONG_MAX + +#pragma warning(disable : 4204) +static __forceinline ktime_t ktime_set(const long secs, const size_t nsecs) +{ +#if 0 + if (unlikely(secs >= KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; +#endif + return (ktime_t) { .tv64 = (s64)secs * NSEC_PER_SEC + (s64)nsecs }; +} + +/* Subtract two ktime_t variables. rem = lhs -rhs: */ +#define ktime_sub(lhs, rhs) \ + (ktime_t){ .tv64 = (lhs).tv64 - (rhs).tv64 } + +/* Add two ktime_t variables. res = lhs + rhs: */ +#define ktime_add(lhs, rhs) \ + (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 } + +/* + * Add a ktime_t variable and a scalar nanosecond value. + * res = kt + nsval: + */ +#define ktime_add_ns(kt, nsval) \ + (ktime_t){ .tv64 = (kt).tv64 + (nsval) } + +/* + * Subtract a scalar nanosecod from a ktime_t variable + * res = kt - nsval: + */ +#define ktime_sub_ns(kt, nsval) \ + (ktime_t){ .tv64 = (kt).tv64 - (nsval) } + + +/* Map the ktime_t to timespec conversion to ns_to_timespec function */ +#define ktime_to_timespec(kt) ns_to_timespec((kt).tv64) + +/* Map the ktime_t to timeval conversion to ns_to_timeval function */ +#define ktime_to_timeval(kt) ns_to_timeval((kt).tv64) + +/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */ +#define ktime_to_ns(kt) ((kt).tv64) + +static __forceinline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 == cmp2.tv64; +} + +/** + * ktime_compare - Compares two ktime_t variables for less, greater or equal + * @cmp1: comparable1 + * @cmp2: comparable2 + * + * Returns ... + * cmp1 < cmp2: return <0 + * cmp1 == cmp2: return 0 + * cmp1 > cmp2: return >0 + */ +static __forceinline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) +{ + if (cmp1.tv64 < cmp2.tv64) + return -1; + if (cmp1.tv64 > cmp2.tv64) + return 1; + return 0; +} + +static __forceinline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) +{ + return ktime_add_ns(kt, usec * 1000); +} + +static __forceinline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) +{ + return ktime_sub_ns(kt, usec * 1000); +} + +static __forceinline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + +static __forceinline ktime_t ktime_get(void) +{ + s64 nsecs = 0; + LARGE_INTEGER time; + KeQuerySystemTime(&time); + nsecs = time.QuadPart; + nsecs *= 100; + + return (ktime_t){.tv64 = nsecs}; +} +typedef size_t clockid_t; +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 +#define CLOCK_PROCESS_CPUTIME_ID 2 +#define CLOCK_THREAD_CPUTIME_ID 3 +#define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_BOOTTIME 7 +#define CLOCK_REALTIME_ALARM 8 +#define CLOCK_BOOTTIME_ALARM 9 + +enum hrtimer_mode +{ + HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */ + HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */ + HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */ + HRTIMER_MODE_ABS_PINNED = 0x02, + HRTIMER_MODE_REL_PINNED = 0x03, +}; + +enum hrtimer_restart +{ + HRTIMER_NORESTART, /* Timer is not restarted */ + HRTIMER_RESTART, /* Timer must be restarted */ +}; + +struct timerqueue_node +{ + ktime_t expires; +}; + +struct hrtimer_clock_base +{ + int index; + ktime_t resolution; + ktime_t (*get_time)(void); + ktime_t softirq_time; + ktime_t offset; +}; + +struct hrtimer +{ + struct timerqueue_node node; + ktime_t _softexpires; + enum hrtimer_restart (*function)(struct hrtimer *); + struct hrtimer_clock_base *base; + size_t state; + KTIMER ktimer; + KDPC kdpc; + LARGE_INTEGER due_time; + struct hrtimer_clock_base base_hack; +}; + +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); +int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode); +int hrtimer_cancel(struct hrtimer *timer); +int hrtimer_restart(struct hrtimer* timer); + +static __forceinline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 delta) +{ + timer->node.expires = ktime_add_ns(timer->node.expires, delta); +} + +static __forceinline ktime_t hrtimer_get_expires(struct hrtimer *timer) +{ + return timer->node.expires; +} + +static __forceinline u64 hrtimer_get_expires_ns(struct hrtimer *timer) +{ + return ktime_to_ns(timer->node.expires); +} + +static __forceinline void hrtimer_start_expires(struct hrtimer *timer, int mode) +{ + hrtimer_start(timer, timer->node.expires, mode); +} + +static __forceinline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) +{ + return ktime_sub(timer->node.expires, timer->base->get_time()); +} + +static __forceinline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + ktime_t rem; + rem = hrtimer_expires_remaining(timer); + return rem; +} + +/* + Memory Management Stuffs + */ + +#define BIT(nr) ((size_t)(1) << (nr)) +#define GFP_KERNEL BIT(0) +#define GFP_ATOMIC BIT(1) +#define __GFP_ZERO BIT(3) +#define GFP_UNALLOC BIT(5) + + /* + * Address types: + * + * gva - guest virtual address + * gpa - guest physical address + * gfn - guest frame number + * hva - host virtual address + * hpa - host physical address + * hfn - host frame number + */ + +typedef size_t gva_t; +typedef u64 gpa_t; +typedef u64 gfn_t; +typedef u64 phys_addr_t; + +typedef size_t hva_t; +typedef u64 hpa_t; +typedef u64 hfn_t; + +typedef hfn_t pfn_t; + +typedef struct page +{ + void* hva; + void* kmap_hva; + size_t __private; + hpa_t hpa; + pfn_t pfn; + size_t gfp_mask; + PEPROCESS proc; +}page; + +extern u64 max_pagen; +extern struct page** pglist; +DECLARE_RAW_SPINLOCK(global_page_lock); + +#define page_private(page) ((page)->__private) +#define set_page_private(page, v) ((page)->__private = (v)) + +#define __free_page(page) __free_pages((page), 0) +#define free_page(addr) free_pages((addr), 0) + +#define clear_page(page) memset((page), 0, PAGE_SIZE) + +#define virt_to_page(kaddr) pfn_to_page((__pa(kaddr) >> PAGE_SHIFT)) + + +static __inline void *kmalloc(size_t size, size_t flags) +{ + void* ret = NULL; + int zero = 0; + + if (flags & __GFP_ZERO) + zero = 1; + + ret = ExAllocatePoolWithTag(NonPagedPool, size, GVM_POOL_TAG); + + if(ret && zero) + { + memset(ret, 0, size); + } + return ret; +} + +static __inline void *kzalloc(size_t size, size_t flags) +{ + return kmalloc(size, flags | __GFP_ZERO); +} + +static __inline void kfree(void* hva) +{ + if (!hva) + return; + ExFreePoolWithTag(hva, GVM_POOL_TAG); +} + +static __inline void *vmalloc(size_t size) +{ + return ExAllocatePoolWithTag(NonPagedPool, size, GVM_POOL_TAG); +} + +static __inline void vfree(void* hva) +{ + if (!hva) + return; + ExFreePoolWithTag(hva, GVM_POOL_TAG); +} + +static __inline void *vzalloc(size_t size) +{ + void *addr = vmalloc(size); + if (addr) + { + memset(addr, 0, size); + } + return addr; +} + +static __inline void *kmalloc_fast(size_t size, size_t flags) +{ + return kmalloc(size, flags); +} + +static __inline void *kzalloc_fast(size_t size, size_t flags) +{ + return kmalloc_fast(size, flags | __GFP_ZERO); +} + +static __inline void kfree_fast(void* hva) +{ + if (!hva) + return; + ExFreePoolWithTag(hva, GVM_POOL_TAG); +} + +#define kvfree kfree_fast + +#define VERIFY_READ 0 +#define VERIFY_WRITE 1 + +static __inline pfn_t page_to_pfn(struct page* page) +{ + return page->pfn; +} + +static __inline void* page_to_hva(struct page* page) +{ + return page->hva; +} + +static __inline hpa_t page_to_phys(struct page* page) +{ + return page->hpa; +} + +static __inline hpa_t mdl_to_phys(PMDL mdl) +{ + return (hpa_t)MmGetPhysicalAddress(mdl->StartVa).QuadPart; +} + +static __inline struct page* pfn_to_page(pfn_t pfn) +{ + return pglist[pfn]; +} + +static __inline hpa_t __pa(void* va) +{ + PHYSICAL_ADDRESS addr_phys; + addr_phys = MmGetPhysicalAddress(va); + return (hpa_t)(addr_phys.QuadPart); +} + +static __inline void* __va(hpa_t pa) +{ + void* ret = 0; + ret = page_to_hva(pfn_to_page(pa >> PAGE_SHIFT)); + if(!ret) + { + printk("vmmr0: __va: invalid hpa %p\n", pa); + } + return ret; +} + +static __inline struct page *alloc_page(unsigned int gfp_mask) +{ + void* page_hva = NULL; + PHYSICAL_ADDRESS pageaddr_phys; + int zero = 0; + struct page* page = ExAllocatePoolWithTag(NonPagedPool, + sizeof(*page), + GVM_POOL_TAG); + if(!page) + goto out_error; + + page_hva = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, GVM_POOL_TAG); + if(!page_hva) + goto out_error_free; + + if (gfp_mask & __GFP_ZERO) + zero = 0; + + ASSERT(!((size_t)page_hva & 0xfffull)); + + if(zero) + memset(page_hva, 0, PAGE_SIZE); + + pageaddr_phys = MmGetPhysicalAddress(page_hva); + page->hpa = pageaddr_phys.QuadPart; + page->pfn = page->hpa >> PAGE_SHIFT; + page->hva = page_hva; + page->gfp_mask = gfp_mask; + page->proc = IoGetCurrentProcess(); + raw_spin_lock(&global_page_lock); + pglist[page->pfn] = page; + raw_spin_unlock(&global_page_lock); + return page; + + out_error_free: + ExFreePoolWithTag(page, GVM_POOL_TAG); + out_error: + return 0; +} + +static __inline void __free_pages(struct page* page, unsigned int order) +{ + ExFreePoolWithTag(page->hva, GVM_POOL_TAG); + + raw_spin_lock(&global_page_lock); + pglist[page->pfn] = 0; + raw_spin_unlock(&global_page_lock); + + ExFreePoolWithTag(page, GVM_POOL_TAG); +} + +static __inline void free_pages(size_t addr, unsigned int order) +{ + if (addr != 0) + { + __free_pages(virt_to_page((void *)addr), order); + } +} + +static __inline void* kmap(PMDL mdl) +{ + + if (!mdl) + return NULL; + + return MmGetSystemAddressForMdlSafe(mdl, NormalPagePriority); +} + +static __inline void kunmap(PMDL mdl) +{ +} + +static __inline void* page_address(struct page* page) +{ + BUG_ON(!page->hva); + return page->hva; +} + +static __inline void* get_zeroed_page(unsigned int gfp_mask) +{ + struct page* page = alloc_page(gfp_mask); + memset(page->hva, 0, PAGE_SIZE); + return page->hva; +} + +static __inline size_t __get_free_page(unsigned int gfp_mask) +{ + struct page *page; + page = alloc_page(gfp_mask); + if (!page) + return 0; + return (size_t) page_address(page); +} + +static __inline int get_user_pages_fast(size_t start, int nr_pages, int write, + PMDL *mdl) +{ + PMDL _mdl; + + start &= PAGE_MASK; + _mdl = IoAllocateMdl((void *)start, nr_pages * PAGE_SIZE, + FALSE, FALSE, NULL); + if (!_mdl) + return 0; + + MmProbeAndLockPages(_mdl, KernelMode, IoWriteAccess); + *mdl = _mdl; + + return nr_pages; +} + +static __inline void kvm_release_page(PMDL mdl) +{ + if (!mdl) + return; + + MmUnlockPages(mdl); + IoFreeMdl(mdl); +} + +/* We actually did not copy from *user* here. This function in kvm is used to + * ioctl parameters. On Windows, we always use buffered io for device control. + * Thus the address supplied to copy_from_user is address in kernel space. + * Simple keep the function name here. + * __copy_from/to_user is really copying from user space. + */ +static __inline size_t copy_from_user(void *dst, const void *src, size_t size) +{ + memcpy(dst, src, size); + return 0; +} + +static __inline size_t __copy_user(void *dst, const void *src, size_t size, + int from) +{ + PMDL lock_mdl; + HANDLE handle; + + lock_mdl = IoAllocateMdl(from? src : dst, size, FALSE, FALSE, NULL); + if (!lock_mdl) + return size; + MmProbeAndLockPages(lock_mdl, UserMode, IoWriteAccess); + handle = MmSecureVirtualMemory(from? src : dst, size, PAGE_READWRITE); + if (!handle) + return size; + memcpy(dst, src, size); + MmUnsecureVirtualMemory(handle); + MmUnlockPages(lock_mdl); + IoFreeMdl(lock_mdl); + return 0; +} + +static __inline size_t __copy_to_user(void *dst, const void *src, size_t size) +{ + return __copy_user(dst, src, size, 0); +} + +static __inline size_t __copy_from_user(void *dst, const void *src, size_t size) +{ + return __copy_user(dst, src, size, 1); +} + +static __inline void *kmap_atomic(PMDL mdl) +{ + return kmap(mdl); +} + +static __inline void kunmap_atomic(PMDL mdl) +{ + kunmap(mdl); +} + +static __inline void *memdup_user(const void *user, size_t size) +{ + void *buf = kzalloc(size, GFP_KERNEL); + + if (!buf) + return ERR_PTR(-ENOMEM); + if (copy_from_user(buf, user, size)) + return ERR_PTR(-EFAULT); + return buf; +} + +/* + TSC + */ +static __forceinline u64 rdtsc(void) +{ + return __rdtsc(); +} + +static __forceinline int check_tsc_unstable(void) +{ + return 0; +} + +static __forceinline int mark_tsc_unstable(void) +{ + return 0; +} + + +/* + File + */ +struct file { + void *private_data; +}; + +/* +Atomic Operations +*/ +typedef long atomic_t; +#define ATOMIC_INIT(n) (n) +static __forceinline void atomic_inc(atomic_t *v) +{ + InterlockedIncrement(v); +} + +static __forceinline void atomic_dec(atomic_t *v) +{ + InterlockedDecrement(v); +} + +static __forceinline int atomic_dec_and_test(atomic_t *v) +{ + return !InterlockedDecrement(v); +} + +static __forceinline int atomic_xchg(atomic_t *v, int val) +{ + return InterlockedExchange(v, val); +} + +extern u8 xchg8(u8 *a, u8 b); +extern u16 xchg16(u16 *a, u16 b); +#define xchg32(a, b) InterlockedExchange((LONG *)a, b) +#define xchg64(a, b) InterlockedExchange64((LONG64 *)a, b) +extern u8 cmpxchg8(u8 *a, u8 b, u8 c); +extern u16 cmpxchg16(u16 *a, u16 b, u16 c); +#define cmpxchg32(a, b, c) InterlockedCompareExchange((LONG *)a, c, b) +#define cmpxchg64(a, b, c) InterlockedCompareExchange64((LONG64 *)a, c, b) + +#define xchg(a, b) ((sizeof(*a) == 8)? xchg64((u64 *)a, b) : \ + ((sizeof(*a) == 4)? xchg32((u32 *)a, b) : \ + ((sizeof(*a) == 2)? xchg16((u16 *)a, b) : \ + ((sizeof(*a) == 1)? xchg8((u8 *)a, b) : 0)))) +#define cmpxchg(a, b, c) ((sizeof(*a) == 8)? cmpxchg64((u64 *)a, b, c) : \ + ((sizeof(*a) == 4)? cmpxchg32((u32 *)a, b, c) : \ + ((sizeof(*a) == 2)? cmpxchg16((u16 *)a, b, c) : \ + ((sizeof(*a) == 1)? cmpxchg8((u8 *)a, b, c) : 0)))) + +#define atomic_cmpxchg(a, b, c) cmpxchg(a, b, c) + +static __forceinline int atomic_dec_if_positive(atomic_t *v) +{ + int c, old, dec; + c = atomic_read(v); + + for (;;) { + dec = c - 1; + if (unlikely(dec < 0)) + break; + old = atomic_cmpxchg((v), c, dec); + if (likely(old == c)) + break; + c = old; + } + return dec; +} + +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + *p = v; \ +} while (0) + + +/* + cpumask + */ +static __inline bool zalloc_cpumask_var(cpumask_var_t *mask, int flags) +{ + *mask = NULL; + *mask = kmalloc(sizeof(cpumask_t), flags | __GFP_ZERO); + return !!(*mask); +} +static __inline void free_cpumask_var(cpumask_var_t mask) +{ + kfree(mask); +} + +/* + vm_mmap/unmap + */ +#define PROT_READ 0x1 /* page can be read */ +#define PROT_WRITE 0x2 /* page can be written */ +#define PROT_EXEC 0x4 /* page can be executed */ +#define PROT_SEM 0x8 /* page may be used for atomic ops */ +#define PROT_NONE 0x0 /* page can not be accessed */ +#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ +#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ + +#define MAP_SHARED 0x01 /* Share changes */ +#define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_TYPE 0x0f /* Mask for type of mapping */ +#define MAP_FIXED 0x10 /* Interpret addr exactly */ +#define MAP_ANONYMOUS 0x20 /* don't use a file */ +#define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ + +typedef struct gvm_mmap_node +{ + PMDL pMDL; + PVOID pMem; + PVOID UserVA; + struct list_head list; +}gvm_mmap_node; + +extern struct list_head gvm_mmap_list; + +extern size_t vm_mmap(struct file *file, size_t addr, + size_t len, size_t prot, size_t flag, size_t offset); +extern size_t __vm_mmap(struct file *file, size_t addr, + size_t len, size_t prot, size_t flag, size_t offset, size_t keva); +extern int vm_munmap(size_t start, size_t len); +extern int __vm_munmap(size_t start, size_t len, bool freepage); + +/* + smp_call_function + */ +extern int smp_call_function_single(int cpu, void(*func)(void *info), void *info, int wait); +extern int smp_call_function_many(cpumask_var_t mask, void(*func) (void *info), void *info, int wait); +extern void smp_send_reschedule(int cpu); + +/* + * srcu tranlation to windows ERESOURCE + */ +struct srcu_struct { + ERESOURCE eres; +}; + +static __inline int srcu_read_lock(struct srcu_struct *sp) +{ + ExAcquireResourceSharedLite(&sp->eres, true); + return 0; +} + +static __inline void __srcu_read_unlock(struct srcu_struct *sp) +{ + ExReleaseResourceLite(&sp->eres); +} +#define srcu_read_unlock(sp, idx) __srcu_read_unlock(sp) + +static __inline void *srcu_dereference(void *p, struct srcu_struct *sp) +{ + return p; +} + +static __inline void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + ExAcquireResourceExclusiveLite(&sp->eres, true); + ExReleaseResourceLite(&sp->eres); +} + +#define synchronize_srcu(srcu) synchronize_srcu_expedited(srcu) + +static __inline int init_srcu_struct(struct srcu_struct *sp) +{ + NTSTATUS rc = ExInitializeResourceLite(&sp->eres); + return !NT_SUCCESS(rc); +} + +static __inline int cleanup_srcu_struct(struct srcu_struct *sp) +{ + NTSTATUS rc = ExDeleteResourceLite(&sp->eres); + return !NT_SUCCESS(rc); +} + +/* + * RCU + */ +static __inline __rcu_assign_pointer(void **p, void *v) +{ + *p = v; + smp_mb(); +} + +#define __rcu +#define rcu_assign_pointer(p, v) __rcu_assign_pointer(&(void *)p, (void *)v) +#define rcu_read_lock() +#define rcu_read_unlock() + +static __inline void *rcu_dereference_raw(void *p) +{ + return p; +} + +#define rcu_dereference(a) rcu_dereference_raw(a) +#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) +#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) +#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) + +static __inline void hlist_add_head_rcu(struct hlist_node *n, + struct hlist_head *h) +{ + struct hlist_node *first = h->first; + + n->next = first; + n->pprev = &h->first; + rcu_assign_pointer(hlist_first_rcu(h), n);
+ if (first) + first->pprev = &n->next; +} + +static __inline void hlist_del_rcu(struct hlist_node *n) +{ + __hlist_del(n); + n->pprev = LIST_POISON2; +} + +#define hlist_for_each_entry_rcu(pos, head, member) \ + for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) + +/* + * It is said there is no cpu online/offline for Windows, + * so always return true. + */ +static bool cpu_online(int cpu) +{ + return true; +} + +/* + * xsave related functions + */ +#define XSTATE_CPUID 0x0000000d +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + return _xgetbv(index); +} + +static inline void xsetbv(u32 index, u64 value) +{ + _xsetbv(index, value); +} + +extern NTSTATUS NtKrUtilsInit(void); +extern void NtKrUtilsExit(void); diff --git a/package/make_package.sh b/package/make_package.sh new file mode 100644 index 0000000..5dfa5ce --- /dev/null +++ b/package/make_package.sh @@ -0,0 +1,17 @@ +# Copyright 2019 Google LLC + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# version 2 as published by the Free Software Foundation. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# Package source codes together with release binary and symbols +# Run under WSL on Windows 10. + +#!/bin/bash +cd .. +zip -r package/gvm-`date +%Y%m%d-%H%M`.zip Release/ arch/ asmgen/ gvm* virt/ __asm.* ntkrutils.* include/ assembly/ diff --git a/sign/gvm/.gitignore b/sign/gvm/.gitignore new file mode 100644 index 0000000..d7e43e8 --- /dev/null +++ b/sign/gvm/.gitignore @@ -0,0 +1,3 @@ +setup.* +disk1/ +gvm.sys diff --git a/sign/gvm/gvm.ddf b/sign/gvm/gvm.ddf new file mode 100644 index 0000000..60f757f --- /dev/null +++ b/sign/gvm/gvm.ddf @@ -0,0 +1,26 @@ +; Copyright 2019 Google LLC + +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; version 2 as published by the Free Software Foundation. + +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. + +.OPTION EXPLICIT ; Generate errors +.Set CabinetFileCountThreshold=0 +.Set FolderFileCountThreshold=0 +.Set FolderSizeThreshold=0 +.Set MaxCabinetSize=0 +.Set MaxDiskFileCount=0 +.Set MaxDiskSize=0 +.Set CompressionType=MSZIP +.Set Cabinet=on +.Set Compress=on +.Set CabinetNameTemplate=gvm.cab +.Set DestinationDir=gvm +;Specify files to be included in cab file +gvm.Inf +gvm.Sys diff --git a/sign/gvm/gvm.inf b/sign/gvm/gvm.inf new file mode 100644 index 0000000..681c2b1 --- /dev/null +++ b/sign/gvm/gvm.inf @@ -0,0 +1,53 @@ +; Copyright 2019 Google LLC + +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; version 2 as published by the Free Software Foundation. + +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. + +[Version] +Signature = "$Windows NT$" +Provider = Google +Class = System +ClassGuid = {4d36e97d-e325-11ce-bfc1-08002be10318} +DriverVer = 09/11/2019,1.0.0.0 +DriverPackageType = KernelService +CatalogFile = gvm.cat + +[DestinationDirs] +DefaultDestDir = 12 + +[DefaultInstall] +CopyFiles = gvm.DriverFiles + +[DefaultInstall.Services] +AddService = gvm,0x00000002,gvm.Service + +[DefaultUninstall] +DelFiles = gvm.DriverFiles + +[DefaultUninstall.Services] +DelService = gvm,0x200 + +[SourceDisksFiles] +gvm.sys = 1 + +[SourceDisksNames] +1 = %DISK_NAME%, + +[gvm.DriverFiles] +gvm.sys + +[gvm.Service] +DisplayName = gvm Service +ServiceType = 1 ; SERVICE_KERNEL_DRIVER +StartType = 3 ; SERVICE_DEMAND_START +ErrorControl = 1 ; SERVICE_ERROR_NORMAL +ServiceBinary = %12%\gvm.sys + +[Strings] +DISK_NAME = "Android Emulator Hypervisor Driver for AMD Processors Installation Media" diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig deleted file mode 100644 index b0cc1a3..0000000 --- a/virt/kvm/Kconfig +++ /dev/null @@ -1,52 +0,0 @@ -# KVM common configuration items and defaults - -config HAVE_KVM - bool - -config HAVE_KVM_IRQCHIP - bool - -config HAVE_KVM_IRQFD - bool - -config HAVE_KVM_IRQ_ROUTING - bool - -config HAVE_KVM_EVENTFD - bool - select EVENTFD - -config KVM_MMIO - bool - -config KVM_ASYNC_PF - bool - -# Toggle to switch between direct notification and batch job -config KVM_ASYNC_PF_SYNC - bool - -config HAVE_KVM_MSI - bool - -config HAVE_KVM_CPU_RELAX_INTERCEPT - bool - -config KVM_VFIO - bool - -config HAVE_KVM_ARCH_TLB_FLUSH_ALL - bool - -config HAVE_KVM_INVALID_WAKEUPS - bool - -config KVM_GENERIC_DIRTYLOG_READ_PROTECT - bool - -config KVM_COMPAT - def_bool y - depends on KVM && COMPAT && !S390 - -config HAVE_KVM_IRQ_BYPASS - bool diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c deleted file mode 100644 index 528af4b..0000000 --- a/virt/kvm/arm/aarch32.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_hyp.h> - -#ifndef CONFIG_ARM64 -#define COMPAT_PSR_T_BIT PSR_T_BIT -#define COMPAT_PSR_IT_MASK PSR_IT_MASK -#endif - -/* - * stolen from arch/arm/kernel/opcodes.c - * - * condition code lookup table - * index into the table is test code: EQ, NE, ... LT, GT, AL, NV - * - * bit position in short is condition code: NZCV - */ -static const unsigned short cc_map[16] = { - 0xF0F0, /* EQ == Z set */ - 0x0F0F, /* NE */ - 0xCCCC, /* CS == C set */ - 0x3333, /* CC */ - 0xFF00, /* MI == N set */ - 0x00FF, /* PL */ - 0xAAAA, /* VS == V set */ - 0x5555, /* VC */ - 0x0C0C, /* HI == C set && Z clear */ - 0xF3F3, /* LS == C clear || Z set */ - 0xAA55, /* GE == (N==V) */ - 0x55AA, /* LT == (N!=V) */ - 0x0A05, /* GT == (!Z && (N==V)) */ - 0xF5FA, /* LE == (Z || (N!=V)) */ - 0xFFFF, /* AL always */ - 0 /* NV */ -}; - -/* - * Check if a trapped instruction should have been executed or not. - */ -bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) -{ - unsigned long cpsr; - u32 cpsr_cond; - int cond; - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - /* Is condition field valid? */ - cond = kvm_vcpu_get_condition(vcpu); - if (cond == 0xE) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - if (cond < 0) { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - cpsr_cond = cpsr >> 28; - - if (!((cc_map[cond] >> cpsr_cond) & 1)) - return false; - - return true; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & COMPAT_PSR_T_BIT); - - if (is_arm || !(cpsr & COMPAT_PSR_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~COMPAT_PSR_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; - else - *vcpu_pc(vcpu) += 4; - kvm_adjust_itstate(vcpu); -} diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c deleted file mode 100644 index 27a1f63..0000000 --- a/virt/kvm/arm/arch_timer.c +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/cpu.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/interrupt.h> -#include <linux/irq.h> - -#include <clocksource/arm_arch_timer.h> -#include <asm/arch_timer.h> - -#include <kvm/arm_vgic.h> -#include <kvm/arm_arch_timer.h> - -#include "trace.h" - -static struct timecounter *timecounter; -static unsigned int host_vtimer_irq; -static u32 host_vtimer_irq_flags; - -void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) -{ - vcpu->arch.timer_cpu.active_cleared_last = false; -} - -static cycle_t kvm_phys_timer_read(void) -{ - return timecounter->cc->read(timecounter->cc); -} - -static bool timer_is_armed(struct arch_timer_cpu *timer) -{ - return timer->armed; -} - -/* timer_arm: as in "arm the timer", not as in ARM the company */ -static void timer_arm(struct arch_timer_cpu *timer, u64 ns) -{ - timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), - HRTIMER_MODE_ABS); -} - -static void timer_disarm(struct arch_timer_cpu *timer) -{ - if (timer_is_armed(timer)) { - hrtimer_cancel(&timer->timer); - cancel_work_sync(&timer->expired); - timer->armed = false; - } -} - -static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) -{ - struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - - /* - * We disable the timer in the world switch and let it be - * handled by kvm_timer_sync_hwstate(). Getting a timer - * interrupt at this point is a sure sign of some major - * breakage. - */ - pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu); - return IRQ_HANDLED; -} - -/* - * Work function for handling the backup timer that we schedule when a vcpu is - * no longer running, but had a timer programmed to fire in the future. - */ -static void kvm_timer_inject_irq_work(struct work_struct *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); - vcpu->arch.timer_cpu.armed = false; - - WARN_ON(!kvm_timer_should_fire(vcpu)); - - /* - * If the vcpu is blocked we want to wake it up so that it will see - * the timer has expired when entering the guest. - */ - kvm_vcpu_kick(vcpu); -} - -static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu) -{ - cycle_t cval, now; - - cval = vcpu->arch.timer_cpu.cntv_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - - if (now < cval) { - u64 ns; - - ns = cyclecounter_cyc2ns(timecounter->cc, - cval - now, - timecounter->mask, - &timecounter->frac); - return ns; - } - - return 0; -} - -static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) -{ - struct arch_timer_cpu *timer; - struct kvm_vcpu *vcpu; - u64 ns; - - timer = container_of(hrt, struct arch_timer_cpu, timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - - /* - * Check that the timer has really expired from the guest's - * PoV (NTP on the host may have forced it to expire - * early). If we should have slept longer, restart it. - */ - ns = kvm_timer_compute_delta(vcpu); - if (unlikely(ns)) { - hrtimer_forward_now(hrt, ns_to_ktime(ns)); - return HRTIMER_RESTART; - } - - schedule_work(&timer->expired); - return HRTIMER_NORESTART; -} - -static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE); -} - -bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - cycle_t cval, now; - - if (!kvm_timer_irq_can_fire(vcpu)) - return false; - - cval = timer->cntv_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - - return cval <= now; -} - -static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) -{ - int ret; - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - BUG_ON(!vgic_initialized(vcpu->kvm)); - - timer->active_cleared_last = false; - timer->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq, - timer->irq.level); - ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, - timer->irq.irq, - timer->irq.level); - WARN_ON(ret); -} - -/* - * Check if there was a change in the timer state (should we raise or lower - * the line level to the GIC). - */ -static int kvm_timer_update_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - /* - * If userspace modified the timer registers via SET_ONE_REG before - * the vgic was initialized, we mustn't set the timer->irq.level value - * because the guest would never see the interrupt. Instead wait - * until we call this function from kvm_timer_flush_hwstate. - */ - if (!vgic_initialized(vcpu->kvm) || !timer->enabled) - return -ENODEV; - - if (kvm_timer_should_fire(vcpu) != timer->irq.level) - kvm_timer_update_irq(vcpu, !timer->irq.level); - - return 0; -} - -/* - * Schedule the background timer before calling kvm_vcpu_block, so that this - * thread is removed from its waitqueue and made runnable when there's a timer - * interrupt to handle. - */ -void kvm_timer_schedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - BUG_ON(timer_is_armed(timer)); - - /* - * No need to schedule a background timer if the guest timer has - * already expired, because kvm_vcpu_block will return before putting - * the thread to sleep. - */ - if (kvm_timer_should_fire(vcpu)) - return; - - /* - * If the timer is not capable of raising interrupts (disabled or - * masked), then there's no more work for us to do. - */ - if (!kvm_timer_irq_can_fire(vcpu)) - return; - - /* The timer has not yet expired, schedule a background timer */ - timer_arm(timer, kvm_timer_compute_delta(vcpu)); -} - -void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - timer_disarm(timer); -} - -/** - * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu - * @vcpu: The vcpu pointer - * - * Check if the virtual timer has expired while we were running in the host, - * and inject an interrupt if that was the case. - */ -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - bool phys_active; - int ret; - - if (kvm_timer_update_state(vcpu)) - return; - - /* - * If we enter the guest with the virtual input level to the VGIC - * asserted, then we have already told the VGIC what we need to, and - * we don't need to exit from the guest until the guest deactivates - * the already injected interrupt, so therefore we should set the - * hardware active state to prevent unnecessary exits from the guest. - * - * Also, if we enter the guest with the virtual timer interrupt active, - * then it must be active on the physical distributor, because we set - * the HW bit and the guest must be able to deactivate the virtual and - * physical interrupt at the same time. - * - * Conversely, if the virtual input level is deasserted and the virtual - * interrupt is not active, then always clear the hardware active state - * to ensure that hardware interrupts from the timer triggers a guest - * exit. - */ - phys_active = timer->irq.level || - kvm_vgic_map_is_active(vcpu, timer->irq.irq); - - /* - * We want to avoid hitting the (re)distributor as much as - * possible, as this is a potentially expensive MMIO access - * (not to mention locks in the irq layer), and a solution for - * this is to cache the "active" state in memory. - * - * Things to consider: we cannot cache an "active set" state, - * because the HW can change this behind our back (it becomes - * "clear" in the HW). We must then restrict the caching to - * the "clear" state. - * - * The cache is invalidated on: - * - vcpu put, indicating that the HW cannot be trusted to be - * in a sane state on the next vcpu load, - * - any change in the interrupt state - * - * Usage conditions: - * - cached value is "active clear" - * - value to be programmed is "active clear" - */ - if (timer->active_cleared_last && !phys_active) - return; - - ret = irq_set_irqchip_state(host_vtimer_irq, - IRQCHIP_STATE_ACTIVE, - phys_active); - WARN_ON(ret); - - timer->active_cleared_last = !phys_active; -} - -/** - * kvm_timer_sync_hwstate - sync timer state from cpu - * @vcpu: The vcpu pointer - * - * Check if the virtual timer has expired while we were running in the guest, - * and inject an interrupt if that was the case. - */ -void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - BUG_ON(timer_is_armed(timer)); - - /* - * The guest could have modified the timer registers or the timer - * could have expired, update the timer state. - */ - kvm_timer_update_state(vcpu); -} - -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, - const struct kvm_irq_level *irq) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - /* - * The vcpu timer irq number cannot be determined in - * kvm_timer_vcpu_init() because it is called much before - * kvm_vcpu_set_target(). To handle this, we determine - * vcpu timer irq number when the vcpu is reset. - */ - timer->irq.irq = irq->irq; - - /* - * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 - * and to 0 for ARMv7. We provide an implementation that always - * resets the timer to be disabled and unmasked and is compliant with - * the ARMv7 architecture. - */ - timer->cntv_ctl = 0; - kvm_timer_update_state(vcpu); - - return 0; -} - -void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->timer.function = kvm_timer_expire; -} - -static void kvm_timer_init_interrupt(void *info) -{ - enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); -} - -int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - timer->cntv_ctl = value; - break; - case KVM_REG_ARM_TIMER_CNT: - vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value; - break; - case KVM_REG_ARM_TIMER_CVAL: - timer->cntv_cval = value; - break; - default: - return -1; - } - - kvm_timer_update_state(vcpu); - return 0; -} - -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - return timer->cntv_ctl; - case KVM_REG_ARM_TIMER_CNT: - return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - case KVM_REG_ARM_TIMER_CVAL: - return timer->cntv_cval; - } - return (u64)-1; -} - -static int kvm_timer_starting_cpu(unsigned int cpu) -{ - kvm_timer_init_interrupt(NULL); - return 0; -} - -static int kvm_timer_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(host_vtimer_irq); - return 0; -} - -int kvm_timer_hyp_init(void) -{ - struct arch_timer_kvm_info *info; - int err; - - info = arch_timer_get_kvm_info(); - timecounter = &info->timecounter; - - if (info->virtual_irq <= 0) { - kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", - info->virtual_irq); - return -ENODEV; - } - host_vtimer_irq = info->virtual_irq; - - host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); - if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && - host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for IRQ%d, assuming level low\n", - host_vtimer_irq); - host_vtimer_irq_flags = IRQF_TRIGGER_LOW; - } - - err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, - "kvm guest timer", kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", - host_vtimer_irq, err); - return err; - } - - kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); - - cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, - "AP_KVM_ARM_TIMER_STARTING", kvm_timer_starting_cpu, - kvm_timer_dying_cpu); - return err; -} - -void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - timer_disarm(timer); - kvm_vgic_unmap_phys_irq(vcpu, timer->irq.irq); -} - -int kvm_timer_enable(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct irq_desc *desc; - struct irq_data *data; - int phys_irq; - int ret; - - if (timer->enabled) - return 0; - - /* - * Find the physical IRQ number corresponding to the host_vtimer_irq - */ - desc = irq_to_desc(host_vtimer_irq); - if (!desc) { - kvm_err("%s: no interrupt descriptor\n", __func__); - return -EINVAL; - } - - data = irq_desc_get_irq_data(desc); - while (data->parent_data) - data = data->parent_data; - - phys_irq = data->hwirq; - - /* - * Tell the VGIC that the virtual interrupt is tied to a - * physical interrupt. We do that once per VCPU. - */ - ret = kvm_vgic_map_phys_irq(vcpu, timer->irq.irq, phys_irq); - if (ret) - return ret; - - - /* - * There is a potential race here between VCPUs starting for the first - * time, which may be enabling the timer multiple times. That doesn't - * hurt though, because we're just setting a variable to the same - * variable that it already was. The important thing is that all - * VCPUs have the enabled variable set, before entering the guest, if - * the arch timers are enabled. - */ - if (timecounter) - timer->enabled = 1; - - return 0; -} - -void kvm_timer_init(struct kvm *kvm) -{ - kvm->arch.timer.cntvoff = kvm_phys_timer_read(); -} diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c deleted file mode 100644 index 798866a..0000000 --- a/virt/kvm/arm/hyp/timer-sr.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <clocksource/arm_arch_timer.h> -#include <linux/compiler.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_hyp.h> - -/* vcpu is already in the HYP VA space */ -void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - u64 val; - - if (timer->enabled) { - timer->cntv_ctl = read_sysreg_el0(cntv_ctl); - timer->cntv_cval = read_sysreg_el0(cntv_cval); - } - - /* Disable the virtual timer */ - write_sysreg_el0(0, cntv_ctl); - - /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; - write_sysreg(val, cnthctl_el2); - - /* Clear cntvoff for the host */ - write_sysreg(0, cntvoff_el2); -} - -void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - u64 val; - - /* - * Disallow physical timer access for the guest - * Physical counter access is allowed - */ - val = read_sysreg(cnthctl_el2); - val &= ~CNTHCTL_EL1PCEN; - val |= CNTHCTL_EL1PCTEN; - write_sysreg(val, cnthctl_el2); - - if (timer->enabled) { - write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2); - write_sysreg_el0(timer->cntv_cval, cntv_cval); - isb(); - write_sysreg_el0(timer->cntv_ctl, cntv_ctl); - } -} diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c deleted file mode 100644 index c8aeb7b..0000000 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/compiler.h> -#include <linux/irqchip/arm-gic.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_emulate.h> -#include <asm/kvm_hyp.h> - -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, - void __iomem *base) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; - u32 eisr0, eisr1; - int i; - bool expect_mi; - - expect_mi = !!(cpu_if->vgic_hcr & GICH_HCR_UIE); - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - expect_mi |= (!(cpu_if->vgic_lr[i] & GICH_LR_HW) && - (cpu_if->vgic_lr[i] & GICH_LR_EOI)); - } - - if (expect_mi) { - cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR); - - if (cpu_if->vgic_misr & GICH_MISR_EOI) { - eisr0 = readl_relaxed(base + GICH_EISR0); - if (unlikely(nr_lr > 32)) - eisr1 = readl_relaxed(base + GICH_EISR1); - else - eisr1 = 0; - } else { - eisr0 = eisr1 = 0; - } - } else { - cpu_if->vgic_misr = 0; - eisr0 = eisr1 = 0; - } - -#ifdef CONFIG_CPU_BIG_ENDIAN - cpu_if->vgic_eisr = ((u64)eisr0 << 32) | eisr1; -#else - cpu_if->vgic_eisr = ((u64)eisr1 << 32) | eisr0; -#endif -} - -static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; - u32 elrsr0, elrsr1; - - elrsr0 = readl_relaxed(base + GICH_ELRSR0); - if (unlikely(nr_lr > 32)) - elrsr1 = readl_relaxed(base + GICH_ELRSR1); - else - elrsr1 = 0; - -#ifdef CONFIG_CPU_BIG_ENDIAN - cpu_if->vgic_elrsr = ((u64)elrsr0 << 32) | elrsr1; -#else - cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0; -#endif -} - -static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; - int i; - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - if (cpu_if->vgic_elrsr & (1UL << i)) - cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; - else - cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4)); - - writel_relaxed(0, base + GICH_LR0 + (i * 4)); - } -} - -/* vcpu is already in the HYP VA space */ -void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - struct vgic_dist *vgic = &kvm->arch.vgic; - void __iomem *base = kern_hyp_va(vgic->vctrl_base); - - if (!base) - return; - - cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR); - - if (vcpu->arch.vgic_cpu.live_lrs) { - cpu_if->vgic_apr = readl_relaxed(base + GICH_APR); - - save_maint_int_state(vcpu, base); - save_elrsr(vcpu, base); - save_lrs(vcpu, base); - - writel_relaxed(0, base + GICH_HCR); - - vcpu->arch.vgic_cpu.live_lrs = 0; - } else { - cpu_if->vgic_eisr = 0; - cpu_if->vgic_elrsr = ~0UL; - cpu_if->vgic_misr = 0; - cpu_if->vgic_apr = 0; - } -} - -/* vcpu is already in the HYP VA space */ -void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - struct vgic_dist *vgic = &kvm->arch.vgic; - void __iomem *base = kern_hyp_va(vgic->vctrl_base); - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; - int i; - u64 live_lrs = 0; - - if (!base) - return; - - - for (i = 0; i < nr_lr; i++) - if (cpu_if->vgic_lr[i] & GICH_LR_STATE) - live_lrs |= 1UL << i; - - if (live_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - writel_relaxed(cpu_if->vgic_apr, base + GICH_APR); - for (i = 0; i < nr_lr; i++) { - if (!(live_lrs & (1UL << i))) - continue; - - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } - - writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR); - vcpu->arch.vgic_cpu.live_lrs = live_lrs; -} - -#ifdef CONFIG_ARM64 -/* - * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the - * guest. - * - * @vcpu: the offending vcpu - * - * Returns: - * 1: GICV access successfully performed - * 0: Not a GICV access - * -1: Illegal GICV access - */ -int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - struct vgic_dist *vgic = &kvm->arch.vgic; - phys_addr_t fault_ipa; - void __iomem *addr; - int rd; - - /* Build the full address */ - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); - - /* If not for GICV, move on */ - if (fault_ipa < vgic->vgic_cpu_base || - fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE)) - return 0; - - /* Reject anything but a 32bit access */ - if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) - return -1; - - /* Not aligned? Don't bother */ - if (fault_ipa & 3) - return -1; - - rd = kvm_vcpu_dabt_get_rd(vcpu); - addr = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va); - addr += fault_ipa - vgic->vgic_cpu_base; - - if (kvm_vcpu_dabt_iswrite(vcpu)) { - u32 data = vcpu_data_guest_to_host(vcpu, - vcpu_get_reg(vcpu, rd), - sizeof(u32)); - writel_relaxed(data, addr); - } else { - u32 data = readl_relaxed(addr); - vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data, - sizeof(u32))); - } - - return 1; -} -#endif diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c deleted file mode 100644 index 3947095..0000000 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/compiler.h> -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_hyp.h> - -#define vtr_to_max_lr_idx(v) ((v) & 0xf) -#define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1) - -static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) -{ - switch (lr & 0xf) { - case 0: - return read_gicreg(ICH_LR0_EL2); - case 1: - return read_gicreg(ICH_LR1_EL2); - case 2: - return read_gicreg(ICH_LR2_EL2); - case 3: - return read_gicreg(ICH_LR3_EL2); - case 4: - return read_gicreg(ICH_LR4_EL2); - case 5: - return read_gicreg(ICH_LR5_EL2); - case 6: - return read_gicreg(ICH_LR6_EL2); - case 7: - return read_gicreg(ICH_LR7_EL2); - case 8: - return read_gicreg(ICH_LR8_EL2); - case 9: - return read_gicreg(ICH_LR9_EL2); - case 10: - return read_gicreg(ICH_LR10_EL2); - case 11: - return read_gicreg(ICH_LR11_EL2); - case 12: - return read_gicreg(ICH_LR12_EL2); - case 13: - return read_gicreg(ICH_LR13_EL2); - case 14: - return read_gicreg(ICH_LR14_EL2); - case 15: - return read_gicreg(ICH_LR15_EL2); - } - - unreachable(); -} - -static void __hyp_text __gic_v3_set_lr(u64 val, int lr) -{ - switch (lr & 0xf) { - case 0: - write_gicreg(val, ICH_LR0_EL2); - break; - case 1: - write_gicreg(val, ICH_LR1_EL2); - break; - case 2: - write_gicreg(val, ICH_LR2_EL2); - break; - case 3: - write_gicreg(val, ICH_LR3_EL2); - break; - case 4: - write_gicreg(val, ICH_LR4_EL2); - break; - case 5: - write_gicreg(val, ICH_LR5_EL2); - break; - case 6: - write_gicreg(val, ICH_LR6_EL2); - break; - case 7: - write_gicreg(val, ICH_LR7_EL2); - break; - case 8: - write_gicreg(val, ICH_LR8_EL2); - break; - case 9: - write_gicreg(val, ICH_LR9_EL2); - break; - case 10: - write_gicreg(val, ICH_LR10_EL2); - break; - case 11: - write_gicreg(val, ICH_LR11_EL2); - break; - case 12: - write_gicreg(val, ICH_LR12_EL2); - break; - case 13: - write_gicreg(val, ICH_LR13_EL2); - break; - case 14: - write_gicreg(val, ICH_LR14_EL2); - break; - case 15: - write_gicreg(val, ICH_LR15_EL2); - break; - } -} - -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - int i; - bool expect_mi; - - expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE); - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) && - (cpu_if->vgic_lr[i] & ICH_LR_EOI)); - } - - if (expect_mi) { - cpu_if->vgic_misr = read_gicreg(ICH_MISR_EL2); - - if (cpu_if->vgic_misr & ICH_MISR_EOI) - cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2); - else - cpu_if->vgic_eisr = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - } -} - -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - - /* - * Make sure stores to the GIC via the memory mapped interface - * are now visible to the system register interface. - */ - if (!cpu_if->vgic_sre) - dsb(st); - - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - - if (vcpu->arch.vgic_cpu.live_lrs) { - int i; - u32 max_lr_idx, nr_pri_bits; - - cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); - - write_gicreg(0, ICH_HCR_EL2); - val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); - nr_pri_bits = vtr_to_nr_pri_bits(val); - - save_maint_int_state(vcpu, max_lr_idx + 1); - - for (i = 0; i <= max_lr_idx; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - if (cpu_if->vgic_elrsr & (1 << i)) - cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; - else - cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); - - __gic_v3_set_lr(0, i); - } - - switch (nr_pri_bits) { - case 7: - cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2); - cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2); - case 6: - cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2); - default: - cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2); - } - - switch (nr_pri_bits) { - case 7: - cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2); - cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2); - case 6: - cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2); - default: - cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); - } - - vcpu->arch.vgic_cpu.live_lrs = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - cpu_if->vgic_elrsr = 0xffff; - cpu_if->vgic_ap0r[0] = 0; - cpu_if->vgic_ap0r[1] = 0; - cpu_if->vgic_ap0r[2] = 0; - cpu_if->vgic_ap0r[3] = 0; - cpu_if->vgic_ap1r[0] = 0; - cpu_if->vgic_ap1r[1] = 0; - cpu_if->vgic_ap1r[2] = 0; - cpu_if->vgic_ap1r[3] = 0; - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); - } -} - -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - u32 max_lr_idx, nr_pri_bits; - u16 live_lrs = 0; - int i; - - /* - * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a - * Group0 interrupt (as generated in GICv2 mode) to be - * delivered as a FIQ to the guest, with potentially fatal - * consequences. So we must make sure that ICC_SRE_EL1 has - * been actually programmed with the value we want before - * starting to mess with the rest of the GIC. - */ - if (!cpu_if->vgic_sre) { - write_gicreg(0, ICC_SRE_EL1); - isb(); - } - - val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); - nr_pri_bits = vtr_to_nr_pri_bits(val); - - for (i = 0; i <= max_lr_idx; i++) { - if (cpu_if->vgic_lr[i] & ICH_LR_STATE) - live_lrs |= (1 << i); - } - - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - - if (live_lrs) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - - switch (nr_pri_bits) { - case 7: - write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2); - write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2); - case 6: - write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2); - default: - write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2); - } - - switch (nr_pri_bits) { - case 7: - write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2); - write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2); - case 6: - write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2); - default: - write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); - } - - for (i = 0; i <= max_lr_idx; i++) { - if (!(live_lrs & (1 << i))) - continue; - - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } - } - - /* - * Ensures that the above will have reached the - * (re)distributors. This ensure the guest will read the - * correct values from the memory-mapped interface. - */ - if (!cpu_if->vgic_sre) { - isb(); - dsb(sy); - } - vcpu->arch.vgic_cpu.live_lrs = live_lrs; - - /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); -} - -void __hyp_text __vgic_v3_init_lrs(void) -{ - int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); - int i; - - for (i = 0; i <= max_lr_idx; i++) - __gic_v3_set_lr(0, i); -} - -u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) -{ - return read_gicreg(ICH_VTR_EL2); -} diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c deleted file mode 100644 index 69ccce3..0000000 --- a/virt/kvm/arm/pmu.c +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Copyright (C) 2015 Linaro Ltd. - * Author: Shannon Zhao <shannon.zhao@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/cpu.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <asm/kvm_emulate.h> -#include <kvm/arm_pmu.h> -#include <kvm/arm_vgic.h> - -/** - * kvm_pmu_get_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) -{ - u64 counter, reg, enabled, running; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; - counter = vcpu_sys_reg(vcpu, reg); - - /* The real counter value is equal to the value of counter register plus - * the value perf event counts. - */ - if (pmc->perf_event) - counter += perf_event_read_value(pmc->perf_event, &enabled, - &running); - - return counter & pmc->bitmask; -} - -/** - * kvm_pmu_set_counter_value - set PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - * @val: The counter value - */ -void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) -{ - u64 reg; - - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; - vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); -} - -/** - * kvm_pmu_stop_counter - stop PMU counter - * @pmc: The PMU counter pointer - * - * If this counter has been configured to monitor some event, release it here. - */ -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) -{ - u64 counter, reg; - - if (pmc->perf_event) { - counter = kvm_pmu_get_counter_value(vcpu, pmc->idx); - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; - vcpu_sys_reg(vcpu, reg) = counter; - perf_event_disable(pmc->perf_event); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } -} - -/** - * kvm_pmu_vcpu_reset - reset pmu state for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - pmu->pmc[i].idx = i; - pmu->pmc[i].bitmask = 0xffffffffUL; - } -} - -/** - * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu - * @vcpu: The vcpu pointer - * - */ -void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc = &pmu->pmc[i]; - - if (pmc->perf_event) { - perf_event_disable(pmc->perf_event); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } - } -} - -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; - - val &= ARMV8_PMU_PMCR_N_MASK; - if (val == 0) - return BIT(ARMV8_PMU_CYCLE_IDX); - else - return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); -} - -/** - * kvm_pmu_enable_counter - enable selected PMU counter - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - if (!(vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - if (!(val & BIT(i))) - continue; - - pmc = &pmu->pmc[i]; - if (pmc->perf_event) { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } - } -} - -/** - * kvm_pmu_disable_counter - disable selected PMU counter - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - if (!val) - return; - - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { - if (!(val & BIT(i))) - continue; - - pmc = &pmu->pmc[i]; - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); - } -} - -static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) -{ - u64 reg = 0; - - if ((vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { - reg = vcpu_sys_reg(vcpu, PMOVSSET_EL0); - reg &= vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - reg &= vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); - } - - return reg; -} - -/** - * kvm_pmu_overflow_set - set PMU overflow interrupt - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMOVSSET register - */ -void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) -{ - u64 reg; - - if (val == 0) - return; - - vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= val; - reg = kvm_pmu_overflow_status(vcpu); - if (reg != 0) - kvm_vcpu_kick(vcpu); -} - -static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool overflow; - - if (!kvm_arm_pmu_v3_ready(vcpu)) - return; - - overflow = !!kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level != overflow) { - pmu->irq_level = overflow; - kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - pmu->irq_num, overflow); - } -} - -/** - * kvm_pmu_flush_hwstate - flush pmu state to cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the host, and inject - * an interrupt if that was the case. - */ -void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -/** - * kvm_pmu_sync_hwstate - sync pmu state from cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the guest, and - * inject an interrupt if that was the case. - */ -void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -static inline struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu; - struct kvm_vcpu_arch *vcpu_arch; - - pmc -= pmc->idx; - pmu = container_of(pmc, struct kvm_pmu, pmc[0]); - vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); - return container_of(vcpu_arch, struct kvm_vcpu, arch); -} - -/** - * When perf event overflows, call kvm_pmu_overflow_set to set overflow status. - */ -static void kvm_pmu_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - int idx = pmc->idx; - - kvm_pmu_overflow_set(vcpu, BIT(idx)); -} - -/** - * kvm_pmu_software_increment - do software increment - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMSWINC register - */ -void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) -{ - int i; - u64 type, enable, reg; - - if (val == 0) - return; - - enable = vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { - if (!(val & BIT(i))) - continue; - type = vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i) - & ARMV8_PMU_EVTYPE_EVENT; - if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR) - && (enable & BIT(i))) { - reg = vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; - reg = lower_32_bits(reg); - vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - if (!reg) - kvm_pmu_overflow_set(vcpu, BIT(i)); - } - } -} - -/** - * kvm_pmu_handle_pmcr - handle PMCR register - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCR register - */ -void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - u64 mask; - int i; - - mask = kvm_pmu_valid_counter_mask(vcpu); - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter(vcpu, - vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); - } else { - kvm_pmu_disable_counter(vcpu, mask); - } - - if (val & ARMV8_PMU_PMCR_C) - kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); - - if (val & ARMV8_PMU_PMCR_P) { - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) - kvm_pmu_set_counter_value(vcpu, i, 0); - } - - if (val & ARMV8_PMU_PMCR_LC) { - pmc = &pmu->pmc[ARMV8_PMU_CYCLE_IDX]; - pmc->bitmask = 0xffffffffffffffffUL; - } -} - -static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) -{ - return (vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && - (vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); -} - -/** - * kvm_pmu_set_counter_event_type - set selected counter to monitor some event - * @vcpu: The vcpu pointer - * @data: The data guest writes to PMXEVTYPER_EL0 - * @select_idx: The number of selected counter - * - * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an - * event with given hardware event number. Here we call perf_event API to - * emulate this action and create a kernel perf event for it. - */ -void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, - u64 select_idx) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - struct perf_event *event; - struct perf_event_attr attr; - u64 eventsel, counter; - - kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & ARMV8_PMU_EVTYPE_EVENT; - - /* Software increment event does't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && - select_idx != ARMV8_PMU_CYCLE_IDX) - return; - - memset(&attr, 0, sizeof(struct perf_event_attr)); - attr.type = PERF_TYPE_RAW; - attr.size = sizeof(attr); - attr.pinned = 1; - attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, select_idx); - attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; - attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; - attr.exclude_hv = 1; /* Don't count EL2 events */ - attr.exclude_host = 1; /* Don't count host events */ - attr.config = (select_idx == ARMV8_PMU_CYCLE_IDX) ? - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; - - counter = kvm_pmu_get_counter_value(vcpu, select_idx); - /* The initial sample period (overflow count) of an event. */ - attr.sample_period = (-counter) & pmc->bitmask; - - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, pmc); - if (IS_ERR(event)) { - pr_err_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); - return; - } - - pmc->perf_event = event; -} - -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - -static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) -{ - if (!kvm_arm_support_pmu_v3()) - return -ENODEV; - - /* - * We currently require an in-kernel VGIC to use the PMU emulation, - * because we do not support forwarding PMU overflow interrupts to - * userspace yet. - */ - if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm)) - return -ENODEV; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) || - !kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - if (kvm_arm_pmu_v3_ready(vcpu)) - return -EBUSY; - - kvm_pmu_vcpu_reset(vcpu); - vcpu->arch.pmu.ready = true; - - return 0; -} - -#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS) - -/* - * For one VM the interrupt type must be same for each vcpu. - * As a PPI, the interrupt number is the same for all vcpus, - * while as an SPI it must be a separate number per vcpu. - */ -static bool pmu_irq_is_valid(struct kvm *kvm, int irq) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_arm_pmu_irq_initialized(vcpu)) - continue; - - if (irq_is_ppi(irq)) { - if (vcpu->arch.pmu.irq_num != irq) - return false; - } else { - if (vcpu->arch.pmu.irq_num == irq) - return false; - } - } - - return true; -} - -int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - - if (get_user(irq, uaddr)) - return -EFAULT; - - /* The PMU overflow interrupt can be a PPI or a valid SPI. */ - if (!(irq_is_ppi(irq) || vgic_valid_spi(vcpu->kvm, irq))) - return -EINVAL; - - if (!pmu_irq_is_valid(vcpu->kvm, irq)) - return -EINVAL; - - if (kvm_arm_pmu_irq_initialized(vcpu)) - return -EBUSY; - - kvm_debug("Set kvm ARM PMU irq: %d\n", irq); - vcpu->arch.pmu.irq_num = irq; - return 0; - } - case KVM_ARM_VCPU_PMU_V3_INIT: - return kvm_arm_pmu_v3_init(vcpu); - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - irq = vcpu->arch.pmu.irq_num; - return put_user(irq, uaddr); - } - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: - case KVM_ARM_VCPU_PMU_V3_INIT: - if (kvm_arm_support_pmu_v3() && - test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return 0; - } - - return -ENXIO; -} diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h deleted file mode 100644 index 37d8b98..0000000 --- a/virt/kvm/arm/trace.h +++ /dev/null @@ -1,63 +0,0 @@ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H - -#include <linux/tracepoint.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* - * Tracepoints for vgic - */ -TRACE_EVENT(vgic_update_irq_pending, - TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), - TP_ARGS(vcpu_id, irq, level), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( __u32, irq ) - __field( bool, level ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->irq = irq; - __entry->level = level; - ), - - TP_printk("VCPU: %ld, IRQ %d, level: %d", - __entry->vcpu_id, __entry->irq, __entry->level) -); - -/* - * Tracepoints for arch_timer - */ -TRACE_EVENT(kvm_timer_update_irq, - TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), - TP_ARGS(vcpu_id, irq, level), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_id ) - __field( __u32, irq ) - __field( int, level ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - __entry->irq = irq; - __entry->level = level; - ), - - TP_printk("VCPU: %ld, IRQ %d, level %d", - __entry->vcpu_id, __entry->irq, __entry->level) -); - -#endif /* _TRACE_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c deleted file mode 100644 index 8cebfbc..0000000 --- a/virt/kvm/arm/vgic/vgic-init.c +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/uaccess.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_mmu.h> -#include "vgic.h" - -/* - * Initialization rules: there are multiple stages to the vgic - * initialization, both for the distributor and the CPU interfaces. - * - * Distributor: - * - * - kvm_vgic_early_init(): initialization of static data that doesn't - * depend on any sizing information or emulation type. No allocation - * is allowed there. - * - * - vgic_init(): allocation and initialization of the generic data - * structures that depend on sizing information (number of CPUs, - * number of interrupts). Also initializes the vcpu specific data - * structures. Can be executed lazily for GICv2. - * - * CPU Interface: - * - * - kvm_vgic_cpu_early_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. - */ - -/* EARLY INIT */ - -/* - * Those 2 functions should not be needed anymore but they - * still are called from arm.c - */ -void kvm_vgic_early_init(struct kvm *kvm) -{ -} - -void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu) -{ -} - -/* CREATION */ - -/** - * kvm_vgic_create: triggered by the instantiation of the VGIC device by - * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) - * or through the generic KVM_CREATE_DEVICE API ioctl. - * irqchip_in_kernel() tells you if this function succeeded or not. - * @kvm: kvm struct pointer - * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] - */ -int kvm_vgic_create(struct kvm *kvm, u32 type) -{ - int i, vcpu_lock_idx = -1, ret; - struct kvm_vcpu *vcpu; - - if (irqchip_in_kernel(kvm)) - return -EEXIST; - - /* - * This function is also called by the KVM_CREATE_IRQCHIP handler, - * which had no chance yet to check the availability of the GICv2 - * emulation. So check this here again. KVM_CREATE_DEVICE does - * the proper checks already. - */ - if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && - !kvm_vgic_global_state.can_emulate_gicv2) - return -ENODEV; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run while we create the vgic. - */ - ret = -EBUSY; - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!mutex_trylock(&vcpu->mutex)) - goto out_unlock; - vcpu_lock_idx = i; - } - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) - goto out_unlock; - } - ret = 0; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) - kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; - else - kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; - - if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { - ret = -E2BIG; - goto out_unlock; - } - - kvm->arch.vgic.in_kernel = true; - kvm->arch.vgic.vgic_model = type; - - /* - * kvm_vgic_global_state.vctrl_base is set on vgic probe (kvm_arch_init) - * it is stored in distributor struct for asm save/restore purpose - */ - kvm->arch.vgic.vctrl_base = kvm_vgic_global_state.vctrl_base; - - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF; - -out_unlock: - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&vcpu->mutex); - } - return ret; -} - -/* INIT/DESTROY */ - -/** - * kvm_vgic_dist_init: initialize the dist data structures - * @kvm: kvm struct pointer - * @nr_spis: number of spis, frozen by caller - */ -static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); - int i; - - INIT_LIST_HEAD(&dist->lpi_list_head); - spin_lock_init(&dist->lpi_list_lock); - - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); - if (!dist->spis) - return -ENOMEM; - - /* - * In the following code we do not take the irq struct lock since - * no other action on irq structs can happen while the VGIC is - * not initialized yet: - * If someone wants to inject an interrupt or does a MMIO access, we - * require prior initialization in case of a virtual GICv3 or trigger - * initialization when using a virtual GICv2. - */ - for (i = 0; i < nr_spis; i++) { - struct vgic_irq *irq = &dist->spis[i]; - - irq->intid = i + VGIC_NR_PRIVATE_IRQS; - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - irq->vcpu = NULL; - irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) - irq->targets = 0; - else - irq->mpidr = 0; - } - return 0; -} - -/** - * kvm_vgic_vcpu_init: initialize the vcpu data structures and - * enable the VCPU interface - * @vcpu: the VCPU which's VGIC should be initialized - */ -static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - int i; - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - spin_lock_init(&vgic_cpu->ap_list_lock); - - /* - * Enable and configure all SGIs to be edge-triggered and - * configure all PPIs as level-triggered. - */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - irq->intid = i; - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - irq->targets = 1U << vcpu->vcpu_id; - kref_init(&irq->refcount); - if (vgic_irq_is_sgi(i)) { - /* SGIs */ - irq->enabled = 1; - irq->config = VGIC_CONFIG_EDGE; - } else { - /* PPIs */ - irq->config = VGIC_CONFIG_LEVEL; - } - } - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); - else - vgic_v3_enable(vcpu); -} - -/* - * vgic_init: allocates and initializes dist and vcpu data structures - * depending on two dimensioning parameters: - * - the number of spis - * - the number of vcpus - * The function is generally called when nr_spis has been explicitly set - * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. - * vgic_initialized() returns true when this function has succeeded. - * Must be called with kvm->lock held! - */ -int vgic_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int ret = 0, i; - - if (vgic_initialized(kvm)) - return 0; - - /* freeze the number of spis */ - if (!dist->nr_spis) - dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; - - ret = kvm_vgic_dist_init(kvm, dist->nr_spis); - if (ret) - goto out; - - if (vgic_has_its(kvm)) - dist->msis_require_devid = true; - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_init(vcpu); - - ret = kvm_vgic_setup_default_irq_routing(kvm); - if (ret) - goto out; - - dist->initialized = true; -out: - return ret; -} - -static void kvm_vgic_dist_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - mutex_lock(&kvm->lock); - - dist->ready = false; - dist->initialized = false; - - kfree(dist->spis); - dist->nr_spis = 0; - - mutex_unlock(&kvm->lock); -} - -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); -} - -void kvm_vgic_destroy(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - kvm_vgic_dist_destroy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_destroy(vcpu); -} - -/** - * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest - * is a GICv2. A GICv3 must be explicitly initialized by the guest using the - * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. - * @kvm: kvm struct pointer - */ -int vgic_lazy_init(struct kvm *kvm) -{ - int ret = 0; - - if (unlikely(!vgic_initialized(kvm))) { - /* - * We only provide the automatic initialization of the VGIC - * for the legacy case of a GICv2. Any other type must - * be explicitly initialized once setup with the respective - * KVM device call. - */ - if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) - return -EBUSY; - - mutex_lock(&kvm->lock); - ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); - } - - return ret; -} - -/* RESOURCE MAPPING */ - -/** - * Map the MMIO regions depending on the VGIC model exposed to the guest - * called on the first VCPU run. - * Also map the virtual CPU interface into the VM. - * v2/v3 derivatives call vgic_init if not already done. - * vgic_ready() returns true if this function has succeeded. - * @kvm: kvm struct pointer - */ -int kvm_vgic_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - mutex_lock(&kvm->lock); - if (!irqchip_in_kernel(kvm)) - goto out; - - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) - ret = vgic_v2_map_resources(kvm); - else - ret = vgic_v3_map_resources(kvm); -out: - mutex_unlock(&kvm->lock); - return ret; -} - -/* GENERIC PROBE */ - -static int vgic_init_cpu_starting(unsigned int cpu) -{ - enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); - return 0; -} - - -static int vgic_init_cpu_dying(unsigned int cpu) -{ - disable_percpu_irq(kvm_vgic_global_state.maint_irq); - return 0; -} - -static irqreturn_t vgic_maintenance_handler(int irq, void *data) -{ - /* - * We cannot rely on the vgic maintenance interrupt to be - * delivered synchronously. This means we can only use it to - * exit the VM, and we perform the handling of EOIed - * interrupts on the exit path (see vgic_process_maintenance). - */ - return IRQ_HANDLED; -} - -/** - * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable - * according to the host GIC model. Accordingly calls either - * vgic_v2/v3_probe which registers the KVM_DEVICE that can be - * instantiated by a guest later on . - */ -int kvm_vgic_hyp_init(void) -{ - const struct gic_kvm_info *gic_kvm_info; - int ret; - - gic_kvm_info = gic_get_kvm_info(); - if (!gic_kvm_info) - return -ENODEV; - - if (!gic_kvm_info->maint_irq) { - kvm_err("No vgic maintenance irq\n"); - return -ENXIO; - } - - switch (gic_kvm_info->type) { - case GIC_V2: - ret = vgic_v2_probe(gic_kvm_info); - break; - case GIC_V3: - ret = vgic_v3_probe(gic_kvm_info); - if (!ret) { - static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); - kvm_info("GIC system register CPU interface enabled\n"); - } - break; - default: - ret = -ENODEV; - }; - - if (ret) - return ret; - - kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; - ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, - vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", - kvm_vgic_global_state.maint_irq); - return ret; - } - - ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, - "AP_KVM_ARM_VGIC_INIT_STARTING", - vgic_init_cpu_starting, vgic_init_cpu_dying); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - - kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); - return 0; - -out_free_irq: - free_percpu_irq(kvm_vgic_global_state.maint_irq, - kvm_get_running_vcpus()); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c deleted file mode 100644 index d918dcf..0000000 --- a/virt/kvm/arm/vgic/vgic-irqfd.c +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <trace/events/kvm.h> -#include <kvm/arm_vgic.h> -#include "vgic.h" - -/** - * vgic_irqfd_set_irq: inject the IRQ corresponding to the - * irqchip routing entry - * - * This is the entry point for irqfd IRQ injection - */ -static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; - - if (!vgic_valid_spi(kvm, spi_id)) - return -EINVAL; - return kvm_vgic_inject_irq(kvm, 0, spi_id, level); -} - -/** - * kvm_set_routing_entry: populate a kvm routing entry - * from a user routing entry - * - * @kvm: the VM this entry is applied to - * @e: kvm kernel routing entry handle - * @ue: user api routing entry handle - * return 0 on success, -EINVAL on errors. - */ -int kvm_set_routing_entry(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) -{ - int r = -EINVAL; - - switch (ue->type) { - case KVM_IRQ_ROUTING_IRQCHIP: - e->set = vgic_irqfd_set_irq; - e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin; - if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || - (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) - goto out; - break; - case KVM_IRQ_ROUTING_MSI: - e->set = kvm_set_msi; - e->msi.address_lo = ue->u.msi.address_lo; - e->msi.address_hi = ue->u.msi.address_hi; - e->msi.data = ue->u.msi.data; - e->msi.flags = ue->flags; - e->msi.devid = ue->u.msi.devid; - break; - default: - goto out; - } - r = 0; -out: - return r; -} - -/** - * kvm_set_msi: inject the MSI corresponding to the - * MSI routing entry - * - * This is the entry point for irqfd MSI injection - * and userspace MSI injection. - */ -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - struct kvm_msi msi; - - msi.address_lo = e->msi.address_lo; - msi.address_hi = e->msi.address_hi; - msi.data = e->msi.data; - msi.flags = e->msi.flags; - msi.devid = e->msi.devid; - - if (!vgic_has_its(kvm)) - return -ENODEV; - - return vgic_its_inject_msi(kvm, &msi); -} - -int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) -{ - struct kvm_irq_routing_entry *entries; - struct vgic_dist *dist = &kvm->arch.vgic; - u32 nr = dist->nr_spis; - int i, ret; - - entries = kcalloc(nr, sizeof(struct kvm_kernel_irq_routing_entry), - GFP_KERNEL); - if (!entries) - return -ENOMEM; - - for (i = 0; i < nr; i++) { - entries[i].gsi = i; - entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; - entries[i].u.irqchip.irqchip = 0; - entries[i].u.irqchip.pin = i; - } - ret = kvm_set_irq_routing(kvm, entries, nr, 0); - kfree(entries); - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c deleted file mode 100644 index 4660a7d..0000000 --- a/virt/kvm/arm/vgic/vgic-its.c +++ /dev/null @@ -1,1570 +0,0 @@ -/* - * GICv3 ITS emulation - * - * Copyright (C) 2015,2016 ARM Ltd. - * Author: Andre Przywara <andre.przywara@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/cpu.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/interrupt.h> -#include <linux/list.h> -#include <linux/uaccess.h> - -#include <linux/irqchip/arm-gic-v3.h> - -#include <asm/kvm_emulate.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -/* - * Creates a new (reference to a) struct vgic_irq for a given LPI. - * If this LPI is already mapped on another ITS, we increase its refcount - * and return a pointer to the existing structure. - * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. - * This function returns a pointer to the _unlocked_ structure. - */ -static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; - - /* In this case there is no put, since we keep the reference. */ - if (irq) - return irq; - - irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); - if (!irq) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&irq->lpi_list); - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - - irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); - irq->intid = intid; - - spin_lock(&dist->lpi_list_lock); - - /* - * There could be a race with another vgic_add_lpi(), so we need to - * check that we don't add a second list entry with the same LPI. - */ - list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { - if (oldirq->intid != intid) - continue; - - /* Someone was faster with adding this LPI, lets use that. */ - kfree(irq); - irq = oldirq; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() on the returned pointer once it's - * finished with the IRQ. - */ - vgic_get_irq_kref(irq); - - goto out_unlock; - } - - list_add_tail(&irq->lpi_list, &dist->lpi_list_head); - dist->lpi_list_count++; - -out_unlock: - spin_unlock(&dist->lpi_list_lock); - - return irq; -} - -struct its_device { - struct list_head dev_list; - - /* the head for the list of ITTEs */ - struct list_head itt_head; - u32 device_id; -}; - -#define COLLECTION_NOT_MAPPED ((u32)~0) - -struct its_collection { - struct list_head coll_list; - - u32 collection_id; - u32 target_addr; -}; - -#define its_is_collection_mapped(coll) ((coll) && \ - ((coll)->target_addr != COLLECTION_NOT_MAPPED)) - -struct its_itte { - struct list_head itte_list; - - struct vgic_irq *irq; - struct its_collection *collection; - u32 lpi; - u32 event_id; -}; - -/* - * Find and returns a device in the device table for an ITS. - * Must be called with the its_lock mutex held. - */ -static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) -{ - struct its_device *device; - - list_for_each_entry(device, &its->device_list, dev_list) - if (device_id == device->device_id) - return device; - - return NULL; -} - -/* - * Find and returns an interrupt translation table entry (ITTE) for a given - * Device ID/Event ID pair on an ITS. - * Must be called with the its_lock mutex held. - */ -static struct its_itte *find_itte(struct vgic_its *its, u32 device_id, - u32 event_id) -{ - struct its_device *device; - struct its_itte *itte; - - device = find_its_device(its, device_id); - if (device == NULL) - return NULL; - - list_for_each_entry(itte, &device->itt_head, itte_list) - if (itte->event_id == event_id) - return itte; - - return NULL; -} - -/* To be used as an iterator this macro misses the enclosing parentheses */ -#define for_each_lpi_its(dev, itte, its) \ - list_for_each_entry(dev, &(its)->device_list, dev_list) \ - list_for_each_entry(itte, &(dev)->itt_head, itte_list) - -/* - * We only implement 48 bits of PA at the moment, although the ITS - * supports more. Let's be restrictive here. - */ -#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) -#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) -#define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) -#define PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) - -#define GIC_LPI_OFFSET 8192 - -/* - * Finds and returns a collection in the ITS collection table. - * Must be called with the its_lock mutex held. - */ -static struct its_collection *find_collection(struct vgic_its *its, int coll_id) -{ - struct its_collection *collection; - - list_for_each_entry(collection, &its->collection_list, coll_list) { - if (coll_id == collection->collection_id) - return collection; - } - - return NULL; -} - -#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) -#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) - -/* - * Reads the configuration data for a given LPI from guest memory and - * updates the fields in struct vgic_irq. - * If filter_vcpu is not NULL, applies only if the IRQ is targeting this - * VCPU. Unconditionally applies if filter_vcpu is NULL. - */ -static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu) -{ - u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); - u8 prop; - int ret; - - ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, - &prop, 1); - - if (ret) - return ret; - - spin_lock(&irq->irq_lock); - - if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { - irq->priority = LPI_PROP_PRIORITY(prop); - irq->enabled = LPI_PROP_ENABLE_BIT(prop); - - vgic_queue_irq_unlock(kvm, irq); - } else { - spin_unlock(&irq->irq_lock); - } - - return 0; -} - -/* - * Create a snapshot of the current LPI list, so that we can enumerate all - * LPIs without holding any lock. - * Returns the array length and puts the kmalloc'ed array into intid_ptr. - */ -static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq; - u32 *intids; - int irq_count = dist->lpi_list_count, i = 0; - - /* - * We use the current value of the list length, which may change - * after the kmalloc. We don't care, because the guest shouldn't - * change anything while the command handling is still running, - * and in the worst case we would miss a new IRQ, which one wouldn't - * expect to be covered by this command anyway. - */ - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); - if (!intids) - return -ENOMEM; - - spin_lock(&dist->lpi_list_lock); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - /* We don't need to "get" the IRQ, as we hold the list lock. */ - intids[i] = irq->intid; - if (++i == irq_count) - break; - } - spin_unlock(&dist->lpi_list_lock); - - *intid_ptr = intids; - return irq_count; -} - -/* - * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI - * is targeting) to the VGIC's view, which deals with target VCPUs. - * Needs to be called whenever either the collection for a LPIs has - * changed or the collection itself got retargeted. - */ -static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte) -{ - struct kvm_vcpu *vcpu; - - if (!its_is_collection_mapped(itte->collection)) - return; - - vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); - - spin_lock(&itte->irq->irq_lock); - itte->irq->target_vcpu = vcpu; - spin_unlock(&itte->irq->irq_lock); -} - -/* - * Updates the target VCPU for every LPI targeting this collection. - * Must be called with the its_lock mutex held. - */ -static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, - struct its_collection *coll) -{ - struct its_device *device; - struct its_itte *itte; - - for_each_lpi_its(device, itte, its) { - if (!itte->collection || coll != itte->collection) - continue; - - update_affinity_itte(kvm, itte); - } -} - -static u32 max_lpis_propbaser(u64 propbaser) -{ - int nr_idbits = (propbaser & 0x1f) + 1; - - return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); -} - -/* - * Scan the whole LPI pending table and sync the pending bit in there - * with our own data structures. This relies on the LPI being - * mapped before. - */ -static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) -{ - gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); - struct vgic_irq *irq; - int last_byte_offset = -1; - int ret = 0; - u32 *intids; - int nr_irqs, i; - - nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids); - if (nr_irqs < 0) - return nr_irqs; - - for (i = 0; i < nr_irqs; i++) { - int byte_offset, bit_nr; - u8 pendmask; - - byte_offset = intids[i] / BITS_PER_BYTE; - bit_nr = intids[i] % BITS_PER_BYTE; - - /* - * For contiguously allocated LPIs chances are we just read - * this very same byte in the last iteration. Reuse that. - */ - if (byte_offset != last_byte_offset) { - ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset, - &pendmask, 1); - if (ret) { - kfree(intids); - return ret; - } - last_byte_offset = byte_offset; - } - - irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); - spin_lock(&irq->irq_lock); - irq->pending = pendmask & (1U << bit_nr); - vgic_queue_irq_unlock(vcpu->kvm, irq); - vgic_put_irq(vcpu->kvm, irq); - } - - kfree(intids); - - return ret; -} - -static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u32 reg = 0; - - mutex_lock(&its->cmd_lock); - if (its->creadr == its->cwriter) - reg |= GITS_CTLR_QUIESCENT; - if (its->enabled) - reg |= GITS_CTLR_ENABLE; - mutex_unlock(&its->cmd_lock); - - return reg; -} - -static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - its->enabled = !!(val & GITS_CTLR_ENABLE); -} - -static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u64 reg = GITS_TYPER_PLPIS; - - /* - * We use linear CPU numbers for redistributor addressing, - * so GITS_TYPER.PTA is 0. - * Also we force all PROPBASER registers to be the same, so - * CommonLPIAff is 0 as well. - * To avoid memory waste in the guest, we keep the number of IDBits and - * DevBits low - as least for the time being. - */ - reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT; - reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT; - - return extract_bytes(reg, addr & 7, len); -} - -static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); -} - -static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - switch (addr & 0xffff) { - case GITS_PIDR0: - return 0x92; /* part number, bits[7:0] */ - case GITS_PIDR1: - return 0xb4; /* part number, bits[11:8] */ - case GITS_PIDR2: - return GIC_PIDR2_ARCH_GICv3 | 0x0b; - case GITS_PIDR4: - return 0x40; /* This is a 64K software visible page */ - /* The following are the ID registers for (any) GIC. */ - case GITS_CIDR0: - return 0x0d; - case GITS_CIDR1: - return 0xf0; - case GITS_CIDR2: - return 0x05; - case GITS_CIDR3: - return 0xb1; - } - - return 0; -} - -/* - * Find the target VCPU and the LPI number for a given devid/eventid pair - * and make this IRQ pending, possibly injecting it. - * Must be called with the its_lock mutex held. - * Returns 0 on success, a positive error value for any ITS mapping - * related errors and negative error values for generic errors. - */ -static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid) -{ - struct kvm_vcpu *vcpu; - struct its_itte *itte; - - if (!its->enabled) - return -EBUSY; - - itte = find_itte(its, devid, eventid); - if (!itte || !its_is_collection_mapped(itte->collection)) - return E_ITS_INT_UNMAPPED_INTERRUPT; - - vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); - if (!vcpu) - return E_ITS_INT_UNMAPPED_INTERRUPT; - - if (!vcpu->arch.vgic_cpu.lpis_enabled) - return -EBUSY; - - spin_lock(&itte->irq->irq_lock); - itte->irq->pending = true; - vgic_queue_irq_unlock(kvm, itte->irq); - - return 0; -} - -static struct vgic_io_device *vgic_get_its_iodev(struct kvm_io_device *dev) -{ - struct vgic_io_device *iodev; - - if (dev->ops != &kvm_io_gic_ops) - return NULL; - - iodev = container_of(dev, struct vgic_io_device, dev); - - if (iodev->iodev_type != IODEV_ITS) - return NULL; - - return iodev; -} - -/* - * Queries the KVM IO bus framework to get the ITS pointer from the given - * doorbell address. - * We then call vgic_its_trigger_msi() with the decoded data. - * According to the KVM_SIGNAL_MSI API description returns 1 on success. - */ -int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) -{ - u64 address; - struct kvm_io_device *kvm_io_dev; - struct vgic_io_device *iodev; - int ret; - - if (!vgic_has_its(kvm)) - return -ENODEV; - - if (!(msi->flags & KVM_MSI_VALID_DEVID)) - return -EINVAL; - - address = (u64)msi->address_hi << 32 | msi->address_lo; - - kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); - if (!kvm_io_dev) - return -EINVAL; - - iodev = vgic_get_its_iodev(kvm_io_dev); - if (!iodev) - return -EINVAL; - - mutex_lock(&iodev->its->its_lock); - ret = vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data); - mutex_unlock(&iodev->its->its_lock); - - if (ret < 0) - return ret; - - /* - * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 - * if the guest has blocked the MSI. So we map any LPI mapping - * related error to that. - */ - if (ret) - return 0; - else - return 1; -} - -/* Requires the its_lock to be held. */ -static void its_free_itte(struct kvm *kvm, struct its_itte *itte) -{ - list_del(&itte->itte_list); - - /* This put matches the get in vgic_add_lpi. */ - if (itte->irq) - vgic_put_irq(kvm, itte->irq); - - kfree(itte); -} - -static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) -{ - return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); -} - -#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) -#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) -#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) -#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) -#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) -#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) -#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) - -/* - * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_itte *itte; - - - itte = find_itte(its, device_id, event_id); - if (itte && itte->collection) { - /* - * Though the spec talks about removing the pending state, we - * don't bother here since we clear the ITTE anyway and the - * pending state is a property of the ITTE struct. - */ - its_free_itte(kvm, itte); - return 0; - } - - return E_ITS_DISCARD_UNMAPPED_INTERRUPT; -} - -/* - * The MOVI command moves an ITTE to a different collection. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - u32 coll_id = its_cmd_get_collection(its_cmd); - struct kvm_vcpu *vcpu; - struct its_itte *itte; - struct its_collection *collection; - - itte = find_itte(its, device_id, event_id); - if (!itte) - return E_ITS_MOVI_UNMAPPED_INTERRUPT; - - if (!its_is_collection_mapped(itte->collection)) - return E_ITS_MOVI_UNMAPPED_COLLECTION; - - collection = find_collection(its, coll_id); - if (!its_is_collection_mapped(collection)) - return E_ITS_MOVI_UNMAPPED_COLLECTION; - - itte->collection = collection; - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - spin_lock(&itte->irq->irq_lock); - itte->irq->target_vcpu = vcpu; - spin_unlock(&itte->irq->irq_lock); - - return 0; -} - -/* - * Check whether an ID can be stored into the corresponding guest table. - * For a direct table this is pretty easy, but gets a bit nasty for - * indirect tables. We check whether the resulting guest physical address - * is actually valid (covered by a memslot and guest accessbible). - * For this we have to read the respective first level entry. - */ -static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id) -{ - int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; - int index; - u64 indirect_ptr; - gfn_t gfn; - - if (!(baser & GITS_BASER_INDIRECT)) { - phys_addr_t addr; - - if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser))) - return false; - - addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser); - gfn = addr >> PAGE_SHIFT; - - return kvm_is_visible_gfn(its->dev->kvm, gfn); - } - - /* calculate and check the index into the 1st level */ - index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser)); - if (index >= (l1_tbl_size / sizeof(u64))) - return false; - - /* Each 1st level entry is represented by a 64-bit value. */ - if (kvm_read_guest(its->dev->kvm, - BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), - &indirect_ptr, sizeof(indirect_ptr))) - return false; - - indirect_ptr = le64_to_cpu(indirect_ptr); - - /* check the valid bit of the first level entry */ - if (!(indirect_ptr & BIT_ULL(63))) - return false; - - /* - * Mask the guest physical address and calculate the frame number. - * Any address beyond our supported 48 bits of PA will be caught - * by the actual check in the final step. - */ - indirect_ptr &= GENMASK_ULL(51, 16); - - /* Find the address of the actual entry */ - index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser)); - indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser); - gfn = indirect_ptr >> PAGE_SHIFT; - - return kvm_is_visible_gfn(its->dev->kvm, gfn); -} - -static int vgic_its_alloc_collection(struct vgic_its *its, - struct its_collection **colp, - u32 coll_id) -{ - struct its_collection *collection; - - if (!vgic_its_check_id(its, its->baser_coll_table, coll_id)) - return E_ITS_MAPC_COLLECTION_OOR; - - collection = kzalloc(sizeof(*collection), GFP_KERNEL); - - collection->collection_id = coll_id; - collection->target_addr = COLLECTION_NOT_MAPPED; - - list_add_tail(&collection->coll_list, &its->collection_list); - *colp = collection; - - return 0; -} - -static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) -{ - struct its_collection *collection; - struct its_device *device; - struct its_itte *itte; - - /* - * Clearing the mapping for that collection ID removes the - * entry from the list. If there wasn't any before, we can - * go home early. - */ - collection = find_collection(its, coll_id); - if (!collection) - return; - - for_each_lpi_its(device, itte, its) - if (itte->collection && - itte->collection->collection_id == coll_id) - itte->collection = NULL; - - list_del(&collection->coll_list); - kfree(collection); -} - -/* - * The MAPTI and MAPI commands map LPIs to ITTEs. - * Must be called with its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - u32 coll_id = its_cmd_get_collection(its_cmd); - struct its_itte *itte; - struct its_device *device; - struct its_collection *collection, *new_coll = NULL; - int lpi_nr; - struct vgic_irq *irq; - - device = find_its_device(its, device_id); - if (!device) - return E_ITS_MAPTI_UNMAPPED_DEVICE; - - if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) - lpi_nr = its_cmd_get_physical_id(its_cmd); - else - lpi_nr = event_id; - if (lpi_nr < GIC_LPI_OFFSET || - lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) - return E_ITS_MAPTI_PHYSICALID_OOR; - - /* If there is an existing mapping, behavior is UNPREDICTABLE. */ - if (find_itte(its, device_id, event_id)) - return 0; - - collection = find_collection(its, coll_id); - if (!collection) { - int ret = vgic_its_alloc_collection(its, &collection, coll_id); - if (ret) - return ret; - new_coll = collection; - } - - itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); - if (!itte) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - return -ENOMEM; - } - - itte->event_id = event_id; - list_add_tail(&itte->itte_list, &device->itt_head); - - itte->collection = collection; - itte->lpi = lpi_nr; - - irq = vgic_add_lpi(kvm, lpi_nr); - if (IS_ERR(irq)) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - its_free_itte(kvm, itte); - return PTR_ERR(irq); - } - itte->irq = irq; - - update_affinity_itte(kvm, itte); - - /* - * We "cache" the configuration table entries in out struct vgic_irq's. - * However we only have those structs for mapped IRQs, so we read in - * the respective config data from memory here upon mapping the LPI. - */ - update_lpi_config(kvm, itte->irq, NULL); - - return 0; -} - -/* Requires the its_lock to be held. */ -static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) -{ - struct its_itte *itte, *temp; - - /* - * The spec says that unmapping a device with still valid - * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, - * since we cannot leave the memory unreferenced. - */ - list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list) - its_free_itte(kvm, itte); - - list_del(&device->dev_list); - kfree(device); -} - -/* - * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - bool valid = its_cmd_get_validbit(its_cmd); - struct its_device *device; - - if (!vgic_its_check_id(its, its->baser_device_table, device_id)) - return E_ITS_MAPD_DEVICE_OOR; - - device = find_its_device(its, device_id); - - /* - * The spec says that calling MAPD on an already mapped device - * invalidates all cached data for this device. We implement this - * by removing the mapping and re-establishing it. - */ - if (device) - vgic_its_unmap_device(kvm, device); - - /* - * The spec does not say whether unmapping a not-mapped device - * is an error, so we are done in any case. - */ - if (!valid) - return 0; - - device = kzalloc(sizeof(struct its_device), GFP_KERNEL); - if (!device) - return -ENOMEM; - - device->device_id = device_id; - INIT_LIST_HEAD(&device->itt_head); - - list_add_tail(&device->dev_list, &its->device_list); - - return 0; -} - -/* - * The MAPC command maps collection IDs to redistributors. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u16 coll_id; - u32 target_addr; - struct its_collection *collection; - bool valid; - - valid = its_cmd_get_validbit(its_cmd); - coll_id = its_cmd_get_collection(its_cmd); - target_addr = its_cmd_get_target_addr(its_cmd); - - if (target_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MAPC_PROCNUM_OOR; - - if (!valid) { - vgic_its_free_collection(its, coll_id); - } else { - collection = find_collection(its, coll_id); - - if (!collection) { - int ret; - - ret = vgic_its_alloc_collection(its, &collection, - coll_id); - if (ret) - return ret; - collection->target_addr = target_addr; - } else { - collection->target_addr = target_addr; - update_affinity_collection(kvm, its, collection); - } - } - - return 0; -} - -/* - * The CLEAR command removes the pending state for a particular LPI. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_itte *itte; - - - itte = find_itte(its, device_id, event_id); - if (!itte) - return E_ITS_CLEAR_UNMAPPED_INTERRUPT; - - itte->irq->pending = false; - - return 0; -} - -/* - * The INV command syncs the configuration bits from the memory table. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 device_id = its_cmd_get_deviceid(its_cmd); - u32 event_id = its_cmd_get_id(its_cmd); - struct its_itte *itte; - - - itte = find_itte(its, device_id, event_id); - if (!itte) - return E_ITS_INV_UNMAPPED_INTERRUPT; - - return update_lpi_config(kvm, itte->irq, NULL); -} - -/* - * The INVALL command requests flushing of all IRQ data in this collection. - * Find the VCPU mapped to that collection, then iterate over the VM's list - * of mapped LPIs and update the configuration for each IRQ which targets - * the specified vcpu. The configuration will be read from the in-memory - * configuration table. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 coll_id = its_cmd_get_collection(its_cmd); - struct its_collection *collection; - struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; - - collection = find_collection(its, coll_id); - if (!its_is_collection_mapped(collection)) - return E_ITS_INVALL_UNMAPPED_COLLECTION; - - vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq_count = vgic_copy_lpi_list(kvm, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - if (!irq) - continue; - update_lpi_config(kvm, irq, vcpu); - vgic_put_irq(kvm, irq); - } - - kfree(intids); - - return 0; -} - -/* - * The MOVALL command moves the pending state of all IRQs targeting one - * redistributor to another. We don't hold the pending state in the VCPUs, - * but in the IRQs instead, so there is really not much to do for us here. - * However the spec says that no IRQ must target the old redistributor - * afterwards, so we make sure that no LPI is using the associated target_vcpu. - * This command affects all LPIs in the system that target that redistributor. - */ -static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - u32 target1_addr = its_cmd_get_target_addr(its_cmd); - u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); - struct kvm_vcpu *vcpu1, *vcpu2; - struct vgic_irq *irq; - - if (target1_addr >= atomic_read(&kvm->online_vcpus) || - target2_addr >= atomic_read(&kvm->online_vcpus)) - return E_ITS_MOVALL_PROCNUM_OOR; - - if (target1_addr == target2_addr) - return 0; - - vcpu1 = kvm_get_vcpu(kvm, target1_addr); - vcpu2 = kvm_get_vcpu(kvm, target2_addr); - - spin_lock(&dist->lpi_list_lock); - - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - spin_lock(&irq->irq_lock); - - if (irq->target_vcpu == vcpu1) - irq->target_vcpu = vcpu2; - - spin_unlock(&irq->irq_lock); - } - - spin_unlock(&dist->lpi_list_lock); - - return 0; -} - -/* - * The INT command injects the LPI associated with that DevID/EvID pair. - * Must be called with the its_lock mutex held. - */ -static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - u32 msi_data = its_cmd_get_id(its_cmd); - u64 msi_devid = its_cmd_get_deviceid(its_cmd); - - return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); -} - -/* - * This function is called with the its_cmd lock held, but the ITS data - * structure lock dropped. - */ -static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd) -{ - int ret = -ENODEV; - - mutex_lock(&its->its_lock); - switch (its_cmd_get_command(its_cmd)) { - case GITS_CMD_MAPD: - ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); - break; - case GITS_CMD_MAPC: - ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); - break; - case GITS_CMD_MAPI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); - break; - case GITS_CMD_MAPTI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); - break; - case GITS_CMD_MOVI: - ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); - break; - case GITS_CMD_DISCARD: - ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); - break; - case GITS_CMD_CLEAR: - ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); - break; - case GITS_CMD_MOVALL: - ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); - break; - case GITS_CMD_INT: - ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); - break; - case GITS_CMD_INV: - ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); - break; - case GITS_CMD_INVALL: - ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); - break; - case GITS_CMD_SYNC: - /* we ignore this command: we are in sync all of the time */ - ret = 0; - break; - } - mutex_unlock(&its->its_lock); - - return ret; -} - -static u64 vgic_sanitise_its_baser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, - GITS_BASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, - GITS_BASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, - GITS_BASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ - reg &= ~GENMASK_ULL(15, 12); - - /* We support only one (ITS) page size: 64K */ - reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; - - return reg; -} - -static u64 vgic_sanitise_its_cbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, - GITS_CBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, - GITS_CBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, - GITS_CBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - /* - * Sanitise the physical address to be 64k aligned. - * Also limit the physical addresses to 48 bits. - */ - reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); - - return reg; -} - -static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->cbaser, addr & 7, len); -} - -static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - /* When GITS_CTLR.Enable is 1, this register is RO. */ - if (its->enabled) - return; - - mutex_lock(&its->cmd_lock); - its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); - its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); - its->creadr = 0; - /* - * CWRITER is architecturally UNKNOWN on reset, but we need to reset - * it to CREADR to make sure we start with an empty command buffer. - */ - its->cwriter = its->creadr; - mutex_unlock(&its->cmd_lock); -} - -#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) -#define ITS_CMD_SIZE 32 -#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) - -/* - * By writing to CWRITER the guest announces new commands to be processed. - * To avoid any races in the first place, we take the its_cmd lock, which - * protects our ring buffer variables, so that there is only one user - * per ITS handling commands at a given time. - */ -static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - gpa_t cbaser; - u64 cmd_buf[4]; - u32 reg; - - if (!its) - return; - - mutex_lock(&its->cmd_lock); - - reg = update_64bit_reg(its->cwriter, addr & 7, len, val); - reg = ITS_CMD_OFFSET(reg); - if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { - mutex_unlock(&its->cmd_lock); - return; - } - - its->cwriter = reg; - cbaser = CBASER_ADDRESS(its->cbaser); - - while (its->cwriter != its->creadr) { - int ret = kvm_read_guest(kvm, cbaser + its->creadr, - cmd_buf, ITS_CMD_SIZE); - /* - * If kvm_read_guest() fails, this could be due to the guest - * programming a bogus value in CBASER or something else going - * wrong from which we cannot easily recover. - * According to section 6.3.2 in the GICv3 spec we can just - * ignore that command then. - */ - if (!ret) - vgic_its_handle_command(kvm, its, cmd_buf); - - its->creadr += ITS_CMD_SIZE; - if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) - its->creadr = 0; - } - - mutex_unlock(&its->cmd_lock); -} - -static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->cwriter, addr & 0x7, len); -} - -static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return extract_bytes(its->creadr, addr & 0x7, len); -} - -#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) -static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u64 reg; - - switch (BASER_INDEX(addr)) { - case 0: - reg = its->baser_device_table; - break; - case 1: - reg = its->baser_coll_table; - break; - default: - reg = 0; - break; - } - - return extract_bytes(reg, addr & 7, len); -} - -#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) -static void vgic_mmio_write_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u64 entry_size, device_type; - u64 reg, *regptr, clearbits = 0; - - /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ - if (its->enabled) - return; - - switch (BASER_INDEX(addr)) { - case 0: - regptr = &its->baser_device_table; - entry_size = 8; - device_type = GITS_BASER_TYPE_DEVICE; - break; - case 1: - regptr = &its->baser_coll_table; - entry_size = 8; - device_type = GITS_BASER_TYPE_COLLECTION; - clearbits = GITS_BASER_INDIRECT; - break; - default: - return; - } - - reg = update_64bit_reg(*regptr, addr & 7, len, val); - reg &= ~GITS_BASER_RO_MASK; - reg &= ~clearbits; - - reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; - reg |= device_type << GITS_BASER_TYPE_SHIFT; - reg = vgic_sanitise_its_baser(reg); - - *regptr = reg; -} - -#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ -{ \ - .reg_offset = off, \ - .len = length, \ - .access_flags = acc, \ - .its_read = rd, \ - .its_write = wr, \ -} - -static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, unsigned long val) -{ - /* Ignore */ -} - -static struct vgic_register_region its_registers[] = { - REGISTER_ITS_DESC(GITS_CTLR, - vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, - VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_IIDR, - vgic_mmio_read_its_iidr, its_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_TYPER, - vgic_mmio_read_its_typer, its_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CBASER, - vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CWRITER, - vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_CREADR, - vgic_mmio_read_its_creadr, its_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_BASER, - vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_IDREGS_BASE, - vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, - VGIC_ACCESS_32bit), -}; - -/* This is called on setting the LPI enable bit in the redistributor. */ -void vgic_enable_lpis(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) - its_sync_lpi_pending_table(vcpu); -} - -static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its) -{ - struct vgic_io_device *iodev = &its->iodev; - int ret; - - if (!its->initialized) - return -EBUSY; - - if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) - return -ENXIO; - - iodev->regions = its_registers; - iodev->nr_regions = ARRAY_SIZE(its_registers); - kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); - - iodev->base_addr = its->vgic_its_base; - iodev->iodev_type = IODEV_ITS; - iodev->its = its; - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, - KVM_VGIC_V3_ITS_SIZE, &iodev->dev); - mutex_unlock(&kvm->slots_lock); - - return ret; -} - -#define INITIAL_BASER_VALUE \ - (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ - ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | \ - GITS_BASER_PAGE_SIZE_64K) - -#define INITIAL_PROPBASER_VALUE \ - (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) - -static int vgic_its_create(struct kvm_device *dev, u32 type) -{ - struct vgic_its *its; - - if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) - return -ENODEV; - - its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); - if (!its) - return -ENOMEM; - - mutex_init(&its->its_lock); - mutex_init(&its->cmd_lock); - - its->vgic_its_base = VGIC_ADDR_UNDEF; - - INIT_LIST_HEAD(&its->device_list); - INIT_LIST_HEAD(&its->collection_list); - - dev->kvm->arch.vgic.has_its = true; - its->initialized = false; - its->enabled = false; - its->dev = dev; - - its->baser_device_table = INITIAL_BASER_VALUE | - ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); - its->baser_coll_table = INITIAL_BASER_VALUE | - ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); - dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; - - dev->private = its; - - return 0; -} - -static void vgic_its_destroy(struct kvm_device *kvm_dev) -{ - struct kvm *kvm = kvm_dev->kvm; - struct vgic_its *its = kvm_dev->private; - struct its_device *dev; - struct its_itte *itte; - struct list_head *dev_cur, *dev_temp; - struct list_head *cur, *temp; - - /* - * We may end up here without the lists ever having been initialized. - * Check this and bail out early to avoid dereferencing a NULL pointer. - */ - if (!its->device_list.next) - return; - - mutex_lock(&its->its_lock); - list_for_each_safe(dev_cur, dev_temp, &its->device_list) { - dev = container_of(dev_cur, struct its_device, dev_list); - list_for_each_safe(cur, temp, &dev->itt_head) { - itte = (container_of(cur, struct its_itte, itte_list)); - its_free_itte(kvm, itte); - } - list_del(dev_cur); - kfree(dev); - } - - list_for_each_safe(cur, temp, &its->collection_list) { - list_del(cur); - kfree(container_of(cur, struct its_collection, coll_list)); - } - mutex_unlock(&its->its_lock); - - kfree(its); -} - -static int vgic_its_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_ITS_ADDR_TYPE: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - break; - } - return -ENXIO; -} - -static int vgic_its_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - struct vgic_its *its = dev->private; - int ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; - u64 addr; - - if (type != KVM_VGIC_ITS_ADDR_TYPE) - return -ENODEV; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, - addr, SZ_64K); - if (ret) - return ret; - - its->vgic_its_base = addr; - - return 0; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - its->initialized = true; - - return 0; - } - break; - } - return -ENXIO; -} - -static int vgic_its_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - struct vgic_its *its = dev->private; - u64 addr = its->vgic_its_base; - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; - - if (type != KVM_VGIC_ITS_ADDR_TYPE) - return -ENODEV; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - default: - return -ENXIO; - } - } - - return 0; -} - -static struct kvm_device_ops kvm_arm_vgic_its_ops = { - .name = "kvm-arm-vgic-its", - .create = vgic_its_create, - .destroy = vgic_its_destroy, - .set_attr = vgic_its_set_attr, - .get_attr = vgic_its_get_attr, - .has_attr = vgic_its_has_attr, -}; - -int kvm_vgic_register_its_device(void) -{ - return kvm_register_device_ops(&kvm_arm_vgic_its_ops, - KVM_DEV_TYPE_ARM_VGIC_ITS); -} - -/* - * Registers all ITSes with the kvm_io_bus framework. - * To follow the existing VGIC initialization sequence, this has to be - * done as late as possible, just before the first VCPU runs. - */ -int vgic_register_its_iodevs(struct kvm *kvm) -{ - struct kvm_device *dev; - int ret = 0; - - list_for_each_entry(dev, &kvm->devices, vm_node) { - if (dev->ops != &kvm_arm_vgic_its_ops) - continue; - - ret = vgic_register_its_iodev(kvm, dev->private); - if (ret) - return ret; - /* - * We don't need to care about tearing down previously - * registered ITSes, as the kvm_io_bus framework removes - * them for us if the VM gets destroyed. - */ - } - - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c deleted file mode 100644 index ce1f4ed..0000000 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ /dev/null @@ -1,474 +0,0 @@ -/* - * VGIC: KVM DEVICE API - * - * Copyright (C) 2015 ARM Ltd. - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <linux/uaccess.h> -#include <asm/kvm_mmu.h> -#include "vgic.h" - -/* common helpers */ - -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) -{ - if (addr & ~KVM_PHYS_MASK) - return -E2BIG; - - if (!IS_ALIGNED(addr, alignment)) - return -EINVAL; - - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; - - return 0; -} - -/** - * kvm_vgic_addr - set or get vgic VM base addresses - * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value - * @write: if true set the address in the VM address space, if false read the - * address - * - * Set or get the vgic base addresses for the distributor and the virtual CPU - * interface in the VM physical address space. These addresses are properties - * of the emulated core/SoC and therefore user space initially knows this - * information. - * Check them for sanity (alignment, double assignment). We can't check for - * overlapping regions in case of a virtual GICv3 here, since we don't know - * the number of VCPUs yet, so we defer this check to map_resources(). - */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) -{ - int r = 0; - struct vgic_dist *vgic = &kvm->arch.vgic; - int type_needed; - phys_addr_t *addr_ptr, alignment; - - mutex_lock(&kvm->lock); - switch (type) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; - addr_ptr = &vgic->vgic_dist_base; - alignment = SZ_4K; - break; - case KVM_VGIC_V2_ADDR_TYPE_CPU: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; - addr_ptr = &vgic->vgic_cpu_base; - alignment = SZ_4K; - break; - case KVM_VGIC_V3_ADDR_TYPE_DIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; - addr_ptr = &vgic->vgic_dist_base; - alignment = SZ_64K; - break; - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; - addr_ptr = &vgic->vgic_redist_base; - alignment = SZ_64K; - break; - default: - r = -ENODEV; - goto out; - } - - if (vgic->vgic_model != type_needed) { - r = -ENODEV; - goto out; - } - - if (write) { - r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); - if (!r) - *addr_ptr = *addr; - } else { - *addr = *addr_ptr; - } - -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int vgic_set_common_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int r; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); - return (r == -ENODEV) ? -ENXIO : r; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 val; - int ret = 0; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* - * We require: - * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs - * - at most 1024 interrupts - * - a multiple of 32 interrupts - */ - if (val < (VGIC_NR_PRIVATE_IRQS + 32) || - val > VGIC_MAX_RESERVED || - (val & 31)) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) - ret = -EBUSY; - else - dev->kvm->arch.vgic.nr_spis = - val - VGIC_NR_PRIVATE_IRQS; - - mutex_unlock(&dev->kvm->lock); - - return ret; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - mutex_lock(&dev->kvm->lock); - r = vgic_init(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return r; - } - break; - } - } - - return -ENXIO; -} - -static int vgic_get_common_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int r = -ENXIO; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - - r = put_user(dev->kvm->arch.vgic.nr_spis + - VGIC_NR_PRIVATE_IRQS, uaddr); - break; - } - } - - return r; -} - -static int vgic_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -int kvm_register_vgic_device(unsigned long type) -{ - int ret = -ENODEV; - - switch (type) { - case KVM_DEV_TYPE_ARM_VGIC_V2: - ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); - break; - case KVM_DEV_TYPE_ARM_VGIC_V3: - ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, - KVM_DEV_TYPE_ARM_VGIC_V3); - -#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS - if (ret) - break; - ret = kvm_vgic_register_its_device(); -#endif - break; - } - - return ret; -} - -struct vgic_reg_attr { - struct kvm_vcpu *vcpu; - gpa_t addr; -}; - -static int parse_vgic_v2_attr(struct kvm_device *dev, - struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr) -{ - int cpuid; - - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) - return -EINVAL; - - reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); - reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - - return 0; -} - -/* unlocks vcpus from @vcpu_lock_idx and smaller */ -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) -{ - struct kvm_vcpu *tmp_vcpu; - - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&tmp_vcpu->mutex); - } -} - -static void unlock_all_vcpus(struct kvm *kvm) -{ - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); -} - -/* Returns true if all vcpus were locked, false otherwise */ -static bool lock_all_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *tmp_vcpu; - int c; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run and fiddle with the vgic state while we - * access it. - */ - kvm_for_each_vcpu(c, tmp_vcpu, kvm) { - if (!mutex_trylock(&tmp_vcpu->mutex)) { - unlock_vcpus(kvm, c - 1); - return false; - } - } - - return true; -} - -/** - * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state - * - * @dev: kvm device handle - * @attr: kvm device attribute - * @reg: address the value is read or written - * @is_write: true if userspace is writing a register - */ -static int vgic_attr_regs_access_v2(struct kvm_device *dev, - struct kvm_device_attr *attr, - u32 *reg, bool is_write) -{ - struct vgic_reg_attr reg_attr; - gpa_t addr; - struct kvm_vcpu *vcpu; - int ret; - - ret = parse_vgic_v2_attr(dev, attr, ®_attr); - if (ret) - return ret; - - vcpu = reg_attr.vcpu; - addr = reg_attr.addr; - - mutex_lock(&dev->kvm->lock); - - ret = vgic_init(dev->kvm); - if (ret) - goto out; - - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); - break; - default: - ret = -EINVAL; - break; - } - - unlock_all_vcpus(dev->kvm); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v2_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_attr_regs_access_v2(dev, attr, ®, true); - } - } - - return -ENXIO; -} - -static int vgic_v2_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_attr_regs_access_v2(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - } - - return -ENXIO; -} - -static int vgic_v2_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return vgic_v2_has_attr_regs(dev, attr); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v2_ops = { - .name = "kvm-arm-vgic-v2", - .create = vgic_create, - .destroy = vgic_destroy, - .set_attr = vgic_v2_set_attr, - .get_attr = vgic_v2_get_attr, - .has_attr = vgic_v2_has_attr, -}; - -static int vgic_v3_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - return vgic_set_common_attr(dev, attr); -} - -static int vgic_v3_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - return vgic_get_common_attr(dev, attr); -} - -static int vgic_v3_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V3_ADDR_TYPE_DIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v3_ops = { - .name = "kvm-arm-vgic-v3", - .create = vgic_create, - .destroy = vgic_destroy, - .set_attr = vgic_v3_set_attr, - .get_attr = vgic_v3_get_attr, - .has_attr = vgic_v3_has_attr, -}; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c deleted file mode 100644 index b44b359..0000000 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ /dev/null @@ -1,456 +0,0 @@ -/* - * VGICv2 MMIO handling functions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/irqchip/arm-gic.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/iodev.h> -#include <kvm/arm_vgic.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 value; - - switch (addr & 0x0c) { - case GIC_DIST_CTRL: - value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0; - break; - case GIC_DIST_CTR: - value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - value = (value >> 5) - 1; - value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - break; - case GIC_DIST_IIDR: - value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); - break; - default: - return 0; - } - - return value; -} - -static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool was_enabled = dist->enabled; - - switch (addr & 0x0c) { - case GIC_DIST_CTRL: - dist->enabled = val & GICD_ENABLE; - if (!was_enabled && dist->enabled) - vgic_kick_vcpus(vcpu->kvm); - break; - case GIC_DIST_CTR: - case GIC_DIST_IIDR: - /* Nothing to do */ - return; - } -} - -static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); - int intid = val & 0xf; - int targets = (val >> 16) & 0xff; - int mode = (val >> 24) & 0x03; - int c; - struct kvm_vcpu *vcpu; - - switch (mode) { - case 0x0: /* as specified by targets */ - break; - case 0x1: - targets = (1U << nr_vcpus) - 1; /* all, ... */ - targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ - break; - case 0x2: /* this very vCPU only */ - targets = (1U << source_vcpu->vcpu_id); - break; - case 0x3: /* reserved */ - return; - } - - kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { - struct vgic_irq *irq; - - if (!(targets & (1U << c))) - continue; - - irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); - - spin_lock(&irq->irq_lock); - irq->pending = true; - irq->source |= 1U << source_vcpu->vcpu_id; - - vgic_queue_irq_unlock(source_vcpu->kvm, irq); - vgic_put_irq(source_vcpu->kvm, irq); - } -} - -static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->targets << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - - /* GICD_ITARGETSR[0-7] are read-only */ - if (intid < VGIC_NR_PRIVATE_IRQS) - return; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); - int target; - - spin_lock(&irq->irq_lock); - - irq->targets = (val >> (i * 8)) & 0xff; - target = irq->targets ? __ffs(irq->targets) : 0; - irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = addr & 0x0f; - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->source << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - return val; -} - -static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = addr & 0x0f; - int i; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - - irq->source &= ~((val >> (i * 8)) & 0xff); - if (!irq->source) - irq->pending = false; - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = addr & 0x0f; - int i; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - - irq->source |= (val >> (i * 8)) & 0xff; - - if (irq->source) { - irq->pending = true; - vgic_queue_irq_unlock(vcpu->kvm, irq); - } else { - spin_unlock(&irq->irq_lock); - } - vgic_put_irq(vcpu->kvm, irq); - } -} - -static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_vmcr(vcpu, vmcr); - else - vgic_v3_set_vmcr(vcpu, vmcr); -} - -static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_get_vmcr(vcpu, vmcr); - else - vgic_v3_get_vmcr(vcpu, vmcr); -} - -#define GICC_ARCH_VERSION_V2 0x2 - -/* These are for userland accesses only, there is no guest-facing emulation. */ -static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_vmcr vmcr; - u32 val; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (addr & 0xff) { - case GIC_CPU_CTRL: - val = vmcr.ctlr; - break; - case GIC_CPU_PRIMASK: - val = vmcr.pmr; - break; - case GIC_CPU_BINPOINT: - val = vmcr.bpr; - break; - case GIC_CPU_ALIAS_BINPOINT: - val = vmcr.abpr; - break; - case GIC_CPU_IDENT: - val = ((PRODUCT_ID_KVM << 20) | - (GICC_ARCH_VERSION_V2 << 16) | - IMPLEMENTER_ARM); - break; - default: - return 0; - } - - return val; -} - -static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_vmcr vmcr; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (addr & 0xff) { - case GIC_CPU_CTRL: - vmcr.ctlr = val; - break; - case GIC_CPU_PRIMASK: - vmcr.pmr = val; - break; - case GIC_CPU_BINPOINT: - vmcr.bpr = val; - break; - case GIC_CPU_ALIAS_BINPOINT: - vmcr.abpr = val; - break; - } - - vgic_set_vmcr(vcpu, &vmcr); -} - -static const struct vgic_register_region vgic_v2_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL, - vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, - vgic_mmio_read_rao, vgic_mmio_write_wi, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, - vgic_mmio_read_enable, vgic_mmio_write_senable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, - vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, - vgic_mmio_read_pending, vgic_mmio_write_spending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, - vgic_mmio_read_active, vgic_mmio_write_sactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, - vgic_mmio_read_active, vgic_mmio_write_cactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, - vgic_mmio_read_priority, vgic_mmio_write_priority, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, - vgic_mmio_read_target, vgic_mmio_write_target, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, - vgic_mmio_read_config, vgic_mmio_write_config, 2, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, - vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, - vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, - vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), -}; - -static const struct vgic_register_region vgic_v2_cpu_registers[] = { - REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, - vgic_mmio_read_raz, vgic_mmio_write_wi, 16, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, - vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, - VGIC_ACCESS_32bit), -}; - -unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) -{ - dev->regions = vgic_v2_dist_registers; - dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); - - kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); - - return SZ_4K; -} - -int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - int nr_irqs = dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - const struct vgic_register_region *regions; - gpa_t addr; - int nr_regions, i, len; - - addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - regions = vgic_v2_dist_registers; - nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - regions = vgic_v2_cpu_registers; - nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); - break; - default: - return -ENXIO; - } - - /* We only support aligned 32-bit accesses. */ - if (addr & 3) - return -ENXIO; - - for (i = 0; i < nr_regions; i++) { - if (regions[i].bits_per_irq) - len = (regions[i].bits_per_irq * nr_irqs) / 8; - else - len = regions[i].len; - - if (regions[i].reg_offset <= addr && - regions[i].reg_offset + len > addr) - return 0; - } - - return -ENXIO; -} - -/* - * When userland tries to access the VGIC register handlers, we need to - * create a usable struct vgic_io_device to be passed to the handlers and we - * have to set up a buffer similar to what would have happened if a guest MMIO - * access occurred, including doing endian conversions on BE systems. - */ -static int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, - bool is_write, int offset, u32 *val) -{ - unsigned int len = 4; - u8 buf[4]; - int ret; - - if (is_write) { - vgic_data_host_to_mmio_bus(buf, len, *val); - ret = kvm_io_gic_ops.write(vcpu, &dev->dev, offset, len, buf); - } else { - ret = kvm_io_gic_ops.read(vcpu, &dev->dev, offset, len, buf); - if (!ret) - *val = vgic_data_mmio_bus_to_host(buf, len); - } - - return ret; -} - -int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v2_cpu_registers, - .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), - .iodev_type = IODEV_CPUIF, - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} - -int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val) -{ - struct vgic_io_device dev = { - .regions = vgic_v2_dist_registers, - .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), - .iodev_type = IODEV_DIST, - }; - - return vgic_uaccess(vcpu, &dev, is_write, offset, val); -} diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c deleted file mode 100644 index 0d3c76a..0000000 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ /dev/null @@ -1,656 +0,0 @@ -/* - * VGICv3 MMIO handling functions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/iodev.h> -#include <kvm/arm_vgic.h> - -#include <asm/kvm_emulate.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -/* extract @num bytes at @offset bytes offset in data */ -unsigned long extract_bytes(u64 data, unsigned int offset, - unsigned int num) -{ - return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); -} - -/* allows updates of any half of a 64-bit register (or the whole thing) */ -u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val) -{ - int lower = (offset & 4) * 8; - int upper = lower + 8 * len - 1; - - reg &= ~GENMASK_ULL(upper, lower); - val &= GENMASK_ULL(len * 8 - 1, 0); - - return reg | ((u64)val << lower); -} - -#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS -bool vgic_has_its(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) - return false; - - return dist->has_its; -} -#endif - -static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 value = 0; - - switch (addr & 0x0c) { - case GICD_CTLR: - if (vcpu->kvm->arch.vgic.enabled) - value |= GICD_CTLR_ENABLE_SS_G1; - value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; - break; - case GICD_TYPER: - value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - value = (value >> 5) - 1; - if (vgic_has_its(vcpu->kvm)) { - value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; - value |= GICD_TYPER_LPIS; - } else { - value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; - } - break; - case GICD_IIDR: - value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); - break; - default: - return 0; - } - - return value; -} - -static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool was_enabled = dist->enabled; - - switch (addr & 0x0c) { - case GICD_CTLR: - dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; - - if (!was_enabled && dist->enabled) - vgic_kick_vcpus(vcpu->kvm); - break; - case GICD_TYPER: - case GICD_IIDR: - return; - } -} - -static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); - unsigned long ret = 0; - - if (!irq) - return 0; - - /* The upper word is RAZ for us. */ - if (!(addr & 4)) - ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); - - vgic_put_irq(vcpu->kvm, irq); - return ret; -} - -static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq; - - /* The upper word is WI for us since we don't implement Aff3. */ - if (addr & 4) - return; - - irq = vgic_get_irq(vcpu->kvm, NULL, intid); - - if (!irq) - return; - - spin_lock(&irq->irq_lock); - - /* We only care about and preserve Aff0, Aff1 and Aff2. */ - irq->mpidr = val & GENMASK(23, 0); - irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); -} - -static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; -} - - -static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - bool was_enabled = vgic_cpu->lpis_enabled; - - if (!vgic_has_its(vcpu->kvm)) - return; - - vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; - - if (!was_enabled && vgic_cpu->lpis_enabled) - vgic_enable_lpis(vcpu); -} - -static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - int target_vcpu_id = vcpu->vcpu_id; - u64 value; - - value = (u64)(mpidr & GENMASK(23, 0)) << 32; - value |= ((target_vcpu_id & 0xffff) << 8); - if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) - value |= GICR_TYPER_LAST; - if (vgic_has_its(vcpu->kvm)) - value |= GICR_TYPER_PLPIS; - - return extract_bytes(value, addr & 7, len); -} - -static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); -} - -static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - switch (addr & 0xffff) { - case GICD_PIDR2: - /* report a GICv3 compliant implementation */ - return 0x3b; - } - - return 0; -} - -/* We want to avoid outer shareable. */ -u64 vgic_sanitise_shareability(u64 field) -{ - switch (field) { - case GIC_BASER_OuterShareable: - return GIC_BASER_InnerShareable; - default: - return field; - } -} - -/* Avoid any inner non-cacheable mapping. */ -u64 vgic_sanitise_inner_cacheability(u64 field) -{ - switch (field) { - case GIC_BASER_CACHE_nCnB: - case GIC_BASER_CACHE_nC: - return GIC_BASER_CACHE_RaWb; - default: - return field; - } -} - -/* Non-cacheable or same-as-inner are OK. */ -u64 vgic_sanitise_outer_cacheability(u64 field) -{ - switch (field) { - case GIC_BASER_CACHE_SameAsInner: - case GIC_BASER_CACHE_nC: - return field; - default: - return GIC_BASER_CACHE_nC; - } -} - -u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, - u64 (*sanitise_fn)(u64)) -{ - u64 field = (reg & field_mask) >> field_shift; - - field = sanitise_fn(field) << field_shift; - return (reg & ~field_mask) | field; -} - -#define PROPBASER_RES0_MASK \ - (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) -#define PENDBASER_RES0_MASK \ - (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ - GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) - -static u64 vgic_sanitise_pendbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, - GICR_PENDBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, - GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, - GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - reg &= ~PENDBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); - - return reg; -} - -static u64 vgic_sanitise_propbaser(u64 reg) -{ - reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, - GICR_PROPBASER_SHAREABILITY_SHIFT, - vgic_sanitise_shareability); - reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, - GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, - vgic_sanitise_inner_cacheability); - reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, - GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, - vgic_sanitise_outer_cacheability); - - reg &= ~PROPBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); - return reg; -} - -static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return extract_bytes(dist->propbaser, addr & 7, len); -} - -static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 old_propbaser, propbaser; - - /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) - return; - - do { - old_propbaser = dist->propbaser; - propbaser = old_propbaser; - propbaser = update_64bit_reg(propbaser, addr & 4, len, val); - propbaser = vgic_sanitise_propbaser(propbaser); - } while (cmpxchg64(&dist->propbaser, old_propbaser, - propbaser) != old_propbaser); -} - -static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - return extract_bytes(vgic_cpu->pendbaser, addr & 7, len); -} - -static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 old_pendbaser, pendbaser; - - /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) - return; - - do { - old_pendbaser = vgic_cpu->pendbaser; - pendbaser = old_pendbaser; - pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); - pendbaser = vgic_sanitise_pendbaser(pendbaser); - } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, - pendbaser) != old_pendbaser); -} - -/* - * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the - * redistributors, while SPIs are covered by registers in the distributor - * block. Trying to set private IRQs in this block gets ignored. - * We take some special care here to fix the calculation of the register - * offset. - */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, bpi, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = bpi, \ - .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ - .access_flags = acc, \ - .read = vgic_mmio_read_raz, \ - .write = vgic_mmio_write_wi, \ - }, { \ - .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ - .bits_per_irq = bpi, \ - .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - } - -static const struct vgic_register_region vgic_v3_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH(GICD_CTLR, - vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, - vgic_mmio_read_rao, vgic_mmio_write_wi, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, - vgic_mmio_read_enable, vgic_mmio_write_senable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, - vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, - vgic_mmio_read_pending, vgic_mmio_write_spending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, - vgic_mmio_read_active, vgic_mmio_write_sactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, - vgic_mmio_read_active, vgic_mmio_write_cactive, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, - vgic_mmio_read_priority, vgic_mmio_write_priority, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, - vgic_mmio_read_config, vgic_mmio_write_config, 2, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 1, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, - vgic_mmio_read_irouter, vgic_mmio_write_irouter, 64, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, - vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, - VGIC_ACCESS_32bit), -}; - -static const struct vgic_register_region vgic_v3_rdbase_registers[] = { - REGISTER_DESC_WITH_LENGTH(GICR_CTLR, - vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IIDR, - vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_TYPER, - vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, - vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, - vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, - VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, - vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, - VGIC_ACCESS_32bit), -}; - -static const struct vgic_register_region vgic_v3_sgibase_registers[] = { - REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0, - vgic_mmio_read_rao, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_senable, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ISPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_spending, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_cpending, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0, - vgic_mmio_read_active, vgic_mmio_write_sactive, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICACTIVER0, - vgic_mmio_read_active, vgic_mmio_write_cactive, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0, - vgic_mmio_read_priority, vgic_mmio_write_priority, 32, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0, - vgic_mmio_read_config, vgic_mmio_write_config, 8, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_NSACR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, - VGIC_ACCESS_32bit), -}; - -unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) -{ - dev->regions = vgic_v3_dist_registers; - dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); - - kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); - - return SZ_64K; -} - -int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) -{ - struct kvm_vcpu *vcpu; - int c, ret = 0; - - kvm_for_each_vcpu(c, vcpu, kvm) { - gpa_t rd_base = redist_base_address + c * SZ_64K * 2; - gpa_t sgi_base = rd_base + SZ_64K; - struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; - struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; - - kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); - rd_dev->base_addr = rd_base; - rd_dev->iodev_type = IODEV_REDIST; - rd_dev->regions = vgic_v3_rdbase_registers; - rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); - rd_dev->redist_vcpu = vcpu; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, - SZ_64K, &rd_dev->dev); - mutex_unlock(&kvm->slots_lock); - - if (ret) - break; - - kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops); - sgi_dev->base_addr = sgi_base; - sgi_dev->iodev_type = IODEV_REDIST; - sgi_dev->regions = vgic_v3_sgibase_registers; - sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers); - sgi_dev->redist_vcpu = vcpu; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base, - SZ_64K, &sgi_dev->dev); - mutex_unlock(&kvm->slots_lock); - if (ret) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &rd_dev->dev); - break; - } - } - - if (ret) { - /* The current c failed, so we start with the previous one. */ - for (c--; c >= 0; c--) { - struct vgic_cpu *vgic_cpu; - - vcpu = kvm_get_vcpu(kvm, c); - vgic_cpu = &vcpu->arch.vgic_cpu; - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &vgic_cpu->rd_iodev.dev); - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &vgic_cpu->sgi_iodev.dev); - } - } - - return ret; -} - -/* - * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI - * generation register ICC_SGI1R_EL1) with a given VCPU. - * If the VCPU's MPIDR matches, return the level0 affinity, otherwise - * return -1. - */ -static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) -{ - unsigned long affinity; - int level0; - - /* - * Split the current VCPU's MPIDR into affinity level 0 and the - * rest as this is what we have to compare against. - */ - affinity = kvm_vcpu_get_mpidr_aff(vcpu); - level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); - affinity &= ~MPIDR_LEVEL_MASK; - - /* bail out if the upper three levels don't match */ - if (sgi_aff != affinity) - return -1; - - /* Is this VCPU's bit set in the mask ? */ - if (!(sgi_cpu_mask & BIT(level0))) - return -1; - - return level0; -} - -/* - * The ICC_SGI* registers encode the affinity differently from the MPIDR, - * so provide a wrapper to use the existing defines to isolate a certain - * affinity level. - */ -#define SGI_AFFINITY_LEVEL(reg, level) \ - ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ - >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/** - * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs - * @vcpu: The VCPU requesting a SGI - * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU - * - * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. - * This will trap in sys_regs.c and call this function. - * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the - * target processors as well as a bitmask of 16 Aff0 CPUs. - * If the interrupt routing mode bit is not set, we iterate over all VCPUs to - * check for matching ones. If this bit is set, we signal all, but not the - * calling VCPU. - */ -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *c_vcpu; - u16 target_cpus; - u64 mpidr; - int sgi, c; - int vcpu_id = vcpu->vcpu_id; - bool broadcast; - - sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); - target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; - mpidr = SGI_AFFINITY_LEVEL(reg, 3); - mpidr |= SGI_AFFINITY_LEVEL(reg, 2); - mpidr |= SGI_AFFINITY_LEVEL(reg, 1); - - /* - * We iterate over all VCPUs to find the MPIDRs matching the request. - * If we have handled one CPU, we clear its bit to detect early - * if we are already finished. This avoids iterating through all - * VCPUs when most of the times we just signal a single VCPU. - */ - kvm_for_each_vcpu(c, c_vcpu, kvm) { - struct vgic_irq *irq; - - /* Exit early if we have dealt with all requested CPUs */ - if (!broadcast && target_cpus == 0) - break; - - /* Don't signal the calling VCPU */ - if (broadcast && c == vcpu_id) - continue; - - if (!broadcast) { - int level0; - - level0 = match_mpidr(mpidr, target_cpus, c_vcpu); - if (level0 == -1) - continue; - - /* remove this matching VCPU from the mask */ - target_cpus &= ~BIT(level0); - } - - irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); - - spin_lock(&irq->irq_lock); - irq->pending = true; - - vgic_queue_irq_unlock(vcpu->kvm, irq); - vgic_put_irq(vcpu->kvm, irq); - } -} diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c deleted file mode 100644 index ebe1b9f..0000000 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ /dev/null @@ -1,583 +0,0 @@ -/* - * VGIC MMIO handling functions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/bitops.h> -#include <linux/bsearch.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/iodev.h> -#include <kvm/arm_vgic.h> - -#include "vgic.h" -#include "vgic-mmio.h" - -unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return 0; -} - -unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - return -1UL; -} - -void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val) -{ - /* Ignore */ -} - -/* - * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value - * of the enabled bit, so there is only one function for both here. - */ -unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->enabled) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - irq->enabled = true; - vgic_queue_irq_unlock(vcpu->kvm, irq); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - - irq->enabled = false; - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->pending) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - irq->pending = true; - if (irq->config == VGIC_CONFIG_LEVEL) - irq->soft_pending = true; - - vgic_queue_irq_unlock(vcpu->kvm, irq); - vgic_put_irq(vcpu->kvm, irq); - } -} - -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - - if (irq->config == VGIC_CONFIG_LEVEL) { - irq->soft_pending = false; - irq->pending = irq->line_level; - } else { - irq->pending = false; - } - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - u32 value = 0; - int i; - - /* Loop over all IRQs affected by this read */ - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->active) - value |= (1U << i); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool new_active_state) -{ - spin_lock(&irq->irq_lock); - /* - * If this virtual IRQ was written into a list register, we - * have to make sure the CPU that runs the VCPU thread has - * synced back LR state to the struct vgic_irq. We can only - * know this for sure, when either this irq is not assigned to - * anyone's AP list anymore, or the VCPU thread is not - * running on any CPUs. - * - * In the opposite case, we know the VCPU thread may be on its - * way back from the guest and still has to sync back this - * IRQ, so we release and re-acquire the spin_lock to let the - * other thread sync back the IRQ. - */ - while (irq->vcpu && /* IRQ may have state in an LR somewhere */ - irq->vcpu->cpu != -1) /* VCPU thread is running */ - cond_resched_lock(&irq->irq_lock); - - irq->active = new_active_state; - if (new_active_state) - vgic_queue_irq_unlock(vcpu->kvm, irq); - else - spin_unlock(&irq->irq_lock); -} - -/* - * If we are fiddling with an IRQ's active state, we have to make sure the IRQ - * is not queued on some running VCPU's LRs, because then the change to the - * active state can be overwritten when the VCPU's state is synced coming back - * from the guest. - * - * For shared interrupts, we have to stop all the VCPUs because interrupts can - * be migrated while we don't hold the IRQ locks and we don't want to be - * chasing moving targets. - * - * For private interrupts, we only have to make sure the single and only VCPU - * that can potentially queue the IRQ is stopped. - */ -static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) -{ - if (intid < VGIC_NR_PRIVATE_IRQS) - kvm_arm_halt_vcpu(vcpu); - else - kvm_arm_halt_guest(vcpu->kvm); -} - -/* See vgic_change_active_prepare */ -static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) -{ - if (intid < VGIC_NR_PRIVATE_IRQS) - kvm_arm_resume_vcpu(vcpu); - else - kvm_arm_resume_guest(vcpu->kvm); -} - -void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - vgic_change_active_prepare(vcpu, intid); - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - vgic_mmio_change_active(vcpu, irq, false); - vgic_put_irq(vcpu->kvm, irq); - } - vgic_change_active_finish(vcpu, intid); -} - -void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - - vgic_change_active_prepare(vcpu, intid); - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - vgic_mmio_change_active(vcpu, irq, true); - vgic_put_irq(vcpu->kvm, irq); - } - vgic_change_active_finish(vcpu, intid); -} - -unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - u64 val = 0; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - val |= (u64)irq->priority << (i * 8); - - vgic_put_irq(vcpu->kvm, irq); - } - - return val; -} - -/* - * We currently don't handle changing the priority of an interrupt that - * is already pending on a VCPU. If there is a need for this, we would - * need to make this VCPU exit and re-evaluate the priorities, potentially - * leading to this interrupt getting presented now to the guest (if it has - * been masked by the priority mask before). - */ -void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 8); - int i; - - for (i = 0; i < len; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - spin_lock(&irq->irq_lock); - /* Narrow the priority range to what we actually support */ - irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); - spin_unlock(&irq->irq_lock); - - vgic_put_irq(vcpu->kvm, irq); - } -} - -unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 2); - u32 value = 0; - int i; - - for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - if (irq->config == VGIC_CONFIG_EDGE) - value |= (2U << (i * 2)); - - vgic_put_irq(vcpu->kvm, irq); - } - - return value; -} - -void vgic_mmio_write_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) -{ - u32 intid = VGIC_ADDR_TO_INTID(addr, 2); - int i; - - for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq; - - /* - * The configuration cannot be changed for SGIs in general, - * for PPIs this is IMPLEMENTATION DEFINED. The arch timer - * code relies on PPIs being level triggered, so we also - * make them read-only here. - */ - if (intid + i < VGIC_NR_PRIVATE_IRQS) - continue; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); - - if (test_bit(i * 2 + 1, &val)) { - irq->config = VGIC_CONFIG_EDGE; - } else { - irq->config = VGIC_CONFIG_LEVEL; - irq->pending = irq->line_level | irq->soft_pending; - } - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -static int match_region(const void *key, const void *elt) -{ - const unsigned int offset = (unsigned long)key; - const struct vgic_register_region *region = elt; - - if (offset < region->reg_offset) - return -1; - - if (offset >= region->reg_offset + region->len) - return 1; - - return 0; -} - -/* Find the proper register handler entry given a certain address offset. */ -static const struct vgic_register_region * -vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions, - unsigned int offset) -{ - return bsearch((void *)(uintptr_t)offset, region, nr_regions, - sizeof(region[0]), match_region); -} - -/* - * kvm_mmio_read_buf() returns a value in a format where it can be converted - * to a byte array and be directly observed as the guest wanted it to appear - * in memory if it had done the store itself, which is LE for the GIC, as the - * guest knows the GIC is always LE. - * - * We convert this value to the CPUs native format to deal with it as a data - * value. - */ -unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) -{ - unsigned long data = kvm_mmio_read_buf(val, len); - - switch (len) { - case 1: - return data; - case 2: - return le16_to_cpu(data); - case 4: - return le32_to_cpu(data); - default: - return le64_to_cpu(data); - } -} - -/* - * kvm_mmio_write_buf() expects a value in a format such that if converted to - * a byte array it is observed as the guest would see it if it could perform - * the load directly. Since the GIC is LE, and the guest knows this, the - * guest expects a value in little endian format. - * - * We convert the data value from the CPUs native format to LE so that the - * value is returned in the proper format. - */ -void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, - unsigned long data) -{ - switch (len) { - case 1: - break; - case 2: - data = cpu_to_le16(data); - break; - case 4: - data = cpu_to_le32(data); - break; - default: - data = cpu_to_le64(data); - } - - kvm_mmio_write_buf(buf, len, data); -} - -static -struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) -{ - return container_of(dev, struct vgic_io_device, dev); -} - -static bool check_region(const struct kvm *kvm, - const struct vgic_register_region *region, - gpa_t addr, int len) -{ - int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - - switch (len) { - case sizeof(u8): - flags = VGIC_ACCESS_8bit; - break; - case sizeof(u32): - flags = VGIC_ACCESS_32bit; - break; - case sizeof(u64): - flags = VGIC_ACCESS_64bit; - break; - default: - return false; - } - - if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { - if (!region->bits_per_irq) - return true; - - /* Do we access a non-allocated IRQ? */ - return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; - } - - return false; -} - -static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, int len, void *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - unsigned long data = 0; - - region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, - addr - iodev->base_addr); - if (!region || !check_region(vcpu->kvm, region, addr, len)) { - memset(val, 0, len); - return 0; - } - - switch (iodev->iodev_type) { - case IODEV_CPUIF: - data = region->read(vcpu, addr, len); - break; - case IODEV_DIST: - data = region->read(vcpu, addr, len); - break; - case IODEV_REDIST: - data = region->read(iodev->redist_vcpu, addr, len); - break; - case IODEV_ITS: - data = region->its_read(vcpu->kvm, iodev->its, addr, len); - break; - } - - vgic_data_host_to_mmio_bus(val, len, data); - return 0; -} - -static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, - gpa_t addr, int len, const void *val) -{ - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); - const struct vgic_register_region *region; - unsigned long data = vgic_data_mmio_bus_to_host(val, len); - - region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, - addr - iodev->base_addr); - if (!region || !check_region(vcpu->kvm, region, addr, len)) - return 0; - - switch (iodev->iodev_type) { - case IODEV_CPUIF: - region->write(vcpu, addr, len, data); - break; - case IODEV_DIST: - region->write(vcpu, addr, len, data); - break; - case IODEV_REDIST: - region->write(iodev->redist_vcpu, addr, len, data); - break; - case IODEV_ITS: - region->its_write(vcpu->kvm, iodev->its, addr, len, data); - break; - } - - return 0; -} - -struct kvm_io_device_ops kvm_io_gic_ops = { - .read = dispatch_mmio_read, - .write = dispatch_mmio_write, -}; - -int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, - enum vgic_type type) -{ - struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; - int ret = 0; - unsigned int len; - - switch (type) { - case VGIC_V2: - len = vgic_v2_init_dist_iodev(io_device); - break; - case VGIC_V3: - len = vgic_v3_init_dist_iodev(io_device); - break; - default: - BUG_ON(1); - } - - io_device->base_addr = dist_base_address; - io_device->iodev_type = IODEV_DIST; - io_device->redist_vcpu = NULL; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, - len, &io_device->dev); - mutex_unlock(&kvm->slots_lock); - - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h deleted file mode 100644 index 84961b4..0000000 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef __KVM_ARM_VGIC_MMIO_H__ -#define __KVM_ARM_VGIC_MMIO_H__ - -struct vgic_register_region { - unsigned int reg_offset; - unsigned int len; - unsigned int bits_per_irq; - unsigned int access_flags; - union { - unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len); - }; - union { - void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - void (*its_write)(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val); - }; -}; - -extern struct kvm_io_device_ops kvm_io_gic_ops; - -#define VGIC_ACCESS_8bit 1 -#define VGIC_ACCESS_32bit 2 -#define VGIC_ACCESS_64bit 4 - -/* - * Generate a mask that covers the number of bytes required to address - * up to 1024 interrupts, each represented by <bits> bits. This assumes - * that <bits> is a power of two. - */ -#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) - -/* - * (addr & mask) gives us the _byte_ offset for the INT ID. - * We multiply this by 8 the get the _bit_ offset, then divide this by - * the number of bits to learn the actual INT ID. - * But instead of a division (which requires a "long long div" implementation), - * we shift by the binary logarithm of <bits>. - * This assumes that <bits> is a power of two. - */ -#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ - 8 >> ilog2(bits)) - -/* - * Some VGIC registers store per-IRQ information, with a different number - * of bits per IRQ. For those registers this macro is used. - * The _WITH_LENGTH version instantiates registers with a fixed length - * and is mutually exclusive with the _PER_IRQ version. - */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, bpi, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = bpi, \ - .len = bpi * 1024 / 8, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - } - -#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ - { \ - .reg_offset = off, \ - .bits_per_irq = 0, \ - .len = length, \ - .access_flags = acc, \ - .read = rd, \ - .write = wr, \ - } - -int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, - struct vgic_register_region *reg_desc, - struct vgic_io_device *region, - int nr_irqs, bool offset_private); - -unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); - -void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, - unsigned long data); - -unsigned long extract_bytes(u64 data, unsigned int offset, - unsigned int num); - -u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); - -unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len); - -void vgic_mmio_write_config(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); - -unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); - -unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); - -u64 vgic_sanitise_outer_cacheability(u64 reg); -u64 vgic_sanitise_inner_cacheability(u64 reg); -u64 vgic_sanitise_shareability(u64 reg); -u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, - u64 (*sanitise_fn)(u64)); - -#endif diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c deleted file mode 100644 index 0a063af..0000000 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ /dev/null @@ -1,379 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/irqchip/arm-gic.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_mmu.h> - -#include "vgic.h" - -/* - * Call this function to convert a u64 value to an unsigned long * bitmask - * in a way that works on both 32-bit and 64-bit LE and BE platforms. - * - * Warning: Calling this function may modify *val. - */ -static unsigned long *u64_to_bitmask(u64 *val) -{ -#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32 - *val = (*val >> 32) | (*val << 32); -#endif - return (unsigned long *)val; -} - -void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - - if (cpuif->vgic_misr & GICH_MISR_EOI) { - u64 eisr = cpuif->vgic_eisr; - unsigned long *eisr_bmap = u64_to_bitmask(&eisr); - int lr; - - for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) { - u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID; - - WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE); - - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - } - } - - /* check and disable underflow maintenance IRQ */ - cpuif->vgic_hcr &= ~GICH_HCR_UIE; - - /* - * In the next iterations of the vcpu loop, if we sync the - * vgic state after flushing it, but before entering the guest - * (this happens for pending signals and vmid rollovers), then - * make sure we don't pick up any old maintenance interrupts - * here. - */ - cpuif->vgic_eisr = 0; -} - -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - - cpuif->vgic_hcr |= GICH_HCR_UIE; -} - -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - int lr; - - for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - - spin_lock(&irq->irq_lock); - - /* Always preserve the active bit */ - irq->active = !!(val & GICH_LR_ACTIVE_BIT); - - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & GICH_LR_PENDING_BIT)) { - irq->pending = true; - - if (vgic_irq_is_sgi(intid)) { - u32 cpuid = val & GICH_LR_PHYSID_CPUID; - - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - irq->source |= (1 << cpuid); - } - } - - /* - * Clear soft pending state when level irqs have been acked. - * Always regenerate the pending state. - */ - if (irq->config == VGIC_CONFIG_LEVEL) { - if (!(val & GICH_LR_PENDING_BIT)) - irq->soft_pending = false; - - irq->pending = irq->line_level || irq->soft_pending; - } - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -/* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. - */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) -{ - u32 val = irq->intid; - - if (irq->pending) { - val |= GICH_LR_PENDING_BIT; - - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending = false; - - if (vgic_irq_is_sgi(irq->intid)) { - u32 src = ffs(irq->source); - - BUG_ON(!src); - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) - irq->pending = true; - } - } - - if (irq->active) - val |= GICH_LR_ACTIVE_BIT; - - if (irq->hw) { - val |= GICH_LR_HW; - val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; - } else { - if (irq->config == VGIC_CONFIG_LEVEL) - val |= GICH_LR_EOI; - } - - /* The GICv2 LR only holds five bits of priority. */ - val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; -} - -void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; -} - -void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr; - - vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; - vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & - GICH_VMCR_ALIAS_BINPOINT_MASK; - vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & - GICH_VMCR_BINPOINT_MASK; - vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & - GICH_VMCR_PRIMASK_MASK; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; -} - -void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; - - vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> - GICH_VMCR_CTRL_SHIFT; - vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> - GICH_VMCR_ALIAS_BINPOINT_SHIFT; - vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> - GICH_VMCR_BINPOINT_SHIFT; - vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> - GICH_VMCR_PRIMASK_SHIFT; -} - -void vgic_v2_enable(struct kvm_vcpu *vcpu) -{ - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; -} - -/* check for overlapping regions and for regions crossing the end of memory */ -static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) -{ - if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) - return false; - if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) - return false; - - if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) - return true; - if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) - return true; - - return false; -} - -int vgic_v2_map_resources(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; - } - - if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { - kvm_err("VGIC CPU and dist frames overlap\n"); - ret = -EINVAL; - goto out; - } - - /* - * Initialize the vgic if this hasn't already been done on demand by - * accessing the vgic state from userspace. - */ - ret = vgic_init(kvm); - if (ret) { - kvm_err("Unable to initialize VGIC dynamic data structures\n"); - goto out; - } - - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); - if (ret) { - kvm_err("Unable to register VGIC MMIO regions\n"); - goto out; - } - - if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { - ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, - kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out; - } - } - - dist->ready = true; - -out: - if (ret) - kvm_vgic_destroy(kvm); - return ret; -} - -DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); - -/** - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT - * @node: pointer to the DT node - * - * Returns 0 if a GICv2 has been found, returns an error code otherwise - */ -int vgic_v2_probe(const struct gic_kvm_info *info) -{ - int ret; - u32 vtr; - - if (!info->vctrl.start) { - kvm_err("GICH not present in the firmware table\n"); - return -ENXIO; - } - - if (!PAGE_ALIGNED(info->vcpu.start) || - !PAGE_ALIGNED(resource_size(&info->vcpu))) { - kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); - kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start, - resource_size(&info->vcpu)); - if (!kvm_vgic_global_state.vcpu_base_va) { - kvm_err("Cannot ioremap GICV\n"); - return -ENOMEM; - } - - ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va, - kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu), - info->vcpu.start); - if (ret) { - kvm_err("Cannot map GICV into hyp\n"); - goto out; - } - - static_branch_enable(&vgic_v2_cpuif_trap); - } - - kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start, - resource_size(&info->vctrl)); - if (!kvm_vgic_global_state.vctrl_base) { - kvm_err("Cannot ioremap GICH\n"); - ret = -ENOMEM; - goto out; - } - - vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); - kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; - - ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base, - kvm_vgic_global_state.vctrl_base + - resource_size(&info->vctrl), - info->vctrl.start); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out; - } - - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device\n"); - goto out; - } - - kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_vgic_global_state.vcpu_base = info->vcpu.start; - kvm_vgic_global_state.type = VGIC_V2; - kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; - - kvm_info("vgic-v2@%llx\n", info->vctrl.start); - - return 0; -out: - if (kvm_vgic_global_state.vctrl_base) - iounmap(kvm_vgic_global_state.vctrl_base); - if (kvm_vgic_global_state.vcpu_base_va) - iounmap(kvm_vgic_global_state.vcpu_base_va); - - return ret; -} diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c deleted file mode 100644 index 9f0dae3..0000000 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ /dev/null @@ -1,363 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <kvm/arm_vgic.h> -#include <asm/kvm_mmu.h> -#include <asm/kvm_asm.h> - -#include "vgic.h" - -void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - - if (cpuif->vgic_misr & ICH_MISR_EOI) { - unsigned long eisr_bmap = cpuif->vgic_eisr; - int lr; - - for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) { - u32 intid; - u64 val = cpuif->vgic_lr[lr]; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) - intid = val & ICH_LR_VIRTUAL_ID_MASK; - else - intid = val & GICH_LR_VIRTUALID; - - WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE); - - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - } - - /* - * In the next iterations of the vcpu loop, if we sync - * the vgic state after flushing it, but before - * entering the guest (this happens for pending - * signals and vmid rollovers), then make sure we - * don't pick up any old maintenance interrupts here. - */ - cpuif->vgic_eisr = 0; - } - - cpuif->vgic_hcr &= ~ICH_HCR_UIE; -} - -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - - cpuif->vgic_hcr |= ICH_HCR_UIE; -} - -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; - - for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid; - struct vgic_irq *irq; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) - intid = val & ICH_LR_VIRTUAL_ID_MASK; - else - intid = val & GICH_LR_VIRTUALID; - irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; - - spin_lock(&irq->irq_lock); - - /* Always preserve the active bit */ - irq->active = !!(val & ICH_LR_ACTIVE_BIT); - - /* Edge is the only case where we preserve the pending bit */ - if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { - irq->pending = true; - - if (vgic_irq_is_sgi(intid) && - model == KVM_DEV_TYPE_ARM_VGIC_V2) { - u32 cpuid = val & GICH_LR_PHYSID_CPUID; - - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - irq->source |= (1 << cpuid); - } - } - - /* - * Clear soft pending state when level irqs have been acked. - * Always regenerate the pending state. - */ - if (irq->config == VGIC_CONFIG_LEVEL) { - if (!(val & ICH_LR_PENDING_BIT)) - irq->soft_pending = false; - - irq->pending = irq->line_level || irq->soft_pending; - } - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - } -} - -/* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) -{ - u32 model = vcpu->kvm->arch.vgic.vgic_model; - u64 val = irq->intid; - - if (irq->pending) { - val |= ICH_LR_PENDING_BIT; - - if (irq->config == VGIC_CONFIG_EDGE) - irq->pending = false; - - if (vgic_irq_is_sgi(irq->intid) && - model == KVM_DEV_TYPE_ARM_VGIC_V2) { - u32 src = ffs(irq->source); - - BUG_ON(!src); - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) - irq->pending = true; - } - } - - if (irq->active) - val |= ICH_LR_ACTIVE_BIT; - - if (irq->hw) { - val |= ICH_LR_HW; - val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; - } else { - if (irq->config == VGIC_CONFIG_LEVEL) - val |= ICH_LR_EOI; - } - - /* - * We currently only support Group1 interrupts, which is a - * known defect. This needs to be addressed at some point. - */ - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; -} - -void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; -} - -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr; - - vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; - vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; - vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; - vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; -} - -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; - - vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; - vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; - vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; -} - -#define INITIAL_PENDBASER_VALUE \ - (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ - GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ - GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) - -void vgic_v3_enable(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vgic_v3->vgic_vmcr = 0; - vgic_v3->vgic_elrsr = ~0; - - /* - * If we are emulating a GICv3, we do it in an non-GICv2-compatible - * way, so we force SRE to 1 to demonstrate this to the guest. - * This goes with the spec allowing the value to be RAO/WI. - */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; - vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; - } else { - vgic_v3->vgic_sre = 0; - } - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; -} - -/* check for overlapping regions and for regions crossing the end of memory */ -static bool vgic_v3_check_base(struct kvm *kvm) -{ - struct vgic_dist *d = &kvm->arch.vgic; - gpa_t redist_size = KVM_VGIC_V3_REDIST_SIZE; - - redist_size *= atomic_read(&kvm->online_vcpus); - - if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) - return false; - if (d->vgic_redist_base + redist_size < d->vgic_redist_base) - return false; - - if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE <= d->vgic_redist_base) - return true; - if (d->vgic_redist_base + redist_size <= d->vgic_dist_base) - return true; - - return false; -} - -int vgic_v3_map_resources(struct kvm *kvm) -{ - int ret = 0; - struct vgic_dist *dist = &kvm->arch.vgic; - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) { - kvm_err("Need to set vgic distributor addresses first\n"); - ret = -ENXIO; - goto out; - } - - if (!vgic_v3_check_base(kvm)) { - kvm_err("VGIC redist and dist frames overlap\n"); - ret = -EINVAL; - goto out; - } - - /* - * For a VGICv3 we require the userland to explicitly initialize - * the VGIC before we need to use it. - */ - if (!vgic_initialized(kvm)) { - ret = -EBUSY; - goto out; - } - - ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); - if (ret) { - kvm_err("Unable to register VGICv3 dist MMIO regions\n"); - goto out; - } - - ret = vgic_register_redist_iodevs(kvm, dist->vgic_redist_base); - if (ret) { - kvm_err("Unable to register VGICv3 redist MMIO regions\n"); - goto out; - } - - if (vgic_has_its(kvm)) { - ret = vgic_register_its_iodevs(kvm); - if (ret) { - kvm_err("Unable to register VGIC ITS MMIO regions\n"); - goto out; - } - } - - dist->ready = true; - -out: - if (ret) - kvm_vgic_destroy(kvm); - return ret; -} - -/** - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT - * @node: pointer to the DT node - * - * Returns 0 if a GICv3 has been found, returns an error code otherwise - */ -int vgic_v3_probe(const struct gic_kvm_info *info) -{ - u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); - int ret; - - /* - * The ListRegs field is 5 bits, but there is a architectural - * maximum of 16 list registers. Just ignore bit 4... - */ - kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; - kvm_vgic_global_state.can_emulate_gicv2 = false; - - if (!info->vcpu.start) { - kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; - } else if (!PAGE_ALIGNED(info->vcpu.start)) { - pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; - } else if (!PAGE_ALIGNED(resource_size(&info->vcpu))) { - pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&info->vcpu), - PAGE_SIZE); - kvm_vgic_global_state.vcpu_base = 0; - } else { - kvm_vgic_global_state.vcpu_base = info->vcpu.start; - kvm_vgic_global_state.can_emulate_gicv2 = true; - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device.\n"); - return ret; - } - kvm_info("vgic-v2@%llx\n", info->vcpu.start); - } - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); - if (ret) { - kvm_err("Cannot register GICv3 KVM device.\n"); - kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); - return ret; - } - - if (kvm_vgic_global_state.vcpu_base == 0) - kvm_info("disabling GICv2 emulation\n"); - - kvm_vgic_global_state.vctrl_base = NULL; - kvm_vgic_global_state.type = VGIC_V3; - kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; - - return 0; -} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c deleted file mode 100644 index 6440b56..0000000 --- a/virt/kvm/arm/vgic/vgic.c +++ /dev/null @@ -1,731 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/list_sort.h> - -#include "vgic.h" - -#define CREATE_TRACE_POINTS -#include "../trace.h" - -#ifdef CONFIG_DEBUG_SPINLOCK -#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) -#else -#define DEBUG_SPINLOCK_BUG_ON(p) -#endif - -struct vgic_global __section(.hyp.text) kvm_vgic_global_state = {.gicv3_cpuif = STATIC_KEY_FALSE_INIT,}; - -/* - * Locking order is always: - * its->cmd_lock (mutex) - * its->its_lock (mutex) - * vgic_cpu->ap_list_lock - * kvm->lpi_list_lock - * vgic_irq->irq_lock - * - * If you need to take multiple locks, always take the upper lock first, - * then the lower ones, e.g. first take the its_lock, then the irq_lock. - * If you are already holding a lock and need to take a higher one, you - * have to drop the lower ranking lock first and re-aquire it after having - * taken the upper one. - * - * When taking more than one ap_list_lock at the same time, always take the - * lowest numbered VCPU's ap_list_lock first, so: - * vcpuX->vcpu_id < vcpuY->vcpu_id: - * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); - * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); - */ - -/* - * Iterate over the VM's list of mapped LPIs to find the one with a - * matching interrupt ID and return a reference to the IRQ structure. - */ -static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct vgic_irq *irq = NULL; - - spin_lock(&dist->lpi_list_lock); - - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (irq->intid != intid) - continue; - - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() later once it's finished with the IRQ. - */ - vgic_get_irq_kref(irq); - goto out_unlock; - } - irq = NULL; - -out_unlock: - spin_unlock(&dist->lpi_list_lock); - - return irq; -} - -/* - * This looks up the virtual interrupt ID to get the corresponding - * struct vgic_irq. It also increases the refcount, so any caller is expected - * to call vgic_put_irq() once it's finished with this IRQ. - */ -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid) -{ - /* SGIs and PPIs */ - if (intid <= VGIC_MAX_PRIVATE) - return &vcpu->arch.vgic_cpu.private_irqs[intid]; - - /* SPIs */ - if (intid <= VGIC_MAX_SPI) - return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; - - /* LPIs */ - if (intid >= VGIC_MIN_LPI) - return vgic_get_lpi(kvm, intid); - - WARN(1, "Looking up struct vgic_irq for reserved INTID"); - return NULL; -} - -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) -{ -} - -void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - if (irq->intid < VGIC_MIN_LPI) - return; - - spin_lock(&dist->lpi_list_lock); - if (!kref_put(&irq->refcount, vgic_irq_release)) { - spin_unlock(&dist->lpi_list_lock); - return; - }; - - list_del(&irq->lpi_list); - dist->lpi_list_count--; - spin_unlock(&dist->lpi_list_lock); - - kfree(irq); -} - -/** - * kvm_vgic_target_oracle - compute the target vcpu for an irq - * - * @irq: The irq to route. Must be already locked. - * - * Based on the current state of the interrupt (enabled, pending, - * active, vcpu and target_vcpu), compute the next vcpu this should be - * given to. Return NULL if this shouldn't be injected at all. - * - * Requires the IRQ lock to be held. - */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) -{ - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); - - /* If the interrupt is active, it must stay on the current vcpu */ - if (irq->active) - return irq->vcpu ? : irq->target_vcpu; - - /* - * If the IRQ is not active but enabled and pending, we should direct - * it to its configured target VCPU. - * If the distributor is disabled, pending interrupts shouldn't be - * forwarded. - */ - if (irq->enabled && irq->pending) { - if (unlikely(irq->target_vcpu && - !irq->target_vcpu->kvm->arch.vgic.enabled)) - return NULL; - - return irq->target_vcpu; - } - - /* If neither active nor pending and enabled, then this IRQ should not - * be queued to any VCPU. - */ - return NULL; -} - -/* - * The order of items in the ap_lists defines how we'll pack things in LRs as - * well, the first items in the list being the first things populated in the - * LRs. - * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * - * Otherwise things should be sorted by the priority field and the GIC - * hardware support will take care of preemption of priority groups etc. - * - * Return negative if "a" sorts before "b", 0 to preserve order, and positive - * to sort "b" before "a". - */ -static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); - struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); - bool penda, pendb; - int ret; - - spin_lock(&irqa->irq_lock); - spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; - goto out; - } - - penda = irqa->enabled && irqa->pending; - pendb = irqb->enabled && irqb->pending; - - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; - goto out; - } - - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; -out: - spin_unlock(&irqb->irq_lock); - spin_unlock(&irqa->irq_lock); - return ret; -} - -/* Must be called with the ap_list_lock held */ -static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); -} - -/* - * Only valid injection if changing level for level-triggered IRQs or for a - * rising edge. - */ -static bool vgic_validate_injection(struct vgic_irq *irq, bool level) -{ - switch (irq->config) { - case VGIC_CONFIG_LEVEL: - return irq->line_level != level; - case VGIC_CONFIG_EDGE: - return level; - } - - return false; -} - -/* - * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. - * Do the queuing if necessary, taking the right locks in the right order. - * Returns true when the IRQ was queued, false otherwise. - * - * Needs to be entered with the IRQ lock already held, but will return - * with all locks dropped. - */ -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) -{ - struct kvm_vcpu *vcpu; - - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); - -retry: - vcpu = vgic_target_oracle(irq); - if (irq->vcpu || !vcpu) { - /* - * If this IRQ is already on a VCPU's ap_list, then it - * cannot be moved or modified and there is no more work for - * us to do. - * - * Otherwise, if the irq is not pending and enabled, it does - * not need to be inserted into an ap_list and there is also - * no more work for us to do. - */ - spin_unlock(&irq->irq_lock); - - /* - * We have to kick the VCPU here, because we could be - * queueing an edge-triggered interrupt for which we - * get no EOI maintenance interrupt. In that case, - * while the IRQ is already on the VCPU's AP list, the - * VCPU could have EOI'ed the original interrupt and - * won't see this one until it exits for some other - * reason. - */ - if (vcpu) - kvm_vcpu_kick(vcpu); - return false; - } - - /* - * We must unlock the irq lock to take the ap_list_lock where - * we are going to insert this new pending interrupt. - */ - spin_unlock(&irq->irq_lock); - - /* someone can do stuff here, which we re-check below */ - - spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); - spin_lock(&irq->irq_lock); - - /* - * Did something change behind our backs? - * - * There are two cases: - * 1) The irq lost its pending state or was disabled behind our - * backs and/or it was queued to another VCPU's ap_list. - * 2) Someone changed the affinity on this irq behind our - * backs and we are now holding the wrong ap_list_lock. - * - * In both cases, drop the locks and retry. - */ - - if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { - spin_unlock(&irq->irq_lock); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - - spin_lock(&irq->irq_lock); - goto retry; - } - - /* - * Grab a reference to the irq to reflect the fact that it is - * now in the ap_list. - */ - vgic_get_irq_kref(irq); - list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); - irq->vcpu = vcpu; - - spin_unlock(&irq->irq_lock); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - - kvm_vcpu_kick(vcpu); - - return true; -} - -static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, - unsigned int intid, bool level, - bool mapped_irq) -{ - struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - int ret; - - trace_vgic_update_irq_pending(cpuid, intid, level); - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - vcpu = kvm_get_vcpu(kvm, cpuid); - if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) - return -EINVAL; - - irq = vgic_get_irq(kvm, vcpu, intid); - if (!irq) - return -EINVAL; - - if (irq->hw != mapped_irq) { - vgic_put_irq(kvm, irq); - return -EINVAL; - } - - spin_lock(&irq->irq_lock); - - if (!vgic_validate_injection(irq, level)) { - /* Nothing to see here, move along... */ - spin_unlock(&irq->irq_lock); - vgic_put_irq(kvm, irq); - return 0; - } - - if (irq->config == VGIC_CONFIG_LEVEL) { - irq->line_level = level; - irq->pending = level || irq->soft_pending; - } else { - irq->pending = true; - } - - vgic_queue_irq_unlock(kvm, irq); - vgic_put_irq(kvm, irq); - - return 0; -} - -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @intid: The INTID to inject a new state to. - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * - * The VGIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level) -{ - return vgic_update_irq_pending(kvm, cpuid, intid, level, false); -} - -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level) -{ - return vgic_update_irq_pending(kvm, cpuid, intid, level, true); -} - -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) -{ - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); - - BUG_ON(!irq); - - spin_lock(&irq->irq_lock); - - irq->hw = true; - irq->hwintid = phys_irq; - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - - return 0; -} - -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) -{ - struct vgic_irq *irq; - - if (!vgic_initialized(vcpu->kvm)) - return -EAGAIN; - - irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); - BUG_ON(!irq); - - spin_lock(&irq->irq_lock); - - irq->hw = false; - irq->hwintid = 0; - - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - - return 0; -} - -/** - * vgic_prune_ap_list - Remove non-relevant interrupts from the list - * - * @vcpu: The VCPU pointer - * - * Go over the list of "interesting" interrupts, and prune those that we - * won't have to consider in the near future. - */ -static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq, *tmp; - -retry: - spin_lock(&vgic_cpu->ap_list_lock); - - list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { - struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; - - spin_lock(&irq->irq_lock); - - BUG_ON(vcpu != irq->vcpu); - - target_vcpu = vgic_target_oracle(irq); - - if (!target_vcpu) { - /* - * We don't need to process this interrupt any - * further, move it off the list. - */ - list_del(&irq->ap_list); - irq->vcpu = NULL; - spin_unlock(&irq->irq_lock); - - /* - * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, - * where we added the LPI to the ap_list. As - * we remove the irq from the list, we drop - * also drop the refcount. - */ - vgic_put_irq(vcpu->kvm, irq); - continue; - } - - if (target_vcpu == vcpu) { - /* We're on the right CPU */ - spin_unlock(&irq->irq_lock); - continue; - } - - /* This interrupt looks like it has to be migrated. */ - - spin_unlock(&irq->irq_lock); - spin_unlock(&vgic_cpu->ap_list_lock); - - /* - * Ensure locking order by always locking the smallest - * ID first. - */ - if (vcpu->vcpu_id < target_vcpu->vcpu_id) { - vcpuA = vcpu; - vcpuB = target_vcpu; - } else { - vcpuA = target_vcpu; - vcpuB = vcpu; - } - - spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); - spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, - SINGLE_DEPTH_NESTING); - spin_lock(&irq->irq_lock); - - /* - * If the affinity has been preserved, move the - * interrupt around. Otherwise, it means things have - * changed while the interrupt was unlocked, and we - * need to replay this. - * - * In all cases, we cannot trust the list not to have - * changed, so we restart from the beginning. - */ - if (target_vcpu == vgic_target_oracle(irq)) { - struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; - - list_del(&irq->ap_list); - irq->vcpu = target_vcpu; - list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); - } - - spin_unlock(&irq->irq_lock); - spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); - spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); - goto retry; - } - - spin_unlock(&vgic_cpu->ap_list_lock); -} - -static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_process_maintenance(vcpu); - else - vgic_v3_process_maintenance(vcpu); -} - -static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_fold_lr_state(vcpu); - else - vgic_v3_fold_lr_state(vcpu); -} - -/* Requires the irq_lock to be held. */ -static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, - struct vgic_irq *irq, int lr) -{ - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_populate_lr(vcpu, irq, lr); - else - vgic_v3_populate_lr(vcpu, irq, lr); -} - -static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_clear_lr(vcpu, lr); - else - vgic_v3_clear_lr(vcpu, lr); -} - -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - int count = 0; - - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - if (vgic_irq_is_sgi(irq->intid) && irq->source) - count += hweight8(irq->source); - else - count++; - spin_unlock(&irq->irq_lock); - } - return count; -} - -/* Requires the VCPU's ap_list_lock to be held. */ -static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - int count = 0; - - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - - if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) { - vgic_set_underflow(vcpu); - vgic_sort_ap_list(vcpu); - } - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); - - if (unlikely(vgic_target_oracle(irq) != vcpu)) - goto next; - - /* - * If we get an SGI with multiple sources, try to get - * them in all at once. - */ - do { - vgic_populate_lr(vcpu, irq, count++); - } while (irq->source && count < kvm_vgic_global_state.nr_lr); - -next: - spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) - break; - } - - vcpu->arch.vgic_cpu.used_lrs = count; - - /* Nuke remaining LRs */ - for ( ; count < kvm_vgic_global_state.nr_lr; count++) - vgic_clear_lr(vcpu, count); -} - -/* Sync back the hardware VGIC state into our emulation after a guest's run. */ -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - if (unlikely(!vgic_initialized(vcpu->kvm))) - return; - - vgic_process_maintenance_interrupt(vcpu); - vgic_fold_lr_state(vcpu); - vgic_prune_ap_list(vcpu); -} - -/* Flush our emulation state into the GIC hardware before entering the guest. */ -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - if (unlikely(!vgic_initialized(vcpu->kvm))) - return; - - spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); - vgic_flush_lr_state(vcpu); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); -} - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_irq *irq; - bool pending = false; - - if (!vcpu->kvm->arch.vgic.enabled) - return false; - - spin_lock(&vgic_cpu->ap_list_lock); - - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); - pending = irq->pending && irq->enabled; - spin_unlock(&irq->irq_lock); - - if (pending) - break; - } - - spin_unlock(&vgic_cpu->ap_list_lock); - - return pending; -} - -void vgic_kick_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c; - - /* - * We've injected an interrupt, time to find out who deserves - * a good kick... - */ - kvm_for_each_vcpu(c, vcpu, kvm) { - if (kvm_vgic_vcpu_pending_irq(vcpu)) - kvm_vcpu_kick(vcpu); - } -} - -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) -{ - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); - bool map_is_active; - - spin_lock(&irq->irq_lock); - map_is_active = irq->hw && irq->active; - spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); - - return map_is_active; -} - diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h deleted file mode 100644 index 9d9e014..0000000 --- a/virt/kvm/arm/vgic/vgic.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef __KVM_ARM_VGIC_NEW_H__ -#define __KVM_ARM_VGIC_NEW_H__ - -#include <linux/irqchip/arm-gic-common.h> - -#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ -#define IMPLEMENTER_ARM 0x43b - -#define VGIC_ADDR_UNDEF (-1) -#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) - -#define INTERRUPT_ID_BITS_SPIS 10 -#define INTERRUPT_ID_BITS_ITS 16 -#define VGIC_PRI_BITS 5 - -#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) - -struct vgic_vmcr { - u32 ctlr; - u32 abpr; - u32 bpr; - u32 pmr; -}; - -struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, - u32 intid); -void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); -void vgic_kick_vcpus(struct kvm *kvm); - -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment); - -void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu); -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); -int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, - int offset, u32 *val); -void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); -int vgic_v2_probe(const struct gic_kvm_info *info); -int vgic_v2_map_resources(struct kvm *kvm); -int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, - enum vgic_type); - -static inline void vgic_get_irq_kref(struct vgic_irq *irq) -{ - if (irq->intid < VGIC_MIN_LPI) - return; - - kref_get(&irq->refcount); -} - -void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu); -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); -int vgic_v3_probe(const struct gic_kvm_info *info); -int vgic_v3_map_resources(struct kvm *kvm); -int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); - -#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS -int vgic_register_its_iodevs(struct kvm *kvm); -bool vgic_has_its(struct kvm *kvm); -int kvm_vgic_register_its_device(void); -void vgic_enable_lpis(struct kvm_vcpu *vcpu); -int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); -#else -static inline int vgic_register_its_iodevs(struct kvm *kvm) -{ - return -ENODEV; -} - -static inline bool vgic_has_its(struct kvm *kvm) -{ - return false; -} - -static inline int kvm_vgic_register_its_device(void) -{ - return -ENODEV; -} - -static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu) -{ -} - -static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) -{ - return -ENODEV; -} -#endif - -int kvm_register_vgic_device(unsigned long type); -int vgic_lazy_init(struct kvm *kvm); -int vgic_init(struct kvm *kvm); - -#endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c deleted file mode 100644 index efeceb0..0000000 --- a/virt/kvm/async_pf.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * kvm asynchronous fault support - * - * Copyright 2010 Red Hat, Inc. - * - * Author: - * Gleb Natapov <gleb@redhat.com> - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/mmu_context.h> - -#include "async_pf.h" -#include <trace/events/kvm.h> - -static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ -#ifdef CONFIG_KVM_ASYNC_PF_SYNC - kvm_arch_async_page_present(vcpu, work); -#endif -} -static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ -#ifndef CONFIG_KVM_ASYNC_PF_SYNC - kvm_arch_async_page_present(vcpu, work); -#endif -} - -static struct kmem_cache *async_pf_cache; - -int kvm_async_pf_init(void) -{ - async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); - - if (!async_pf_cache) - return -ENOMEM; - - return 0; -} - -void kvm_async_pf_deinit(void) -{ - kmem_cache_destroy(async_pf_cache); - async_pf_cache = NULL; -} - -void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) -{ - INIT_LIST_HEAD(&vcpu->async_pf.done); - INIT_LIST_HEAD(&vcpu->async_pf.queue); - spin_lock_init(&vcpu->async_pf.lock); -} - -static void async_pf_execute(struct work_struct *work) -{ - struct kvm_async_pf *apf = - container_of(work, struct kvm_async_pf, work); - struct mm_struct *mm = apf->mm; - struct kvm_vcpu *vcpu = apf->vcpu; - unsigned long addr = apf->addr; - gva_t gva = apf->gva; - - might_sleep(); - - /* - * This work is run asynchromously to the task which owns - * mm and might be done in another context, so we must - * use FOLL_REMOTE. - */ - __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, - FOLL_WRITE | FOLL_REMOTE); - - kvm_async_page_present_sync(vcpu, apf); - - spin_lock(&vcpu->async_pf.lock); - list_add_tail(&apf->link, &vcpu->async_pf.done); - apf->vcpu = NULL; - spin_unlock(&vcpu->async_pf.lock); - - /* - * apf may be freed by kvm_check_async_pf_completion() after - * this point - */ - - trace_kvm_async_pf_completed(addr, gva); - - /* - * This memory barrier pairs with prepare_to_wait's set_current_state() - */ - smp_mb(); - if (swait_active(&vcpu->wq)) - swake_up(&vcpu->wq); - - mmput(mm); - kvm_put_kvm(vcpu->kvm); -} - -void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->async_pf.lock); - - /* cancel outstanding work queue item */ - while (!list_empty(&vcpu->async_pf.queue)) { - struct kvm_async_pf *work = - list_first_entry(&vcpu->async_pf.queue, - typeof(*work), queue); - list_del(&work->queue); - - /* - * We know it's present in vcpu->async_pf.done, do - * nothing here. - */ - if (!work->vcpu) - continue; - - spin_unlock(&vcpu->async_pf.lock); -#ifdef CONFIG_KVM_ASYNC_PF_SYNC - flush_work(&work->work); -#else - if (cancel_work_sync(&work->work)) { - mmput(work->mm); - kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ - kmem_cache_free(async_pf_cache, work); - } -#endif - spin_lock(&vcpu->async_pf.lock); - } - - while (!list_empty(&vcpu->async_pf.done)) { - struct kvm_async_pf *work = - list_first_entry(&vcpu->async_pf.done, - typeof(*work), link); - list_del(&work->link); - kmem_cache_free(async_pf_cache, work); - } - spin_unlock(&vcpu->async_pf.lock); - - vcpu->async_pf.queued = 0; -} - -void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) -{ - struct kvm_async_pf *work; - - while (!list_empty_careful(&vcpu->async_pf.done) && - kvm_arch_can_inject_async_page_present(vcpu)) { - spin_lock(&vcpu->async_pf.lock); - work = list_first_entry(&vcpu->async_pf.done, typeof(*work), - link); - list_del(&work->link); - spin_unlock(&vcpu->async_pf.lock); - - kvm_arch_async_page_ready(vcpu, work); - kvm_async_page_present_async(vcpu, work); - - list_del(&work->queue); - vcpu->async_pf.queued--; - kmem_cache_free(async_pf_cache, work); - } -} - -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch) -{ - struct kvm_async_pf *work; - - if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) - return 0; - - /* setup delayed work */ - - /* - * do alloc nowait since if we are going to sleep anyway we - * may as well sleep faulting in page - */ - work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN); - if (!work) - return 0; - - work->wakeup_all = false; - work->vcpu = vcpu; - work->gva = gva; - work->addr = hva; - work->arch = *arch; - work->mm = current->mm; - atomic_inc(&work->mm->mm_users); - kvm_get_kvm(work->vcpu->kvm); - - /* this can't really happen otherwise gfn_to_pfn_async - would succeed */ - if (unlikely(kvm_is_error_hva(work->addr))) - goto retry_sync; - - INIT_WORK(&work->work, async_pf_execute); - if (!schedule_work(&work->work)) - goto retry_sync; - - list_add_tail(&work->queue, &vcpu->async_pf.queue); - vcpu->async_pf.queued++; - kvm_arch_async_page_not_present(vcpu, work); - return 1; -retry_sync: - kvm_put_kvm(work->vcpu->kvm); - mmput(work->mm); - kmem_cache_free(async_pf_cache, work); - return 0; -} - -int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) -{ - struct kvm_async_pf *work; - - if (!list_empty_careful(&vcpu->async_pf.done)) - return 0; - - work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); - if (!work) - return -ENOMEM; - - work->wakeup_all = true; - INIT_LIST_HEAD(&work->queue); /* for list_del to work */ - - spin_lock(&vcpu->async_pf.lock); - list_add_tail(&work->link, &vcpu->async_pf.done); - spin_unlock(&vcpu->async_pf.lock); - - vcpu->async_pf.queued++; - return 0; -} diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h deleted file mode 100644 index ec4cfa2..0000000 --- a/virt/kvm/async_pf.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * kvm asynchronous fault support - * - * Copyright 2010 Red Hat, Inc. - * - * Author: - * Gleb Natapov <gleb@redhat.com> - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef __KVM_ASYNC_PF_H__ -#define __KVM_ASYNC_PF_H__ - -#ifdef CONFIG_KVM_ASYNC_PF -int kvm_async_pf_init(void); -void kvm_async_pf_deinit(void); -void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); -#else -#define kvm_async_pf_init() (0) -#define kvm_async_pf_deinit() do {} while (0) -#define kvm_async_pf_vcpu_init(C) do {} while (0) -#endif - -#endif diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c deleted file mode 100644 index 571c1ce..0000000 --- a/virt/kvm/coalesced_mmio.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * KVM coalesced MMIO - * - * Copyright (c) 2008 Bull S.A.S. - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * Author: Laurent Vivier <Laurent.Vivier@bull.net> - * - */ - -#include <kvm/iodev.h> - -#include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/kvm.h> - -#include "coalesced_mmio.h" - -static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_coalesced_mmio_dev, dev); -} - -static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, - gpa_t addr, int len) -{ - /* is it in a batchable area ? - * (addr,len) is fully included in - * (zone->addr, zone->size) - */ - if (len < 0) - return 0; - if (addr + len < addr) - return 0; - if (addr < dev->zone.addr) - return 0; - if (addr + len > dev->zone.addr + dev->zone.size) - return 0; - return 1; -} - -static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) -{ - struct kvm_coalesced_mmio_ring *ring; - unsigned avail; - - /* Are we able to batch it ? */ - - /* last is the first free entry - * check if we don't meet the first used entry - * there is always one unused entry in the buffer - */ - ring = dev->kvm->coalesced_mmio_ring; - avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; - if (avail == 0) { - /* full */ - return 0; - } - - return 1; -} - -static int coalesced_mmio_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, gpa_t addr, - int len, const void *val) -{ - struct kvm_coalesced_mmio_dev *dev = to_mmio(this); - struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; - - if (!coalesced_mmio_in_range(dev, addr, len)) - return -EOPNOTSUPP; - - spin_lock(&dev->kvm->ring_lock); - - if (!coalesced_mmio_has_room(dev)) { - spin_unlock(&dev->kvm->ring_lock); - return -EOPNOTSUPP; - } - - /* copy data in first free entry of the ring */ - - ring->coalesced_mmio[ring->last].phys_addr = addr; - ring->coalesced_mmio[ring->last].len = len; - memcpy(ring->coalesced_mmio[ring->last].data, val, len); - smp_wmb(); - ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; - spin_unlock(&dev->kvm->ring_lock); - return 0; -} - -static void coalesced_mmio_destructor(struct kvm_io_device *this) -{ - struct kvm_coalesced_mmio_dev *dev = to_mmio(this); - - list_del(&dev->list); - - kfree(dev); -} - -static const struct kvm_io_device_ops coalesced_mmio_ops = { - .write = coalesced_mmio_write, - .destructor = coalesced_mmio_destructor, -}; - -int kvm_coalesced_mmio_init(struct kvm *kvm) -{ - struct page *page; - int ret; - - ret = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto out_err; - - ret = 0; - kvm->coalesced_mmio_ring = page_address(page); - - /* - * We're using this spinlock to sync access to the coalesced ring. - * The list doesn't need it's own lock since device registration and - * unregistration should only happen when kvm->slots_lock is held. - */ - spin_lock_init(&kvm->ring_lock); - INIT_LIST_HEAD(&kvm->coalesced_zones); - -out_err: - return ret; -} - -void kvm_coalesced_mmio_free(struct kvm *kvm) -{ - if (kvm->coalesced_mmio_ring) - free_page((unsigned long)kvm->coalesced_mmio_ring); -} - -int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, - struct kvm_coalesced_mmio_zone *zone) -{ - int ret; - struct kvm_coalesced_mmio_dev *dev; - - dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); - dev->kvm = kvm; - dev->zone = *zone; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, - zone->size, &dev->dev); - if (ret < 0) - goto out_free_dev; - list_add_tail(&dev->list, &kvm->coalesced_zones); - mutex_unlock(&kvm->slots_lock); - - return 0; - -out_free_dev: - mutex_unlock(&kvm->slots_lock); - kfree(dev); - - return ret; -} - -int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, - struct kvm_coalesced_mmio_zone *zone) -{ - struct kvm_coalesced_mmio_dev *dev, *tmp; - - mutex_lock(&kvm->slots_lock); - - list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) - if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev); - kvm_iodevice_destructor(&dev->dev); - } - - mutex_unlock(&kvm->slots_lock); - - return 0; -} diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h deleted file mode 100644 index 6bca74c..0000000 --- a/virt/kvm/coalesced_mmio.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef __KVM_COALESCED_MMIO_H__ -#define __KVM_COALESCED_MMIO_H__ - -/* - * KVM coalesced MMIO - * - * Copyright (c) 2008 Bull S.A.S. - * - * Author: Laurent Vivier <Laurent.Vivier@bull.net> - * - */ - -#ifdef CONFIG_KVM_MMIO - -#include <linux/list.h> - -struct kvm_coalesced_mmio_dev { - struct list_head list; - struct kvm_io_device dev; - struct kvm *kvm; - struct kvm_coalesced_mmio_zone zone; -}; - -int kvm_coalesced_mmio_init(struct kvm *kvm); -void kvm_coalesced_mmio_free(struct kvm *kvm); -int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, - struct kvm_coalesced_mmio_zone *zone); -int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, - struct kvm_coalesced_mmio_zone *zone); - -#else - -static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; } -static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { } - -#endif - -#endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c deleted file mode 100644 index a29786d..0000000 --- a/virt/kvm/eventfd.c +++ /dev/null @@ -1,956 +0,0 @@ -/* - * kvm eventfd support - use eventfd objects to signal various KVM events - * - * Copyright 2009 Novell. All Rights Reserved. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: - * Gregory Haskins <ghaskins@novell.com> - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/kvm_irqfd.h> -#include <linux/workqueue.h> -#include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/poll.h> -#include <linux/file.h> -#include <linux/list.h> -#include <linux/eventfd.h> -#include <linux/kernel.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/seqlock.h> -#include <linux/irqbypass.h> -#include <trace/events/kvm.h> - -#include <kvm/iodev.h> - -#ifdef CONFIG_HAVE_KVM_IRQFD - -static struct workqueue_struct *irqfd_cleanup_wq; - -static void -irqfd_inject(struct work_struct *work) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(work, struct kvm_kernel_irqfd, inject); - struct kvm *kvm = irqfd->kvm; - - if (!irqfd->resampler) { - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, - false); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, - false); - } else - kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - irqfd->gsi, 1, false); -} - -/* - * Since resampler irqfds share an IRQ source ID, we de-assert once - * then notify all of the resampler irqfds using this GSI. We can't - * do multiple de-asserts or we risk racing with incoming re-asserts. - */ -static void -irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_kernel_irqfd_resampler *resampler; - struct kvm *kvm; - struct kvm_kernel_irqfd *irqfd; - int idx; - - resampler = container_of(kian, - struct kvm_kernel_irqfd_resampler, notifier); - kvm = resampler->kvm; - - kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - resampler->notifier.gsi, 0, false); - - idx = srcu_read_lock(&kvm->irq_srcu); - - list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) - eventfd_signal(irqfd->resamplefd, 1); - - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -static void -irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) -{ - struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; - struct kvm *kvm = resampler->kvm; - - mutex_lock(&kvm->irqfds.resampler_lock); - - list_del_rcu(&irqfd->resampler_link); - synchronize_srcu(&kvm->irq_srcu); - - if (list_empty(&resampler->list)) { - list_del(&resampler->link); - kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); - kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - resampler->notifier.gsi, 0, false); - kfree(resampler); - } - - mutex_unlock(&kvm->irqfds.resampler_lock); -} - -/* - * Race-free decouple logic (ordering is critical) - */ -static void -irqfd_shutdown(struct work_struct *work) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(work, struct kvm_kernel_irqfd, shutdown); - u64 cnt; - - /* - * Synchronize with the wait-queue and unhook ourselves to prevent - * further events. - */ - eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); - - /* - * We know no new events will be scheduled at this point, so block - * until all previously outstanding events have completed - */ - flush_work(&irqfd->inject); - - if (irqfd->resampler) { - irqfd_resampler_shutdown(irqfd); - eventfd_ctx_put(irqfd->resamplefd); - } - - /* - * It is now safe to release the object's resources - */ -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS - irq_bypass_unregister_consumer(&irqfd->consumer); -#endif - eventfd_ctx_put(irqfd->eventfd); - kfree(irqfd); -} - - -/* assumes kvm->irqfds.lock is held */ -static bool -irqfd_is_active(struct kvm_kernel_irqfd *irqfd) -{ - return list_empty(&irqfd->list) ? false : true; -} - -/* - * Mark the irqfd as inactive and schedule it for removal - * - * assumes kvm->irqfds.lock is held - */ -static void -irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) -{ - BUG_ON(!irqfd_is_active(irqfd)); - - list_del_init(&irqfd->list); - - queue_work(irqfd_cleanup_wq, &irqfd->shutdown); -} - -int __attribute__((weak)) kvm_arch_set_irq_inatomic( - struct kvm_kernel_irq_routing_entry *irq, - struct kvm *kvm, int irq_source_id, - int level, - bool line_status) -{ - return -EWOULDBLOCK; -} - -/* - * Called with wqh->lock held and interrupts disabled - */ -static int -irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(wait, struct kvm_kernel_irqfd, wait); - unsigned long flags = (unsigned long)key; - struct kvm_kernel_irq_routing_entry irq; - struct kvm *kvm = irqfd->kvm; - unsigned seq; - int idx; - - if (flags & POLLIN) { - idx = srcu_read_lock(&kvm->irq_srcu); - do { - seq = read_seqcount_begin(&irqfd->irq_entry_sc); - irq = irqfd->irq_entry; - } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); - /* An event has been signaled, inject an interrupt */ - if (kvm_arch_set_irq_inatomic(&irq, kvm, - KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false) == -EWOULDBLOCK) - schedule_work(&irqfd->inject); - srcu_read_unlock(&kvm->irq_srcu, idx); - } - - if (flags & POLLHUP) { - /* The eventfd is closing, detach from KVM */ - unsigned long flags; - - spin_lock_irqsave(&kvm->irqfds.lock, flags); - - /* - * We must check if someone deactivated the irqfd before - * we could acquire the irqfds.lock since the item is - * deactivated from the KVM side before it is unhooked from - * the wait-queue. If it is already deactivated, we can - * simply return knowing the other side will cleanup for us. - * We cannot race against the irqfd going away since the - * other side is required to acquire wqh->lock, which we hold - */ - if (irqfd_is_active(irqfd)) - irqfd_deactivate(irqfd); - - spin_unlock_irqrestore(&kvm->irqfds.lock, flags); - } - - return 0; -} - -static void -irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(pt, struct kvm_kernel_irqfd, pt); - add_wait_queue(wqh, &irqfd->wait); -} - -/* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - int n_entries; - - n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); - - write_seqcount_begin(&irqfd->irq_entry_sc); - - e = entries; - if (n_entries == 1) - irqfd->irq_entry = *e; - else - irqfd->irq_entry.type = 0; - - write_seqcount_end(&irqfd->irq_entry_sc); -} - -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS -void __attribute__((weak)) kvm_arch_irq_bypass_stop( - struct irq_bypass_consumer *cons) -{ -} - -void __attribute__((weak)) kvm_arch_irq_bypass_start( - struct irq_bypass_consumer *cons) -{ -} - -int __attribute__((weak)) kvm_arch_update_irqfd_routing( - struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - return 0; -} -#endif - -static int -kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) -{ - struct kvm_kernel_irqfd *irqfd, *tmp; - struct fd f; - struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; - int ret; - unsigned int events; - int idx; - - if (!kvm_arch_intc_initialized(kvm)) - return -EAGAIN; - - irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); - if (!irqfd) - return -ENOMEM; - - irqfd->kvm = kvm; - irqfd->gsi = args->gsi; - INIT_LIST_HEAD(&irqfd->list); - INIT_WORK(&irqfd->inject, irqfd_inject); - INIT_WORK(&irqfd->shutdown, irqfd_shutdown); - seqcount_init(&irqfd->irq_entry_sc); - - f = fdget(args->fd); - if (!f.file) { - ret = -EBADF; - goto out; - } - - eventfd = eventfd_ctx_fileget(f.file); - if (IS_ERR(eventfd)) { - ret = PTR_ERR(eventfd); - goto fail; - } - - irqfd->eventfd = eventfd; - - if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { - struct kvm_kernel_irqfd_resampler *resampler; - - resamplefd = eventfd_ctx_fdget(args->resamplefd); - if (IS_ERR(resamplefd)) { - ret = PTR_ERR(resamplefd); - goto fail; - } - - irqfd->resamplefd = resamplefd; - INIT_LIST_HEAD(&irqfd->resampler_link); - - mutex_lock(&kvm->irqfds.resampler_lock); - - list_for_each_entry(resampler, - &kvm->irqfds.resampler_list, link) { - if (resampler->notifier.gsi == irqfd->gsi) { - irqfd->resampler = resampler; - break; - } - } - - if (!irqfd->resampler) { - resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); - if (!resampler) { - ret = -ENOMEM; - mutex_unlock(&kvm->irqfds.resampler_lock); - goto fail; - } - - resampler->kvm = kvm; - INIT_LIST_HEAD(&resampler->list); - resampler->notifier.gsi = irqfd->gsi; - resampler->notifier.irq_acked = irqfd_resampler_ack; - INIT_LIST_HEAD(&resampler->link); - - list_add(&resampler->link, &kvm->irqfds.resampler_list); - kvm_register_irq_ack_notifier(kvm, - &resampler->notifier); - irqfd->resampler = resampler; - } - - list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); - synchronize_srcu(&kvm->irq_srcu); - - mutex_unlock(&kvm->irqfds.resampler_lock); - } - - /* - * Install our own custom wake-up handling so we are notified via - * a callback whenever someone signals the underlying eventfd - */ - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); - - spin_lock_irq(&kvm->irqfds.lock); - - ret = 0; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - goto fail; - } - - idx = srcu_read_lock(&kvm->irq_srcu); - irqfd_update(kvm, irqfd); - srcu_read_unlock(&kvm->irq_srcu, idx); - - list_add_tail(&irqfd->list, &kvm->irqfds.items); - - spin_unlock_irq(&kvm->irqfds.lock); - - /* - * Check if there was an event already pending on the eventfd - * before we registered, and trigger it as if we didn't miss it. - */ - events = f.file->f_op->poll(f.file, &irqfd->pt); - - if (events & POLLIN) - schedule_work(&irqfd->inject); - - /* - * do not drop the file until the irqfd is fully initialized, otherwise - * we might race against the POLLHUP - */ - fdput(f); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS - if (kvm_arch_has_irq_bypass()) { - irqfd->consumer.token = (void *)irqfd->eventfd; - irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; - irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; - irqfd->consumer.stop = kvm_arch_irq_bypass_stop; - irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); - if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", - irqfd->consumer.token, ret); - } -#endif - - return 0; - -fail: - if (irqfd->resampler) - irqfd_resampler_shutdown(irqfd); - - if (resamplefd && !IS_ERR(resamplefd)) - eventfd_ctx_put(resamplefd); - - if (eventfd && !IS_ERR(eventfd)) - eventfd_ctx_put(eventfd); - - fdput(f); - -out: - kfree(irqfd); - return ret; -} - -bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - struct kvm_irq_ack_notifier *kian; - int gsi, idx; - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); - if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) { - srcu_read_unlock(&kvm->irq_srcu, idx); - return true; - } - - srcu_read_unlock(&kvm->irq_srcu, idx); - - return false; -} -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); - -void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) -{ - struct kvm_irq_ack_notifier *kian; - - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) - kian->irq_acked(kian); -} - -void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - int gsi, idx; - - trace_kvm_ack_irq(irqchip, pin); - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); - if (gsi != -1) - kvm_notify_acked_gsi(kvm, gsi); - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - mutex_lock(&kvm->irq_lock); - hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); - mutex_unlock(&kvm->irq_lock); - kvm_vcpu_request_scan_ioapic(kvm); -} - -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - mutex_lock(&kvm->irq_lock); - hlist_del_init_rcu(&kian->link); - mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); - kvm_vcpu_request_scan_ioapic(kvm); -} -#endif - -void -kvm_eventfd_init(struct kvm *kvm) -{ -#ifdef CONFIG_HAVE_KVM_IRQFD - spin_lock_init(&kvm->irqfds.lock); - INIT_LIST_HEAD(&kvm->irqfds.items); - INIT_LIST_HEAD(&kvm->irqfds.resampler_list); - mutex_init(&kvm->irqfds.resampler_lock); -#endif - INIT_LIST_HEAD(&kvm->ioeventfds); -} - -#ifdef CONFIG_HAVE_KVM_IRQFD -/* - * shutdown any irqfd's that match fd+gsi - */ -static int -kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) -{ - struct kvm_kernel_irqfd *irqfd, *tmp; - struct eventfd_ctx *eventfd; - - eventfd = eventfd_ctx_fdget(args->fd); - if (IS_ERR(eventfd)) - return PTR_ERR(eventfd); - - spin_lock_irq(&kvm->irqfds.lock); - - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { - /* - * This clearing of irq_entry.type is needed for when - * another thread calls kvm_irq_routing_update before - * we flush workqueue below (we synchronize with - * kvm_irq_routing_update using irqfds.lock). - */ - write_seqcount_begin(&irqfd->irq_entry_sc); - irqfd->irq_entry.type = 0; - write_seqcount_end(&irqfd->irq_entry_sc); - irqfd_deactivate(irqfd); - } - } - - spin_unlock_irq(&kvm->irqfds.lock); - eventfd_ctx_put(eventfd); - - /* - * Block until we know all outstanding shutdown jobs have completed - * so that we guarantee there will not be any more interrupts on this - * gsi once this deassign function returns. - */ - flush_workqueue(irqfd_cleanup_wq); - - return 0; -} - -int -kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) -{ - if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) - return -EINVAL; - - if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) - return kvm_irqfd_deassign(kvm, args); - - return kvm_irqfd_assign(kvm, args); -} - -/* - * This function is called as the kvm VM fd is being released. Shutdown all - * irqfds that still remain open - */ -void -kvm_irqfd_release(struct kvm *kvm) -{ - struct kvm_kernel_irqfd *irqfd, *tmp; - - spin_lock_irq(&kvm->irqfds.lock); - - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) - irqfd_deactivate(irqfd); - - spin_unlock_irq(&kvm->irqfds.lock); - - /* - * Block until we know all outstanding shutdown jobs have completed - * since we do not take a kvm* reference. - */ - flush_workqueue(irqfd_cleanup_wq); - -} - -/* - * Take note of a change in irq routing. - * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. - */ -void kvm_irq_routing_update(struct kvm *kvm) -{ - struct kvm_kernel_irqfd *irqfd; - - spin_lock_irq(&kvm->irqfds.lock); - - list_for_each_entry(irqfd, &kvm->irqfds.items, list) { - irqfd_update(kvm, irqfd); - -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS - if (irqfd->producer) { - int ret = kvm_arch_update_irqfd_routing( - irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, 1); - WARN_ON(ret); - } -#endif - } - - spin_unlock_irq(&kvm->irqfds.lock); -} - -/* - * create a host-wide workqueue for issuing deferred shutdown requests - * aggregated from all vm* instances. We need our own isolated - * queue to ease flushing work items when a VM exits. - */ -int kvm_irqfd_init(void) -{ - irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); - if (!irqfd_cleanup_wq) - return -ENOMEM; - - return 0; -} - -void kvm_irqfd_exit(void) -{ - destroy_workqueue(irqfd_cleanup_wq); -} -#endif - -/* - * -------------------------------------------------------------------- - * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. - * - * userspace can register a PIO/MMIO address with an eventfd for receiving - * notification when the memory has been touched. - * -------------------------------------------------------------------- - */ - -struct _ioeventfd { - struct list_head list; - u64 addr; - int length; - struct eventfd_ctx *eventfd; - u64 datamatch; - struct kvm_io_device dev; - u8 bus_idx; - bool wildcard; -}; - -static inline struct _ioeventfd * -to_ioeventfd(struct kvm_io_device *dev) -{ - return container_of(dev, struct _ioeventfd, dev); -} - -static void -ioeventfd_release(struct _ioeventfd *p) -{ - eventfd_ctx_put(p->eventfd); - list_del(&p->list); - kfree(p); -} - -static bool -ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) -{ - u64 _val; - - if (addr != p->addr) - /* address must be precise for a hit */ - return false; - - if (!p->length) - /* length = 0 means only look at the address, so always a hit */ - return true; - - if (len != p->length) - /* address-range must be precise for a hit */ - return false; - - if (p->wildcard) - /* all else equal, wildcard is always a hit */ - return true; - - /* otherwise, we have to actually compare the data */ - - BUG_ON(!IS_ALIGNED((unsigned long)val, len)); - - switch (len) { - case 1: - _val = *(u8 *)val; - break; - case 2: - _val = *(u16 *)val; - break; - case 4: - _val = *(u32 *)val; - break; - case 8: - _val = *(u64 *)val; - break; - default: - return false; - } - - return _val == p->datamatch ? true : false; -} - -/* MMIO/PIO writes trigger an event if the addr/val match */ -static int -ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, - int len, const void *val) -{ - struct _ioeventfd *p = to_ioeventfd(this); - - if (!ioeventfd_in_range(p, addr, len, val)) - return -EOPNOTSUPP; - - eventfd_signal(p->eventfd, 1); - return 0; -} - -/* - * This function is called as KVM is completely shutting down. We do not - * need to worry about locking just nuke anything we have as quickly as possible - */ -static void -ioeventfd_destructor(struct kvm_io_device *this) -{ - struct _ioeventfd *p = to_ioeventfd(this); - - ioeventfd_release(p); -} - -static const struct kvm_io_device_ops ioeventfd_ops = { - .write = ioeventfd_write, - .destructor = ioeventfd_destructor, -}; - -/* assumes kvm->slots_lock held */ -static bool -ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) -{ - struct _ioeventfd *_p; - - list_for_each_entry(_p, &kvm->ioeventfds, list) - if (_p->bus_idx == p->bus_idx && - _p->addr == p->addr && - (!_p->length || !p->length || - (_p->length == p->length && - (_p->wildcard || p->wildcard || - _p->datamatch == p->datamatch)))) - return true; - - return false; -} - -static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) -{ - if (flags & KVM_IOEVENTFD_FLAG_PIO) - return KVM_PIO_BUS; - if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) - return KVM_VIRTIO_CCW_NOTIFY_BUS; - return KVM_MMIO_BUS; -} - -static int kvm_assign_ioeventfd_idx(struct kvm *kvm, - enum kvm_bus bus_idx, - struct kvm_ioeventfd *args) -{ - - struct eventfd_ctx *eventfd; - struct _ioeventfd *p; - int ret; - - eventfd = eventfd_ctx_fdget(args->fd); - if (IS_ERR(eventfd)) - return PTR_ERR(eventfd); - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) { - ret = -ENOMEM; - goto fail; - } - - INIT_LIST_HEAD(&p->list); - p->addr = args->addr; - p->bus_idx = bus_idx; - p->length = args->len; - p->eventfd = eventfd; - - /* The datamatch feature is optional, otherwise this is a wildcard */ - if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) - p->datamatch = args->datamatch; - else - p->wildcard = true; - - mutex_lock(&kvm->slots_lock); - - /* Verify that there isn't a match already */ - if (ioeventfd_check_collision(kvm, p)) { - ret = -EEXIST; - goto unlock_fail; - } - - kvm_iodevice_init(&p->dev, &ioeventfd_ops); - - ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, - &p->dev); - if (ret < 0) - goto unlock_fail; - - kvm->buses[bus_idx]->ioeventfd_count++; - list_add_tail(&p->list, &kvm->ioeventfds); - - mutex_unlock(&kvm->slots_lock); - - return 0; - -unlock_fail: - mutex_unlock(&kvm->slots_lock); - -fail: - kfree(p); - eventfd_ctx_put(eventfd); - - return ret; -} - -static int -kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_ioeventfd *args) -{ - struct _ioeventfd *p, *tmp; - struct eventfd_ctx *eventfd; - int ret = -ENOENT; - - eventfd = eventfd_ctx_fdget(args->fd); - if (IS_ERR(eventfd)) - return PTR_ERR(eventfd); - - mutex_lock(&kvm->slots_lock); - - list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { - bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); - - if (p->bus_idx != bus_idx || - p->eventfd != eventfd || - p->addr != args->addr || - p->length != args->len || - p->wildcard != wildcard) - continue; - - if (!p->wildcard && p->datamatch != args->datamatch) - continue; - - kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); - kvm->buses[bus_idx]->ioeventfd_count--; - ioeventfd_release(p); - ret = 0; - break; - } - - mutex_unlock(&kvm->slots_lock); - - eventfd_ctx_put(eventfd); - - return ret; -} - -static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) -{ - enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); - int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); - - if (!args->len && bus_idx == KVM_MMIO_BUS) - kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); - - return ret; -} - -static int -kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) -{ - enum kvm_bus bus_idx; - int ret; - - bus_idx = ioeventfd_bus_from_flags(args->flags); - /* must be natural-word sized, or 0 to ignore length */ - switch (args->len) { - case 0: - case 1: - case 2: - case 4: - case 8: - break; - default: - return -EINVAL; - } - - /* check for range overflow */ - if (args->addr + args->len < args->addr) - return -EINVAL; - - /* check for extra flags that we don't understand */ - if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) - return -EINVAL; - - /* ioeventfd with no length can't be combined with DATAMATCH */ - if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) - return -EINVAL; - - ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); - if (ret) - goto fail; - - /* When length is ignored, MMIO is also put on a separate bus, for - * faster lookups. - */ - if (!args->len && bus_idx == KVM_MMIO_BUS) { - ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); - if (ret < 0) - goto fast_fail; - } - - return 0; - -fast_fail: - kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); -fail: - return ret; -} - -int -kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) -{ - if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) - return kvm_deassign_ioeventfd(kvm, args); - - return kvm_assign_ioeventfd(kvm, args); -} diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 3bcc999..3885f42 100644..100755 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -3,6 +3,7 @@ * Copyright (c) 2007, Intel Corporation. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (c) 2013, Alexander Graf <agraf@suse.de> + * Copyright 2019 Google LLC * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -25,11 +26,7 @@ */ #include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/srcu.h> -#include <linux/export.h> -#include <trace/events/kvm.h> -#include "irq.h" +#include "arch\x86\kvm\irq.h" int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) @@ -38,13 +35,15 @@ int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e; int n = 0; - irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, - lockdep_is_held(&kvm->irq_lock)); + irq_rt = kvm->irq_routing; + if (irq_rt && gsi < irq_rt->nr_rt_entries) { +#define LIST_ENTRY_TYPE_INFO struct kvm_kernel_irq_routing_entry hlist_for_each_entry(e, &irq_rt->map[gsi], link) { entries[n] = *e; ++n; } +#undef LIST_ENTRY_TYPE_INFO } return n; @@ -62,7 +61,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; - if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) + if (!irqchip_in_kernel(kvm) || (msi->flags & ~GVM_MSI_VALID_DEVID)) return -EINVAL; route.msi.address_lo = msi->address_lo; @@ -71,7 +70,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) route.msi.flags = msi->flags; route.msi.devid = msi->devid; - return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); + return kvm_set_msi(&route, kvm, GVM_USERSPACE_IRQ_SOURCE_ID, 1, false); } /* @@ -83,11 +82,9 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { - struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry irq_set[GVM_NR_IRQCHIPS]; int ret = -1, i, idx; - trace_kvm_set_irq(irq, level, irq_source_id); - /* Not possible to detect if the guest uses the PIC or the * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. @@ -120,10 +117,13 @@ static void free_irq_routing_table(struct kvm_irq_routing_table *rt) struct kvm_kernel_irq_routing_entry *e; struct hlist_node *n; +#define LIST_ENTRY_TYPE_INFO struct kvm_kernel_irq_routing_entry hlist_for_each_entry_safe(e, n, &rt->map[i], link) { + n = e->link.next; hlist_del(&e->link); kfree(e); } +#undef LIST_ENTRY_TYPE_INFO } kfree(rt); @@ -133,7 +133,7 @@ void kvm_free_irq_routing(struct kvm *kvm) { /* Called only during vm destruction. Nobody can use the pointer at this stage */ - struct kvm_irq_routing_table *rt = rcu_access_pointer(kvm->irq_routing); + struct kvm_irq_routing_table *rt = kvm->irq_routing; free_irq_routing_table(rt); } @@ -149,18 +149,20 @@ static int setup_routing_entry(struct kvm *kvm, * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ +#define LIST_ENTRY_TYPE_INFO struct kvm_kernel_irq_routing_entry hlist_for_each_entry(ei, &rt->map[ue->gsi], link) - if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || - ue->type != KVM_IRQ_ROUTING_IRQCHIP || + if (ei->type != GVM_IRQ_ROUTING_IRQCHIP || + ue->type != GVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return r; +#undef LIST_ENTRY_TYPE_INFO e->gsi = ue->gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) goto out; - if (e->type == KVM_IRQ_ROUTING_IRQCHIP) + if (e->type == GVM_IRQ_ROUTING_IRQCHIP) rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi; hlist_add_head(&e->link, &rt->map[e->gsi]); @@ -169,9 +171,10 @@ out: return r; } -void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm) +void kvm_arch_irq_routing_update_default(struct kvm *kvm) { } +#pragma comment(linker, "/alternatename:kvm_arch_irq_routing_update=kvm_arch_irq_routing_update_default") int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, @@ -184,7 +187,7 @@ int kvm_set_irq_routing(struct kvm *kvm, int r; for (i = 0; i < nr; ++i) { - if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) + if (ue[i].gsi >= GVM_MAX_IRQ_ROUTES) return -EINVAL; nr_rt_entries = max(nr_rt_entries, ue[i].gsi); } @@ -198,8 +201,8 @@ int kvm_set_irq_routing(struct kvm *kvm, return -ENOMEM; new->nr_rt_entries = nr_rt_entries; - for (i = 0; i < KVM_NR_IRQCHIPS; i++) - for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) + for (i = 0; i < GVM_NR_IRQCHIPS; i++) + for (j = 0; j < GVM_IRQCHIP_NUM_PINS; j++) new->chip[i][j] = -1; for (i = 0; i < nr; ++i) { @@ -210,8 +213,8 @@ int kvm_set_irq_routing(struct kvm *kvm, r = -EINVAL; switch (ue->type) { - case KVM_IRQ_ROUTING_MSI: - if (ue->flags & ~KVM_MSI_VALID_DEVID) + case GVM_IRQ_ROUTING_MSI: + if (ue->flags & ~GVM_MSI_VALID_DEVID) goto free_entry; break; default: @@ -227,9 +230,7 @@ int kvm_set_irq_routing(struct kvm *kvm, mutex_lock(&kvm->irq_lock); old = kvm->irq_routing; - rcu_assign_pointer(kvm->irq_routing, new); - kvm_irq_routing_update(kvm); - kvm_arch_irq_routing_update(kvm); + kvm->irq_routing = new; mutex_unlock(&kvm->irq_lock); kvm_arch_post_irq_routing_update(kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5c36034..e521da2 100644..100755 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6,6 +6,7 @@ * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright 2019 Google LLC * * Authors: * Avi Kivity <avi@qumranet.com> @@ -19,68 +20,13 @@ #include <kvm/iodev.h> #include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/percpu.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> -#include <linux/vmalloc.h> -#include <linux/reboot.h> -#include <linux/debugfs.h> -#include <linux/highmem.h> -#include <linux/file.h> -#include <linux/syscore_ops.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/cpumask.h> -#include <linux/smp.h> -#include <linux/anon_inodes.h> -#include <linux/profile.h> -#include <linux/kvm_para.h> -#include <linux/pagemap.h> -#include <linux/mman.h> -#include <linux/swap.h> -#include <linux/bitops.h> -#include <linux/spinlock.h> -#include <linux/compat.h> -#include <linux/srcu.h> -#include <linux/hugetlb.h> -#include <linux/slab.h> -#include <linux/sort.h> -#include <linux/bsearch.h> - -#include <asm/processor.h> -#include <asm/io.h> -#include <asm/ioctl.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> - -#include "coalesced_mmio.h" -#include "async_pf.h" -#include "vfio.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/kvm.h> +#include <uapi/linux/kvm.h> +#include <ntkrutils.h> +#include <gvm-main.h> /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -/* Architectures should define their poll value according to the halt latency */ -static unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; -module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR); - -/* Default doubles per-vcpu halt_poll_ns. */ -static unsigned int halt_poll_ns_grow = 2; -module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR); - -/* Default resets per-vcpu halt_poll_ns . */ -static unsigned int halt_poll_ns_shrink; -module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR); - /* * Ordering of locks: * @@ -90,47 +36,129 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR); DEFINE_SPINLOCK(kvm_lock); static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); +static LONG64 global_vm_id = -1; static cpumask_var_t cpus_hardware_enabled; static int kvm_usage_count; static atomic_t hardware_enable_failed; struct kmem_cache *kvm_vcpu_cache; -EXPORT_SYMBOL_GPL(kvm_vcpu_cache); - -static __read_mostly struct preempt_ops kvm_preempt_ops; - -struct dentry *kvm_debugfs_dir; -EXPORT_SYMBOL_GPL(kvm_debugfs_dir); -static int kvm_debugfs_num_entries; -static const struct file_operations *stat_fops_per_vm[]; - -static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, - unsigned long arg); -#ifdef CONFIG_KVM_COMPAT -static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, - unsigned long arg); -#endif static int hardware_enable_all(void); static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void kvm_release_pfn_dirty(kvm_pfn_t pfn); +void kvm_release_pfn_dirty(kvm_pfn_t pfn); static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); - -static bool largepages_enabled = true; +/* +* bsearch - binary search an array of elements +* @key: pointer to item being searched for +* @base: pointer to first element to search +* @num: number of elements +* @size: size of each element +* @cmp: pointer to comparison function +* +* This function does a binary search on the given array. The +* contents of the array should already be in ascending sorted order +* under the provided comparison function. +* +* Note that the key need not have the same type as the elements in +* the array, e.g. key could be a string and the comparison function +* could compare the string with the struct's name field. However, if +* the key and elements in the array are of the same type, you can use +* the same comparison function for both sort() and bsearch(). +*/ +void *bsearch(const void *key, const void *base, size_t num, size_t size, + int(*cmp)(const void *key, const void *elt)) +{ + size_t start = 0, end = num; + int result; + const char *__base = base; + + while (start < end) { + size_t mid = start + (end - start) / 2; + + result = cmp(key, __base + mid * size); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return (void *)(__base + mid * size); + } + + return NULL; +} + +static void generic_swap(void *a, void *b, int size) +{ + char t; + char *__a = a, *__b = b; + + do { + t = *__a; + *__a++ = *__b; + *__b++ = t; + } while (--size > 0); +} -bool kvm_is_reserved_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); +/** +* sort - sort an array of elements +* @base: pointer to data to sort +* @num: number of elements +* @size: size of each element +* @cmp_func: pointer to comparison function +* @swap_func: pointer to swap function or NULL +* +* This function does a heapsort on the given array. You may provide a +* swap_func function optimized to your element type. +* +* Sorting time is O(n log n) both on average and worst-case. While +* qsort is about 20% faster on average, it suffers from exploitable +* O(n*n) worst-case behavior and extra memory requirements that make +* it less suitable for kernel use. +*/ + +static void sort(void *base, size_t num, size_t size, + int(*cmp_func)(const void *, const void *), + void(*swap_func)(void *, void *, int size)) +{ + /* pre-scale counters for performance */ + int i = (num / 2 - 1) * size, n = num * size, c, r; + char *__base = base; + + if (!swap_func) { + swap_func = generic_swap; +} + + /* heapify */ + for (; i >= 0; i -= size) { + for (r = i; r * 2 + size < n; r = c) { + c = r * 2 + size; + if (c < n - size && + cmp_func(__base + c, __base + c + size) < 0) + c += size; + if (cmp_func(__base + r, __base + c) >= 0) + break; + swap_func(__base + r, __base + c, size); + } + } - return true; + /* sort */ + for (i = n - size; i > 0; i -= size) { + swap_func(__base, __base + i, size); + for (r = 0; r * 2 + size < i; r = c) { + c = r * 2 + size; + if (c < i - size && + cmp_func(__base + c, __base + c + size) < 0) + c += size; + if (cmp_func(__base + r, __base + c) >= 0) + break; + swap_func(__base + r, __base + c, size); + } + } } /* @@ -140,27 +168,20 @@ int vcpu_load(struct kvm_vcpu *vcpu) { int cpu; - if (mutex_lock_killable(&vcpu->mutex)) - return -EINTR; + mutex_lock(&vcpu->mutex); cpu = get_cpu(); - preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); - put_cpu(); - return 0; + return cpu; } -EXPORT_SYMBOL_GPL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { - preempt_disable(); kvm_arch_vcpu_put(vcpu); - preempt_notifier_unregister(&vcpu->preempt_notifier); - preempt_enable(); + put_cpu(); mutex_unlock(&vcpu->mutex); } -EXPORT_SYMBOL_GPL(vcpu_put); -static void ack_flush(void *_completed) +void ack_flush(void *_completed) { } @@ -173,7 +194,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - me = get_cpu(); + me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(req, vcpu); cpu = vcpu->cpu; @@ -191,19 +212,18 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - put_cpu(); free_cpumask_var(cpus); return called; } -#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { /* - * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in + * Read tlbs_dirty before setting GVM_REQ_TLB_FLUSH in * kvm_make_all_cpus_request. */ - long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); + long dirty_count; + READ_ONCE(kvm->tlbs_dirty, dirty_count); /* * We want to publish modifications to the page tables before reading @@ -216,66 +236,65 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ - if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + if (kvm_make_all_cpus_request(kvm, GVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } -EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); -#endif void kvm_reload_remote_mmus(struct kvm *kvm) { - kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + kvm_make_all_cpus_request(kvm, GVM_REQ_MMU_RELOAD); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { - struct page *page; int r; mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; - vcpu->pid = NULL; - init_swait_queue_head(&vcpu->wq); - kvm_async_pf_vcpu_init(vcpu); + vcpu->thread = NULL; vcpu->pre_pcpu = -1; INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { + /* + * KVM(Lin) allocates two seperate pages for vcpu->run and MMIO Emulation page + * vcpu->arch.piodata. These two pages will be mapped to userland as continuous + * virtual address space. Linux API allows to do that but I did not find a + * Windows equivalent API. So keep the physical pages also continuous. + */ + vcpu->run = ExAllocatePoolWithTag(NonPagedPool, 2 * PAGE_SIZE, GVM_POOL_TAG); + if (!vcpu->run) { r = -ENOMEM; goto fail; } - vcpu->run = page_address(page); - kvm_vcpu_set_in_spin_loop(vcpu, false); - kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; + KeInitializeEvent(&vcpu->kick_event, SynchronizationEvent, FALSE); + r = kvm_arch_vcpu_init(vcpu); if (r < 0) goto fail_free_run; return 0; fail_free_run: - free_page((unsigned long)vcpu->run); + ExFreePoolWithTag(vcpu->run, GVM_POOL_TAG); fail: return r; } -EXPORT_SYMBOL_GPL(kvm_vcpu_init); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { - put_pid(vcpu->pid); kvm_arch_vcpu_uninit(vcpu); - free_page((unsigned long)vcpu->run); + if (vcpu->run_userva) + __vm_munmap(vcpu->run_userva, 2 * PAGE_SIZE, false); + ExFreePoolWithTag(vcpu->run, GVM_POOL_TAG); } -EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#if defined(CONFIG_MMU_NOTIFIER) && defined(GVM_ARCH_WANT_MMU_NOTIFIER) static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); @@ -283,7 +302,7 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + size_t address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush, idx; @@ -324,7 +343,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address, + size_t address, pte_t pte) { struct kvm *kvm = mmu_notifier_to_kvm(mn); @@ -340,8 +359,8 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, - unsigned long end) + size_t start, + size_t end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; @@ -366,8 +385,8 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, - unsigned long end) + size_t start, + size_t end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); @@ -392,8 +411,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, - unsigned long end) + size_t start, + size_t end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -413,8 +432,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, - unsigned long end) + size_t start, + size_t end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -443,7 +462,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + size_t address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -485,16 +504,16 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } -#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ +#else /* !(CONFIG_MMU_NOTIFIER && GVM_ARCH_WANT_MMU_NOTIFIER) */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ +#endif /* CONFIG_MMU_NOTIFIER && GVM_ARCH_WANT_MMU_NOTIFIER */ -static struct kvm_memslots *kvm_alloc_memslots(void) +static struct kvm_memslots *kvm_alloc_memslots(struct kvm *kvm) { int i; struct kvm_memslots *slots; @@ -508,8 +527,10 @@ static struct kvm_memslots *kvm_alloc_memslots(void) * code of handling generation number wrap-around. */ slots->generation = -150; - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) + for (i = 0; i < GVM_MEM_SLOTS_NUM; i++) { slots->id_to_index[i] = slots->memslots[i].id = i; + slots->memslots[i].kvm = kvm; + } return slots; } @@ -529,9 +550,27 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + struct pmem_lock *pl; + int i; + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); + if (!dont || free->pmem_lock != dont->pmem_lock) + if (free->pmem_lock) { + for (i = 0; i < free->npages; i++) { + pl = &free->pmem_lock[i]; + if (!pl->lock_mdl) + continue; + spin_lock(&pl->lock); + MmUnlockPages(pl->lock_mdl); + IoFreeMdl(pl->lock_mdl); + pl->lock_mdl = NULL; + spin_unlock(&pl->lock); + } + kfree(free->pmem_lock); + } + kvm_arch_free_memslot(kvm, free, dont); free->npages = 0; @@ -550,61 +589,7 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) kvfree(slots); } -static void kvm_destroy_vm_debugfs(struct kvm *kvm) -{ - int i; - - if (!kvm->debugfs_dentry) - return; - - debugfs_remove_recursive(kvm->debugfs_dentry); - - if (kvm->debugfs_stat_data) { - for (i = 0; i < kvm_debugfs_num_entries; i++) - kfree(kvm->debugfs_stat_data[i]); - kfree(kvm->debugfs_stat_data); - } -} - -static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) -{ - char dir_name[ITOA_MAX_LEN * 2]; - struct kvm_stat_data *stat_data; - struct kvm_stats_debugfs_item *p; - - if (!debugfs_initialized()) - return 0; - - snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); - kvm->debugfs_dentry = debugfs_create_dir(dir_name, - kvm_debugfs_dir); - if (!kvm->debugfs_dentry) - return -ENOMEM; - - kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, - sizeof(*kvm->debugfs_stat_data), - GFP_KERNEL); - if (!kvm->debugfs_stat_data) - return -ENOMEM; - - for (p = debugfs_entries; p->name; p++) { - stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); - if (!stat_data) - return -ENOMEM; - - stat_data->kvm = kvm; - stat_data->offset = p->offset; - kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - if (!debugfs_create_file(p->name, 0444, - kvm->debugfs_dentry, - stat_data, - stat_fops_per_vm[p->kind])) - return -ENOMEM; - } - return 0; -} - -static struct kvm *kvm_create_vm(unsigned long type) +static struct kvm *kvm_create_vm(size_t type) { int r, i; struct kvm *kvm = kvm_arch_alloc_vm(); @@ -613,14 +598,12 @@ static struct kvm *kvm_create_vm(unsigned long type) return ERR_PTR(-ENOMEM); spin_lock_init(&kvm->mmu_lock); - atomic_inc(¤t->mm->mm_count); - kvm->mm = current->mm; - kvm_eventfd_init(kvm); + kvm->process = IoGetCurrentProcess(); + kvm->vm_id = InterlockedIncrement64(&global_vm_id); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); - INIT_LIST_HEAD(&kvm->devices); r = kvm_arch_init_vm(kvm, type); if (r) @@ -630,15 +613,9 @@ static struct kvm *kvm_create_vm(unsigned long type) if (r) goto out_err_no_disable; -#ifdef CONFIG_HAVE_KVM_IRQFD - INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); -#endif - - BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); - r = -ENOMEM; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - kvm->memslots[i] = kvm_alloc_memslots(); + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) { + kvm->memslots[i] = kvm_alloc_memslots(kvm); if (!kvm->memslots[i]) goto out_err_no_srcu; } @@ -647,7 +624,7 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; - for (i = 0; i < KVM_NR_BUSES; i++) { + for (i = 0; i < GVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); if (!kvm->buses[i]) @@ -662,8 +639,6 @@ static struct kvm *kvm_create_vm(unsigned long type) list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); - preempt_notifier_inc(); - return kvm; out_err: @@ -673,12 +648,11 @@ out_err_no_irq_srcu: out_err_no_srcu: hardware_disable_all(); out_err_no_disable: - for (i = 0; i < KVM_NR_BUSES; i++) + for (i = 0; i < GVM_NR_BUSES; i++) kfree(kvm->buses[i]); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, kvm->memslots[i]); kvm_arch_free_vm(kvm); - mmdrop(current->mm); return ERR_PTR(r); } @@ -686,7 +660,7 @@ out_err_no_disable: * Avoid using vmalloc for a small buffer. * Should not be used when the size is statically known. */ -void *kvm_kvzalloc(unsigned long size) +void *kvm_kvzalloc(size_t size) { if (size > PAGE_SIZE) return vzalloc(size); @@ -694,71 +668,43 @@ void *kvm_kvzalloc(unsigned long size) return kzalloc(size, GFP_KERNEL); } -static void kvm_destroy_devices(struct kvm *kvm) -{ - struct kvm_device *dev, *tmp; - - /* - * We do not need to take the kvm->lock here, because nobody else - * has a reference to the struct kvm at this point and therefore - * cannot access the devices list anyhow. - */ - list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { - list_del(&dev->vm_node); - dev->ops->destroy(dev); - } -} - static void kvm_destroy_vm(struct kvm *kvm) { int i; - struct mm_struct *mm = kvm->mm; - kvm_destroy_vm_debugfs(kvm); - kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); - for (i = 0; i < KVM_NR_BUSES; i++) + for (i = 0; i < GVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); - kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); -#else kvm_arch_flush_shadow_all(kvm); -#endif kvm_arch_destroy_vm(kvm); - kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + for (i = 0; i < GVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, kvm->memslots[i]); + kfree(kvm->rp_bitmap); cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); - preempt_notifier_dec(); hardware_disable_all(); - mmdrop(mm); } void kvm_get_kvm(struct kvm *kvm) { atomic_inc(&kvm->users_count); } -EXPORT_SYMBOL_GPL(kvm_get_kvm); void kvm_put_kvm(struct kvm *kvm) { if (atomic_dec_and_test(&kvm->users_count)) kvm_destroy_vm(kvm); } -EXPORT_SYMBOL_GPL(kvm_put_kvm); -static int kvm_vm_release(struct inode *inode, struct file *filp) +NTSTATUS kvm_vm_release(PDEVICE_OBJECT pDevObj, PIRP pIrp) { - struct kvm *kvm = filp->private_data; - - kvm_irqfd_release(kvm); + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + struct kvm *kvm = devext->PrivData; kvm_put_kvm(kvm); return 0; @@ -770,7 +716,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { - unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + size_t dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); if (!memslot->dirty_bitmap) @@ -802,7 +748,7 @@ static void update_memslots(struct kvm_memslots *slots, slots->used_slots++; } - while (i < KVM_MEM_SLOTS_NUM - 1 && + while (i < GVM_MEM_SLOTS_NUM - 1 && new->base_gfn <= mslots[i + 1].base_gfn) { if (!mslots[i + 1].npages) break; @@ -836,10 +782,10 @@ static void update_memslots(struct kvm_memslots *slots, static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) { - u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + u32 valid_flags = GVM_MEM_LOG_DIRTY_PAGES; -#ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; +#ifdef __GVM_HAVE_READONLY_MEM + valid_flags |= GVM_MEM_READONLY; #endif if (mem->flags & ~valid_flags) @@ -888,7 +834,7 @@ int __kvm_set_memory_region(struct kvm *kvm, { int r; gfn_t base_gfn; - unsigned long npages; + size_t npages; struct kvm_memory_slot *slot; struct kvm_memory_slot old, new; struct kvm_memslots *slots = NULL, *old_memslots; @@ -908,14 +854,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) goto out; - /* We can read the guest memory with __xxx_user() later on. */ - if ((id < KVM_USER_MEM_SLOTS) && - ((mem->userspace_addr & (PAGE_SIZE - 1)) || - !access_ok(VERIFY_WRITE, - (void __user *)(unsigned long)mem->userspace_addr, - mem->memory_size))) - goto out; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) + if (as_id >= GVM_ADDRESS_SPACE_NUM || id >= GVM_MEM_SLOTS_NUM) goto out; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) goto out; @@ -924,7 +863,7 @@ int __kvm_set_memory_region(struct kvm *kvm, base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - if (npages > KVM_MEM_MAX_NR_PAGES) + if (npages > GVM_MEM_MAX_NR_PAGES) goto out; new = old = *slot; @@ -936,17 +875,17 @@ int __kvm_set_memory_region(struct kvm *kvm, if (npages) { if (!old.npages) - change = KVM_MR_CREATE; + change = GVM_MR_CREATE; else { /* Modify an existing slot. */ if ((mem->userspace_addr != old.userspace_addr) || (npages != old.npages) || - ((new.flags ^ old.flags) & KVM_MEM_READONLY)) + ((new.flags ^ old.flags) & GVM_MEM_READONLY)) goto out; if (base_gfn != old.base_gfn) - change = KVM_MR_MOVE; + change = GVM_MR_MOVE; else if (new.flags != old.flags) - change = KVM_MR_FLAGS_ONLY; + change = GVM_MR_FLAGS_ONLY; else { /* Nothing to change. */ r = 0; goto out; @@ -956,16 +895,16 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!old.npages) goto out; - change = KVM_MR_DELETE; + change = GVM_MR_DELETE; new.base_gfn = 0; new.flags = 0; } - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { + if ((change == GVM_MR_CREATE) || (change == GVM_MR_MOVE)) { /* Check for overlaps */ r = -EEXIST; kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { - if ((slot->id >= KVM_USER_MEM_SLOTS) || + if ((slot->id >= GVM_USER_MEM_SLOTS) || (slot->id == id)) continue; if (!((base_gfn + npages <= slot->base_gfn) || @@ -975,36 +914,43 @@ int __kvm_set_memory_region(struct kvm *kvm, } /* Free page dirty bitmap if unneeded */ - if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + if (!(new.flags & GVM_MEM_LOG_DIRTY_PAGES)) new.dirty_bitmap = NULL; r = -ENOMEM; - if (change == KVM_MR_CREATE) { + if (change == GVM_MR_CREATE) { new.userspace_addr = mem->userspace_addr; if (kvm_arch_create_memslot(kvm, &new, npages)) goto out_free; + } /* Allocate page dirty bitmap if needed */ - if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + if ((new.flags & GVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { if (kvm_create_dirty_bitmap(&new) < 0) goto out_free; } + /* Allocate physical page pinning data structure */ + if (!new.pmem_lock) { + new.pmem_lock = + kzalloc(sizeof(struct pmem_lock) * new.npages, GFP_KERNEL); + if (!new.pmem_lock) + goto out_free; + } + slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); if (!slots) goto out_free; memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); - if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { + if ((change == GVM_MR_DELETE) || (change == GVM_MR_MOVE)) { slot = id_to_memslot(slots, id); - slot->flags |= KVM_MEMSLOT_INVALID; + slot->flags |= GVM_MEMSLOT_INVALID; old_memslots = install_new_memslots(kvm, as_id, slots); - /* slot was deleted or moved, clear iommu mapping */ - kvm_iommu_unmap_pages(kvm, &old); /* From this point no new shadow pages pointing to a deleted, * or moved, memslot will be created. * @@ -1027,8 +973,9 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_slots; /* actual memory is freed via old in kvm_free_memslot below */ - if (change == KVM_MR_DELETE) { + if (change == GVM_MR_DELETE) { new.dirty_bitmap = NULL; + new.pmem_lock = NULL; memset(&new.arch, 0, sizeof(new.arch)); } @@ -1040,20 +987,6 @@ int __kvm_set_memory_region(struct kvm *kvm, kvm_free_memslot(kvm, &old, &new); kvfree(old_memslots); - /* - * IOMMU mapping: New slots need to be mapped. Old slots need to be - * un-mapped and re-mapped if their base changes. Since base change - * unmapping is handled above with slot deletion, mapping alone is - * needed here. Anything else the iommu might care about for existing - * slots (size changes, userspace addr changes and read-only flag - * changes) is disallowed above, so any other attribute changes getting - * here can be skipped. - */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { - r = kvm_iommu_map_pages(kvm, &new); - return r; - } - return 0; out_slots: @@ -1063,7 +996,6 @@ out_free: out: return r; } -EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem) @@ -1075,12 +1007,11 @@ int kvm_set_memory_region(struct kvm *kvm, mutex_unlock(&kvm->slots_lock); return r; } -EXPORT_SYMBOL_GPL(kvm_set_memory_region); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) + if ((u16)mem->slot >= GVM_USER_MEM_SLOTS) return -EINVAL; return kvm_set_memory_region(kvm, mem); @@ -1092,13 +1023,13 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int r, i, as_id, id; - unsigned long n; - unsigned long any = 0; + size_t n; + size_t any = 0; r = -EINVAL; as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= GVM_ADDRESS_SPACE_NUM || id >= GVM_USER_MEM_SLOTS) goto out; slots = __kvm_memslots(kvm, as_id); @@ -1113,7 +1044,7 @@ int kvm_get_dirty_log(struct kvm *kvm, any = memslot->dirty_bitmap[i]; r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + if ( __copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) goto out; if (any) @@ -1123,9 +1054,7 @@ int kvm_get_dirty_log(struct kvm *kvm, out: return r; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log); -#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages * are dirty write protect them for next write. @@ -1154,14 +1083,14 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int r, i, as_id, id; - unsigned long n; - unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_buffer; + size_t n; + size_t *dirty_bitmap; + size_t *dirty_bitmap_buffer; r = -EINVAL; as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= GVM_ADDRESS_SPACE_NUM || id >= GVM_USER_MEM_SLOTS) goto out; slots = __kvm_memslots(kvm, as_id); @@ -1174,13 +1103,13 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); - dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(size_t); memset(dirty_bitmap_buffer, 0, n); spin_lock(&kvm->mmu_lock); *is_dirty = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; + for (i = 0; i < n / sizeof(size_t); i++) { + size_t mask; gfn_t offset; if (!dirty_bitmap[i]) @@ -1201,32 +1130,18 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + if ( __copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) goto out; r = 0; out: return r; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); -#endif - -bool kvm_largepages_enabled(void) -{ - return largepages_enabled; -} - -void kvm_disable_largepages(void) -{ - largepages_enabled = false; -} -EXPORT_SYMBOL_GPL(kvm_disable_largepages); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -1237,51 +1152,31 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || - memslot->flags & KVM_MEMSLOT_INVALID) + if (!memslot || memslot->id >= GVM_USER_MEM_SLOTS || + memslot->flags & GVM_MEMSLOT_INVALID) return false; return true; } -EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +size_t kvm_host_page_size(struct kvm *kvm, gfn_t gfn) { - struct vm_area_struct *vma; - unsigned long addr, size; - - size = PAGE_SIZE; - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PAGE_SIZE; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); - - return size; + return PAGE_SIZE; } static bool memslot_is_readonly(struct kvm_memory_slot *slot) { - return slot->flags & KVM_MEM_READONLY; + return slot->flags & GVM_MEM_READONLY; } -static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +static size_t __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return KVM_HVA_ERR_BAD; + if (!slot || slot->flags & GVM_MEMSLOT_INVALID) + return GVM_HVA_ERR_BAD; if (memslot_is_readonly(slot) && write) - return KVM_HVA_ERR_RO_BAD; + return GVM_HVA_ERR_RO_BAD; if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); @@ -1289,39 +1184,36 @@ static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, return __gfn_to_hva_memslot(slot, gfn); } -static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +static size_t gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { return __gfn_to_hva_many(slot, gfn, nr_pages, true); } -unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, +size_t gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { return gfn_to_hva_many(slot, gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); -unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +size_t gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva); -unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) +size_t kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); /* * If writable is set to false, the hva returned by this function is only * allowed to be read. */ -unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, +size_t gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable) { - unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); + size_t hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) *writable = !memslot_is_readonly(slot); @@ -1329,177 +1221,39 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, return hva; } -unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +size_t gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } -unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) +size_t kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static int get_user_page_nowait(unsigned long start, int write, - struct page **page) -{ - int flags = FOLL_NOWAIT | FOLL_HWPOISON; - - if (write) - flags |= FOLL_WRITE; - - return get_user_pages(start, 1, flags, page, NULL); -} - -static inline int check_user_page_hwpoison(unsigned long addr) -{ - int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - - rc = get_user_pages(addr, 1, flags, NULL, NULL); - return rc == -EHWPOISON; -} - /* * The atomic path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. */ -static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, +static bool __hva_to_pfn(size_t addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) { - struct page *page[1]; - int npages; - - if (!(async || atomic)) - return false; - - /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. - */ - if (!(write_fault || writable)) - return false; - - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { - *pfn = page_to_pfn(page[0]); - - if (writable) - *writable = true; - return true; - } - - return false; -} - -/* - * The slow path to get the pfn of the specified host virtual address, - * 1 indicates success, -errno is returned if error is detected. - */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, - bool *writable, kvm_pfn_t *pfn) -{ - struct page *page[1]; - int npages = 0; - - might_sleep(); - if (writable) *writable = write_fault; - if (async) { - down_read(¤t->mm->mmap_sem); - npages = get_user_page_nowait(addr, write_fault, page); - up_read(¤t->mm->mmap_sem); - } else { - unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; - - if (write_fault) - flags |= FOLL_WRITE; - - npages = __get_user_pages_unlocked(current, current->mm, addr, 1, - page, flags); - } - if (npages != 1) - return npages; - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && writable) { - struct page *wpage[1]; - - npages = __get_user_pages_fast(addr, 1, 1, wpage); - if (npages == 1) { - *writable = true; - put_page(page[0]); - page[0] = wpage[0]; - } + if (!write_fault && writable) + *writable = true; - npages = 1; - } - *pfn = page_to_pfn(page[0]); - return npages; -} - -static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) -{ - if (unlikely(!(vma->vm_flags & VM_READ))) - return false; - - if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) - return false; + *pfn = __pa((void *)addr) >> PAGE_SHIFT; return true; } -static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool *async, - bool write_fault, kvm_pfn_t *p_pfn) -{ - unsigned long pfn; - int r; - - r = follow_pfn(vma, addr, &pfn); - if (r) { - /* - * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does - * not call the fault handler, so do it here. - */ - bool unlocked = false; - r = fixup_user_fault(current, current->mm, addr, - (write_fault ? FAULT_FLAG_WRITE : 0), - &unlocked); - if (unlocked) - return -EAGAIN; - if (r) - return r; - - r = follow_pfn(vma, addr, &pfn); - if (r) - return r; - - } - - - /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - */ - kvm_get_pfn(pfn); - - *p_pfn = pfn; - return 0; -} - /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest @@ -1514,70 +1268,67 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, +static kvm_pfn_t hva_to_pfn(size_t addr, bool write_fault, bool *writable) { - struct vm_area_struct *vma; kvm_pfn_t pfn = 0; - int npages, r; - - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) + if (__hva_to_pfn(addr, write_fault, writable, &pfn)) return pfn; - if (atomic) - return KVM_PFN_ERR_FAULT; + return GVM_PFN_ERR_FAULT; +} - npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); - if (npages == 1) - return pfn; +static int gvm_pin_user_memory(size_t addr, struct pmem_lock *pmem_lock) +{ + pmem_lock->lock_mdl = IoAllocateMdl((PVOID)addr, PAGE_SIZE, + FALSE, FALSE, NULL); + if (!pmem_lock->lock_mdl) + return -1; + MmProbeAndLockPages(pmem_lock->lock_mdl, UserMode, + IoWriteAccess); + return 0; +} - down_read(¤t->mm->mmap_sem); - if (npages == -EHWPOISON || - (!async && check_user_page_hwpoison(addr))) { - pfn = KVM_PFN_ERR_HWPOISON; - goto exit; - } +static int kvm_is_ram_prot(struct kvm* kvm, gfn_t gfn); +static int kvm_should_ram_prot_exit(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_vcpu* vcpu; -retry: - vma = find_vma_intersection(current->mm, addr, addr + 1); - - if (vma == NULL) - pfn = KVM_PFN_ERR_FAULT; - else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); - if (r == -EAGAIN) - goto retry; - if (r < 0) - pfn = KVM_PFN_ERR_FAULT; - } else { - if (async && vma_is_valid(vma, write_fault)) - *async = true; - pfn = KVM_PFN_ERR_FAULT; - } -exit: - up_read(¤t->mm->mmap_sem); - return pfn; + if (!kvm_is_ram_prot(kvm, gfn)) + return 0; + + /* + * We assume get user pages always run + * in the vcpu thread requesting that + * page. + */ + vcpu = kvm_get_vcpu_by_thread(kvm, PsGetCurrentThread()); + vcpu->run->exit_reason = GVM_EXIT_RAM_PROT; + vcpu->run->rp.gfn = gfn; + return 1; } kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable) { - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + size_t addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + struct pmem_lock *pmem_lock = NULL; + + /* We removed async pafe fault support for gvm*/ + BUG_ON(async); - if (addr == KVM_HVA_ERR_RO_BAD) { + if (addr == GVM_HVA_ERR_RO_BAD) { if (writable) *writable = false; - return KVM_PFN_ERR_RO_FAULT; + return GVM_PFN_ERR_RO_FAULT; } if (kvm_is_error_hva(addr)) { if (writable) *writable = false; - return KVM_PFN_NOSLOT; + return GVM_PFN_NOSLOT; } /* Do not map writable pfn in the readonly memslot. */ @@ -1586,10 +1337,22 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, writable = NULL; } - return hva_to_pfn(addr, atomic, async, write_fault, - writable); + if (kvm_should_ram_prot_exit(slot->kvm, gfn)) + return 0; + + pmem_lock = &slot->pmem_lock[gfn - slot->base_gfn]; + spin_lock(&pmem_lock->lock); + if (!pmem_lock->lock_mdl) { + gvm_pin_user_memory(addr, pmem_lock); + if (!pmem_lock->lock_mdl) { + spin_unlock(&pmem_lock->lock); + return GVM_PFN_ERR_FAULT; + } + } + spin_unlock(&pmem_lock->lock); + + return hva_to_pfn(addr, write_fault, writable); } -EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) @@ -1597,49 +1360,44 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, write_fault, writable); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_pfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages) +int gfn_to_pfn_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, + pfn_t *pfn, int nr_pages) { - unsigned long addr; + size_t addr; gfn_t entry; + size_t i; + struct pmem_lock *pmem_lock; addr = gfn_to_hva_many(slot, gfn, &entry); if (kvm_is_error_hva(addr)) @@ -1648,32 +1406,36 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); -} -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); + for (i = 0; i < nr_pages; i++) { + if (kvm_should_ram_prot_exit(slot->kvm, gfn + i)) + return 0; -static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) -{ - if (is_error_noslot_pfn(pfn)) - return KVM_ERR_PTR_BAD_PAGE; - - if (kvm_is_reserved_pfn(pfn)) { - WARN_ON(1); - return KVM_ERR_PTR_BAD_PAGE; + pmem_lock = &slot->pmem_lock[gfn + i - slot->base_gfn]; + spin_lock(&pmem_lock->lock); + if (!pmem_lock->lock_mdl) { + gvm_pin_user_memory(addr + i * PAGE_SIZE, pmem_lock); + if (!pmem_lock->lock_mdl) { + spin_unlock(&pmem_lock->lock); + break; + } + } + spin_unlock(&pmem_lock->lock); } - return pfn_to_page(pfn); + nr_pages = i; + + while(i--) + pfn[i] = __pa((void*)(addr + i * PAGE_SIZE)) >> PAGE_SHIFT; + return nr_pages; } -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) { - kvm_pfn_t pfn; - - pfn = gfn_to_pfn(kvm, gfn); + if (is_error_noslot_pfn(pfn)) + return GVM_ERR_PTR_BAD_PAGE; - return kvm_pfn_to_page(pfn); + return pfn_to_page(pfn); } -EXPORT_SYMBOL_GPL(gfn_to_page); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -1683,63 +1445,8 @@ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) return kvm_pfn_to_page(pfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); - -void kvm_release_page_clean(struct page *page) -{ - WARN_ON(is_error_page(page)); - - kvm_release_pfn_clean(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); - -void kvm_release_pfn_clean(kvm_pfn_t pfn) -{ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - put_page(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); - -void kvm_release_page_dirty(struct page *page) -{ - WARN_ON(is_error_page(page)); - - kvm_release_pfn_dirty(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); - -static void kvm_release_pfn_dirty(kvm_pfn_t pfn) -{ - kvm_set_pfn_dirty(pfn); - kvm_release_pfn_clean(pfn); -} - -void kvm_set_pfn_dirty(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (!PageReserved(page)) - SetPageDirty(page); - } -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); -void kvm_set_pfn_accessed(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - mark_page_accessed(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); - -void kvm_get_pfn(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - get_page(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_get_pfn); - -static int next_segment(unsigned long len, int offset) +static int next_segment(size_t len, int offset) { if (len > PAGE_SIZE - offset) return PAGE_SIZE - offset; @@ -1750,13 +1457,13 @@ static int next_segment(unsigned long len, int offset) static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { - int r; - unsigned long addr; + int r = 0; + size_t addr; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; - r = __copy_from_user(data, (void __user *)addr + offset, len); + r = __copy_from_user(data, (char __user *)addr + offset, len); if (r) return -EFAULT; return 0; @@ -1769,7 +1476,6 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) @@ -1778,9 +1484,8 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); -int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, size_t len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; @@ -1793,14 +1498,13 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) return ret; offset = 0; len -= seg; - data += seg; + //data += seg; ++gfn; } return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest); -int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) +int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, size_t len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; @@ -1813,43 +1517,29 @@ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned l return ret; offset = 0; len -= seg; - data += seg; + //data += seg; ++gfn; } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - void *data, int offset, unsigned long len) + void *data, int offset, size_t len) { - int r; - unsigned long addr; + int r = 0; + size_t addr; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; - pagefault_disable(); - r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); - pagefault_enable(); + r = __copy_from_user(data, (char __user *)addr + offset, len); if (r) return -EFAULT; return 0; } -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - int offset = offset_in_page(gpa); - - return __kvm_read_guest_atomic(slot, gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); - int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, - void *data, unsigned long len) + void *data, size_t len) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); @@ -1857,18 +1547,17 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) { - int r; - unsigned long addr; + int r = 0; + size_t addr; addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; - r = __copy_to_user((void __user *)addr + offset, data, len); + r = __copy_to_user((void __user *)(addr + offset), data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(memslot, gfn); @@ -1882,7 +1571,6 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, return __kvm_write_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) @@ -1891,10 +1579,9 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, return __kvm_write_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, - unsigned long len) + size_t len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; @@ -1907,15 +1594,14 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, return ret; offset = 0; len -= seg; - data += seg; + //data += seg; ++gfn; } return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, - unsigned long len) + size_t len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; @@ -1928,15 +1614,14 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, return ret; offset = 0; len -= seg; - data += seg; + //data += seg; ++gfn; } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len) + gpa_t gpa, size_t len) { struct kvm_memslots *slots = kvm_memslots(kvm); int offset = offset_in_page(gpa); @@ -1970,10 +1655,9 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, } return 0; } -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) + void *data, size_t len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; @@ -1996,10 +1680,9 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) + void *data, size_t len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; @@ -2021,17 +1704,13 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); - - return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); + return kvm_write_guest_page(kvm, gfn, pZeroPage, offset, len); } -EXPORT_SYMBOL_GPL(kvm_clear_guest_page); -int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, size_t len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; @@ -2048,15 +1727,14 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_clear_guest); static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn) { if (memslot && memslot->dirty_bitmap) { - unsigned long rel_gfn = gfn - memslot->base_gfn; + size_t rel_gfn = gfn - memslot->base_gfn; - set_bit_le(rel_gfn, memslot->dirty_bitmap); + set_bit(rel_gfn, memslot->dirty_bitmap); } } @@ -2067,7 +1745,6 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(memslot, gfn); } -EXPORT_SYMBOL_GPL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2076,138 +1753,52 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(memslot, gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); - -static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) -{ - unsigned int old, val, grow; - - old = val = vcpu->halt_poll_ns; - grow = READ_ONCE(halt_poll_ns_grow); - /* 10us base */ - if (val == 0 && grow) - val = 10000; - else - val *= grow; - - if (val > halt_poll_ns) - val = halt_poll_ns; - - vcpu->halt_poll_ns = val; - trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); -} - -static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) -{ - unsigned int old, val, shrink; - - old = val = vcpu->halt_poll_ns; - shrink = READ_ONCE(halt_poll_ns_shrink); - if (shrink == 0) - val = 0; - else - val /= shrink; - - vcpu->halt_poll_ns = val; - trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); -} static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) { if (kvm_arch_vcpu_runnable(vcpu)) { - kvm_make_request(KVM_REQ_UNHALT, vcpu); + kvm_make_request(GVM_REQ_UNHALT, vcpu); return -EINTR; } if (kvm_cpu_has_pending_timer(vcpu)) return -EINTR; - if (signal_pending(current)) + if (vcpu->run->user_event_pending) return -EINTR; return 0; } +static void hardware_disable_nolock(void *junk); +static void hardware_enable_nolock(void *junk); + /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ void kvm_vcpu_block(struct kvm_vcpu *vcpu) { - ktime_t start, cur; - DECLARE_SWAITQUEUE(wait); - bool waited = false; - u64 block_ns; - - start = cur = ktime_get(); - if (vcpu->halt_poll_ns) { - ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); - - ++vcpu->stat.halt_attempted_poll; - do { - /* - * This sets KVM_REQ_UNHALT if an interrupt - * arrives. - */ - if (kvm_vcpu_check_block(vcpu) < 0) { - ++vcpu->stat.halt_successful_poll; - if (!vcpu_valid_wakeup(vcpu)) - ++vcpu->stat.halt_poll_invalid; - goto out; - } - cur = ktime_get(); - } while (single_task_running() && ktime_before(cur, stop)); - } + LARGE_INTEGER expire; + expire.QuadPart = (u64)-1000000; kvm_arch_vcpu_blocking(vcpu); - for (;;) { - prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - - if (kvm_vcpu_check_block(vcpu) < 0) + vcpu->blocked = 1; + for (;;) + { + if (kvm_vcpu_check_block(vcpu)) break; - - waited = true; - schedule(); + KeWaitForSingleObject(&vcpu->kick_event, Executive, KernelMode, FALSE, &expire); } - - finish_swait(&vcpu->wq, &wait); - cur = ktime_get(); - + vcpu->blocked = 0; + KeClearEvent(&vcpu->kick_event); kvm_arch_vcpu_unblocking(vcpu); -out: - block_ns = ktime_to_ns(cur) - ktime_to_ns(start); - - if (!vcpu_valid_wakeup(vcpu)) - shrink_halt_poll_ns(vcpu); - else if (halt_poll_ns) { - if (block_ns <= vcpu->halt_poll_ns) - ; - /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) - shrink_halt_poll_ns(vcpu); - /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) - grow_halt_poll_ns(vcpu); - } else - vcpu->halt_poll_ns = 0; - - trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); kvm_arch_vcpu_block_finish(vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_block); -#ifndef CONFIG_S390 void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { - struct swait_queue_head *wqp; - - wqp = kvm_arch_vcpu_wq(vcpu); - if (swait_active(wqp)) { - swake_up(wqp); - ++vcpu->stat.halt_wakeup; - } - + if(vcpu->blocked) + KeSetEvent(&vcpu->kick_event, IO_NO_INCREMENT, FALSE); } -EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. @@ -2218,224 +1809,36 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) int cpu = vcpu->cpu; kvm_vcpu_wake_up(vcpu); - me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + me = smp_processor_id(); + if (cpu != -1 && cpu != me && cpu_online(cpu)) if (kvm_arch_vcpu_should_kick(vcpu)) smp_send_reschedule(cpu); - put_cpu(); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_kick); -#endif /* !CONFIG_S390 */ - -int kvm_vcpu_yield_to(struct kvm_vcpu *target) -{ - struct pid *pid; - struct task_struct *task = NULL; - int ret = 0; - - rcu_read_lock(); - pid = rcu_dereference(target->pid); - if (pid) - task = get_pid_task(pid, PIDTYPE_PID); - rcu_read_unlock(); - if (!task) - return ret; - ret = yield_to(task, 1); - put_task_struct(task); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); - -/* - * Helper that checks whether a VCPU is eligible for directed yield. - * Most eligible candidate to yield is decided by following heuristics: - * - * (a) VCPU which has not done pl-exit or cpu relax intercepted recently - * (preempted lock holder), indicated by @in_spin_loop. - * Set at the beiginning and cleared at the end of interception/PLE handler. - * - * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get - * chance last time (mostly it has become eligible now since we have probably - * yielded to lockholder in last iteration. This is done by toggling - * @dy_eligible each time a VCPU checked for eligibility.) - * - * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding - * to preempted lock-holder could result in wrong VCPU selection and CPU - * burning. Giving priority for a potential lock-holder increases lock - * progress. - * - * Since algorithm is based on heuristics, accessing another VCPU data without - * locking does not harm. It may result in trying to yield to same VCPU, fail - * and continue with next VCPU and so on. - */ -static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT - bool eligible; - - eligible = !vcpu->spin_loop.in_spin_loop || - vcpu->spin_loop.dy_eligible; - - if (vcpu->spin_loop.in_spin_loop) - kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); - - return eligible; -#else - return true; -#endif -} - -void kvm_vcpu_on_spin(struct kvm_vcpu *me) -{ - struct kvm *kvm = me->kvm; - struct kvm_vcpu *vcpu; - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; - int yielded = 0; - int try = 3; - int pass; - int i; - - kvm_vcpu_set_in_spin_loop(me, true); - /* - * We boost the priority of a VCPU that is runnable but not - * currently running, because it got preempted by something - * else and called schedule in __vcpu_run. Hopefully that - * VCPU is holding the lock that we need and will release it. - * We approximate round-robin by starting at the last boosted VCPU. - */ - for (pass = 0; pass < 2 && !yielded && try; pass++) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!pass && i <= last_boosted_vcpu) { - i = last_boosted_vcpu; - continue; - } else if (pass && i > last_boosted_vcpu) - break; - if (!ACCESS_ONCE(vcpu->preempted)) - continue; - if (vcpu == me) - continue; - if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) - continue; - if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) - continue; - - yielded = kvm_vcpu_yield_to(vcpu); - if (yielded > 0) { - kvm->last_boosted_vcpu = i; - break; - } else if (yielded < 0) { - try--; - if (!try) - break; - } - } - } - kvm_vcpu_set_in_spin_loop(me, false); - - /* Ensure vcpu is not eligible during next spinloop */ - kvm_vcpu_set_dy_eligible(me, false); } -EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); -static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +NTSTATUS kvm_vcpu_release(PDEVICE_OBJECT pDevObj, PIRP pIrp) { - struct kvm_vcpu *vcpu = vma->vm_file->private_data; - struct page *page; - - if (vmf->pgoff == 0) - page = virt_to_page(vcpu->run); -#ifdef CONFIG_X86 - else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) - page = virt_to_page(vcpu->arch.pio_data); -#endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) - page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); -#endif - else - return kvm_arch_vcpu_fault(vcpu, vmf); - get_page(page); - vmf->page = page; - return 0; -} - -static const struct vm_operations_struct kvm_vcpu_vm_ops = { - .fault = kvm_vcpu_fault, -}; + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + struct kvm_vcpu *vcpu = devext->PrivData; -static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vcpu_vm_ops; - return 0; -} - -static int kvm_vcpu_release(struct inode *inode, struct file *filp) -{ - struct kvm_vcpu *vcpu = filp->private_data; - - debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_put_kvm(vcpu->kvm); return 0; } -static struct file_operations kvm_vcpu_fops = { - .release = kvm_vcpu_release, - .unlocked_ioctl = kvm_vcpu_ioctl, -#ifdef CONFIG_KVM_COMPAT - .compat_ioctl = kvm_vcpu_compat_ioctl, -#endif - .mmap = kvm_vcpu_mmap, - .llseek = noop_llseek, -}; - -/* - * Allocates an inode for the vcpu. - */ -static int create_vcpu_fd(struct kvm_vcpu *vcpu) -{ - return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); -} - -static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - char dir_name[ITOA_MAX_LEN * 2]; - int ret; - - if (!kvm_arch_has_vcpu_debugfs()) - return 0; - - if (!debugfs_initialized()) - return 0; - - snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); - vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); - if (!vcpu->debugfs_dentry) - return -ENOMEM; - - ret = kvm_arch_create_vcpu_debugfs(vcpu); - if (ret < 0) { - debugfs_remove_recursive(vcpu->debugfs_dentry); - return ret; - } - - return 0; -} - -/* - * Creates some virtual cpus. Good luck creating more than one. - */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) +static int kvm_vm_ioctl_create_vcpu(PDEVICE_OBJECT pDevObj, PIRP pIrp, void *arg) { int r; struct kvm_vcpu *vcpu; + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + struct kvm *kvm = devext->PrivData; + HANDLE handle; + int id = *(int *)arg; + KAFFINITY Affinity; - if (id >= KVM_MAX_VCPU_ID) + mutex_lock(&kvm->lock); + if (id >= GVM_MAX_VCPU_ID) return -EINVAL; - mutex_lock(&kvm->lock); - if (kvm->created_vcpus == KVM_MAX_VCPUS) { + if (kvm->created_vcpus == GVM_MAX_VCPUS) { mutex_unlock(&kvm->lock); return -EINVAL; } @@ -2449,16 +1852,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_decrement; } - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - r = kvm_arch_vcpu_setup(vcpu); if (r) goto vcpu_destroy; - r = kvm_create_vcpu_debugfs(vcpu); - if (r) - goto vcpu_destroy; - mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; @@ -2469,8 +1866,14 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); - r = create_vcpu_fd(vcpu); - if (r < 0) { + r = gvmCreateVMDevice(&handle, kvm->vm_id, id, vcpu); + if (!NT_SUCCESS(r)) { + kvm_put_kvm(kvm); + goto unlock_vcpu_destroy; + } + r = gvmUpdateReturnBuffer(pIrp, 0, &handle, sizeof(handle)); + if (r) { + gvmDeleteVMDevice(NULL, 0, id); kvm_put_kvm(kvm); goto unlock_vcpu_destroy; } @@ -2486,11 +1889,16 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) mutex_unlock(&kvm->lock); kvm_arch_vcpu_postcreate(vcpu); + + Affinity = (KAFFINITY)1 << ( + cpu_online_count - 1 + - 2 * vcpu->vcpu_id / cpu_online_count % 2 + - vcpu->vcpu_id * 2 % cpu_online_count); + KeSetSystemAffinityThread(Affinity); return r; unlock_vcpu_destroy: mutex_unlock(&kvm->lock); - debugfs_remove_recursive(vcpu->debugfs_dentry); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); vcpu_decrement: @@ -2500,64 +1908,171 @@ vcpu_decrement: return r; } -static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +static int kvm_vm_ioctl_kick_vcpu(PDEVICE_OBJECT pDevObj, PIRP pIrp, void *arg) { - if (sigset) { - sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - vcpu->sigset_active = 1; - vcpu->sigset = *sigset; - } else - vcpu->sigset_active = 0; + struct kvm_vcpu *vcpu; + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + struct kvm *kvm = devext->PrivData; + int id = *(int *)arg; + + if (id >= GVM_MAX_VCPU_ID) + return -EINVAL; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (!vcpu) + return -EINVAL; + + kvm_vcpu_kick(vcpu); + return 0; } -static long kvm_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static bool kvm_is_valid_prot_flags(u32 flags) { - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - int r; - struct kvm_fpu *fpu = NULL; - struct kvm_sregs *kvm_sregs = NULL; + return (flags == RP_NOACCESS || flags == RP_RDWREX); +} - if (vcpu->kvm->mm != current->mm) - return -EIO; +static int kvm_adjust_rp_bitmap(struct kvm *kvm, u64 size) +{ + int old_size, new_size; + size_t *old_bitmap, *new_bitmap; - if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) - return -EINVAL; + if (kvm->rp_bitmap_size >= size) + return 0; -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) - /* - * Special cases: vcpu ioctls that are asynchronous to vcpu execution, - * so vcpu_load() would break it. - */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) - return kvm_arch_vcpu_ioctl(filp, ioctl, arg); -#endif + new_size = ALIGN(size, (u64)BITS_PER_LONG) / 8; + new_bitmap = kvm_kvzalloc(new_size); + if (!new_bitmap) + return -ENOMEM; + + old_size = kvm->rp_bitmap_size; + old_bitmap = kvm->rp_bitmap; + memcpy(new_bitmap, old_bitmap, old_size); + + kvm->rp_bitmap = new_bitmap; + kvm->rp_bitmap_size = new_size; + + return 0; +} + +/* + * For set bulk bitmap instead of looping set_bit + */ +static inline void set_bits_in_long(size_t *byte, int start, int nbits, bool set) +{ + size_t mask; - r = vcpu_load(vcpu); + BUG_ON(byte == NULL); + BUG_ON(start < 0 || start > BITS_PER_LONG); + BUG_ON(nbits < 0 || start + nbits > BITS_PER_LONG); + + mask = ((1 << nbits) - 1) << start; + if (set) + *byte |= mask; + else + *byte &= ~mask; +} + +static void set_bit_block(size_t *bitmap, u64 start, u64 nbits, bool set) +{ + u64 first_long_index = start / BITS_PER_LONG; + u64 last_long_index = (start + nbits - 1) / BITS_PER_LONG; + u64 i; + int first_bit_index = (int)(start % BITS_PER_LONG); + int last_bit_index = (int)((start + nbits - 1) % BITS_PER_LONG); + + if (first_long_index == last_long_index) { + set_bits_in_long(&bitmap[first_long_index], first_bit_index, (int)nbits, + set); + return; + } + + set_bits_in_long(&bitmap[first_long_index], first_bit_index, + BITS_PER_LONG - first_bit_index, set); + for (i = first_long_index + 1; i < last_long_index; i++) { + bitmap[i] = set ? (size_t)-1 : 0; + } + set_bits_in_long(&bitmap[last_long_index], 0, last_bit_index + 1, set); +} + +static int kvm_is_ram_prot(struct kvm *kvm, gfn_t gfn) +{ + if (!kvm->rp_bitmap) + return 0; + + return test_bit(gfn, kvm->rp_bitmap); +} + +static int kvm_vm_ioctl_ram_prot(struct kvm *kvm, struct gvm_ram_protect *rp) +{ + int r = -EFAULT; + gfn_t first_gfn = rp->pa >> PAGE_SHIFT; + gfn_t last_gfn = (rp->pa + rp->size - 1) >> PAGE_SHIFT; + + if (!rp->reserved) + return -EINVAL; + + if (!kvm_is_valid_prot_flags(rp->flags)) + return -EINVAL; + + r = kvm_adjust_rp_bitmap(kvm, last_gfn + 1); if (r) return r; + + set_bit_block(kvm->rp_bitmap, first_gfn, last_gfn + 1 - first_gfn, + rp->flags == RP_NOACCESS); + + /* only need flush shadow when page access right is lowered */ + if (rp->flags == RP_NOACCESS) + kvm_arch_flush_shadow_all(kvm); + + return 0; +} + +NTSTATUS kvm_vcpu_ioctl(PDEVICE_OBJECT pDevObj, PIRP pIrp, + unsigned int ioctl) +{ + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + struct kvm_vcpu *vcpu = devext->PrivData; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; + int r; + struct kvm_fpu *fpu = NULL; + struct kvm_sregs *kvm_sregs = NULL; + + if (vcpu->kvm->process != IoGetCurrentProcess()) + return -EIO; + switch (ioctl) { - case KVM_RUN: + case GVM_RUN: r = -EINVAL; - if (arg) - goto out; - if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { - /* The thread running this VCPU changed. */ - struct pid *oldpid = vcpu->pid; - struct pid *newpid = get_task_pid(current, PIDTYPE_PID); - - rcu_assign_pointer(vcpu->pid, newpid); - if (oldpid) - synchronize_rcu(); - put_pid(oldpid); + if (vcpu->thread != PsGetCurrentThread()) { + vcpu->thread = PsGetCurrentThread(); + KeInitializeApc(&vcpu->apc, vcpu->thread, + OriginalApcEnvironment, + gvmWaitSuspend, + NULL, + NULL, + KernelMode, + NULL); } r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; - case KVM_GET_REGS: { + case GVM_VCPU_MMAP: + r = -EINVAL; + size_t mmap_size = 2 * PAGE_SIZE; + size_t userva = __vm_mmap(NULL, 0, mmap_size, PROT_READ |PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, (size_t)vcpu->run); + if (!userva) + break; + r = gvmUpdateReturnBuffer(pIrp, 0, &userva, sizeof(userva)); + if (r) { + __vm_munmap(userva, 2 * PAGE_SIZE, false); + break; + } + vcpu->run_userva = userva; + break; + case GVM_GET_REGS: { struct kvm_regs *kvm_regs; r = -ENOMEM; @@ -2567,15 +2082,15 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); if (r) goto out_free1; - r = -EFAULT; - if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) + r = gvmUpdateReturnBuffer(pIrp, 0, kvm_regs, sizeof(struct kvm_regs)); + if (r) goto out_free1; r = 0; out_free1: kfree(kvm_regs); break; } - case KVM_SET_REGS: { + case GVM_SET_REGS: { struct kvm_regs *kvm_regs; r = -ENOMEM; @@ -2588,7 +2103,7 @@ out_free1: kfree(kvm_regs); break; } - case KVM_GET_SREGS: { + case GVM_GET_SREGS: { kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); r = -ENOMEM; if (!kvm_sregs) @@ -2596,13 +2111,13 @@ out_free1: r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) + r = gvmUpdateReturnBuffer(pIrp, 0, kvm_sregs, sizeof(struct kvm_sregs)); + if (r) goto out; r = 0; break; } - case KVM_SET_SREGS: { + case GVM_SET_SREGS: { kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); if (IS_ERR(kvm_sregs)) { r = PTR_ERR(kvm_sregs); @@ -2612,19 +2127,16 @@ out_free1: r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); break; } - case KVM_GET_MP_STATE: { + case GVM_GET_MP_STATE: { struct kvm_mp_state mp_state; r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, &mp_state, sizeof(mp_state))) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &mp_state, sizeof(mp_state)); break; } - case KVM_SET_MP_STATE: { + case GVM_SET_MP_STATE: { struct kvm_mp_state mp_state; r = -EFAULT; @@ -2633,7 +2145,7 @@ out_free1: r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); break; } - case KVM_TRANSLATE: { + case GVM_TRANSLATE: { struct kvm_translation tr; r = -EFAULT; @@ -2642,13 +2154,10 @@ out_free1: r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, &tr, sizeof(tr))) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, &tr, sizeof(tr)); break; } - case KVM_SET_GUEST_DEBUG: { + case GVM_SET_GUEST_DEBUG: { struct kvm_guest_debug dbg; r = -EFAULT; @@ -2657,30 +2166,7 @@ out_free1: r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); break; } - case KVM_SET_SIGNAL_MASK: { - struct kvm_signal_mask __user *sigmask_arg = argp; - struct kvm_signal_mask kvm_sigmask; - sigset_t sigset, *p; - - p = NULL; - if (argp) { - r = -EFAULT; - if (copy_from_user(&kvm_sigmask, argp, - sizeof(kvm_sigmask))) - goto out; - r = -EINVAL; - if (kvm_sigmask.len != sizeof(sigset)) - goto out; - r = -EFAULT; - if (copy_from_user(&sigset, sigmask_arg->sigset, - sizeof(sigset))) - goto out; - p = &sigset; - } - r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); - break; - } - case KVM_GET_FPU: { + case GVM_GET_FPU: { fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); r = -ENOMEM; if (!fpu) @@ -2688,13 +2174,10 @@ out_free1: r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); if (r) goto out; - r = -EFAULT; - if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) - goto out; - r = 0; + r = gvmUpdateReturnBuffer(pIrp, 0, fpu, sizeof(struct kvm_fpu)); break; } - case KVM_SET_FPU: { + case GVM_SET_FPU: { fpu = memdup_user(argp, sizeof(*fpu)); if (IS_ERR(fpu)) { r = PTR_ERR(fpu); @@ -2705,260 +2188,57 @@ out_free1: break; } default: - r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_ioctl(devext, pIrp, ioctl); } out: - vcpu_put(vcpu); kfree(fpu); kfree(kvm_sregs); return r; } -#ifdef CONFIG_KVM_COMPAT -static long kvm_vcpu_compat_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = compat_ptr(arg); - int r; - - if (vcpu->kvm->mm != current->mm) - return -EIO; - - switch (ioctl) { - case KVM_SET_SIGNAL_MASK: { - struct kvm_signal_mask __user *sigmask_arg = argp; - struct kvm_signal_mask kvm_sigmask; - compat_sigset_t csigset; - sigset_t sigset; - - if (argp) { - r = -EFAULT; - if (copy_from_user(&kvm_sigmask, argp, - sizeof(kvm_sigmask))) - goto out; - r = -EINVAL; - if (kvm_sigmask.len != sizeof(csigset)) - goto out; - r = -EFAULT; - if (copy_from_user(&csigset, sigmask_arg->sigset, - sizeof(csigset))) - goto out; - sigset_from_compat(&sigset, &csigset); - r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); - } else - r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); - break; - } - default: - r = kvm_vcpu_ioctl(filp, ioctl, arg); - } - -out: - return r; -} -#endif - -static int kvm_device_ioctl_attr(struct kvm_device *dev, - int (*accessor)(struct kvm_device *dev, - struct kvm_device_attr *attr), - unsigned long arg) -{ - struct kvm_device_attr attr; - - if (!accessor) - return -EPERM; - - if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) - return -EFAULT; - - return accessor(dev, &attr); -} - -static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) -{ - struct kvm_device *dev = filp->private_data; - - switch (ioctl) { - case KVM_SET_DEVICE_ATTR: - return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); - case KVM_GET_DEVICE_ATTR: - return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); - case KVM_HAS_DEVICE_ATTR: - return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); - default: - if (dev->ops->ioctl) - return dev->ops->ioctl(dev, ioctl, arg); - - return -ENOTTY; - } -} - -static int kvm_device_release(struct inode *inode, struct file *filp) -{ - struct kvm_device *dev = filp->private_data; - struct kvm *kvm = dev->kvm; - - kvm_put_kvm(kvm); - return 0; -} - -static const struct file_operations kvm_device_fops = { - .unlocked_ioctl = kvm_device_ioctl, -#ifdef CONFIG_KVM_COMPAT - .compat_ioctl = kvm_device_ioctl, -#endif - .release = kvm_device_release, -}; - -struct kvm_device *kvm_device_from_filp(struct file *filp) -{ - if (filp->f_op != &kvm_device_fops) - return NULL; - - return filp->private_data; -} - -static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { -#ifdef CONFIG_KVM_MPIC - [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, - [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, -#endif - -#ifdef CONFIG_KVM_XICS - [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, -#endif -}; - -int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) -{ - if (type >= ARRAY_SIZE(kvm_device_ops_table)) - return -ENOSPC; - - if (kvm_device_ops_table[type] != NULL) - return -EEXIST; - - kvm_device_ops_table[type] = ops; - return 0; -} - -void kvm_unregister_device_ops(u32 type) -{ - if (kvm_device_ops_table[type] != NULL) - kvm_device_ops_table[type] = NULL; -} - -static int kvm_ioctl_create_device(struct kvm *kvm, - struct kvm_create_device *cd) -{ - struct kvm_device_ops *ops = NULL; - struct kvm_device *dev; - bool test = cd->flags & KVM_CREATE_DEVICE_TEST; - int ret; - - if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) - return -ENODEV; - - ops = kvm_device_ops_table[cd->type]; - if (ops == NULL) - return -ENODEV; - - if (test) - return 0; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - dev->ops = ops; - dev->kvm = kvm; - - mutex_lock(&kvm->lock); - ret = ops->create(dev, cd->type); - if (ret < 0) { - mutex_unlock(&kvm->lock); - kfree(dev); - return ret; - } - list_add(&dev->vm_node, &kvm->devices); - mutex_unlock(&kvm->lock); - - if (ops->init) - ops->init(dev); - - ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); - if (ret < 0) { - ops->destroy(dev); - mutex_lock(&kvm->lock); - list_del(&dev->vm_node); - mutex_unlock(&kvm->lock); - return ret; - } - - kvm_get_kvm(kvm); - cd->fd = ret; - return 0; -} - static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { - case KVM_CAP_USER_MEMORY: - case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: - case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: - case KVM_CAP_INTERNAL_ERROR_DATA: -#ifdef CONFIG_HAVE_KVM_MSI - case KVM_CAP_SIGNAL_MSI: -#endif -#ifdef CONFIG_HAVE_KVM_IRQFD - case KVM_CAP_IRQFD: - case KVM_CAP_IRQFD_RESAMPLE: -#endif - case KVM_CAP_IOEVENTFD_ANY_LENGTH: - case KVM_CAP_CHECK_EXTENSION_VM: - return 1; -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING - case KVM_CAP_IRQ_ROUTING: - return KVM_MAX_IRQ_ROUTES; +#ifdef CONFIG_HAVE_GVM_MSI + case GVM_CAP_SIGNAL_MSI: #endif -#if KVM_ADDRESS_SPACE_NUM > 1 - case KVM_CAP_MULTI_ADDRESS_SPACE: - return KVM_ADDRESS_SPACE_NUM; + case GVM_CAP_IRQ_ROUTING: + return GVM_MAX_IRQ_ROUTES; +#if GVM_ADDRESS_SPACE_NUM > 1 + case GVM_CAP_MULTI_ADDRESS_SPACE: + return GVM_ADDRESS_SPACE_NUM; #endif - case KVM_CAP_MAX_VCPU_ID: - return KVM_MAX_VCPU_ID; + case GVM_CAP_MAX_VCPU_ID: + return GVM_MAX_VCPU_ID; default: break; } return kvm_vm_ioctl_check_extension(kvm, arg); } -static long kvm_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +NTSTATUS kvm_vm_ioctl(PDEVICE_OBJECT pDevObj, PIRP pIrp, + unsigned int ioctl) { - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + struct kvm *kvm = devext->PrivData; + void __user *argp = (void __user *)pIrp->AssociatedIrp.SystemBuffer; int r; - if (kvm->mm != current->mm) + if (kvm->process != IoGetCurrentProcess()) return -EIO; switch (ioctl) { - case KVM_CREATE_VCPU: - r = kvm_vm_ioctl_create_vcpu(kvm, arg); + case GVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(pDevObj, pIrp, argp); break; - case KVM_SET_USER_MEMORY_REGION: { + case GVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) - goto out; - + RtlCopyBytes(&kvm_userspace_mem, pIrp->AssociatedIrp.SystemBuffer, sizeof(kvm_userspace_mem)); r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); break; } - case KVM_GET_DIRTY_LOG: { + case GVM_GET_DIRTY_LOG: { struct kvm_dirty_log log; r = -EFAULT; @@ -2967,46 +2247,18 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - case KVM_REGISTER_COALESCED_MMIO: { - struct kvm_coalesced_mmio_zone zone; - - r = -EFAULT; - if (copy_from_user(&zone, argp, sizeof(zone))) - goto out; - r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); + case GVM_KICK_VCPU: + r = kvm_vm_ioctl_kick_vcpu(pDevObj, pIrp, argp); break; - } - case KVM_UNREGISTER_COALESCED_MMIO: { - struct kvm_coalesced_mmio_zone zone; + case GVM_RAM_PROTECT: + struct gvm_ram_protect rp; r = -EFAULT; - if (copy_from_user(&zone, argp, sizeof(zone))) - goto out; - r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); + RtlCopyBytes(&rp, pIrp->AssociatedIrp.SystemBuffer, sizeof(rp)); + r = kvm_vm_ioctl_ram_prot(kvm, &rp); break; - } -#endif - case KVM_IRQFD: { - struct kvm_irqfd data; - - r = -EFAULT; - if (copy_from_user(&data, argp, sizeof(data))) - goto out; - r = kvm_irqfd(kvm, &data); - break; - } - case KVM_IOEVENTFD: { - struct kvm_ioeventfd data; - - r = -EFAULT; - if (copy_from_user(&data, argp, sizeof(data))) - goto out; - r = kvm_ioeventfd(kvm, &data); - break; - } -#ifdef CONFIG_HAVE_KVM_MSI - case KVM_SIGNAL_MSI: { +#ifdef CONFIG_HAVE_GVM_MSI + case GVM_SIGNAL_MSI: { struct kvm_msi msi; r = -EFAULT; @@ -3016,32 +2268,28 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif -#ifdef __KVM_HAVE_IRQ_LINE - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { + case GVM_IRQ_LINE_STATUS: { struct kvm_irq_level irq_event; r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof(irq_event))) goto out; - r = kvm_vm_ioctl_irq_line(kvm, &irq_event, - ioctl == KVM_IRQ_LINE_STATUS); + r = kvm_vm_ioctl_irq_line(kvm, &irq_event, true); if (r) goto out; - r = -EFAULT; - if (ioctl == KVM_IRQ_LINE_STATUS) { - if (copy_to_user(argp, &irq_event, sizeof(irq_event))) + if (ioctl == GVM_IRQ_LINE_STATUS) { + r = gvmUpdateReturnBuffer(pIrp, 0, &irq_event, + sizeof(irq_event)); + if (r) goto out; } r = 0; break; } -#endif -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING - case KVM_SET_GSI_ROUTING: { + case GVM_SET_GSI_ROUTING: { struct kvm_irq_routing routing; struct kvm_irq_routing __user *urouting; struct kvm_irq_routing_entry *entries = NULL; @@ -3050,7 +2298,7 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&routing, argp, sizeof(routing))) goto out; r = -EINVAL; - if (routing.nr > KVM_MAX_IRQ_ROUTES) + if (routing.nr > GVM_MAX_IRQ_ROUTES) goto out; if (routing.flags) goto out; @@ -3071,178 +2319,66 @@ out_free_irq_routing: vfree(entries); break; } -#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ - case KVM_CREATE_DEVICE: { - struct kvm_create_device cd; - - r = -EFAULT; - if (copy_from_user(&cd, argp, sizeof(cd))) - goto out; - - r = kvm_ioctl_create_device(kvm, &cd); - if (r) - goto out; - - r = -EFAULT; - if (copy_to_user(argp, &cd, sizeof(cd))) - goto out; - - r = 0; - break; - } - case KVM_CHECK_EXTENSION: - r = kvm_vm_ioctl_check_extension_generic(kvm, arg); + case GVM_CHECK_EXTENSION: + r = kvm_vm_ioctl_check_extension_generic(kvm, *(long *)argp); + gvmUpdateReturnBuffer(pIrp, 0, &r, sizeof(r)); + r = STATUS_SUCCESS; break; default: - r = kvm_arch_vm_ioctl(filp, ioctl, arg); + r = kvm_arch_vm_ioctl(devext, pIrp, ioctl); } out: return r; } -#ifdef CONFIG_KVM_COMPAT -struct compat_kvm_dirty_log { - __u32 slot; - __u32 padding1; - union { - compat_uptr_t dirty_bitmap; /* one bit per page */ - __u64 padding2; - }; -}; - -static long kvm_vm_compat_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_dev_ioctl_create_vm(PDEVICE_OBJECT pDevObj, PIRP pIrp, unsigned long arg) { - struct kvm *kvm = filp->private_data; - int r; - - if (kvm->mm != current->mm) - return -EIO; - switch (ioctl) { - case KVM_GET_DIRTY_LOG: { - struct compat_kvm_dirty_log compat_log; - struct kvm_dirty_log log; - - r = -EFAULT; - if (copy_from_user(&compat_log, (void __user *)arg, - sizeof(compat_log))) - goto out; - log.slot = compat_log.slot; - log.padding1 = compat_log.padding1; - log.padding2 = compat_log.padding2; - log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); - - r = kvm_vm_ioctl_get_dirty_log(kvm, &log); - break; - } - default: - r = kvm_vm_ioctl(filp, ioctl, arg); - } - -out: - return r; -} -#endif - -static struct file_operations kvm_vm_fops = { - .release = kvm_vm_release, - .unlocked_ioctl = kvm_vm_ioctl, -#ifdef CONFIG_KVM_COMPAT - .compat_ioctl = kvm_vm_compat_ioctl, -#endif - .llseek = noop_llseek, -}; - -static int kvm_dev_ioctl_create_vm(unsigned long type) -{ - int r; struct kvm *kvm; - struct file *file; + NTSTATUS rc; + HANDLE handle; + unsigned int type = arg; kvm = kvm_create_vm(type); if (IS_ERR(kvm)) return PTR_ERR(kvm); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - r = kvm_coalesced_mmio_init(kvm); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } -#endif - r = get_unused_fd_flags(O_CLOEXEC); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } - file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); - if (IS_ERR(file)) { - put_unused_fd(r); - kvm_put_kvm(kvm); - return PTR_ERR(file); - } - if (kvm_create_vm_debugfs(kvm, r) < 0) { - put_unused_fd(r); - fput(file); - return -ENOMEM; - } - - fd_install(r, file); - return r; + rc = gvmCreateVMDevice(&handle, kvm->vm_id, -1, kvm); + if (NT_SUCCESS(rc)) + gvmUpdateReturnBuffer(pIrp, 0, &handle, sizeof(handle)); + return rc; } -static long kvm_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +NTSTATUS kvm_dev_ioctl(PDEVICE_OBJECT pDevObj, PIRP pIrp, + unsigned int ioctl) { long r = -EINVAL; + struct gvm_device_extension *devext = pDevObj->DeviceExtension; + void* pin = pIrp->AssociatedIrp.SystemBuffer; switch (ioctl) { - case KVM_GET_API_VERSION: - if (arg) - goto out; - r = KVM_API_VERSION; + case GVM_GET_API_VERSION: + r = GVM_VERSION; + gvmUpdateReturnBuffer(pIrp, 0, &r, sizeof(r)); + r = STATUS_SUCCESS; break; - case KVM_CREATE_VM: - r = kvm_dev_ioctl_create_vm(arg); + case GVM_CREATE_VM: + r = kvm_dev_ioctl_create_vm(pDevObj, pIrp, 0); break; - case KVM_CHECK_EXTENSION: - r = kvm_vm_ioctl_check_extension_generic(NULL, arg); + case GVM_CHECK_EXTENSION: + r = kvm_vm_ioctl_check_extension_generic(NULL, *(long *)pin); + gvmUpdateReturnBuffer(pIrp, 0, &r, sizeof(r)); + r = STATUS_SUCCESS; break; - case KVM_GET_VCPU_MMAP_SIZE: - if (arg) - goto out; - r = PAGE_SIZE; /* struct kvm_run */ -#ifdef CONFIG_X86 - r += PAGE_SIZE; /* pio data page */ -#endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - r += PAGE_SIZE; /* coalesced mmio ring page */ -#endif - break; - case KVM_TRACE_ENABLE: - case KVM_TRACE_PAUSE: - case KVM_TRACE_DISABLE: - r = -EOPNOTSUPP; + case GVM_GET_VCPU_MMAP_SIZE: + long mmap_size = 2 * PAGE_SIZE; + r = gvmUpdateReturnBuffer(pIrp, 0, &mmap_size, sizeof(mmap_size)); break; default: - return kvm_arch_dev_ioctl(filp, ioctl, arg); + return kvm_arch_dev_ioctl(devext, pIrp, ioctl); } -out: return r; } -static struct file_operations kvm_chardev_ops = { - .unlocked_ioctl = kvm_dev_ioctl, - .compat_ioctl = kvm_dev_ioctl, - .llseek = noop_llseek, -}; - -static struct miscdevice kvm_dev = { - KVM_MINOR, - "kvm", - &kvm_chardev_ops, -}; - static void hardware_enable_nolock(void *junk) { int cpu = raw_smp_processor_id(); @@ -3260,6 +2396,8 @@ static void hardware_enable_nolock(void *junk) atomic_inc(&hardware_enable_failed); pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); } + + return; } static int kvm_starting_cpu(unsigned int cpu) @@ -3296,7 +2434,7 @@ static void hardware_disable_all_nolock(void) kvm_usage_count--; if (!kvm_usage_count) - on_each_cpu(hardware_disable_nolock, NULL, 1); + smp_call_function_many(cpu_online_mask, hardware_disable_nolock, NULL, 1); } static void hardware_disable_all(void) @@ -3315,8 +2453,7 @@ static int hardware_enable_all(void) kvm_usage_count++; if (kvm_usage_count == 1) { atomic_set(&hardware_enable_failed, 0); - on_each_cpu(hardware_enable_nolock, NULL, 1); - + smp_call_function_many(cpu_online_mask, hardware_enable_nolock, NULL, 1); if (atomic_read(&hardware_enable_failed)) { hardware_disable_all_nolock(); r = -EBUSY; @@ -3328,26 +2465,6 @@ static int hardware_enable_all(void) return r; } -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) -{ - /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - * - * And Intel TXT required VMX off for all cpu when system shutdown. - */ - pr_info("kvm: exiting hardware virtualization\n"); - kvm_rebooting = true; - on_each_cpu(hardware_disable_nolock, NULL, 1); - return NOTIFY_OK; -} - -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; - static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -3463,6 +2580,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = vcpu->kvm->buses[bus_idx]; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3480,6 +2598,7 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = vcpu->kvm->buses[bus_idx]; /* First try the device referenced by cookie. */ if ((cookie >= 0) && (cookie < bus->dev_count) && @@ -3514,7 +2633,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, @@ -3530,6 +2648,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = vcpu->kvm->buses[bus_idx]; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3542,8 +2661,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - /* exclude ioeventfd which is limited by maximum fd */ - if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) + if (bus->dev_count > NR_IOBUS_DEVS - 1) return -ENOSPC; new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * @@ -3604,6 +2722,7 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = kvm->buses[bus_idx]; dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); if (dev_idx < 0) @@ -3616,241 +2735,35 @@ out_unlock: return iodev; } -EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); - -static int kvm_debugfs_open(struct inode *inode, struct file *file, - int (*get)(void *, u64 *), int (*set)(void *, u64), - const char *fmt) -{ - struct kvm_stat_data *stat_data = (struct kvm_stat_data *) - inode->i_private; - - /* The debugfs files are a reference to the kvm struct which - * is still valid when kvm_destroy_vm is called. - * To avoid the race between open and the removal of the debugfs - * directory we test against the users count. - */ - if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0)) - return -ENOENT; - - if (simple_attr_open(inode, file, get, set, fmt)) { - kvm_put_kvm(stat_data->kvm); - return -ENOMEM; - } - - return 0; -} - -static int kvm_debugfs_release(struct inode *inode, struct file *file) -{ - struct kvm_stat_data *stat_data = (struct kvm_stat_data *) - inode->i_private; - - simple_attr_release(inode, file); - kvm_put_kvm(stat_data->kvm); - - return 0; -} - -static int vm_stat_get_per_vm(void *data, u64 *val) -{ - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - - *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); - - return 0; -} - -static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) -{ - __simple_attr_check_format("%llu\n", 0ull); - return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, - NULL, "%llu\n"); -} - -static const struct file_operations vm_stat_get_per_vm_fops = { - .owner = THIS_MODULE, - .open = vm_stat_get_per_vm_open, - .release = kvm_debugfs_release, - .read = simple_attr_read, - .write = simple_attr_write, - .llseek = generic_file_llseek, -}; - -static int vcpu_stat_get_per_vm(void *data, u64 *val) -{ - int i; - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - struct kvm_vcpu *vcpu; - - *val = 0; - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) - *val += *(u64 *)((void *)vcpu + stat_data->offset); - - return 0; -} - -static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) -{ - __simple_attr_check_format("%llu\n", 0ull); - return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, - NULL, "%llu\n"); -} - -static const struct file_operations vcpu_stat_get_per_vm_fops = { - .owner = THIS_MODULE, - .open = vcpu_stat_get_per_vm_open, - .release = kvm_debugfs_release, - .read = simple_attr_read, - .write = simple_attr_write, - .llseek = generic_file_llseek, -}; - -static const struct file_operations *stat_fops_per_vm[] = { - [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, - [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, -}; - -static int vm_stat_get(void *_offset, u64 *val) -{ - unsigned offset = (long)_offset; - struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; - u64 tmp_val; - - *val = 0; - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); - *val += tmp_val; - } - spin_unlock(&kvm_lock); - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); - -static int vcpu_stat_get(void *_offset, u64 *val) -{ - unsigned offset = (long)_offset; - struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; - u64 tmp_val; - - *val = 0; - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); - *val += tmp_val; - } - spin_unlock(&kvm_lock); - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); - -static const struct file_operations *stat_fops[] = { - [KVM_STAT_VCPU] = &vcpu_stat_fops, - [KVM_STAT_VM] = &vm_stat_fops, -}; - -static int kvm_init_debug(void) -{ - int r = -EEXIST; - struct kvm_stats_debugfs_item *p; - - kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); - if (kvm_debugfs_dir == NULL) - goto out; - - kvm_debugfs_num_entries = 0; - for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir, - (void *)(long)p->offset, - stat_fops[p->kind])) - goto out_dir; - } - - return 0; - -out_dir: - debugfs_remove_recursive(kvm_debugfs_dir); -out: - return r; -} - -static int kvm_suspend(void) +/* + * The following two functions are kept here so that they + * could be used once hooking driver with Windows Power State + * chage. + */ +int kvm_suspend(void) { if (kvm_usage_count) - hardware_disable_nolock(NULL); + smp_call_function_many(cpu_online_mask, + hardware_disable_nolock, NULL, 1); return 0; } -static void kvm_resume(void) -{ - if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_count_lock)); - hardware_enable_nolock(NULL); - } -} - -static struct syscore_ops kvm_syscore_ops = { - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - -static inline -struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) -{ - return container_of(pn, struct kvm_vcpu, preempt_notifier); -} - -static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +void kvm_resume(void) { - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - - if (vcpu->preempted) - vcpu->preempted = false; - - kvm_arch_sched_in(vcpu, cpu); - - kvm_arch_vcpu_load(vcpu, cpu); -} - -static void kvm_sched_out(struct preempt_notifier *pn, - struct task_struct *next) -{ - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - - if (current->state == TASK_RUNNING) - vcpu->preempted = true; - kvm_arch_vcpu_put(vcpu); + if (kvm_usage_count) + smp_call_function_many(cpu_online_mask, + hardware_enable_nolock, NULL, 1); } -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align) { int r; - int cpu; r = kvm_arch_init(opaque); if (r) goto out_fail; - /* - * kvm_arch_init makes sure there's at most one caller - * for architectures that support multiple implementations, - * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. - */ - r = kvm_irqfd_init(); - if (r) - goto out_irqfd; - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -3860,98 +2773,27 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r < 0) goto out_free_0a; - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, - kvm_arch_check_processor_compat, - &r, 1); - if (r < 0) - goto out_free_1; - } - - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "AP_KVM_STARTING", - kvm_starting_cpu, kvm_dying_cpu); - if (r) - goto out_free_2; - register_reboot_notifier(&kvm_reboot_notifier); - - /* A kmem cache lets us meet the alignment requirements of fx_save. */ - if (!vcpu_align) - vcpu_align = __alignof__(struct kvm_vcpu); - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, - 0, NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto out_free_3; - } - - r = kvm_async_pf_init(); - if (r) - goto out_free; - - kvm_chardev_ops.owner = module; - kvm_vm_fops.owner = module; - kvm_vcpu_fops.owner = module; - - r = misc_register(&kvm_dev); - if (r) { - pr_err("kvm: misc device register failed\n"); - goto out_unreg; - } - - register_syscore_ops(&kvm_syscore_ops); - - kvm_preempt_ops.sched_in = kvm_sched_in; - kvm_preempt_ops.sched_out = kvm_sched_out; - - r = kvm_init_debug(); - if (r) { - pr_err("kvm: create debugfs files failed\n"); - goto out_undebugfs; - } - - r = kvm_vfio_ops_init(); - WARN_ON(r); + kvm_arch_check_processor_compat(&r); + if (r < 0) + goto out_free_1; return 0; -out_undebugfs: - unregister_syscore_ops(&kvm_syscore_ops); - misc_deregister(&kvm_dev); -out_unreg: - kvm_async_pf_deinit(); -out_free: - kmem_cache_destroy(kvm_vcpu_cache); -out_free_3: - unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); -out_free_2: out_free_1: kvm_arch_hardware_unsetup(); out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: - kvm_irqfd_exit(); -out_irqfd: kvm_arch_exit(); out_fail: return r; } -EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { - debugfs_remove_recursive(kvm_debugfs_dir); - misc_deregister(&kvm_dev); - kmem_cache_destroy(kvm_vcpu_cache); - kvm_async_pf_deinit(); - unregister_syscore_ops(&kvm_syscore_ops); - unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); - on_each_cpu(hardware_disable_nolock, NULL, 1); + smp_call_function_many(cpu_online_mask, + hardware_disable_nolock, NULL, 1); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_vfio_ops_exit(); } -EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c deleted file mode 100644 index 1dd087d..0000000 --- a/virt/kvm/vfio.c +++ /dev/null @@ -1,295 +0,0 @@ -/* - * VFIO-KVM bridge pseudo device - * - * Copyright (C) 2013 Red Hat, Inc. All rights reserved. - * Author: Alex Williamson <alex.williamson@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/kvm_host.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <linux/vfio.h> -#include "vfio.h" - -struct kvm_vfio_group { - struct list_head node; - struct vfio_group *vfio_group; -}; - -struct kvm_vfio { - struct list_head group_list; - struct mutex lock; - bool noncoherent; -}; - -static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) -{ - struct vfio_group *vfio_group; - struct vfio_group *(*fn)(struct file *); - - fn = symbol_get(vfio_group_get_external_user); - if (!fn) - return ERR_PTR(-EINVAL); - - vfio_group = fn(filep); - - symbol_put(vfio_group_get_external_user); - - return vfio_group; -} - -static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) -{ - void (*fn)(struct vfio_group *); - - fn = symbol_get(vfio_group_put_external_user); - if (!fn) - return; - - fn(vfio_group); - - symbol_put(vfio_group_put_external_user); -} - -static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) -{ - long (*fn)(struct vfio_group *, unsigned long); - long ret; - - fn = symbol_get(vfio_external_check_extension); - if (!fn) - return false; - - ret = fn(vfio_group, VFIO_DMA_CC_IOMMU); - - symbol_put(vfio_external_check_extension); - - return ret > 0; -} - -/* - * Groups can use the same or different IOMMU domains. If the same then - * adding a new group may change the coherency of groups we've previously - * been told about. We don't want to care about any of that so we retest - * each group and bail as soon as we find one that's noncoherent. This - * means we only ever [un]register_noncoherent_dma once for the whole device. - */ -static void kvm_vfio_update_coherency(struct kvm_device *dev) -{ - struct kvm_vfio *kv = dev->private; - bool noncoherent = false; - struct kvm_vfio_group *kvg; - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) { - noncoherent = true; - break; - } - } - - if (noncoherent != kv->noncoherent) { - kv->noncoherent = noncoherent; - - if (kv->noncoherent) - kvm_arch_register_noncoherent_dma(dev->kvm); - else - kvm_arch_unregister_noncoherent_dma(dev->kvm); - } - - mutex_unlock(&kv->lock); -} - -static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) -{ - struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; - struct kvm_vfio_group *kvg; - int32_t __user *argp = (int32_t __user *)(unsigned long)arg; - struct fd f; - int32_t fd; - int ret; - - switch (attr) { - case KVM_DEV_VFIO_GROUP_ADD: - if (get_user(fd, argp)) - return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group == vfio_group) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -EEXIST; - } - } - - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); - if (!kvg) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -ENOMEM; - } - - list_add_tail(&kvg->node, &kv->group_list); - kvg->vfio_group = vfio_group; - - kvm_arch_start_assignment(dev->kvm); - - mutex_unlock(&kv->lock); - - kvm_vfio_update_coherency(dev); - - return 0; - - case KVM_DEV_VFIO_GROUP_DEL: - if (get_user(fd, argp)) - return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - ret = -ENOENT; - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group != vfio_group) - continue; - - list_del(&kvg->node); - kvm_vfio_group_put_external_user(kvg->vfio_group); - kfree(kvg); - ret = 0; - break; - } - - kvm_arch_end_assignment(dev->kvm); - - mutex_unlock(&kv->lock); - - kvm_vfio_group_put_external_user(vfio_group); - - kvm_vfio_update_coherency(dev); - - return ret; - } - - return -ENXIO; -} - -static int kvm_vfio_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_VFIO_GROUP: - return kvm_vfio_set_group(dev, attr->attr, attr->addr); - } - - return -ENXIO; -} - -static int kvm_vfio_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_VFIO_GROUP: - switch (attr->attr) { - case KVM_DEV_VFIO_GROUP_ADD: - case KVM_DEV_VFIO_GROUP_DEL: - return 0; - } - - break; - } - - return -ENXIO; -} - -static void kvm_vfio_destroy(struct kvm_device *dev) -{ - struct kvm_vfio *kv = dev->private; - struct kvm_vfio_group *kvg, *tmp; - - list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { - kvm_vfio_group_put_external_user(kvg->vfio_group); - list_del(&kvg->node); - kfree(kvg); - kvm_arch_end_assignment(dev->kvm); - } - - kvm_vfio_update_coherency(dev); - - kfree(kv); - kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ -} - -static int kvm_vfio_create(struct kvm_device *dev, u32 type); - -static struct kvm_device_ops kvm_vfio_ops = { - .name = "kvm-vfio", - .create = kvm_vfio_create, - .destroy = kvm_vfio_destroy, - .set_attr = kvm_vfio_set_attr, - .has_attr = kvm_vfio_has_attr, -}; - -static int kvm_vfio_create(struct kvm_device *dev, u32 type) -{ - struct kvm_device *tmp; - struct kvm_vfio *kv; - - /* Only one VFIO "device" per VM */ - list_for_each_entry(tmp, &dev->kvm->devices, vm_node) - if (tmp->ops == &kvm_vfio_ops) - return -EBUSY; - - kv = kzalloc(sizeof(*kv), GFP_KERNEL); - if (!kv) - return -ENOMEM; - - INIT_LIST_HEAD(&kv->group_list); - mutex_init(&kv->lock); - - dev->private = kv; - - return 0; -} - -int kvm_vfio_ops_init(void) -{ - return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); -} - -void kvm_vfio_ops_exit(void) -{ - kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); -} diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h deleted file mode 100644 index ab88c7d..0000000 --- a/virt/kvm/vfio.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __KVM_VFIO_H -#define __KVM_VFIO_H - -#ifdef CONFIG_KVM_VFIO -int kvm_vfio_ops_init(void); -void kvm_vfio_ops_exit(void); -#else -static inline int kvm_vfio_ops_init(void) -{ - return 0; -} -static inline void kvm_vfio_ops_exit(void) -{ -} -#endif - -#endif |