summaryrefslogtreecommitdiff
path: root/ntkrutils.c
diff options
context:
space:
mode:
Diffstat (limited to 'ntkrutils.c')
-rw-r--r--ntkrutils.c599
1 files changed, 599 insertions, 0 deletions
diff --git a/ntkrutils.c b/ntkrutils.c
new file mode 100644
index 0000000..2509940
--- /dev/null
+++ b/ntkrutils.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright 2019 Google LLC
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <ntddk.h>
+#include <gvm_types.h>
+#include <ntkrutils.h>
+#include <linux/list.h>
+
+LIST_HEAD(global_malloc_list);
+DEFINE_SPINLOCK(global_malloc_lock);
+struct page** pglist;
+DEFINE_SPINLOCK(global_page_lock);
+
+int CPU_HAS_X86_FEATURE_XSAVE;
+int CPU_HAS_X86_FEATURE_PKU;
+int CPU_HAS_X86_FEATURE_GBPAGES;
+int CPU_HAS_X86_FEATURE_HLE;
+int CPU_HAS_X86_FEATURE_RTM;
+int CPU_HAS_X86_FEATURE_NX;
+int CPU_HAS_X86_FEATURE_FXSR_OPT;
+int CPU_HAS_X86_FEATURE_NPT;
+int CPU_HAS_X86_FEATURE_AVIC;
+int CPU_HAS_X86_FEATURE_DECODEASSISTS;
+int CPU_HAS_X86_FEATURE_RDTSCP;
+int CPU_HAS_X86_FEATURE_LBRV;
+int CPU_HAS_X86_FEATURE_NRIPS;
+int CPU_HAS_X86_FEATURE_SMEP;
+int CPU_HAS_X86_FEATURE_MPX;
+int CPU_HAS_X86_FEATURE_XSAVES;
+int CPU_HAS_X86_FEATURE_CONSTANT_TSC;
+int CPU_HAS_X86_BUG_AMD_TLB_MMATCH;
+int CPU_HAS_X86_FEATURE_FLUSHBYASID;
+int CPU_HAS_X86_FEATURE_OSVW;
+int CPU_HAS_X86_FEATURE_SVM;
+
+struct cpumask __cpu_online_mask;
+struct cpumask *cpu_online_mask = &__cpu_online_mask;
+unsigned int cpu_online_count;
+u64 max_pagen;
+
+DEFINE_PER_CPU(struct cpu_getput_cxt, cpu_getput_cxt);
+
+typedef struct _KAFFINITY_EX {
+ uint16_t Count;
+ uint16_t Size;
+ uint32_t Padding;
+ uint64_t bitmap[20];
+} KAFFINITYEX, *PKAFFINITYEX;
+
+typedef void (NTAPI *PFNHALREQUESTIPI)(uint32_t, PKAFFINITYEX);
+typedef void (NTAPI *PFNKEINITIALIZEAFFINITYEX)(PKAFFINITYEX);
+typedef void (NTAPI *PFNKEADDPROCESSORAFFINITYEX)(PKAFFINITYEX, uint32_t);
+
+PFNHALREQUESTIPI pHalRequestIpi;
+PFNKEINITIALIZEAFFINITYEX pKeInitializeAffinityEx;
+PFNKEADDPROCESSORAFFINITYEX pKeAddProcessorAffinityEx;
+
+// Fix me: We assume there is not cpu online at this time
+
+NTSTATUS gvmGetCpuOnlineMap(void)
+{
+ NTSTATUS rc;
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *inf = NULL;
+ PPROCESSOR_GROUP_INFO pginf = NULL;
+ PROCESSOR_NUMBER pn;
+ ULONG buffSize = 0;
+ u32 ig;
+ u32 ip;
+ u32 cpuIndex;
+
+ cpu_online_count = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS);
+
+ rc = KeQueryLogicalProcessorRelationship(NULL,
+ RelationGroup, NULL, &buffSize);
+ NT_ASSERT(rc == STATUS_INFO_LENGTH_MISMATCH);
+
+ inf = ExAllocatePoolWithTag(NonPagedPool, buffSize, GVM_POOL_TAG);
+
+ if (!inf)
+ return STATUS_INSUFFICIENT_RESOURCES;
+
+ rc = KeQueryLogicalProcessorRelationship(NULL, RelationGroup,
+ inf, &buffSize);
+
+ if (!NT_SUCCESS(rc))
+ goto mapout;
+
+ for (ig = 0; NT_SUCCESS(rc) && ig < inf->Group.ActiveGroupCount; ig++) {
+ pginf = &inf->Group.GroupInfo[ig];
+
+ for (ip = 0; ip < pginf->MaximumProcessorCount; ip++) {
+ pn.Group = ig;
+ pn.Number = ip;
+ pn.Reserved = 0;
+
+ cpuIndex = KeGetProcessorIndexFromNumber(&pn);
+
+ if (cpuIndex == INVALID_PROCESSOR_INDEX) {
+ DbgPrint("Cannot find CPU Index for processor \
+ in group %d[%d", ig, ip);
+ continue;
+ }
+
+ if (test_bit(ip, &pginf->ActiveProcessorMask))
+ cpumask_set_cpu(cpuIndex, cpu_online_mask);
+ else
+ DbgPrint("Processor %d inside group %d[%d] \
+ is not active", cpuIndex, ig, ip);
+ }
+ }
+
+mapout:
+ ExFreePoolWithTag(inf, GVM_POOL_TAG);
+ return rc;
+}
+
+/*
+ Timer Stuffs
+ */
+void timer_dpc_fn(struct _KDPC *Dpc,
+ PVOID DeferredContext,
+ PVOID SystemArgument1,
+ PVOID SystemArgument2)
+{
+ struct hrtimer *timer = (struct hrtimer*)DeferredContext;
+ enum hrtimer_restart ret = timer->function(timer);
+ if(ret == HRTIMER_RESTART)
+ hrtimer_restart(timer);
+}
+
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode)
+{
+ KeInitializeTimerEx(&timer->ktimer, SynchronizationTimer);
+ timer->base = &timer->base_hack;
+ timer->base->get_time = ktime_get;
+ KeInitializeDpc(&timer->kdpc, (PKDEFERRED_ROUTINE)timer_dpc_fn, timer);
+}
+
+int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+ int r;
+ // We only emulate hrtimer mode that KVM uses
+ ASSERTMSG("Unsupported hrtimer mode", mode == HRTIMER_MODE_ABS_PINNED);
+ timer->due_time.QuadPart = ktime_to_ns(tim);
+ timer->node.expires = tim;
+ do_div(&(u64)timer->due_time.QuadPart, 100);
+ r = (int)KeSetTimer(&timer->ktimer, timer->due_time, &timer->kdpc);
+ return r;
+}
+
+int hrtimer_cancel(struct hrtimer *timer)
+{
+ int r;
+ r = KeCancelTimer(&timer->ktimer);
+ return r;
+}
+
+int hrtimer_restart(struct hrtimer* timer)
+{
+ int r;
+ //timer->due_time.QuadPart = (ktime_to_ns(ktime_get()) - ktime_to_ns(timer->node.expires)) / 100;
+ timer->due_time.QuadPart = ktime_to_ns(timer->node.expires);
+ do_div(&(u64)timer->due_time.QuadPart, 100);
+ r = (int)KeSetTimer(&timer->ktimer, timer->due_time, &timer->kdpc);
+ return r;
+}
+
+struct list_head gvm_mmap_list;
+DEFINE_RAW_SPINLOCK(gvm_mmap_lock);
+
+size_t vm_mmap(struct file *notused, size_t addr, size_t len, size_t prot,
+ size_t flag, size_t offset)
+{
+ return __vm_mmap(notused, addr, len, prot, flag, offset, 0);
+}
+
+size_t __declspec(noinline) __vm_mmap(struct file *notused, size_t addr,
+ size_t len, size_t prot, size_t flag, size_t offset, size_t keva)
+{
+ PMDL pMDL = NULL;
+ PVOID pMem = NULL;
+ PVOID UserVA = NULL;
+ struct gvm_mmap_node *node;
+
+ node = ExAllocatePoolWithTag(NonPagedPool,
+ sizeof(struct gvm_mmap_node),
+ GVM_POOL_TAG);
+ if (!node)
+ return (size_t)NULL;
+
+ if (keva)
+ pMem = (PVOID)keva;
+ else {
+ pMem = ExAllocatePoolWithTag(NonPagedPool, len, GVM_POOL_TAG);
+ if (!pMem)
+ goto free_node;
+ RtlZeroMemory(pMem, len);
+ }
+
+ pMDL = IoAllocateMdl(pMem, len, FALSE, FALSE, NULL);
+ if (!pMDL)
+ goto free_pmem;
+
+ MmBuildMdlForNonPagedPool(pMDL);
+ UserVA = MmMapLockedPagesSpecifyCache(pMDL, UserMode, MmCached,
+ 0, 0, NormalPagePriority);
+
+ if (!UserVA)
+ goto free_mdl;
+
+ node->UserVA = UserVA;
+ node->pMDL = pMDL;
+ node->pMem = pMem;
+
+ raw_spin_lock(&gvm_mmap_lock);
+ list_add_tail(&node->list, &gvm_mmap_list);
+ raw_spin_unlock(&gvm_mmap_lock);
+
+ return (size_t)UserVA;
+
+ free_mdl:
+ IoFreeMdl(pMDL);
+ free_pmem:
+ if (keva)
+ ExFreePoolWithTag(pMem, GVM_POOL_TAG);
+ free_node:
+ ExFreePoolWithTag(node, GVM_POOL_TAG);
+
+ return (size_t)NULL;
+}
+
+int vm_munmap(size_t start, size_t len)
+{
+ return __vm_munmap(start, len, true);
+}
+
+int __declspec(noinline) __vm_munmap(size_t start, size_t len, bool freepage)
+{
+ struct gvm_mmap_node *node = NULL;
+ int find = 0;
+
+ raw_spin_lock(&gvm_mmap_lock);
+#define LIST_ENTRY_TYPE_INFO struct gvm_mmap_node
+ list_for_each_entry(node, &gvm_mmap_list, list)
+ if (node->UserVA == (PVOID)start) {
+ find = 1;
+ break;
+ }
+#undef LIST_ENTRY_TYPE_INFO
+ if (find)
+ list_del(&node->list);
+ raw_spin_unlock(&gvm_mmap_lock);
+
+ if (!find)
+ return -1;
+
+ BUG_ON(!node->UserVA);
+ BUG_ON(!node->pMDL);
+ BUG_ON(!node->pMem);
+
+ MmUnmapLockedPages(node->UserVA, node->pMDL);
+ IoFreeMdl(node->pMDL);
+
+ if (freepage)
+ ExFreePoolWithTag(node->pMem, GVM_POOL_TAG);
+
+ ExFreePoolWithTag(node, GVM_POOL_TAG);
+ return 0;
+}
+
+struct sfc_data {
+ void (*func)(void *info);
+ void *info;
+ int done;
+ struct spin_lock lock;
+};
+
+DEFINE_PER_CPU(KDPC, ipi_dpc);
+DEFINE_PER_CPU(struct sfc_data, smp_call_function_data);
+
+static void sfc_dpc_routine(KDPC *Dpc, PVOID DeferredContext,
+ PVOID func, PVOID info)
+{
+ struct sfc_data *sfc_data;
+ sfc_data = &per_cpu(smp_processor_id(), smp_call_function_data);
+ if (sfc_data->func)
+ sfc_data->func(sfc_data->info);
+ sfc_data->done = 1;
+}
+
+/*
+ * smp_call_function_xxx has been changed several times from KeIpiGenericCall
+ * to HalRequestIpi...
+ * Current version used DPC with HighImportance to emulate physical IPIs.
+ * The major concern here is making code easy to debug. Playing with physical
+ * IPIs incorrectly (some time even correctly) can hang the system and WinDbg
+ * cannot debug these cases.
+ * We may later to switch to physical IPIs.
+ * Note: a DPC (or an IPI) issued to current processor just preempts the
+ * code.
+ */
+int smp_call_function_many(cpumask_var_t mask,
+ void(*func) (void *info), void *info, int wait)
+{
+ int cpu;
+ struct sfc_data *sfc_data;
+
+ for_each_cpu(cpu, mask) {
+ sfc_data = &per_cpu(cpu, smp_call_function_data);
+ spin_lock(&sfc_data->lock);
+ sfc_data->func = func;
+ sfc_data->info = info;
+ sfc_data->done = 0;
+ if (!KeInsertQueueDpc(&per_cpu(cpu, ipi_dpc),
+ NULL, NULL))
+ DbgBreakPoint();
+ }
+
+ for_each_cpu(cpu, mask) {
+ sfc_data = &per_cpu(cpu, smp_call_function_data);
+ while (!sfc_data->done)
+ _mm_pause();
+ spin_unlock(&sfc_data->lock);
+ }
+
+ return 0;
+}
+
+int smp_call_function_single(int cpu, void(*func)(void *info),
+ void *info, int wait)
+{
+ struct sfc_data *sfc_data;
+
+ sfc_data = &per_cpu(cpu, smp_call_function_data);
+ spin_lock(&sfc_data->lock);
+ sfc_data->func = func;
+ sfc_data->info = info;
+ sfc_data->done = 0;
+ if (!KeInsertQueueDpc(&per_cpu(cpu, ipi_dpc),
+ func, info))
+ DbgBreakPoint();
+ while (!sfc_data->done)
+ _mm_pause();
+ spin_unlock(&sfc_data->lock);
+ return 0;
+}
+
+
+void smp_send_reschedule(int cpu)
+{
+ KAFFINITYEX target;
+
+ pKeInitializeAffinityEx(&target);
+ pKeAddProcessorAffinityEx(&target, cpu);
+ pHalRequestIpi(0, &target);
+}
+
+enum cpuid_reg {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
+#define check_cpu_has(name, leaf, reg, bitpos) \
+ do { \
+ __cpuid(cpuid_info, leaf); \
+ CPU_HAS_##name = !!(cpuid_info[reg] & (1 << bitpos)); \
+ } while (0)
+
+#define check_cpu_has_ex(name, leaf, level, reg, bitpos) \
+ do { \
+ __cpuidex(cpuid_info, leaf, level); \
+ CPU_HAS_##name = !!(cpuid_info[reg] & (1 << bitpos)); \
+ } while (0)
+
+
+static void cpu_features_init(void)
+{
+ int cpuid_info[4] = { 0 };
+
+ check_cpu_has(X86_FEATURE_XSAVE, 1, CPUID_ECX, 26);
+
+ check_cpu_has(X86_FEATURE_OSVW, 0x80000001, CPUID_ECX, 9);
+ check_cpu_has(X86_FEATURE_SVM, 0x80000001, CPUID_ECX, 2);
+
+ check_cpu_has(X86_FEATURE_NX, 0x80000001, CPUID_EDX, 20);
+ check_cpu_has(X86_FEATURE_FXSR_OPT, 0x80000001, CPUID_EDX, 25);
+ check_cpu_has(X86_FEATURE_GBPAGES, 0x80000001, CPUID_EDX, 26);
+ check_cpu_has(X86_FEATURE_RDTSCP, 0x80000001, CPUID_EDX, 27);
+
+ check_cpu_has_ex(X86_FEATURE_HLE, 7, 0, CPUID_EBX, 4);
+ check_cpu_has_ex(X86_FEATURE_RTM, 7, 0, CPUID_EBX, 11);
+ check_cpu_has_ex(X86_FEATURE_MPX, 7, 0, CPUID_EBX, 14);
+
+ check_cpu_has_ex(X86_FEATURE_PKU, 7, 0, CPUID_ECX, 3);
+ check_cpu_has_ex(X86_FEATURE_SMEP, 7, 0, CPUID_ECX, 7);
+
+ check_cpu_has(X86_FEATURE_NPT, 0x8000000a, CPUID_EDX, 0);
+ check_cpu_has(X86_FEATURE_LBRV, 0x8000000a, CPUID_EDX, 1);
+ check_cpu_has(X86_FEATURE_NRIPS, 0x8000000a, CPUID_EDX, 3);
+ check_cpu_has(X86_FEATURE_FLUSHBYASID, 0x8000000a, CPUID_EDX, 6);
+ check_cpu_has(X86_FEATURE_DECODEASSISTS, 0x8000000a, CPUID_EDX, 7);
+ check_cpu_has(X86_FEATURE_AVIC, 0x8000000a, CPUID_EDX, 13);
+
+ check_cpu_has_ex(X86_FEATURE_XSAVES, 0xd, 1, CPUID_EAX, 3);
+}
+
+static NTSTATUS prepare_boot_cpu_data(void)
+{
+ /* Check Physical Address Bit*/
+ unsigned int eax, ebx, ecx, edx;
+
+ boot_cpu_data.extended_cpuid_level = cpuid_eax(0x80000000);
+ boot_cpu_data.x86_phys_bits = 36;
+
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (boot_cpu_data.extended_cpuid_level >= 0x80000008)
+ if (edx & (1 << 29)) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ boot_cpu_data.x86_phys_bits = eax & 0xFF;
+ }
+
+ return STATUS_SUCCESS;
+}
+
+#define RegName L"\\Registry\\Machine\\HARDWARE\\RESOURCEMAP\\System Resources\\Physical Memory"
+static NTSTATUS get_physical_memsize(u64 *size)
+{
+ OBJECT_ATTRIBUTES keyAttribute;
+ UNICODE_STRING keyName, valName;
+ HANDLE keyHandle;
+ NTSTATUS rc;
+ ULONG buffSize, count;
+ PKEY_VALUE_FULL_INFORMATION buff;
+ PCM_RESOURCE_LIST res;
+ PCM_PARTIAL_RESOURCE_LIST list;
+ PCM_PARTIAL_RESOURCE_DESCRIPTOR pres;
+
+ RtlInitUnicodeString(&keyName, RegName);
+ InitializeObjectAttributes(&keyAttribute,
+ &keyName,
+ OBJ_CASE_INSENSITIVE | OBJ_KERNEL_HANDLE,
+ NULL,
+ NULL);
+ rc = ZwOpenKey(&keyHandle, KEY_READ, &keyAttribute);
+ if (!NT_SUCCESS(rc))
+ return rc;
+
+ RtlInitUnicodeString(&valName, L".Translated");
+ rc = ZwQueryValueKey(keyHandle,
+ &valName,
+ KeyValueFullInformation,
+ NULL,
+ 0,
+ &buffSize);
+ if (!(rc == STATUS_BUFFER_TOO_SMALL ||
+ rc == STATUS_BUFFER_OVERFLOW))
+ goto key_close;
+
+ buff = ExAllocatePoolWithTag(NonPagedPool, buffSize, GVM_POOL_TAG);
+ if (!buff) {
+ rc = STATUS_NO_MEMORY;
+ goto key_close;
+ }
+
+ RtlZeroMemory(buff, buffSize);
+ rc = ZwQueryValueKey(keyHandle,
+ &valName,
+ KeyValueFullInformation,
+ buff,
+ buffSize,
+ &buffSize);
+ if (!NT_SUCCESS(rc))
+ goto free_buff;
+
+ ASSERT(buff->Type == REG_RESOURCE_LIST);
+ res = (PCM_RESOURCE_LIST)((char *)buff + buff->DataOffset);
+ ASSERT(res->Count == 1);
+ list = &res->List[0].PartialResourceList;
+ count = list->Count;
+ pres = &list->PartialDescriptors[count - 1];
+
+ switch (pres->Type) {
+ case CmResourceTypeMemory:
+ *size = pres->u.Memory.Start.QuadPart +
+ pres->u.Memory.Length;
+ break;
+ case CmResourceTypeMemoryLarge:
+ switch (pres->Flags) {
+ case CM_RESOURCE_MEMORY_LARGE_40:
+ *size = pres->u.Memory40.Start.QuadPart +
+ ((u64)pres->u.Memory40.Length40 << 8);
+ break;
+ case CM_RESOURCE_MEMORY_LARGE_48:
+ *size = pres->u.Memory48.Start.QuadPart +
+ ((u64)pres->u.Memory48.Length48 << 16);
+ break;
+ case CM_RESOURCE_MEMORY_LARGE_64:
+ *size = pres->u.Memory64.Start.QuadPart +
+ ((u64)pres->u.Memory64.Length64 << 32);
+ break;
+ }
+ break;
+ }
+
+ rc = STATUS_SUCCESS;
+
+ free_buff:
+ ExFreePoolWithTag(buff, GVM_POOL_TAG);
+ key_close:
+ ZwClose(keyHandle);
+ return rc;
+}
+
+/*
+ * Init/Deinit Nt Kernel Support Routines
+ */
+
+NTSTATUS NtKrUtilsInit(void)
+{
+ u64 phy_memsize = 0;
+ UNICODE_STRING FuncName;
+ NTSTATUS rc;
+ int cpu;
+ PROCESSOR_NUMBER cpu_number;
+
+ cpu_features_init();
+
+ rc = get_physical_memsize(&phy_memsize);
+ if (!NT_SUCCESS(rc))
+ return rc;
+ max_pagen = (phy_memsize >> PAGE_SHIFT) + 1;
+
+ rc = prepare_boot_cpu_data();
+ if (!NT_SUCCESS(rc))
+ return rc;
+
+ rc = gvmGetCpuOnlineMap();
+ if (!NT_SUCCESS(rc))
+ return rc;
+
+ // Prepare smp call function stuffs
+ RtlInitUnicodeString(&FuncName, L"HalRequestIpi");
+ pHalRequestIpi = MmGetSystemRoutineAddress(&FuncName);
+ RtlInitUnicodeString(&FuncName, L"KeInitializeAffinityEx");
+ pKeInitializeAffinityEx = MmGetSystemRoutineAddress(&FuncName);
+ RtlInitUnicodeString(&FuncName, L"KeAddProcessorAffinityEx");
+ pKeAddProcessorAffinityEx = MmGetSystemRoutineAddress(&FuncName);
+ for (cpu = 0; cpu < cpu_online_count; cpu++) {
+ KeInitializeDpc(&per_cpu(cpu, ipi_dpc),
+ sfc_dpc_routine, NULL);
+ rc = KeGetProcessorNumberFromIndex(cpu, &cpu_number);
+ if (!NT_SUCCESS(rc))
+ return rc;
+ rc = KeSetTargetProcessorDpcEx(
+ &per_cpu(cpu, ipi_dpc),
+ &cpu_number);
+ if (!NT_SUCCESS(rc))
+ return rc;
+ KeSetImportanceDpc(&per_cpu(cpu, ipi_dpc),
+ HighImportance);
+ }
+
+ pglist = (struct page**)ExAllocatePoolWithTag(NonPagedPool,
+ max_pagen*sizeof(struct page *),
+ GVM_POOL_TAG);
+ if (!pglist)
+ return STATUS_NO_MEMORY;
+
+ RtlZeroMemory(pglist, max_pagen*sizeof(struct page *));
+ INIT_LIST_HEAD(&gvm_mmap_list);
+ spin_lock_init(&gvm_mmap_lock);
+
+ return STATUS_SUCCESS;
+}
+
+void NtKrUtilsExit(void)
+{
+ u64 i;
+
+ /* Well implemented code won't rely on freeing here */
+ for (i = 0; i < max_pagen; i++)
+ if (pglist[i])
+ ExFreePoolWithTag(pglist[i], GVM_POOL_TAG);
+ ExFreePoolWithTag(pglist, GVM_POOL_TAG);
+ pglist = NULL;
+}
+