--- sys/amd64/amd64/pmap.c.orig +++ sys/amd64/amd64/pmap.c @@ -1206,6 +1206,9 @@ vm_size_t s; int error, i, pv_npg; + /* L1TF, reserve page @0 unconditionally */ + vm_page_blacklist_add(0, bootverbose); + /* * Initialize the vm page array entries for the kernel pmap's * page table pages. --- sys/amd64/vmm/intel/vmx.c.orig +++ sys/amd64/vmm/intel/vmx.c @@ -183,6 +183,12 @@ SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); +static int guest_l1d_flush; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, + &guest_l1d_flush, 0, NULL); + +uint64_t vmx_msr_flush_cmd; + /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict @@ -718,6 +724,12 @@ return (error); } + guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); + if (guest_l1d_flush && + (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0) + vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D; + /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ --- sys/amd64/vmm/intel/vmx_genassym.c.orig +++ sys/amd64/vmm/intel/vmx_genassym.c @@ -36,6 +36,7 @@ #include #include +#include #include #include "vmx_cpufunc.h" @@ -86,3 +87,6 @@ ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL)); + +ASSYM(PAGE_SIZE, PAGE_SIZE); +ASSYM(KERNBASE, KERNBASE); --- sys/amd64/vmm/intel/vmx_support.S.orig +++ sys/amd64/vmm/intel/vmx_support.S @@ -28,6 +28,7 @@ */ #include +#include #include "vmx_assym.h" @@ -136,9 +137,47 @@ jbe invept_error /* Check invept instruction error */ guest_restore: - cmpl $0, %edx + + /* + * Flush L1D cache if requested. Use IA32_FLUSH_CMD MSR if available, + * otherwise load enough of the data from the zero_region to flush + * existing L1D content. + */ +#define L1D_FLUSH_SIZE (64 * 1024) + movl %edx, %r8d + cmpb $0, guest_l1d_flush(%rip) + je after_l1d + movq vmx_msr_flush_cmd(%rip), %rax + testq %rax, %rax + jz 1f + movq %rax, %rdx + shrq $32, %rdx + movl $MSR_IA32_FLUSH_CMD, %ecx + wrmsr + jmp after_l1d +1: movq $KERNBASE, %r9 + movq $-L1D_FLUSH_SIZE, %rcx + /* + * pass 1: Preload TLB. + * Kernel text is mapped using superpages. TLB preload is + * done for the benefit of older CPUs which split 2M page + * into 4k TLB entries. + */ +2: movb L1D_FLUSH_SIZE(%r9, %rcx), %al + addq $PAGE_SIZE, %rcx + jne 2b + xorl %eax, %eax + cpuid + movq $-L1D_FLUSH_SIZE, %rcx + /* pass 2: Read each cache line */ +3: movb L1D_FLUSH_SIZE(%r9, %rcx), %al + addq $64, %rcx + jne 3b + lfence +#undef L1D_FLUSH_SIZE +after_l1d: + cmpl $0, %r8d je do_launch - VMX_GUEST_RESTORE vmresume /* --- sys/vm/vm_page.c.orig +++ sys/vm/vm_page.c @@ -290,6 +290,27 @@ return (0); } +bool +vm_page_blacklist_add(vm_paddr_t pa, bool verbose) +{ + vm_page_t m; + int ret; + + m = vm_phys_paddr_to_vm_page(pa); + if (m == NULL) + return (true); /* page does not exist, no failure */ + + mtx_lock(&vm_page_queue_free_mtx); + ret = vm_phys_unfree_page(m); + mtx_unlock(&vm_page_queue_free_mtx); + if (ret) { + TAILQ_INSERT_TAIL(&blacklist_head, m, listq); + if (verbose) + printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); + } + return (ret); +} + /* * vm_page_blacklist_check: * @@ -301,26 +322,13 @@ vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; - vm_page_t m; char *next; - int ret; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; - m = vm_phys_paddr_to_vm_page(pa); - if (m == NULL) - continue; - mtx_lock(&vm_page_queue_free_mtx); - ret = vm_phys_unfree_page(m); - mtx_unlock(&vm_page_queue_free_mtx); - if (ret == TRUE) { - TAILQ_INSERT_TAIL(&blacklist_head, m, listq); - if (bootverbose) - printf("Skipping page with pa 0x%jx\n", - (uintmax_t)pa); - } + vm_page_blacklist_add(pa, bootverbose); } } --- sys/vm/vm_page.h.orig +++ sys/vm/vm_page.h @@ -448,6 +448,7 @@ u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); +bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); --- sys/x86/include/specialreg.h.orig +++ sys/x86/include/specialreg.h @@ -378,6 +378,7 @@ */ #define CPUID_STDEXT3_IBPB 0x04000000 #define CPUID_STDEXT3_STIBP 0x08000000 +#define CPUID_STDEXT3_L1D_FLUSH 0x10000000 #define CPUID_STDEXT3_ARCH_CAP 0x20000000 /* MSR IA32_ARCH_CAP(ABILITIES) bits */ @@ -427,6 +428,7 @@ #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe #define MSR_IA32_ARCH_CAP 0x10a +#define MSR_IA32_FLUSH_CMD 0x10b #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 @@ -580,6 +582,9 @@ /* MSR IA32_PRED_CMD */ #define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL +/* MSR IA32_FLUSH_CMD */ +#define IA32_FLUSH_CMD_L1D 0x00000001 + /* * PAT modes. */