Index: /trunk/include/VBox/vmm/cpum.mac
===================================================================
--- /trunk/include/VBox/vmm/cpum.mac	(revision 55737)
+++ /trunk/include/VBox/vmm/cpum.mac	(revision 55738)
@@ -28,4 +28,12 @@
 
 %include "iprt/asmdefs.mac"
+
+
+;;
+; The volatile XSAVE components when VBOX_WITH_KERNEL_USING_XMM is active.
+; @note ASSUMED to be at the most 32-bit in width at the moment.
+%ifdef VBOX_WITH_KERNEL_USING_XMM
+ %define CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS (XSAVE_C_SSE | XSAVE_C_YMM | XSAVE_C_ZMM_HI256 | XSAVE_C_ZMM_16HI)
+%endif
 
 ;;
Index: /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm	(revision 55737)
+++ /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm	(revision 55738)
@@ -379,4 +379,7 @@
         ; XSAVE
         mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+%ifdef VBOX_WITH_KERNEL_USING_XMM
+        and     eax, ~CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS ; Already saved in HMR0A.asm.
+%endif
 %ifdef RT_ARCH_AMD64
         o64 xsave [pXState]
@@ -473,4 +476,7 @@
         ; XRSTOR
         mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+%ifdef VBOX_WITH_KERNEL_USING_XMM
+        and     eax, ~CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS ; Will be loaded by HMR0A.asm.
+%endif
 %ifdef RT_ARCH_AMD64
         o64 xrstor [pXState]
Index: /trunk/src/VBox/VMM/VMMR0/HMR0A.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/HMR0A.asm	(revision 55737)
+++ /trunk/src/VBox/VMM/VMMR0/HMR0A.asm	(revision 55738)
@@ -69,4 +69,5 @@
 ; Use define because I'm too lazy to convert the struct.
 %define XMM_OFF_IN_X86FXSTATE   160
+
 
 ;; @def MYPUSHAD
@@ -1173,5 +1174,5 @@
 ALIGNCODE(8)
 .guest_fpu_state_active:
-        ; Save the host XMM registers.
+        ; Save the non-volatile host XMM registers.
         movdqa  [rsp + 040h + 000h], xmm6
         movdqa  [rsp + 040h + 010h], xmm7
@@ -1185,6 +1186,59 @@
         movdqa  [rsp + 040h + 090h], xmm15
 
+        mov     r10, [xBP + 018h]       ; pCtx
+        mov     eax, [r10 + CPUMCTX.fXStateMask]
+        test    eax, eax
+        jz      .guest_fpu_state_manually
+
+        ;
+        ; Using XSAVE to load the guest XMM, YMM and ZMM registers.
+        ;
+        and     eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS
+        xor     edx, edx
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        xrstor  [r10]
+
+        ; Make the call (same as in the other case ).
+        mov     r11, [xBP + 38h]        ; pfnStartVM
+        mov     r10, [xBP + 30h]        ; pVCpu
+        mov     [xSP + 020h], r10
+        mov     rcx, [xBP + 010h]       ; fResumeVM
+        mov     rdx, [xBP + 018h]       ; pCtx
+        mov     r8,  [xBP + 020h]       ; pVMCSCache
+        mov     r9,  [xBP + 028h]       ; pVM
+        call    r11
+
+        mov     r11d, eax               ; save return value (xsave below uses eax)
+
+        ; Save the guest XMM registers.
+        mov     r10, [xBP + 018h]       ; pCtx
+        mov     eax, [r10 + CPUMCTX.fXStateMask]
+        and     eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS
+        xor     edx, edx
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        xsave  [r10]
+
+        mov     eax, r11d               ; restore return value.
+
+.restore_non_volatile_host_xmm_regs:
+        ; Load the non-volatile host XMM registers.
+        movdqa  xmm6,  [rsp + 040h + 000h]
+        movdqa  xmm7,  [rsp + 040h + 010h]
+        movdqa  xmm8,  [rsp + 040h + 020h]
+        movdqa  xmm9,  [rsp + 040h + 030h]
+        movdqa  xmm10, [rsp + 040h + 040h]
+        movdqa  xmm11, [rsp + 040h + 050h]
+        movdqa  xmm12, [rsp + 040h + 060h]
+        movdqa  xmm13, [rsp + 040h + 070h]
+        movdqa  xmm14, [rsp + 040h + 080h]
+        movdqa  xmm15, [rsp + 040h + 090h]
+        leave
+        ret
+
+        ;
+        ; No XSAVE, load and save the guest XMM registers manually.
+        ;
+.guest_fpu_state_manually:
         ; Load the full guest XMM register state.
-        mov     r10, [xBP + 018h]       ; pCtx
         mov     r10, [r10 + CPUMCTX.pXStateR0]
         movdqa  xmm0,  [r10 + XMM_OFF_IN_X86FXSTATE + 000h]
@@ -1234,6 +1288,107 @@
         movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h], xmm14
         movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h], xmm15
-
-        ; Load the host XMM registers.
+        jmp     .restore_non_volatile_host_xmm_regs
+ENDPROC   HMR0VMXStartVMWrapXMM
+
+;;
+; Wrapper around svm.pfnVMRun that preserves host XMM registers and
+; load the guest ones when necessary.
+;
+; @cproto       DECLASM(int) HMR0SVMRunWrapXMM(RTHCPHYS pVMCBHostPhys, RTHCPHYS pVMCBPhys, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu, PFNHMSVMVMRUN pfnVMRun);
+;
+; @returns      eax
+;
+; @param        pVMCBHostPhys   msc:rcx
+; @param        pVMCBPhys       msc:rdx
+; @param        pCtx            msc:r8
+; @param        pVM             msc:r9
+; @param        pVCpu           msc:[rbp+30h]
+; @param        pfnVMRun        msc:[rbp+38h]
+;
+; @remarks      This is essentially the same code as HMR0VMXStartVMWrapXMM, only the parameters differ a little bit.
+;
+; ASSUMING 64-bit and windows for now.
+ALIGNCODE(16)
+BEGINPROC HMR0SVMRunWrapXMM
+        push    xBP
+        mov     xBP, xSP
+        sub     xSP, 0a0h + 040h        ; Don't bother optimizing the frame size.
+
+        ; spill input parameters.
+        mov     [xBP + 010h], rcx       ; pVMCBHostPhys
+        mov     [xBP + 018h], rdx       ; pVMCBPhys
+        mov     [xBP + 020h], r8        ; pCtx
+        mov     [xBP + 028h], r9        ; pVM
+
+        ; Ask CPUM whether we've started using the FPU yet.
+        mov     rcx, [xBP + 30h]        ; pVCpu
+        call    NAME(CPUMIsGuestFPUStateActive)
+        test    al, al
+        jnz     .guest_fpu_state_active
+
+        ; No need to mess with XMM registers just call the start routine and return.
+        mov     r11, [xBP + 38h]        ; pfnVMRun
+        mov     r10, [xBP + 30h]        ; pVCpu
+        mov     [xSP + 020h], r10
+        mov     rcx, [xBP + 010h]       ; pVMCBHostPhys
+        mov     rdx, [xBP + 018h]       ; pVMCBPhys
+        mov     r8,  [xBP + 020h]       ; pCtx
+        mov     r9,  [xBP + 028h]       ; pVM
+        call    r11
+
+        leave
+        ret
+
+ALIGNCODE(8)
+.guest_fpu_state_active:
+        ; Save the non-volatile host XMM registers.
+        movdqa  [rsp + 040h + 000h], xmm6
+        movdqa  [rsp + 040h + 010h], xmm7
+        movdqa  [rsp + 040h + 020h], xmm8
+        movdqa  [rsp + 040h + 030h], xmm9
+        movdqa  [rsp + 040h + 040h], xmm10
+        movdqa  [rsp + 040h + 050h], xmm11
+        movdqa  [rsp + 040h + 060h], xmm12
+        movdqa  [rsp + 040h + 070h], xmm13
+        movdqa  [rsp + 040h + 080h], xmm14
+        movdqa  [rsp + 040h + 090h], xmm15
+
+        mov     r10, [xBP + 020h]       ; pCtx
+        mov     eax, [r10 + CPUMCTX.fXStateMask]
+        test    eax, eax
+        jz      .guest_fpu_state_manually
+
+        ;
+        ; Using XSAVE.
+        ;
+        and     eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS
+        xor     edx, edx
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        xrstor  [r10]
+
+        ; Make the call (same as in the other case ).
+        mov     r11, [xBP + 38h]        ; pfnVMRun
+        mov     r10, [xBP + 30h]        ; pVCpu
+        mov     [xSP + 020h], r10
+        mov     rcx, [xBP + 010h]       ; pVMCBHostPhys
+        mov     rdx, [xBP + 018h]       ; pVMCBPhys
+        mov     r8,  [xBP + 020h]       ; pCtx
+        mov     r9,  [xBP + 028h]       ; pVM
+        call    r11
+
+        mov     r11d, eax               ; save return value (xsave below uses eax)
+
+        ; Save the guest XMM registers.
+        mov     r10, [xBP + 020h]       ; pCtx
+        mov     eax, [r10 + CPUMCTX.fXStateMask]
+        and     eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS
+        xor     edx, edx
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        xsave  [r10]
+
+        mov     eax, r11d               ; restore return value.
+
+.restore_non_volatile_host_xmm_regs:
+        ; Load the non-volatile host XMM registers.
         movdqa  xmm6,  [rsp + 040h + 000h]
         movdqa  xmm7,  [rsp + 040h + 010h]
@@ -1248,71 +1403,10 @@
         leave
         ret
-ENDPROC   HMR0VMXStartVMWrapXMM
-
-;;
-; Wrapper around svm.pfnVMRun that preserves host XMM registers and
-; load the guest ones when necessary.
-;
-; @cproto       DECLASM(int) HMR0SVMRunWrapXMM(RTHCPHYS pVMCBHostPhys, RTHCPHYS pVMCBPhys, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu, PFNHMSVMVMRUN pfnVMRun);
-;
-; @returns      eax
-;
-; @param        pVMCBHostPhys   msc:rcx
-; @param        pVMCBPhys       msc:rdx
-; @param        pCtx            msc:r8
-; @param        pVM             msc:r9
-; @param        pVCpu           msc:[rbp+30h]
-; @param        pfnVMRun        msc:[rbp+38h]
-;
-; @remarks      This is essentially the same code as HMR0VMXStartVMWrapXMM, only the parameters differ a little bit.
-;
-; ASSUMING 64-bit and windows for now.
-ALIGNCODE(16)
-BEGINPROC HMR0SVMRunWrapXMM
-        push    xBP
-        mov     xBP, xSP
-        sub     xSP, 0a0h + 040h        ; Don't bother optimizing the frame size.
-
-        ; spill input parameters.
-        mov     [xBP + 010h], rcx       ; pVMCBHostPhys
-        mov     [xBP + 018h], rdx       ; pVMCBPhys
-        mov     [xBP + 020h], r8        ; pCtx
-        mov     [xBP + 028h], r9        ; pVM
-
-        ; Ask CPUM whether we've started using the FPU yet.
-        mov     rcx, [xBP + 30h]        ; pVCpu
-        call    NAME(CPUMIsGuestFPUStateActive)
-        test    al, al
-        jnz     .guest_fpu_state_active
-
-        ; No need to mess with XMM registers just call the start routine and return.
-        mov     r11, [xBP + 38h]        ; pfnVMRun
-        mov     r10, [xBP + 30h]        ; pVCpu
-        mov     [xSP + 020h], r10
-        mov     rcx, [xBP + 010h]       ; pVMCBHostPhys
-        mov     rdx, [xBP + 018h]       ; pVMCBPhys
-        mov     r8,  [xBP + 020h]       ; pCtx
-        mov     r9,  [xBP + 028h]       ; pVM
-        call    r11
-
-        leave
-        ret
-
-ALIGNCODE(8)
-.guest_fpu_state_active:
-        ; Save the host XMM registers.
-        movdqa  [rsp + 040h + 000h], xmm6
-        movdqa  [rsp + 040h + 010h], xmm7
-        movdqa  [rsp + 040h + 020h], xmm8
-        movdqa  [rsp + 040h + 030h], xmm9
-        movdqa  [rsp + 040h + 040h], xmm10
-        movdqa  [rsp + 040h + 050h], xmm11
-        movdqa  [rsp + 040h + 060h], xmm12
-        movdqa  [rsp + 040h + 070h], xmm13
-        movdqa  [rsp + 040h + 080h], xmm14
-        movdqa  [rsp + 040h + 090h], xmm15
-
+
+        ;
+        ; No XSAVE, load and save the guest XMM registers manually.
+        ;
+.guest_fpu_state_manually:
         ; Load the full guest XMM register state.
-        mov     r10, [xBP + 020h]       ; pCtx
         mov     r10, [r10 + CPUMCTX.pXStateR0]
         movdqa  xmm0,  [r10 + XMM_OFF_IN_X86FXSTATE + 000h]
@@ -1362,18 +1456,5 @@
         movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h], xmm14
         movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h], xmm15
-
-        ; Load the host XMM registers.
-        movdqa  xmm6,  [rsp + 040h + 000h]
-        movdqa  xmm7,  [rsp + 040h + 010h]
-        movdqa  xmm8,  [rsp + 040h + 020h]
-        movdqa  xmm9,  [rsp + 040h + 030h]
-        movdqa  xmm10, [rsp + 040h + 040h]
-        movdqa  xmm11, [rsp + 040h + 050h]
-        movdqa  xmm12, [rsp + 040h + 060h]
-        movdqa  xmm13, [rsp + 040h + 070h]
-        movdqa  xmm14, [rsp + 040h + 080h]
-        movdqa  xmm15, [rsp + 040h + 090h]
-        leave
-        ret
+        jmp     .restore_non_volatile_host_xmm_regs
 ENDPROC   HMR0SVMRunWrapXMM
 
