Index: /trunk/include/VBox/vmm/cpum.mac
===================================================================
--- /trunk/include/VBox/vmm/cpum.mac	(revision 55105)
+++ /trunk/include/VBox/vmm/cpum.mac	(revision 55106)
@@ -235,7 +235,10 @@
     .msrApicBase        resb    8
     alignb 8
+    .xcr0               resq    1
+    .fXStateMask        resq    1
     .pXStateR0      RTR0PTR_RES 1
     .pXStateR3      RTR3PTR_RES 1
     .pXStateRC      RTRCPTR_RES 1
+    .aoffXState         resw    64
     alignb 64
 endstruc
Index: /trunk/include/VBox/vmm/cpumctx.h
===================================================================
--- /trunk/include/VBox/vmm/cpumctx.h	(revision 55105)
+++ /trunk/include/VBox/vmm/cpumctx.h	(revision 55106)
@@ -399,4 +399,10 @@
     /** @} */
 
+    /** The XCR0 register. */
+    uint64_t                    xcr0;
+    /** The mask to pass to XSAVE/XRSTOR in EDX:EAX.  If zero we use
+     *  FXSAVE/FXRSTOR (since bit 0 will always be set, we only need to test it). */
+    uint64_t                    fXStateMask;
+
     /** Pointer to the FPU/SSE/AVX/XXXX state ring-0 mapping. */
     R0PTRTYPE(PX86XSAVEAREA)    pXStateR0;
@@ -405,7 +411,9 @@
     /** Pointer to the FPU/SSE/AVX/XXXX state raw-mode mapping. */
     RCPTRTYPE(PX86XSAVEAREA)    pXStateRC;
+    /** State component offsets into pXState, UINT16_MAX if not present. */
+    uint16_t                    aoffXState[64];
 
     /** Size padding. */
-    uint32_t        au32SizePadding[HC_ARCH_BITS == 32 ? 3 : 1];
+    uint32_t        au32SizePadding[HC_ARCH_BITS == 32 ? 15 : 13];
 } CPUMCTX;
 #pragma pack()
Index: /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm	(revision 55105)
+++ /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm	(revision 55106)
@@ -41,4 +41,11 @@
 %define DS_OFF_IN_X86FXSTATE    14h
 
+;; For numeric expressions
+%ifdef RT_ARCH_AMD64
+ %define CPUMR0_IS_AMD64        1
+%else
+ %define CPUMR0_IS_AMD64        0
+%endif
+
 
 ;*******************************************************************************
@@ -105,49 +112,4 @@
 %endmacro
 %endif ; Unused.
-
-
-;; Macro for FXSAVE for the guest FPU but tries to figure out whether to
-;  save the 32-bit FPU state or 64-bit FPU state.
-;
-; @param    %1      Pointer to CPUMCPU.
-; @param    %2      Pointer to XState.
-; @uses     xAX, xDX, EFLAGS, 20h of stack.
-;
-%macro SAVE_32_OR_64_FPU 2
-        o64 fxsave [%2]
-
-        xor     edx, edx
-        cmp     dword [%2 + CS_OFF_IN_X86FXSTATE], 0
-        jne     short %%save_done
-
-        sub     rsp, 20h                ; Only need 1ch bytes but keep stack aligned otherwise we #GP(0).
-        fnstenv [rsp]
-        movzx   eax, word [rsp + 10h]
-        mov     [%2 + CS_OFF_IN_X86FXSTATE], eax
-        movzx   eax, word [rsp + 18h]
-        add     rsp, 20h
-        mov     [%2 + DS_OFF_IN_X86FXSTATE], eax
-        mov     edx, X86_FXSTATE_RSVD_32BIT_MAGIC
-
-%%save_done:
-        mov     dword [%2 + X86_OFF_FXSTATE_RSVD], edx
-%endmacro
-
-;;
-; Wrapper for selecting 32-bit or 64-bit FXRSTOR according to what SAVE_32_OR_64_FPU did.
-;
-; @param    %1      Pointer to CPUMCPU.
-; @param    %2      Pointer to XState.
-; @uses     xAX, xDX, EFLAGS
-;
-%macro RESTORE_32_OR_64_FPU 2
-        cmp     dword [%2 + X86_OFF_FXSTATE_RSVD], X86_FXSTATE_RSVD_32BIT_MAGIC
-        jne     short %%restore_64bit_fpu
-        fxrstor [%2]
-        jmp     short %%restore_fpu_done
-%%restore_64bit_fpu:
-        o64 fxrstor [%2]
-%%restore_fpu_done:
-%endmacro
 
 
@@ -186,4 +148,345 @@
 
 ;;
+; Saves the host state.
+;
+; @uses     rax, rdx
+; @param    pCpumCpu    Define for the register containing the CPUMCPU pointer.
+; @param    pXState     Define for the regsiter containing the extended state pointer.
+;
+%macro CPUMR0_SAVE_HOST 0
+        ;
+        ; Load a couple of registers we'll use later in all branches.
+        ;
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
+        mov     eax, [pCpumCpu + CPUMCPU.Host.fXStateMask]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        ; The joy of 32-bit darwin kernels that runs the CPU in 64-bit mode.
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      %%host_legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      %%host_sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+BITS 64
+%%host_sixtyfourbit_mode:
+        or      eax, eax
+        jz      %%host_sixtyfourbit_fxsave
+
+        ; XSAVE
+        mov     edx, [pCpumCpu + CPUMCPU.Host.fXStateMask + 4]
+        o64 xsave [pXState]
+        jmp     %%host_sixtyfourbit_done
+
+        ; FXSAVE
+%%host_sixtyfourbit_fxsave:
+        o64 fxsave [pXState]
+
+%%host_sixtyfourbit_done:
+        jmp far [%%host_fpret wrt rip]
+%%host_fpret:                           ; 16:32 Pointer to %%host_done.
+        dd      %%host_done, NAME(SUPR0AbsKernelCS)
+BITS 32
+
+%%host_legacy_mode:
+%endif
+
+        ;
+        ; XSAVE or FXSAVE?
+        ;
+        or      eax, eax
+        jz      %%host_fxsave
+
+        ; XSAVE
+        mov     edx, [pCpumCpu + CPUMCPU.Host.fXStateMask + 4]
+%ifdef RT_ARCH_AMD64
+        o64 xsave [pXState]
+%else
+        xsave   [pXState]
+%endif
+        jmp     %%host_done
+
+        ; FXSAVE
+%%host_fxsave:
+%ifdef RT_ARCH_AMD64
+        o64 fxsave [pXState]            ; Use explicit REX prefix. See @bugref{6398}.
+%else
+        fxsave  [pXState]
+%endif
+
+%%host_done:
+%endmacro ; CPUMR0_SAVE_HOST
+
+
+;;
+; Loads the host state.
+;
+; @uses     rax, rdx
+; @param    pCpumCpu    Define for the register containing the CPUMCPU pointer.
+; @param    pXState     Define for the regsiter containing the extended state pointer.
+;
+%macro CPUMR0_LOAD_HOST 0
+        ;
+        ; Load a couple of registers we'll use later in all branches.
+        ;
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
+        mov     eax, [pCpumCpu + CPUMCPU.Host.fXStateMask]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        ; The joy of 32-bit darwin kernels that runs the CPU in 64-bit mode.
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      %%host_legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      %%host_sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+BITS 64
+%%host_sixtyfourbit_mode:
+        or      eax, eax
+        jz      %%host_sixtyfourbit_fxrstor
+
+        ; XRSTOR
+        mov     edx, [pCpumCpu + CPUMCPU.Host.fXStateMask + 4]
+        o64 xrstor [pXState]
+        jmp     %%host_sixtyfourbit_done
+
+        ; FXRSTOR
+%%host_sixtyfourbit_fxrstor:
+        o64 fxrstor [pXState]
+
+%%host_sixtyfourbit_done:
+        jmp far [%%host_fpret wrt rip]
+%%host_fpret:                           ; 16:32 Pointer to %%host_done.
+        dd      %%host_done, NAME(SUPR0AbsKernelCS)
+BITS 32
+
+%%host_legacy_mode:
+%endif
+
+        ;
+        ; XRSTOR or FXRSTOR?
+        ;
+        or      eax, eax
+        jz      %%host_fxrstor
+
+        ; XRSTOR
+        mov     edx, [pCpumCpu + CPUMCPU.Host.fXStateMask + 4]
+%ifdef RT_ARCH_AMD64
+        o64 xrstor [pXState]
+%else
+        xrstor  [pXState]
+%endif
+        jmp     %%host_done
+
+        ; FXRSTOR
+%%host_fxrstor:
+%ifdef RT_ARCH_AMD64
+        o64 fxrstor [pXState]           ; Use explicit REX prefix. See @bugref{6398}.
+%else
+        fxrstor [pXState]
+%endif
+
+%%host_done:
+%endmacro ; CPUMR0_LOAD_HOST
+
+
+
+;; Macro for FXSAVE for the guest FPU but tries to figure out whether to
+;  save the 32-bit FPU state or 64-bit FPU state.
+;
+; @param    %1      Pointer to CPUMCPU.
+; @param    %2      Pointer to XState.
+; @param    %3      Force AMD64
+; @uses     xAX, xDX, EFLAGS, 20h of stack.
+;
+%macro SAVE_32_OR_64_FPU 3
+%if CPUMR0_IS_AMD64 || %3
+        ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
+        jnz     short %%save_long_mode_guest
+%endif
+        fxsave  [pXState]
+%if CPUMR0_IS_AMD64 || %3
+        jmp     %%save_done_32bit_cs_ds
+
+%%save_long_mode_guest:
+        o64 fxsave [pXState]
+
+        xor     edx, edx
+        cmp     dword [pXState + CS_OFF_IN_X86FXSTATE], 0
+        jne     short %%save_done
+
+        sub     rsp, 20h                ; Only need 1ch bytes but keep stack aligned otherwise we #GP(0).
+        fnstenv [rsp]
+        movzx   eax, word [rsp + 10h]
+        mov     [pXState + CS_OFF_IN_X86FXSTATE], eax
+        movzx   eax, word [rsp + 18h]
+        add     rsp, 20h
+        mov     [pXState + DS_OFF_IN_X86FXSTATE], eax
+%endif
+%%save_done_32bit_cs_ds:
+        mov     edx, X86_FXSTATE_RSVD_32BIT_MAGIC
+%%save_done:
+        mov     dword [pXState + X86_OFF_FXSTATE_RSVD], edx
+%endmacro ; SAVE_32_OR_64_FPU
+
+
+;;
+; Save the guest state.
+;
+; @uses     rax, rdx
+; @param    pCpumCpu    Define for the register containing the CPUMCPU pointer.
+; @param    pXState     Define for the regsiter containing the extended state pointer.
+;
+%macro CPUMR0_SAVE_GUEST 0
+        ;
+        ; Load a couple of registers we'll use later in all branches.
+        ;
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        mov     eax, [pCpumCpu + CPUMCPU.Guest.fXStateMask]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        ; The joy of 32-bit darwin kernels that runs the CPU in 64-bit mode.
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      %%guest_legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      %%guest_sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+BITS 64
+%%guest_sixtyfourbit_mode:
+        or      eax, eax
+        jz      %%guest_sixtyfourbit_fxsave
+
+        ; XSAVE
+        mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+        o64 xsave [pXState]
+        jmp     %%guest_sixtyfourbit_done
+
+        ; FXSAVE
+%%guest_sixtyfourbit_fxsave:
+        SAVE_32_OR_64_FPU pCpumCpu, pXState, 1
+
+%%guest_sixtyfourbit_done:
+        jmp far [%%guest_fpret wrt rip]
+%%guest_fpret:                          ; 16:32 Pointer to %%guest_done.
+        dd      %%guest_done, NAME(SUPR0AbsKernelCS)
+BITS 32
+
+%%guest_legacy_mode:
+%endif
+
+        ;
+        ; XSAVE or FXSAVE?
+        ;
+        or      eax, eax
+        jz      %%guest_fxsave
+
+        ; XSAVE
+        mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+%ifdef RT_ARCH_AMD64
+        o64 xsave [pXState]
+%else
+        xsave   [pXState]
+%endif
+        jmp     %%guest_done
+
+        ; FXSAVE
+%%guest_fxsave:
+        SAVE_32_OR_64_FPU pCpumCpu, pXState, 0
+
+%%guest_done:
+%endmacro ; CPUMR0_SAVE_GUEST
+
+
+;;
+; Wrapper for selecting 32-bit or 64-bit FXRSTOR according to what SAVE_32_OR_64_FPU did.
+;
+; @param    %1      Pointer to CPUMCPU.
+; @param    %2      Pointer to XState.
+; @param    %3      Force AMD64.
+; @uses     xAX, xDX, EFLAGS
+;
+%macro RESTORE_32_OR_64_FPU 3
+%if CPUMR0_IS_AMD64 || %3
+        ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
+        jz      %%restore_32bit_fpu
+        cmp     dword [pXState + X86_OFF_FXSTATE_RSVD], X86_FXSTATE_RSVD_32BIT_MAGIC
+        jne     short %%restore_64bit_fpu
+%%restore_32bit_fpu:
+%endif
+        fxrstor [pXState]
+%if CPUMR0_IS_AMD64 || %3
+        ; TODO: Restore XMM8-XMM15!
+        jmp     short %%restore_fpu_done
+%%restore_64bit_fpu:
+        o64 fxrstor [pXState]
+%%restore_fpu_done:
+%endif
+%endmacro ; RESTORE_32_OR_64_FPU
+
+
+;;
+; Loads the guest state.
+;
+; @uses     rax, rdx
+; @param    pCpumCpu    Define for the register containing the CPUMCPU pointer.
+; @param    pXState     Define for the regsiter containing the extended state pointer.
+;
+%macro CPUMR0_LOAD_GUEST 0
+        ;
+        ; Load a couple of registers we'll use later in all branches.
+        ;
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        mov     eax, [pCpumCpu + CPUMCPU.Guest.fXStateMask]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        ; The joy of 32-bit darwin kernels that runs the CPU in 64-bit mode.
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      %%guest_legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      %%guest_sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+BITS 64
+%%guest_sixtyfourbit_mode:
+        or      eax, eax
+        jz      %%guest_sixtyfourbit_fxrstor
+
+        ; XRSTOR
+        mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+        o64 xrstor [pXState]
+        jmp     %%guest_sixtyfourbit_done
+
+        ; FXRSTOR
+%%guest_sixtyfourbit_fxrstor:
+        RESTORE_32_OR_64_FPU pCpumCpu, pXState, 1
+
+%%guest_sixtyfourbit_done:
+        jmp far [%%guest_fpret wrt rip]
+%%guest_fpret:                          ; 16:32 Pointer to %%guest_done.
+        dd      %%guest_done, NAME(SUPR0AbsKernelCS)
+BITS 32
+
+%%guest_legacy_mode:
+%endif
+
+        ;
+        ; XRSTOR or FXRSTOR?
+        ;
+        or      eax, eax
+        jz      %%guest_fxrstor
+
+        ; XRSTOR
+        mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+%ifdef RT_ARCH_AMD64
+        o64 xrstor [pXState]
+%else
+        xrstor  [pXState]
+%endif
+        jmp     %%guest_done
+
+        ; FXRSTOR
+%%guest_fxrstor:
+        RESTORE_32_OR_64_FPU pCpumCpu, pXState, 0
+
+%%guest_done:
+%endmacro ; CPUMR0_LOAD_GUEST
+
+
+;;
 ; Saves the host FPU/SSE/AVX state and restores the guest FPU/SSE/AVX state.
 ;
@@ -219,34 +522,6 @@
         SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
 
-        ;
-        ; Switch state.
-        ;
-        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-        jz      .legacy_mode
-        db      0xea                    ; jmp far .sixtyfourbit_mode
-        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
-.legacy_mode:
-%endif
-
-%ifdef RT_ARCH_AMD64
-        o64 fxsave [pXState]            ; Use explicit REX prefix. See @bugref{6398}.
-
-        ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-        jnz     short .fpu_load_32_or_64
-        fxrstor [pXState]
-        jmp     short .fpu_load_done
-.fpu_load_32_or_64:
-        RESTORE_32_OR_64_FPU pCpumCpu, pXState
-.fpu_load_done:
-%else
-        fxsave  [pXState]
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-        fxrstor [pXState]
-%endif
+        CPUMR0_SAVE_HOST
+        CPUMR0_LOAD_GUEST
 
 %ifdef VBOX_WITH_KERNEL_USING_XMM
@@ -265,5 +540,4 @@
 %endif
 
-.done:
         RESTORE_CR0 xCX
         or      dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
@@ -277,26 +551,4 @@
         xor     eax, eax
         ret
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
-ALIGNCODE(16)
-BITS 64
-.sixtyfourbit_mode:
-        o64 fxsave  [pXState]
-
-        ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-        jnz     short .fpu_load_32_or_64_darwin
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-        fxrstor [pXState]
-        jmp     short .fpu_load_done_darwin
-.fpu_load_32_or_64_darwin:
-        RESTORE_32_OR_64_FPU pCpumCpu, pXState
-.fpu_load_done_darwin:
-
-        jmp far [.fpret wrt rip]
-.fpret:                                 ; 16:32 Pointer to .the_end.
-        dd      .done, NAME(SUPR0AbsKernelCS)
-BITS 32
-%endif
 ENDPROC   cpumR0SaveHostRestoreGuestFPUState
 
@@ -338,24 +590,6 @@
         SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
 
-        ;
-        ; Save the host state.
-        ;
-        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-        jz      .legacy_mode
-        db      0xea                    ; jmp far .sixtyfourbit_mode
-        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
-.legacy_mode:
-%endif
-
-%ifdef RT_ARCH_AMD64
-        o64 fxsave [pXstate]
-%else
-        fxsave  [pXState]
-%endif
-
-.done:
+        CPUMR0_SAVE_HOST
+
         RESTORE_CR0 xCX
         or      dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
@@ -428,33 +662,7 @@
         SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
 
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-        jz      .legacy_mode
-        db      0xea                    ; jmp far .sixtyfourbit_mode
-        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
-.legacy_mode:
-%endif
-
-%ifdef RT_ARCH_AMD64
-        ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-        jnz     short .fpu_save_32_or_64
-        fxsave  [pXState]
-        jmp     short .fpu_save_done
-.fpu_save_32_or_64:
-        SAVE_32_OR_64_FPU pCpumCpu, pXState
-.fpu_save_done:
-
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-        o64 fxrstor [pXState]           ; Use explicit REX prefix. See @bugref{6398}.
-%else
-        fxsave  [pXState]               ; ASSUMES that all VT-x/AMD-V boxes support fxsave/fxrstor (safe assumption)
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-        fxrstor [pXState]
-%endif
-
-.done:
+        CPUMR0_SAVE_GUEST
+        CPUMR0_LOAD_HOST
+
         RESTORE_CR0 xCX
         and     dword [pCpumCpu + CPUMCPU.fUseFlags], ~CPUM_USED_FPU
@@ -469,25 +677,4 @@
         xor     eax, eax
         ret
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
-ALIGNCODE(16)
-BITS 64
-.sixtyfourbit_mode:
-        ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-        jnz     short .fpu_save_32_or_64_darwin
-        fxsave  [pXState]
-        jmp     short .fpu_save_done_darwin
-.fpu_save_32_or_64_darwin:
-        SAVE_32_OR_64_FPU pCpumCpu, pXState
-.fpu_save_done_darwin:
-
-        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
-        o64 fxrstor [pXstate]
-        jmp far [.fpret wrt rip]
-.fpret:                                 ; 16:32 Pointer to .the_end.
-        dd      .done, NAME(SUPR0AbsKernelCS)
-BITS 32
-%endif
 %undef pCpumCpu
 %undef pXState
@@ -534,21 +721,6 @@
         SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
 
-        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-        jz      .legacy_mode
-        db      0xea                    ; jmp far .sixtyfourbit_mode
-        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
-.legacy_mode:
-%endif
-
-%ifdef RT_ARCH_AMD64
-        o64 fxrstor [pXState]
-%else
-        fxrstor [pXState]
-%endif
-
-.done:
+        CPUMR0_LOAD_HOST
+
         RESTORE_CR0 xCX
         and     dword [pCpumCpu + CPUMCPU.fUseFlags], ~CPUM_USED_FPU
@@ -563,15 +735,4 @@
         xor     eax, eax
         ret
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
-ALIGNCODE(16)
-BITS 64
-.sixtyfourbit_mode:
-        o64 fxrstor [pXState]
-        jmp far [.fpret wrt rip]
-.fpret:                                 ; 16:32 Pointer to .the_end.
-        dd      .done, NAME(SUPR0AbsKernelCS)
-BITS 32
-%endif
 %undef pCpumCPu
 %undef pXState
Index: /trunk/src/VBox/VMM/VMMRC/CPUMRCA.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMRC/CPUMRCA.asm	(revision 55105)
+++ /trunk/src/VBox/VMM/VMMRC/CPUMRCA.asm	(revision 55106)
@@ -145,8 +145,25 @@
         mov     cr0, edx                ; Clear flags so we don't trap here.
 
+        mov     eax, [pCpumCpu + CPUMCPU.Host.fXStateMask]
         mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateRC]
+        or      eax, eax
+        jz      hlfpua_host_fxsave
+        mov     edx, [pCpumCpu + CPUMCPU.Host.fXStateMask + 4]
+        xsave   [pXState]
+        jmp     hlfpua_host_done
+hlfpua_host_fxsave:
         fxsave  [pXState]
+hlfpua_host_done:
+
+        mov     eax, [pCpumCpu + CPUMCPU.Guest.fXStateMask]
         mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateRC]
+        or      eax, eax
+        jz      hlfpua_guest_fxrstor
+        mov     edx, [pCpumCpu + CPUMCPU.Guest.fXStateMask + 4]
+        xrstor  [pXState]
+        jmp     hlfpua_guest_done
+hlfpua_guest_fxrstor:
         fxrstor [pXState]
+hlfpua_guest_done:
 
 hlfpua_finished_switch:
Index: /trunk/src/VBox/VMM/VMMSwitcher/AMD64andLegacy.mac
===================================================================
--- /trunk/src/VBox/VMM/VMMSwitcher/AMD64andLegacy.mac	(revision 55105)
+++ /trunk/src/VBox/VMM/VMMSwitcher/AMD64andLegacy.mac	(revision 55106)
@@ -1139,14 +1139,35 @@
     mov     esi, [rdx + r8 + CPUMCPU.fUseFlags] ; esi == use flags.
     test    esi, CPUM_USED_FPU
-    jz short gth_fpu_no
+    jz      gth_fpu_no
     mov     rcx, cr0
     and     rcx, ~(X86_CR0_TS | X86_CR0_EM)
     mov     cr0, rcx
 
-    mov     rax, [rdx + r8 + CPUMCPU.Guest.pXStateR0]
-    fxsave  [rax]
-    mov     rax, [rdx + r8 + CPUMCPU.Host.pXStateR0]
-    fxrstor [rax]                       ; We saved 32-bit state, so only restore 32-bit.
-    jmp short gth_fpu_no
+    mov     r10, rdx                    ; Save rdx.
+
+    mov     eax, [r10 + r8 + CPUMCPU.Guest.fXStateMask]
+    mov     r9, [r10 + r8 + CPUMCPU.Guest.pXStateR0]
+    or      eax, eax
+    jz      gth_fpu_guest_fxsave
+    mov     edx, [r10 + r8 + CPUMCPU.Guest.fXStateMask + 4]
+    xsave   [r9]
+    jmp     gth_fpu_host
+gth_fpu_guest_fxsave:
+    fxsave  [r9]
+
+gth_fpu_host:
+    mov     eax, [r10 + r8 + CPUMCPU.Host.fXStateMask]
+    mov     r9, [r10 + r8 + CPUMCPU.Host.pXStateR0]
+    or      eax, eax
+    jz      gth_fpu_host_fxrstor
+    mov     edx, [r10 + r8 + CPUMCPU.Host.fXStateMask + 4]
+    xrstor  [r9]                        ; We saved 32-bit state, so only restore 32-bit.
+    jmp     gth_fpu_done
+gth_fpu_host_fxrstor:
+    fxrstor [r9]                        ; We saved 32-bit state, so only restore 32-bit.
+
+gth_fpu_done:
+    mov     rdx, r10                    ; Restore rdx.
+    jmp     gth_fpu_no
 
 ALIGNCODE(16)
Index: /trunk/src/VBox/VMM/VMMSwitcher/LegacyandAMD64.mac
===================================================================
--- /trunk/src/VBox/VMM/VMMSwitcher/LegacyandAMD64.mac	(revision 55105)
+++ /trunk/src/VBox/VMM/VMMSwitcher/LegacyandAMD64.mac	(revision 55106)
@@ -664,6 +664,17 @@
     and     rax, ~(X86_CR0_TS | X86_CR0_EM)
     mov     cr0, rax
-    mov     eax, [rdx + CPUMCPU.Guest.pXStateRC]
-    o64 fxrstor [rax]                   ; (use explicit REX prefix, see @bugref{6398})
+
+    mov     eax, [rdx + CPUMCPU.Guest.fXStateMask]
+    mov     ebx, [rdx + CPUMCPU.Guest.pXStateRC]
+    or      eax, eax
+    jz      htg_fpu_fxrstor
+    mov     r9, rdx
+    mov     edx, [rdx + CPUMCPU.Guest.fXStateMask + 4]
+    o64 xsave [rbx]
+    mov     rdx, r9
+    jmp     htg_fpu_done
+htg_fpu_fxrstor:
+    o64 fxrstor [rbx]                   ; (use explicit REX prefix, see @bugref{6398})
+htg_fpu_done:
     mov     cr0, rcx                    ; and restore old CR0 again
 
@@ -1258,7 +1269,16 @@
     mov     cr0, rax
 
-    mov     eax, [rsi + CPUMCTX.pXStateRC]
-    o64 fxsave  [rax]                   ; (use explicit REX prefix, see @bugref{6398})
-
+    mov     eax, [rsi + CPUMCTX.fXStateMask]
+    mov     ecx, [rsi + CPUMCTX.pXStateRC]
+    test    eax, eax
+    jz      .use_fxsave
+    mov     edx, [rsi + CPUMCTX.fXStateMask + 4]
+    o64 xsave [rcx]
+    jmp     .done
+
+.use_fxsave:
+    o64 fxsave  [rcx]                   ; (use explicit REX prefix, see @bugref{6398})
+
+.done:
     mov     cr0, rcx                    ; and restore old CR0 again
 
Index: /trunk/src/VBox/VMM/VMMSwitcher/PAEand32Bit.mac
===================================================================
--- /trunk/src/VBox/VMM/VMMSwitcher/PAEand32Bit.mac	(revision 55105)
+++ /trunk/src/VBox/VMM/VMMSwitcher/PAEand32Bit.mac	(revision 55106)
@@ -990,8 +990,29 @@
     mov     cr0, ecx
 
-    mov     eax, [edx + CPUMCPU.Guest.pXStateR0]
-    mov     ecx, [edx + CPUMCPU.Host.pXStateR0]
-    fxsave  [eax]
+    mov     ebx, edx                    ; save edx
+
+    mov     eax, [ebx + CPUMCPU.Guest.fXStateMask]
+    mov     ecx, [ebx + CPUMCPU.Guest.pXStateR0]
+    test    eax, eax
+    jz      gth_fpu_guest_fxsave
+    mov     edx, [ebx + CPUMCPU.Guest.fXStateMask + 4]
+    xsave   [ecx]
+    jmp     gth_fpu_host
+gth_fpu_guest_fxsave:
+    fxsave  [ecx]
+
+gth_fpu_host:
+    mov     eax, [ebx + CPUMCPU.Host.fXStateMask]
+    mov     ecx, [ebx + CPUMCPU.Host.pXStateR0]
+    test    eax, eax
+    jz      gth_fpu_host_fxrstor
+    mov     edx, [ebx + CPUMCPU.Host.fXStateMask + 4]
+    xrstor  [ecx]
+    jmp     gth_fpu_done
+gth_fpu_host_fxrstor:
     fxrstor [ecx]
+
+gth_fpu_done:
+    mov     edx, ebx                    ; restore edx
 gth_fpu_no:
 
@@ -1003,5 +1024,5 @@
     mov     ecx, [edx + CPUMCPU.Host.cr0]
     mov     cr0, ecx
-    ;mov     ecx, [edx + CPUMCPU.Host.cr2] ; assumes this is waste of time.
+    ;mov     ecx, [edx + CPUMCPU.Host.cr2] ; assumes this is a waste of time.
     ;mov     cr2, ecx
 
Index: /trunk/src/VBox/VMM/include/CPUMInternal.h
===================================================================
--- /trunk/src/VBox/VMM/include/CPUMInternal.h	(revision 55105)
+++ /trunk/src/VBox/VMM/include/CPUMInternal.h	(revision 55106)
@@ -410,5 +410,5 @@
 
     /* padding to get 64byte aligned size */
-    uint8_t         auPadding[16+20];
+    uint8_t         auPadding[20];
 
 #elif HC_ARCH_BITS == 64 || defined(VBOX_WITH_HYBRID_32BIT_KERNEL)
@@ -456,7 +456,7 @@
     /* padding to get 32byte aligned size */
 # ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+    uint8_t         auPadding[52];
+# else
     uint8_t         auPadding[4];
-# else
-    uint8_t         auPadding[8+12];
 # endif
 
@@ -471,4 +471,9 @@
     /** Pointer to the FPU/SSE/AVX/XXXX state ring-3 mapping. */
     R3PTRTYPE(PX86XSAVEAREA)    pXStateR3;
+    /** The XCR0 register. */
+    uint64_t                    xcr0;
+    /** The mask to pass to XSAVE/XRSTOR in EDX:EAX.  If zero we use
+     *  FXSAVE/FXRSTOR (since bit 0 will always be set, we only need to test it). */
+    uint64_t                    fXStateMask;
 } CPUMHOSTCTX;
 AssertCompileSizeAlignment(CPUMHOSTCTX, 64);
Index: /trunk/src/VBox/VMM/include/CPUMInternal.mac
===================================================================
--- /trunk/src/VBox/VMM/include/CPUMInternal.mac	(revision 55105)
+++ /trunk/src/VBox/VMM/include/CPUMInternal.mac	(revision 55106)
@@ -217,7 +217,10 @@
     .Guest.msrKERNELGSBASE    resb    8
     .Guest.msrApicBase        resb    8
-    .Guest.pXStateR0          RTR0PTR_RES 1
-    .Guest.pXStateR3          RTR3PTR_RES 1
-    .Guest.pXStateRC          RTRCPTR_RES 1
+    .Guest.xcr0               resq    1
+    .Guest.fXStateMask        resq    1
+    .Guest.pXStateR0      RTR0PTR_RES 1
+    .Guest.pXStateR3      RTR3PTR_RES 1
+    .Guest.pXStateRC      RTRCPTR_RES 1
+    .Guest.aoffXState         resw    64
 
     alignb 64
@@ -324,5 +327,5 @@
     .Host.SysEnter.esp   resq    1
     .Host.efer           resq    1
-    .Host.auPadding      resb    (16+20)
+    .Host.auPadding      resb    (20)
 
 %else ; 64-bit
@@ -357,7 +360,7 @@
     .Host.efer           resq    1
  %if fVBOX_WITH_HYBRID_32BIT_KERNEL
+    .Host.auPadding      resb    54
+ %else
     .Host.auPadding      resb    4
- %else
-    .Host.auPadding      resb   (8+12)
  %endif
 %endif ; 64-bit
@@ -366,4 +369,7 @@
     .Host.pXStateR0 RTR0PTR_RES  1
     .Host.pXStateR3 RTR3PTR_RES  1
+    alignb 8
+    .Host.xcr0           resq    1
+    .Host.fXStateMask    resq    1
 
     ;
@@ -470,7 +476,10 @@
     .Hyper.msrKERNELGSBASE    resb    8
     .Hyper.msrApicBase        resb    8
-    .Hyper.pXStateR0          RTR0PTR_RES 1
-    .Hyper.pXStateR3          RTR3PTR_RES 1
-    .Hyper.pXStateRC          RTRCPTR_RES 1
+    .Hyper.xcr0               resq    1
+    .Hyper.fXStateMask        resq    1
+    .Hyper.pXStateR0      RTR0PTR_RES 1
+    .Hyper.pXStateR3      RTR3PTR_RES 1
+    .Hyper.pXStateRC      RTRCPTR_RES 1
+    .Hyper.aoffXState         resw    64
     alignb 64
 
