Index: /trunk/include/VBox/vmm/cpum.mac
===================================================================
--- /trunk/include/VBox/vmm/cpum.mac	(revision 55047)
+++ /trunk/include/VBox/vmm/cpum.mac	(revision 55048)
@@ -136,5 +136,4 @@
 
 struc CPUMCTX
-    .XState             resb    XSTATE_SIZE
     .eax                resq    1
     .ecx                resq    1
@@ -235,5 +234,9 @@
     .msrKERNELGSBASE    resb    8
     .msrApicBase        resb    8
-    .au32SizePadding    resb    24
+    alignb 8
+    .pXStateR0      RTR0PTR_RES 1
+    .pXStateR3      RTR3PTR_RES 1
+    .pXStateRC      RTRCPTR_RES 1
+    alignb 64
 endstruc
 
Index: /trunk/include/VBox/vmm/cpumctx.h
===================================================================
--- /trunk/include/VBox/vmm/cpumctx.h	(revision 55047)
+++ /trunk/include/VBox/vmm/cpumctx.h	(revision 55048)
@@ -247,9 +247,4 @@
 typedef struct CPUMCTX
 {
-    /** FPU state. (16-byte alignment)
-     * @todo This doesn't have to be in X86FXSTATE on CPUs without fxsr - we need a type for the
-     *       actual format or convert it (waste of time).  */
-    X86XSAVEAREA        XState;
-
     /** CPUMCTXCORE Part.
      * @{ */
@@ -404,6 +399,13 @@
     /** @} */
 
+    /** Pointer to the FPU/SSE/AVX/XXXX state ring-0 mapping. */
+    R0PTRTYPE(PX86XSAVEAREA)    pXStateR0;
+    /** Pointer to the FPU/SSE/AVX/XXXX state ring-3 mapping. */
+    R3PTRTYPE(PX86XSAVEAREA)    pXStateR3;
+    /** Pointer to the FPU/SSE/AVX/XXXX state raw-mode mapping. */
+    RCPTRTYPE(PX86XSAVEAREA)    pXStateRC;
+
     /** Size padding. */
-    uint32_t        au32SizePadding[6];
+    uint32_t        au32SizePadding[HC_ARCH_BITS == 32 ? 3 : 1];
 } CPUMCTX;
 #pragma pack()
Index: /trunk/include/VBox/vmm/ssm.h
===================================================================
--- /trunk/include/VBox/vmm/ssm.h	(revision 55047)
+++ /trunk/include/VBox/vmm/ssm.h	(revision 55048)
@@ -378,4 +378,9 @@
 /** Saved using SSMR3PutMem, don't be too strict. */
 #define SSMSTRUCT_FLAGS_SAVED_AS_MEM        RT_BIT_32(3)
+/** No introductory structure marker.  Use when splitting up structures.  */
+#define SSMSTRUCT_FLAGS_NO_LEAD_MARKER      RT_BIT_32(4)
+/** No trailing structure marker.  Use when splitting up structures.  */
+#define SSMSTRUCT_FLAGS_NO_TAIL_MARKER      RT_BIT_32(5)
+
 /** Band-aid for old SSMR3PutMem/SSMR3GetMem of structurs with host pointers.
  * @remarks This type is normally only used up to the first changes to the
@@ -390,5 +395,5 @@
                                               | SSMSTRUCT_FLAGS_NO_MARKERS  | SSMSTRUCT_FLAGS_SAVED_AS_MEM)
 /** Mask of the valid bits. */
-#define SSMSTRUCT_FLAGS_VALID_MASK          UINT32_C(0x0000000f)
+#define SSMSTRUCT_FLAGS_VALID_MASK          UINT32_C(0x0000003f)
 /** @} */
 
Index: /trunk/include/VBox/vmm/vm.h
===================================================================
--- /trunk/include/VBox/vmm/vm.h	(revision 55047)
+++ /trunk/include/VBox/vmm/vm.h	(revision 55048)
@@ -246,5 +246,5 @@
         struct CPUMCPU      s;
 #endif
-        uint8_t             padding[28672];      /* multiple of 4096 */
+        uint8_t             padding[4096];      /* multiple of 4096 */
     } cpum;
 
Index: /trunk/include/VBox/vmm/vm.mac
===================================================================
--- /trunk/include/VBox/vmm/vm.mac	(revision 55047)
+++ /trunk/include/VBox/vmm/vm.mac	(revision 55048)
@@ -148,5 +148,5 @@
     .pgm                    resb 4096
     alignb 4096
-    .cpum                   resb 28672
+    .cpum                   resb 4096
     alignb 4096
 endstruc
Index: /trunk/include/iprt/x86.h
===================================================================
--- /trunk/include/iprt/x86.h	(revision 55047)
+++ /trunk/include/iprt/x86.h	(revision 55048)
@@ -2457,5 +2457,6 @@
 #define X86_OFF_FXSTATE_RSVD            0x1d0
 /** The 32-bit magic used to recognize if this a 32-bit FPU state. Don't
- *  forget to update x86.mac if you change this! */
+ *  forget to update x86.mac if you change this!
+ * @todo r=bird: This has nothing what-so-ever to do here.... */
 #define X86_FXSTATE_RSVD_32BIT_MAGIC    0x32b3232b
 #ifndef VBOX_FOR_DTRACE_LIB
@@ -2791,4 +2792,8 @@
 AssertCompileMemberOffset(X86XSAVEAREA, u.Intel.Zmm16Hi,    0x680 /* 1664 */);
 #endif
+/** Pointer to a XSAVE area. */
+typedef X86XSAVEAREA *PX86XSAVEAREA;
+/** Pointer to a const XSAVE area. */
+typedef X86XSAVEAREA const *PCX86XSAVEAREA;
 
 
Index: /trunk/src/VBox/VMM/VMMAll/IEMAll.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMAll/IEMAll.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMAll/IEMAll.cpp	(revision 55048)
@@ -5226,34 +5226,36 @@
  * @param   pIemCpu             The IEM per CPU data.
  * @param   pCtx                The CPU context.
- */
-DECLINLINE(void) iemFpuUpdateOpcodeAndIpWorker(PIEMCPU pIemCpu, PCPUMCTX pCtx)
-{
-    pCtx->XState.x87.FOP   = pIemCpu->abOpcode[pIemCpu->offFpuOpcode]
-                    | ((uint16_t)(pIemCpu->abOpcode[pIemCpu->offFpuOpcode - 1] & 0x7) << 8);
-    /** @todo XState.x87.CS and FPUIP needs to be kept seperately. */
+ * @param   pFpuCtx             The FPU context.
+ */
+DECLINLINE(void) iemFpuUpdateOpcodeAndIpWorker(PIEMCPU pIemCpu, PCPUMCTX pCtx, PX86FXSTATE pFpuCtx)
+{
+    pFpuCtx->FOP       = pIemCpu->abOpcode[pIemCpu->offFpuOpcode]
+                       | ((uint16_t)(pIemCpu->abOpcode[pIemCpu->offFpuOpcode - 1] & 0x7) << 8);
+    /** @todo x87.CS and FPUIP needs to be kept seperately. */
     if (IEM_IS_REAL_OR_V86_MODE(pIemCpu))
     {
         /** @todo Testcase: making assumptions about how FPUIP and FPUDP are handled
          *        happens in real mode here based on the fnsave and fnstenv images. */
-        pCtx->XState.x87.CS    = 0;
-        pCtx->XState.x87.FPUIP = pCtx->eip | ((uint32_t)pCtx->cs.Sel << 4);
+        pFpuCtx->CS    = 0;
+        pFpuCtx->FPUIP = pCtx->eip | ((uint32_t)pCtx->cs.Sel << 4);
     }
     else
     {
-        pCtx->XState.x87.CS    = pCtx->cs.Sel;
-        pCtx->XState.x87.FPUIP = pCtx->rip;
-    }
-}
-
-
-/**
- * Updates the XState.x87.DS and FPUDP registers.
+        pFpuCtx->CS    = pCtx->cs.Sel;
+        pFpuCtx->FPUIP = pCtx->rip;
+    }
+}
+
+
+/**
+ * Updates the x87.DS and FPUDP registers.
  *
  * @param   pIemCpu             The IEM per CPU data.
  * @param   pCtx                The CPU context.
+ * @param   pFpuCtx             The FPU context.
  * @param   iEffSeg             The effective segment register.
  * @param   GCPtrEff            The effective address relative to @a iEffSeg.
  */
-DECLINLINE(void) iemFpuUpdateDP(PIEMCPU pIemCpu, PCPUMCTX pCtx, uint8_t iEffSeg, RTGCPTR GCPtrEff)
+DECLINLINE(void) iemFpuUpdateDP(PIEMCPU pIemCpu, PCPUMCTX pCtx, PX86FXSTATE pFpuCtx, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
     RTSEL sel;
@@ -5270,14 +5272,14 @@
             sel = pCtx->ds.Sel;
     }
-    /** @todo XState.x87.DS and FPUDP needs to be kept seperately. */
+    /** @todo pFpuCtx->DS and FPUDP needs to be kept seperately. */
     if (IEM_IS_REAL_OR_V86_MODE(pIemCpu))
     {
-        pCtx->XState.x87.DS    = 0;
-        pCtx->XState.x87.FPUDP = (uint32_t)GCPtrEff | ((uint32_t)sel << 4);
+        pFpuCtx->DS    = 0;
+        pFpuCtx->FPUDP = (uint32_t)GCPtrEff | ((uint32_t)sel << 4);
     }
     else
     {
-        pCtx->XState.x87.DS    = sel;
-        pCtx->XState.x87.FPUDP = GCPtrEff;
+        pFpuCtx->DS    = sel;
+        pFpuCtx->FPUDP = GCPtrEff;
     }
 }
@@ -5287,19 +5289,19 @@
  * Rotates the stack registers in the push direction.
  *
- * @param   pCtx                The CPU context.
+ * @param   pFpuCtx             The FPU context.
  * @remarks This is a complete waste of time, but fxsave stores the registers in
  *          stack order.
  */
-DECLINLINE(void) iemFpuRotateStackPush(PCPUMCTX pCtx)
-{
-    RTFLOAT80U r80Tmp = pCtx->XState.x87.aRegs[7].r80;
-    pCtx->XState.x87.aRegs[7].r80 = pCtx->XState.x87.aRegs[6].r80;
-    pCtx->XState.x87.aRegs[6].r80 = pCtx->XState.x87.aRegs[5].r80;
-    pCtx->XState.x87.aRegs[5].r80 = pCtx->XState.x87.aRegs[4].r80;
-    pCtx->XState.x87.aRegs[4].r80 = pCtx->XState.x87.aRegs[3].r80;
-    pCtx->XState.x87.aRegs[3].r80 = pCtx->XState.x87.aRegs[2].r80;
-    pCtx->XState.x87.aRegs[2].r80 = pCtx->XState.x87.aRegs[1].r80;
-    pCtx->XState.x87.aRegs[1].r80 = pCtx->XState.x87.aRegs[0].r80;
-    pCtx->XState.x87.aRegs[0].r80 = r80Tmp;
+DECLINLINE(void) iemFpuRotateStackPush(PX86FXSTATE pFpuCtx)
+{
+    RTFLOAT80U r80Tmp = pFpuCtx->aRegs[7].r80;
+    pFpuCtx->aRegs[7].r80 = pFpuCtx->aRegs[6].r80;
+    pFpuCtx->aRegs[6].r80 = pFpuCtx->aRegs[5].r80;
+    pFpuCtx->aRegs[5].r80 = pFpuCtx->aRegs[4].r80;
+    pFpuCtx->aRegs[4].r80 = pFpuCtx->aRegs[3].r80;
+    pFpuCtx->aRegs[3].r80 = pFpuCtx->aRegs[2].r80;
+    pFpuCtx->aRegs[2].r80 = pFpuCtx->aRegs[1].r80;
+    pFpuCtx->aRegs[1].r80 = pFpuCtx->aRegs[0].r80;
+    pFpuCtx->aRegs[0].r80 = r80Tmp;
 }
 
@@ -5308,19 +5310,19 @@
  * Rotates the stack registers in the pop direction.
  *
- * @param   pCtx                The CPU context.
+ * @param   pFpuCtx             The FPU context.
  * @remarks This is a complete waste of time, but fxsave stores the registers in
  *          stack order.
  */
-DECLINLINE(void) iemFpuRotateStackPop(PCPUMCTX pCtx)
-{
-    RTFLOAT80U r80Tmp = pCtx->XState.x87.aRegs[0].r80;
-    pCtx->XState.x87.aRegs[0].r80 = pCtx->XState.x87.aRegs[1].r80;
-    pCtx->XState.x87.aRegs[1].r80 = pCtx->XState.x87.aRegs[2].r80;
-    pCtx->XState.x87.aRegs[2].r80 = pCtx->XState.x87.aRegs[3].r80;
-    pCtx->XState.x87.aRegs[3].r80 = pCtx->XState.x87.aRegs[4].r80;
-    pCtx->XState.x87.aRegs[4].r80 = pCtx->XState.x87.aRegs[5].r80;
-    pCtx->XState.x87.aRegs[5].r80 = pCtx->XState.x87.aRegs[6].r80;
-    pCtx->XState.x87.aRegs[6].r80 = pCtx->XState.x87.aRegs[7].r80;
-    pCtx->XState.x87.aRegs[7].r80 = r80Tmp;
+DECLINLINE(void) iemFpuRotateStackPop(PX86FXSTATE pFpuCtx)
+{
+    RTFLOAT80U r80Tmp = pFpuCtx->aRegs[0].r80;
+    pFpuCtx->aRegs[0].r80 = pFpuCtx->aRegs[1].r80;
+    pFpuCtx->aRegs[1].r80 = pFpuCtx->aRegs[2].r80;
+    pFpuCtx->aRegs[2].r80 = pFpuCtx->aRegs[3].r80;
+    pFpuCtx->aRegs[3].r80 = pFpuCtx->aRegs[4].r80;
+    pFpuCtx->aRegs[4].r80 = pFpuCtx->aRegs[5].r80;
+    pFpuCtx->aRegs[5].r80 = pFpuCtx->aRegs[6].r80;
+    pFpuCtx->aRegs[6].r80 = pFpuCtx->aRegs[7].r80;
+    pFpuCtx->aRegs[7].r80 = r80Tmp;
 }
 
@@ -5332,36 +5334,36 @@
  * @param   pIemCpu             The IEM per CPU data.
  * @param   pResult             The FPU operation result to push.
- * @param   pCtx                The CPU context.
- */
-static void iemFpuMaybePushResult(PIEMCPU pIemCpu, PIEMFPURESULT pResult, PCPUMCTX pCtx)
+ * @param   pFpuCtx             The FPU context.
+ */
+static void iemFpuMaybePushResult(PIEMCPU pIemCpu, PIEMFPURESULT pResult, PX86FXSTATE pFpuCtx)
 {
     /* Update FSW and bail if there are pending exceptions afterwards. */
-    uint16_t fFsw = pCtx->XState.x87.FSW & ~X86_FSW_C_MASK;
+    uint16_t fFsw = pFpuCtx->FSW & ~X86_FSW_C_MASK;
     fFsw |= pResult->FSW & ~X86_FSW_TOP_MASK;
-    if (   (fFsw                 & (X86_FSW_IE | X86_FSW_ZE | X86_FSW_DE))
-        & ~(pCtx->XState.x87.FCW & (X86_FCW_IM | X86_FCW_ZM | X86_FCW_DM)))
-    {
-        pCtx->XState.x87.FSW = fFsw;
+    if (   (fFsw             & (X86_FSW_IE | X86_FSW_ZE | X86_FSW_DE))
+        & ~(pFpuCtx->FCW & (X86_FCW_IM | X86_FCW_ZM | X86_FCW_DM)))
+    {
+        pFpuCtx->FSW = fFsw;
         return;
     }
 
     uint16_t iNewTop = (X86_FSW_TOP_GET(fFsw) + 7) & X86_FSW_TOP_SMASK;
-    if (!(pCtx->XState.x87.FTW & RT_BIT(iNewTop)))
+    if (!(pFpuCtx->FTW & RT_BIT(iNewTop)))
     {
         /* All is fine, push the actual value. */
-        pCtx->XState.x87.FTW |= RT_BIT(iNewTop);
-        pCtx->XState.x87.aRegs[7].r80 = pResult->r80Result;
-    }
-    else if (pCtx->XState.x87.FCW & X86_FCW_IM)
+        pFpuCtx->FTW |= RT_BIT(iNewTop);
+        pFpuCtx->aRegs[7].r80 = pResult->r80Result;
+    }
+    else if (pFpuCtx->FCW & X86_FCW_IM)
     {
         /* Masked stack overflow, push QNaN. */
         fFsw |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1;
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[7].r80);
+        iemFpuStoreQNan(&pFpuCtx->aRegs[7].r80);
     }
     else
     {
         /* Raise stack overflow, don't push anything. */
-        pCtx->XState.x87.FSW |= pResult->FSW & ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1 | X86_FSW_B | X86_FSW_ES;
+        pFpuCtx->FSW |= pResult->FSW & ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1 | X86_FSW_B | X86_FSW_ES;
         return;
     }
@@ -5369,7 +5371,7 @@
     fFsw &= ~X86_FSW_TOP_MASK;
     fFsw |= iNewTop << X86_FSW_TOP_SHIFT;
-    pCtx->XState.x87.FSW = fFsw;
-
-    iemFpuRotateStackPush(pCtx);
+    pFpuCtx->FSW = fFsw;
+
+    iemFpuRotateStackPush(pFpuCtx);
 }
 
@@ -5377,4 +5379,153 @@
 /**
  * Stores a result in a FPU register and updates the FSW and FTW.
+ *
+ * @param   pFpuCtx             The FPU context.
+ * @param   pResult             The result to store.
+ * @param   iStReg              Which FPU register to store it in.
+ */
+static void iemFpuStoreResultOnly(PX86FXSTATE pFpuCtx, PIEMFPURESULT pResult, uint8_t iStReg)
+{
+    Assert(iStReg < 8);
+    uint16_t iReg = (X86_FSW_TOP_GET(pFpuCtx->FSW) + iStReg) & X86_FSW_TOP_SMASK;
+    pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+    pFpuCtx->FSW |= pResult->FSW & ~X86_FSW_TOP_MASK;
+    pFpuCtx->FTW |= RT_BIT(iReg);
+    pFpuCtx->aRegs[iStReg].r80 = pResult->r80Result;
+}
+
+
+/**
+ * Only updates the FPU status word (FSW) with the result of the current
+ * instruction.
+ *
+ * @param   pFpuCtx             The FPU context.
+ * @param   u16FSW              The FSW output of the current instruction.
+ */
+static void iemFpuUpdateFSWOnly(PX86FXSTATE pFpuCtx, uint16_t u16FSW)
+{
+    pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+    pFpuCtx->FSW |= u16FSW & ~X86_FSW_TOP_MASK;
+}
+
+
+/**
+ * Pops one item off the FPU stack if no pending exception prevents it.
+ *
+ * @param   pFpuCtx             The FPU context.
+ */
+static void iemFpuMaybePopOne(PX86FXSTATE pFpuCtx)
+{
+    /* Check pending exceptions. */
+    uint16_t uFSW = pFpuCtx->FSW;
+    if (   (pFpuCtx->FSW & (X86_FSW_IE | X86_FSW_ZE | X86_FSW_DE))
+        & ~(pFpuCtx->FCW & (X86_FCW_IM | X86_FCW_ZM | X86_FCW_DM)))
+        return;
+
+    /* TOP--. */
+    uint16_t iOldTop = uFSW & X86_FSW_TOP_MASK;
+    uFSW &= ~X86_FSW_TOP_MASK;
+    uFSW |= (iOldTop + (UINT16_C(9) << X86_FSW_TOP_SHIFT)) & X86_FSW_TOP_MASK;
+    pFpuCtx->FSW = uFSW;
+
+    /* Mark the previous ST0 as empty. */
+    iOldTop >>= X86_FSW_TOP_SHIFT;
+    pFpuCtx->FTW &= ~RT_BIT(iOldTop);
+
+    /* Rotate the registers. */
+    iemFpuRotateStackPop(pFpuCtx);
+}
+
+
+/**
+ * Pushes a FPU result onto the FPU stack if no pending exception prevents it.
+ *
+ * @param   pIemCpu             The IEM per CPU data.
+ * @param   pResult             The FPU operation result to push.
+ */
+static void iemFpuPushResult(PIEMCPU pIemCpu, PIEMFPURESULT pResult)
+{
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuMaybePushResult(pIemCpu, pResult, pFpuCtx);
+}
+
+
+/**
+ * Pushes a FPU result onto the FPU stack if no pending exception prevents it,
+ * and sets FPUDP and FPUDS.
+ *
+ * @param   pIemCpu             The IEM per CPU data.
+ * @param   pResult             The FPU operation result to push.
+ * @param   iEffSeg             The effective segment register.
+ * @param   GCPtrEff            The effective address relative to @a iEffSeg.
+ */
+static void iemFpuPushResultWithMemOp(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iEffSeg, RTGCPTR GCPtrEff)
+{
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuMaybePushResult(pIemCpu, pResult, pFpuCtx);
+}
+
+
+/**
+ * Replace ST0 with the first value and push the second onto the FPU stack,
+ * unless a pending exception prevents it.
+ *
+ * @param   pIemCpu             The IEM per CPU data.
+ * @param   pResult             The FPU operation result to store and push.
+ */
+static void iemFpuPushResultTwo(PIEMCPU pIemCpu, PIEMFPURESULTTWO pResult)
+{
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+
+    /* Update FSW and bail if there are pending exceptions afterwards. */
+    uint16_t fFsw = pFpuCtx->FSW & ~X86_FSW_C_MASK;
+    fFsw |= pResult->FSW & ~X86_FSW_TOP_MASK;
+    if (   (fFsw             & (X86_FSW_IE | X86_FSW_ZE | X86_FSW_DE))
+        & ~(pFpuCtx->FCW & (X86_FCW_IM | X86_FCW_ZM | X86_FCW_DM)))
+    {
+        pFpuCtx->FSW = fFsw;
+        return;
+    }
+
+    uint16_t iNewTop = (X86_FSW_TOP_GET(fFsw) + 7) & X86_FSW_TOP_SMASK;
+    if (!(pFpuCtx->FTW & RT_BIT(iNewTop)))
+    {
+        /* All is fine, push the actual value. */
+        pFpuCtx->FTW |= RT_BIT(iNewTop);
+        pFpuCtx->aRegs[0].r80 = pResult->r80Result1;
+        pFpuCtx->aRegs[7].r80 = pResult->r80Result2;
+    }
+    else if (pFpuCtx->FCW & X86_FCW_IM)
+    {
+        /* Masked stack overflow, push QNaN. */
+        fFsw |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1;
+        iemFpuStoreQNan(&pFpuCtx->aRegs[0].r80);
+        iemFpuStoreQNan(&pFpuCtx->aRegs[7].r80);
+    }
+    else
+    {
+        /* Raise stack overflow, don't push anything. */
+        pFpuCtx->FSW |= pResult->FSW & ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1 | X86_FSW_B | X86_FSW_ES;
+        return;
+    }
+
+    fFsw &= ~X86_FSW_TOP_MASK;
+    fFsw |= iNewTop << X86_FSW_TOP_SHIFT;
+    pFpuCtx->FSW = fFsw;
+
+    iemFpuRotateStackPush(pFpuCtx);
+}
+
+
+/**
+ * Stores a result in a FPU register, updates the FSW, FTW, FPUIP, FPUCS, and
+ * FOP.
  *
  * @param   pIemCpu             The IEM per CPU data.
@@ -5383,139 +5534,10 @@
  * @param   pCtx                The CPU context.
  */
-static void iemFpuStoreResultOnly(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iStReg, PCPUMCTX pCtx)
-{
-    Assert(iStReg < 8);
-    uint16_t  iReg = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + iStReg) & X86_FSW_TOP_SMASK;
-    pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-    pCtx->XState.x87.FSW |= pResult->FSW & ~X86_FSW_TOP_MASK;
-    pCtx->XState.x87.FTW |= RT_BIT(iReg);
-    pCtx->XState.x87.aRegs[iStReg].r80 = pResult->r80Result;
-}
-
-
-/**
- * Only updates the FPU status word (FSW) with the result of the current
- * instruction.
- *
- * @param   pCtx                The CPU context.
- * @param   u16FSW              The FSW output of the current instruction.
- */
-static void iemFpuUpdateFSWOnly(PCPUMCTX pCtx, uint16_t u16FSW)
-{
-    pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-    pCtx->XState.x87.FSW |= u16FSW & ~X86_FSW_TOP_MASK;
-}
-
-
-/**
- * Pops one item off the FPU stack if no pending exception prevents it.
- *
- * @param   pCtx                The CPU context.
- */
-static void iemFpuMaybePopOne(PCPUMCTX pCtx)
-{
-    /* Check pending exceptions. */
-    uint16_t uFSW = pCtx->XState.x87.FSW;
-    if (   (pCtx->XState.x87.FSW & (X86_FSW_IE | X86_FSW_ZE | X86_FSW_DE))
-        & ~(pCtx->XState.x87.FCW & (X86_FCW_IM | X86_FCW_ZM | X86_FCW_DM)))
-        return;
-
-    /* TOP--. */
-    uint16_t iOldTop = uFSW & X86_FSW_TOP_MASK;
-    uFSW &= ~X86_FSW_TOP_MASK;
-    uFSW |= (iOldTop + (UINT16_C(9) << X86_FSW_TOP_SHIFT)) & X86_FSW_TOP_MASK;
-    pCtx->XState.x87.FSW = uFSW;
-
-    /* Mark the previous ST0 as empty. */
-    iOldTop >>= X86_FSW_TOP_SHIFT;
-    pCtx->XState.x87.FTW &= ~RT_BIT(iOldTop);
-
-    /* Rotate the registers. */
-    iemFpuRotateStackPop(pCtx);
-}
-
-
-/**
- * Pushes a FPU result onto the FPU stack if no pending exception prevents it.
- *
- * @param   pIemCpu             The IEM per CPU data.
- * @param   pResult             The FPU operation result to push.
- */
-static void iemFpuPushResult(PIEMCPU pIemCpu, PIEMFPURESULT pResult)
-{
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuMaybePushResult(pIemCpu, pResult, pCtx);
-}
-
-
-/**
- * Pushes a FPU result onto the FPU stack if no pending exception prevents it,
- * and sets FPUDP and FPUDS.
- *
- * @param   pIemCpu             The IEM per CPU data.
- * @param   pResult             The FPU operation result to push.
- * @param   iEffSeg             The effective segment register.
- * @param   GCPtrEff            The effective address relative to @a iEffSeg.
- */
-static void iemFpuPushResultWithMemOp(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iEffSeg, RTGCPTR GCPtrEff)
-{
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuMaybePushResult(pIemCpu, pResult, pCtx);
-}
-
-
-/**
- * Replace ST0 with the first value and push the second onto the FPU stack,
- * unless a pending exception prevents it.
- *
- * @param   pIemCpu             The IEM per CPU data.
- * @param   pResult             The FPU operation result to store and push.
- */
-static void iemFpuPushResultTwo(PIEMCPU pIemCpu, PIEMFPURESULTTWO pResult)
-{
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-
-    /* Update FSW and bail if there are pending exceptions afterwards. */
-    uint16_t fFsw = pCtx->XState.x87.FSW & ~X86_FSW_C_MASK;
-    fFsw |= pResult->FSW & ~X86_FSW_TOP_MASK;
-    if (   (fFsw          & (X86_FSW_IE | X86_FSW_ZE | X86_FSW_DE))
-        & ~(pCtx->XState.x87.FCW & (X86_FCW_IM | X86_FCW_ZM | X86_FCW_DM)))
-    {
-        pCtx->XState.x87.FSW = fFsw;
-        return;
-    }
-
-    uint16_t iNewTop = (X86_FSW_TOP_GET(fFsw) + 7) & X86_FSW_TOP_SMASK;
-    if (!(pCtx->XState.x87.FTW & RT_BIT(iNewTop)))
-    {
-        /* All is fine, push the actual value. */
-        pCtx->XState.x87.FTW |= RT_BIT(iNewTop);
-        pCtx->XState.x87.aRegs[0].r80 = pResult->r80Result1;
-        pCtx->XState.x87.aRegs[7].r80 = pResult->r80Result2;
-    }
-    else if (pCtx->XState.x87.FCW & X86_FCW_IM)
-    {
-        /* Masked stack overflow, push QNaN. */
-        fFsw |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1;
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[0].r80);
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[7].r80);
-    }
-    else
-    {
-        /* Raise stack overflow, don't push anything. */
-        pCtx->XState.x87.FSW |= pResult->FSW & ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_C1 | X86_FSW_B | X86_FSW_ES;
-        return;
-    }
-
-    fFsw &= ~X86_FSW_TOP_MASK;
-    fFsw |= iNewTop << X86_FSW_TOP_SHIFT;
-    pCtx->XState.x87.FSW = fFsw;
-
-    iemFpuRotateStackPush(pCtx);
+static void iemFpuStoreResult(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iStReg)
+{
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStoreResultOnly(pFpuCtx, pResult, iStReg);
 }
 
@@ -5523,5 +5545,5 @@
 /**
  * Stores a result in a FPU register, updates the FSW, FTW, FPUIP, FPUCS, and
- * FOP.
+ * FOP, and then pops the stack.
  *
  * @param   pIemCpu             The IEM per CPU data.
@@ -5530,27 +5552,11 @@
  * @param   pCtx                The CPU context.
  */
-static void iemFpuStoreResult(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iStReg)
-{
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStoreResultOnly(pIemCpu, pResult, iStReg, pCtx);
-}
-
-
-/**
- * Stores a result in a FPU register, updates the FSW, FTW, FPUIP, FPUCS, and
- * FOP, and then pops the stack.
- *
- * @param   pIemCpu             The IEM per CPU data.
- * @param   pResult             The result to store.
- * @param   iStReg              Which FPU register to store it in.
- * @param   pCtx                The CPU context.
- */
 static void iemFpuStoreResultThenPop(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iStReg)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStoreResultOnly(pIemCpu, pResult, iStReg, pCtx);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStoreResultOnly(pFpuCtx, pResult, iStReg);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5569,8 +5575,9 @@
 static void iemFpuStoreResultWithMemOp(PIEMCPU pIemCpu, PIEMFPURESULT pResult, uint8_t iStReg, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pIemCpu->CTX_SUFF(pCtx), iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStoreResultOnly(pIemCpu, pResult, iStReg, pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStoreResultOnly(pFpuCtx, pResult, iStReg);
 }
 
@@ -5590,9 +5597,10 @@
                                               uint8_t iStReg, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStoreResultOnly(pIemCpu, pResult, iStReg, pCtx);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStoreResultOnly(pFpuCtx, pResult, iStReg);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5605,5 +5613,7 @@
 static void iemFpuUpdateOpcodeAndIp(PIEMCPU pIemCpu)
 {
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pIemCpu->CTX_SUFF(pCtx));
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
 }
 
@@ -5618,7 +5628,7 @@
 {
     Assert(iStReg < 8);
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    uint8_t iReg = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + iStReg) & X86_FSW_TOP_SMASK;
-    pCtx->XState.x87.FTW &= ~RT_BIT(iReg);
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint8_t     iReg    = (X86_FSW_TOP_GET(pFpuCtx->FSW) + iStReg) & X86_FSW_TOP_SMASK;
+    pFpuCtx->FTW &= ~RT_BIT(iReg);
 }
 
@@ -5631,11 +5641,11 @@
 static void iemFpuStackIncTop(PIEMCPU pIemCpu)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    uint16_t uFsw = pCtx->XState.x87.FSW;
-    uint16_t uTop = uFsw & X86_FSW_TOP_MASK;
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint16_t    uFsw    = pFpuCtx->FSW;
+    uint16_t    uTop    = uFsw & X86_FSW_TOP_MASK;
     uTop  = (uTop + (1 << X86_FSW_TOP_SHIFT)) & X86_FSW_TOP_MASK;
     uFsw &= ~X86_FSW_TOP_MASK;
     uFsw |= uTop;
-    pCtx->XState.x87.FSW = uFsw;
+    pFpuCtx->FSW = uFsw;
 }
 
@@ -5648,11 +5658,11 @@
 static void iemFpuStackDecTop(PIEMCPU pIemCpu)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    uint16_t uFsw = pCtx->XState.x87.FSW;
-    uint16_t uTop = uFsw & X86_FSW_TOP_MASK;
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint16_t    uFsw    = pFpuCtx->FSW;
+    uint16_t    uTop    = uFsw & X86_FSW_TOP_MASK;
     uTop  = (uTop + (7 << X86_FSW_TOP_SHIFT)) & X86_FSW_TOP_MASK;
     uFsw &= ~X86_FSW_TOP_MASK;
     uFsw |= uTop;
-    pCtx->XState.x87.FSW = uFsw;
+    pFpuCtx->FSW = uFsw;
 }
 
@@ -5666,7 +5676,8 @@
 static void iemFpuUpdateFSW(PIEMCPU pIemCpu, uint16_t u16FSW)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuUpdateFSWOnly(pCtx, u16FSW);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuUpdateFSWOnly(pFpuCtx, u16FSW);
 }
 
@@ -5680,8 +5691,9 @@
 static void iemFpuUpdateFSWThenPop(PIEMCPU pIemCpu, uint16_t u16FSW)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuUpdateFSWOnly(pCtx, u16FSW);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuUpdateFSWOnly(pFpuCtx, u16FSW);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5697,8 +5709,9 @@
 static void iemFpuUpdateFSWWithMemOp(PIEMCPU pIemCpu, uint16_t u16FSW, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuUpdateFSWOnly(pCtx, u16FSW);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuUpdateFSWOnly(pFpuCtx, u16FSW);
 }
 
@@ -5712,9 +5725,10 @@
 static void iemFpuUpdateFSWThenPopPop(PIEMCPU pIemCpu, uint16_t u16FSW)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuUpdateFSWOnly(pCtx, u16FSW);
-    iemFpuMaybePopOne(pCtx);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuUpdateFSWOnly(pFpuCtx, u16FSW);
+    iemFpuMaybePopOne(pFpuCtx);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5730,9 +5744,10 @@
 static void iemFpuUpdateFSWWithMemOpThenPop(PIEMCPU pIemCpu, uint16_t u16FSW, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuUpdateFSWOnly(pCtx, u16FSW);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuUpdateFSWOnly(pFpuCtx, u16FSW);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5742,26 +5757,26 @@
  *
  * @param   pIemCpu             The IEM per CPU data.
+ * @param   pFpuCtx             The FPU context.
  * @param   iStReg              The stack register being accessed.
- * @param   pCtx                The CPU context.
- */
-static void iemFpuStackUnderflowOnly(PIEMCPU pIemCpu, uint8_t iStReg, PCPUMCTX pCtx)
+ */
+static void iemFpuStackUnderflowOnly(PIEMCPU pIemCpu, PX86FXSTATE pFpuCtx, uint8_t iStReg)
 {
     Assert(iStReg < 8 || iStReg == UINT8_MAX);
-    if (pCtx->XState.x87.FCW & X86_FCW_IM)
+    if (pFpuCtx->FCW & X86_FCW_IM)
     {
         /* Masked underflow. */
-        pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF;
-        uint16_t iReg = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + iStReg) & X86_FSW_TOP_SMASK;
+        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF;
+        uint16_t iReg = (X86_FSW_TOP_GET(pFpuCtx->FSW) + iStReg) & X86_FSW_TOP_SMASK;
         if (iStReg != UINT8_MAX)
         {
-            pCtx->XState.x87.FTW |= RT_BIT(iReg);
-            iemFpuStoreQNan(&pCtx->XState.x87.aRegs[iStReg].r80);
+            pFpuCtx->FTW |= RT_BIT(iReg);
+            iemFpuStoreQNan(&pFpuCtx->aRegs[iStReg].r80);
         }
     }
     else
     {
-        pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
+        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
     }
 }
@@ -5778,7 +5793,8 @@
 DECL_NO_INLINE(static, void) iemFpuStackUnderflow(PIEMCPU pIemCpu, uint8_t iStReg)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackUnderflowOnly(pIemCpu, iStReg, pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackUnderflowOnly(pIemCpu, pFpuCtx, iStReg);
 }
 
@@ -5787,8 +5803,9 @@
 iemFpuStackUnderflowWithMemOp(PIEMCPU pIemCpu, uint8_t iStReg, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackUnderflowOnly(pIemCpu, iStReg, pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackUnderflowOnly(pIemCpu, pFpuCtx, iStReg);
 }
 
@@ -5796,8 +5813,9 @@
 DECL_NO_INLINE(static, void) iemFpuStackUnderflowThenPop(PIEMCPU pIemCpu, uint8_t iStReg)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackUnderflowOnly(pIemCpu, iStReg, pCtx);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackUnderflowOnly(pIemCpu, pFpuCtx, iStReg);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5806,9 +5824,10 @@
 iemFpuStackUnderflowWithMemOpThenPop(PIEMCPU pIemCpu, uint8_t iStReg, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackUnderflowOnly(pIemCpu, iStReg, pCtx);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx      = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackUnderflowOnly(pIemCpu, pFpuCtx, iStReg);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5816,9 +5835,10 @@
 DECL_NO_INLINE(static, void) iemFpuStackUnderflowThenPopPop(PIEMCPU pIemCpu)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackUnderflowOnly(pIemCpu, UINT8_MAX, pCtx);
-    iemFpuMaybePopOne(pCtx);
-    iemFpuMaybePopOne(pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackUnderflowOnly(pIemCpu, pFpuCtx, UINT8_MAX);
+    iemFpuMaybePopOne(pFpuCtx);
+    iemFpuMaybePopOne(pFpuCtx);
 }
 
@@ -5827,23 +5847,24 @@
 iemFpuStackPushUnderflow(PIEMCPU pIemCpu)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-
-    if (pCtx->XState.x87.FCW & X86_FCW_IM)
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+
+    if (pFpuCtx->FCW & X86_FCW_IM)
     {
         /* Masked overflow - Push QNaN. */
-        uint16_t iNewTop = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + 7) & X86_FSW_TOP_SMASK;
-        pCtx->XState.x87.FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_C_MASK);
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF;
-        pCtx->XState.x87.FSW |= iNewTop << X86_FSW_TOP_SHIFT;
-        pCtx->XState.x87.FTW |= RT_BIT(iNewTop);
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[7].r80);
-        iemFpuRotateStackPush(pCtx);
+        uint16_t iNewTop = (X86_FSW_TOP_GET(pFpuCtx->FSW) + 7) & X86_FSW_TOP_SMASK;
+        pFpuCtx->FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_C_MASK);
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF;
+        pFpuCtx->FSW |= iNewTop << X86_FSW_TOP_SHIFT;
+        pFpuCtx->FTW |= RT_BIT(iNewTop);
+        iemFpuStoreQNan(&pFpuCtx->aRegs[7].r80);
+        iemFpuRotateStackPush(pFpuCtx);
     }
     else
     {
         /* Exception pending - don't change TOP or the register stack. */
-        pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
+        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
     }
 }
@@ -5853,24 +5874,25 @@
 iemFpuStackPushUnderflowTwo(PIEMCPU pIemCpu)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-
-    if (pCtx->XState.x87.FCW & X86_FCW_IM)
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+
+    if (pFpuCtx->FCW & X86_FCW_IM)
     {
         /* Masked overflow - Push QNaN. */
-        uint16_t iNewTop = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + 7) & X86_FSW_TOP_SMASK;
-        pCtx->XState.x87.FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_C_MASK);
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF;
-        pCtx->XState.x87.FSW |= iNewTop << X86_FSW_TOP_SHIFT;
-        pCtx->XState.x87.FTW |= RT_BIT(iNewTop);
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[0].r80);
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[7].r80);
-        iemFpuRotateStackPush(pCtx);
+        uint16_t iNewTop = (X86_FSW_TOP_GET(pFpuCtx->FSW) + 7) & X86_FSW_TOP_SMASK;
+        pFpuCtx->FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_C_MASK);
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF;
+        pFpuCtx->FSW |= iNewTop << X86_FSW_TOP_SHIFT;
+        pFpuCtx->FTW |= RT_BIT(iNewTop);
+        iemFpuStoreQNan(&pFpuCtx->aRegs[0].r80);
+        iemFpuStoreQNan(&pFpuCtx->aRegs[7].r80);
+        iemFpuRotateStackPush(pFpuCtx);
     }
     else
     {
         /* Exception pending - don't change TOP or the register stack. */
-        pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
+        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
     }
 }
@@ -5880,39 +5902,39 @@
  * Worker routine for raising an FPU stack overflow exception on a push.
  *
+ * @param   pFpuCtx             The FPU context.
+ */
+static void iemFpuStackPushOverflowOnly(PX86FXSTATE pFpuCtx)
+{
+    if (pFpuCtx->FCW & X86_FCW_IM)
+    {
+        /* Masked overflow. */
+        uint16_t iNewTop = (X86_FSW_TOP_GET(pFpuCtx->FSW) + 7) & X86_FSW_TOP_SMASK;
+        pFpuCtx->FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_C_MASK);
+        pFpuCtx->FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF;
+        pFpuCtx->FSW |= iNewTop << X86_FSW_TOP_SHIFT;
+        pFpuCtx->FTW |= RT_BIT(iNewTop);
+        iemFpuStoreQNan(&pFpuCtx->aRegs[7].r80);
+        iemFpuRotateStackPush(pFpuCtx);
+    }
+    else
+    {
+        /* Exception pending - don't change TOP or the register stack. */
+        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
+    }
+}
+
+
+/**
+ * Raises a FPU stack overflow exception on a push.
+ *
  * @param   pIemCpu             The IEM per CPU data.
- * @param   pCtx                The CPU context.
- */
-static void iemFpuStackPushOverflowOnly(PIEMCPU pIemCpu, PCPUMCTX pCtx)
-{
-    if (pCtx->XState.x87.FCW & X86_FCW_IM)
-    {
-        /* Masked overflow. */
-        uint16_t iNewTop = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + 7) & X86_FSW_TOP_SMASK;
-        pCtx->XState.x87.FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_C_MASK);
-        pCtx->XState.x87.FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF;
-        pCtx->XState.x87.FSW |= iNewTop << X86_FSW_TOP_SHIFT;
-        pCtx->XState.x87.FTW |= RT_BIT(iNewTop);
-        iemFpuStoreQNan(&pCtx->XState.x87.aRegs[7].r80);
-        iemFpuRotateStackPush(pCtx);
-    }
-    else
-    {
-        /* Exception pending - don't change TOP or the register stack. */
-        pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
-    }
-}
-
-
-/**
- * Raises a FPU stack overflow exception on a push.
- *
- * @param   pIemCpu             The IEM per CPU data.
  */
 DECL_NO_INLINE(static, void) iemFpuStackPushOverflow(PIEMCPU pIemCpu)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackPushOverflowOnly(pIemCpu, pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackPushOverflowOnly(pFpuCtx);
 }
 
@@ -5928,8 +5950,9 @@
 iemFpuStackPushOverflowWithMemOp(PIEMCPU pIemCpu, uint8_t iEffSeg, RTGCPTR GCPtrEff)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    iemFpuUpdateDP(pIemCpu, pCtx, iEffSeg, GCPtrEff);
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
-    iemFpuStackPushOverflowOnly(pIemCpu, pCtx);
+    PCPUMCTX    pCtx    = pIemCpu->CTX_SUFF(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    iemFpuUpdateDP(pIemCpu, pCtx, pFpuCtx, iEffSeg, GCPtrEff);
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
+    iemFpuStackPushOverflowOnly(pFpuCtx);
 }
 
@@ -5937,7 +5960,7 @@
 static int iemFpuStRegNotEmpty(PIEMCPU pIemCpu, uint8_t iStReg)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    uint16_t iReg = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + iStReg) & X86_FSW_TOP_SMASK;
-    if (pCtx->XState.x87.FTW & RT_BIT(iReg))
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint16_t    iReg    = (X86_FSW_TOP_GET(pFpuCtx->FSW) + iStReg) & X86_FSW_TOP_SMASK;
+    if (pFpuCtx->FTW & RT_BIT(iReg))
         return VINF_SUCCESS;
     return VERR_NOT_FOUND;
@@ -5947,9 +5970,9 @@
 static int iemFpuStRegNotEmptyRef(PIEMCPU pIemCpu, uint8_t iStReg, PCRTFLOAT80U *ppRef)
 {
-    PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
-    uint16_t iReg = (X86_FSW_TOP_GET(pCtx->XState.x87.FSW) + iStReg) & X86_FSW_TOP_SMASK;
-    if (pCtx->XState.x87.FTW & RT_BIT(iReg))
-    {
-        *ppRef = &pCtx->XState.x87.aRegs[iStReg].r80;
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint16_t    iReg    = (X86_FSW_TOP_GET(pFpuCtx->FSW) + iStReg) & X86_FSW_TOP_SMASK;
+    if (pFpuCtx->FTW & RT_BIT(iReg))
+    {
+        *ppRef = &pFpuCtx->aRegs[iStReg].r80;
         return VINF_SUCCESS;
     }
@@ -5961,12 +5984,12 @@
                                     uint8_t iStReg1, PCRTFLOAT80U *ppRef1)
 {
-    PCPUMCTX pCtx  = pIemCpu->CTX_SUFF(pCtx);
-    uint16_t iTop  = X86_FSW_TOP_GET(pCtx->XState.x87.FSW);
-    uint16_t iReg0 = (iTop + iStReg0) & X86_FSW_TOP_SMASK;
-    uint16_t iReg1 = (iTop + iStReg1) & X86_FSW_TOP_SMASK;
-    if ((pCtx->XState.x87.FTW & (RT_BIT(iReg0) | RT_BIT(iReg1))) == (RT_BIT(iReg0) | RT_BIT(iReg1)))
-    {
-        *ppRef0 = &pCtx->XState.x87.aRegs[iStReg0].r80;
-        *ppRef1 = &pCtx->XState.x87.aRegs[iStReg1].r80;
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint16_t    iTop    = X86_FSW_TOP_GET(pFpuCtx->FSW);
+    uint16_t    iReg0   = (iTop + iStReg0) & X86_FSW_TOP_SMASK;
+    uint16_t    iReg1   = (iTop + iStReg1) & X86_FSW_TOP_SMASK;
+    if ((pFpuCtx->FTW & (RT_BIT(iReg0) | RT_BIT(iReg1))) == (RT_BIT(iReg0) | RT_BIT(iReg1)))
+    {
+        *ppRef0 = &pFpuCtx->aRegs[iStReg0].r80;
+        *ppRef1 = &pFpuCtx->aRegs[iStReg1].r80;
         return VINF_SUCCESS;
     }
@@ -5977,11 +6000,11 @@
 static int iemFpu2StRegsNotEmptyRefFirst(PIEMCPU pIemCpu, uint8_t iStReg0, PCRTFLOAT80U *ppRef0, uint8_t iStReg1)
 {
-    PCPUMCTX pCtx  = pIemCpu->CTX_SUFF(pCtx);
-    uint16_t iTop  = X86_FSW_TOP_GET(pCtx->XState.x87.FSW);
-    uint16_t iReg0 = (iTop + iStReg0) & X86_FSW_TOP_SMASK;
-    uint16_t iReg1 = (iTop + iStReg1) & X86_FSW_TOP_SMASK;
-    if ((pCtx->XState.x87.FTW & (RT_BIT(iReg0) | RT_BIT(iReg1))) == (RT_BIT(iReg0) | RT_BIT(iReg1)))
-    {
-        *ppRef0 = &pCtx->XState.x87.aRegs[iStReg0].r80;
+    PX86FXSTATE pFpuCtx = &pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87;
+    uint16_t    iTop    = X86_FSW_TOP_GET(pFpuCtx->FSW);
+    uint16_t    iReg0   = (iTop + iStReg0) & X86_FSW_TOP_SMASK;
+    uint16_t    iReg1   = (iTop + iStReg1) & X86_FSW_TOP_SMASK;
+    if ((pFpuCtx->FTW & (RT_BIT(iReg0) | RT_BIT(iReg1))) == (RT_BIT(iReg0) | RT_BIT(iReg1)))
+    {
+        *ppRef0 = &pFpuCtx->aRegs[iStReg0].r80;
         return VINF_SUCCESS;
     }
@@ -5993,14 +6016,14 @@
  * Updates the FPU exception status after FCW is changed.
  *
- * @param   pCtx                The CPU context.
- */
-static void iemFpuRecalcExceptionStatus(PCPUMCTX pCtx)
-{
-    uint16_t u16Fsw = pCtx->XState.x87.FSW;
-    if ((u16Fsw & X86_FSW_XCPT_MASK) & ~(pCtx->XState.x87.FCW & X86_FCW_XCPT_MASK))
+ * @param   pFpuCtx             The FPU context.
+ */
+static void iemFpuRecalcExceptionStatus(PX86FXSTATE pFpuCtx)
+{
+    uint16_t u16Fsw = pFpuCtx->FSW;
+    if ((u16Fsw & X86_FSW_XCPT_MASK) & ~(pFpuCtx->FCW & X86_FCW_XCPT_MASK))
         u16Fsw |= X86_FSW_ES | X86_FSW_B;
     else
         u16Fsw &= ~(X86_FSW_ES | X86_FSW_B);
-    pCtx->XState.x87.FSW = u16Fsw;
+    pFpuCtx->FSW = u16Fsw;
 }
 
@@ -6010,11 +6033,11 @@
  *
  * @returns The full FTW.
- * @param   pCtx                The CPU state.
- */
-static uint16_t iemFpuCalcFullFtw(PCCPUMCTX pCtx)
-{
-    uint8_t const   u8Ftw  = (uint8_t)pCtx->XState.x87.FTW;
+ * @param   pFpuCtx             The FPU context.
+ */
+static uint16_t iemFpuCalcFullFtw(PCX86FXSTATE pFpuCtx)
+{
+    uint8_t const   u8Ftw  = (uint8_t)pFpuCtx->FTW;
     uint16_t        u16Ftw = 0;
-    unsigned const  iTop   = X86_FSW_TOP_GET(pCtx->XState.x87.FSW);
+    unsigned const  iTop   = X86_FSW_TOP_GET(pFpuCtx->FSW);
     for (unsigned iSt = 0; iSt < 8; iSt++)
     {
@@ -6025,5 +6048,5 @@
         {
             uint16_t uTag;
-            PCRTFLOAT80U const pr80Reg = &pCtx->XState.x87.aRegs[iSt].r80;
+            PCRTFLOAT80U const pr80Reg = &pFpuCtx->aRegs[iSt].r80;
             if (pr80Reg->s.uExponent == 0x7fff)
                 uTag = 2; /* Exponent is all 1's => Special. */
@@ -7172,5 +7195,6 @@
     /* The lazy approach for now... */
     /** @todo testcase: Ordering of \#SS(0) vs \#GP() vs \#PF on SSE stuff. */
-    if ((GCPtrMem & 15) && !(pIemCpu->CTX_SUFF(pCtx)->XState.x87.MXCSR & X86_MXSCR_MM)) /** @todo should probably check this *after* applying seg.u64Base... Check real HW. */
+    if (   (GCPtrMem & 15)
+        && !(pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.MXCSR & X86_MXSCR_MM)) /** @todo should probably check this *after* applying seg.u64Base... Check real HW. */
         return iemRaiseGeneralProtectionFault0(pIemCpu);
 
@@ -7372,5 +7396,6 @@
 {
     /* The lazy approach for now... */
-    if ((GCPtrMem & 15) && !(pIemCpu->CTX_SUFF(pCtx)->XState.x87.MXCSR & X86_MXSCR_MM)) /** @todo should probably check this *after* applying seg.u64Base... Check real HW. */
+    if (   (GCPtrMem & 15)
+        && !(pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.MXCSR & X86_MXSCR_MM)) /** @todo should probably check this *after* applying seg.u64Base... Check real HW. */
         return iemRaiseGeneralProtectionFault0(pIemCpu);
 
@@ -8304,5 +8329,5 @@
 #define IEM_MC_MAYBE_RAISE_FPU_XCPT() \
     do { \
-        if ((pIemCpu)->CTX_SUFF(pCtx)->XState.x87.FSW & X86_FSW_ES) \
+        if ((pIemCpu)->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.FSW & X86_FSW_ES) \
             return iemRaiseMathFault(pIemCpu); \
     } while (0)
@@ -8387,6 +8412,6 @@
 #define IEM_MC_FETCH_EFLAGS(a_EFlags)                   (a_EFlags) = (pIemCpu)->CTX_SUFF(pCtx)->eflags.u
 #define IEM_MC_FETCH_EFLAGS_U8(a_EFlags)                (a_EFlags) = (uint8_t)(pIemCpu)->CTX_SUFF(pCtx)->eflags.u
-#define IEM_MC_FETCH_FSW(a_u16Fsw)                      (a_u16Fsw) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.FSW
-#define IEM_MC_FETCH_FCW(a_u16Fcw)                      (a_u16Fcw) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.FCW
+#define IEM_MC_FETCH_FSW(a_u16Fsw)                      (a_u16Fsw) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.FSW
+#define IEM_MC_FETCH_FCW(a_u16Fcw)                      (a_u16Fcw) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.FCW
 
 #define IEM_MC_STORE_GREG_U8(a_iGReg, a_u8Value)        *iemGRegRefU8(pIemCpu, (a_iGReg)) = (a_u8Value)
@@ -8401,5 +8426,5 @@
 #define IEM_MC_CLEAR_HIGH_GREG_U64_BY_REF(a_pu32Dst)    do { (a_pu32Dst)[1] = 0; } while (0)
 #define IEM_MC_STORE_FPUREG_R80_SRC_REF(a_iSt, a_pr80Src) \
-    do { pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[a_iSt].r80 = *(a_pr80Src); } while (0)
+    do { pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[a_iSt].r80 = *(a_pr80Src); } while (0)
 
 #define IEM_MC_REF_GREG_U8(a_pu8Dst, a_iGReg)           (a_pu8Dst) = iemGRegRefU8(pIemCpu, (a_iGReg))
@@ -8492,44 +8517,44 @@
 #define IEM_MC_FLIP_EFL_BIT(a_fBit)                     do { (pIemCpu)->CTX_SUFF(pCtx)->eflags.u ^= (a_fBit); } while (0)
 
-#define IEM_MC_CLEAR_FSW_EX()   do { (pIemCpu)->CTX_SUFF(pCtx)->XState.x87.FSW &= X86_FSW_C_MASK | X86_FSW_TOP_MASK; } while (0)
+#define IEM_MC_CLEAR_FSW_EX()   do { (pIemCpu)->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.FSW &= X86_FSW_C_MASK | X86_FSW_TOP_MASK; } while (0)
 
 
 #define IEM_MC_FETCH_MREG_U64(a_u64Value, a_iMReg) \
-    do { (a_u64Value) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].mmx; } while (0)
+    do { (a_u64Value) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].mmx; } while (0)
 #define IEM_MC_FETCH_MREG_U32(a_u32Value, a_iMReg) \
-    do { (a_u32Value) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].au32[0]; } while (0)
+    do { (a_u32Value) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].au32[0]; } while (0)
 #define IEM_MC_STORE_MREG_U64(a_iMReg, a_u64Value) \
-    do { pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].mmx = (a_u64Value); } while (0)
+    do { pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].mmx = (a_u64Value); } while (0)
 #define IEM_MC_STORE_MREG_U32_ZX_U64(a_iMReg, a_u32Value) \
-    do { pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].mmx = (uint32_t)(a_u32Value); } while (0)
+    do { pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].mmx = (uint32_t)(a_u32Value); } while (0)
 #define IEM_MC_REF_MREG_U64(a_pu64Dst, a_iMReg)         \
-        (a_pu64Dst) = (&pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].mmx)
+        (a_pu64Dst) = (&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].mmx)
 #define IEM_MC_REF_MREG_U64_CONST(a_pu64Dst, a_iMReg) \
-        (a_pu64Dst) = ((uint64_t const *)&pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].mmx)
+        (a_pu64Dst) = ((uint64_t const *)&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].mmx)
 #define IEM_MC_REF_MREG_U32_CONST(a_pu32Dst, a_iMReg) \
-        (a_pu32Dst) = ((uint32_t const *)&pIemCpu->CTX_SUFF(pCtx)->XState.x87.aRegs[(a_iMReg)].mmx)
+        (a_pu32Dst) = ((uint32_t const *)&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aRegs[(a_iMReg)].mmx)
 
 #define IEM_MC_FETCH_XREG_U128(a_u128Value, a_iXReg) \
-    do { (a_u128Value) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].xmm; } while (0)
+    do { (a_u128Value) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].xmm; } while (0)
 #define IEM_MC_FETCH_XREG_U64(a_u64Value, a_iXReg) \
-    do { (a_u64Value) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au64[0]; } while (0)
+    do { (a_u64Value) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au64[0]; } while (0)
 #define IEM_MC_FETCH_XREG_U32(a_u32Value, a_iXReg) \
-    do { (a_u32Value) = pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au32[0]; } while (0)
+    do { (a_u32Value) = pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au32[0]; } while (0)
 #define IEM_MC_STORE_XREG_U128(a_iXReg, a_u128Value) \
-    do { pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].xmm = (a_u128Value); } while (0)
+    do { pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].xmm = (a_u128Value); } while (0)
 #define IEM_MC_STORE_XREG_U64_ZX_U128(a_iXReg, a_u64Value) \
-    do { pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au64[0] = (a_u64Value); \
-         pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au64[1] = 0; \
+    do { pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au64[0] = (a_u64Value); \
+         pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au64[1] = 0; \
     } while (0)
 #define IEM_MC_STORE_XREG_U32_ZX_U128(a_iXReg, a_u32Value) \
-    do { pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au64[0] = (uint32_t)(a_u32Value); \
-         pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au64[1] = 0; \
+    do { pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au64[0] = (uint32_t)(a_u32Value); \
+         pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au64[1] = 0; \
     } while (0)
 #define IEM_MC_REF_XREG_U128(a_pu128Dst, a_iXReg)       \
-    (a_pu128Dst) = (&pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].xmm)
+    (a_pu128Dst) = (&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].xmm)
 #define IEM_MC_REF_XREG_U128_CONST(a_pu128Dst, a_iXReg) \
-    (a_pu128Dst) = ((uint128_t const *)&pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].xmm)
+    (a_pu128Dst) = ((uint128_t const *)&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].xmm)
 #define IEM_MC_REF_XREG_U64_CONST(a_pu64Dst, a_iXReg) \
-    (a_pu64Dst) = ((uint64_t const *)&pIemCpu->CTX_SUFF(pCtx)->XState.x87.aXMM[(a_iXReg)].au64[0])
+    (a_pu64Dst) = ((uint64_t const *)&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.aXMM[(a_iXReg)].au64[0])
 
 #define IEM_MC_FETCH_MEM_U8(a_u8Dst, a_iSeg, a_GCPtrMem) \
@@ -8739,5 +8764,5 @@
         if (   !(a_u16FSW & X86_FSW_ES) \
             || !(  (a_u16FSW & (X86_FSW_UE | X86_FSW_OE | X86_FSW_IE)) \
-                 & ~(pIemCpu->CTX_SUFF(pCtx)->XState.x87.FCW & X86_FCW_MASK_ALL) ) ) \
+                 & ~(pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.FCW & X86_FCW_MASK_ALL) ) ) \
             IEM_MC_RETURN_ON_FAILURE(iemMemCommitAndUnmap(pIemCpu, (a_pvMem), (a_fAccess))); \
     } while (0)
@@ -8875,5 +8900,5 @@
     do { \
         iemFpuPrepareUsage(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0)); \
     } while (0)
 
@@ -8888,5 +8913,5 @@
     do { \
         iemFpuPrepareUsage(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0), (a1)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0), (a1)); \
     } while (0)
 
@@ -8902,5 +8927,5 @@
     do { \
         iemFpuPrepareUsage(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0), (a1), (a2)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0), (a1), (a2)); \
     } while (0)
 
@@ -9019,5 +9044,5 @@
     do { \
         iemFpuPrepareUsage(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0), (a1)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0), (a1)); \
     } while (0)
 
@@ -9033,5 +9058,5 @@
     do { \
         iemFpuPrepareUsage(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0), (a1), (a2)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0), (a1), (a2)); \
     } while (0)
 
@@ -9047,5 +9072,5 @@
     do { \
         iemFpuPrepareUsageSse(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0), (a1)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0), (a1)); \
     } while (0)
 
@@ -9061,5 +9086,5 @@
     do { \
         iemFpuPrepareUsageSse(pIemCpu); \
-        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->XState.x87, (a0), (a1), (a2)); \
+        a_pfnAImpl(&pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87, (a0), (a1), (a2)); \
     } while (0)
 
@@ -9131,5 +9156,5 @@
     if (iemFpu2StRegsNotEmptyRefFirst(pIemCpu, (a_iSt0), &(a_pr80Dst0), (a_iSt1)) == VINF_SUCCESS) {
 #define IEM_MC_IF_FCW_IM() \
-    if (pIemCpu->CTX_SUFF(pCtx)->XState.x87.FCW & X86_FCW_IM) {
+    if (pIemCpu->CTX_SUFF(pCtx)->CTX_SUFF(pXState)->x87.FCW & X86_FCW_IM) {
 
 #define IEM_MC_ELSE()                                   } else {
@@ -10195,4 +10220,20 @@
         } \
     } while (0)
+#  define CHECK_XSTATE_FIELD(a_Field) \
+    do \
+    { \
+        if (pOrgXState->a_Field != pDebugXState->a_Field) \
+        { \
+            switch (sizeof(pOrgCtx->a_Field)) \
+            { \
+                case 1: RTAssertMsg2Weak("  %8s differs - iem=%02x - %s=%02x\n", #a_Field, pDebugXState->a_Field, pszWho, pOrgXState->a_Field); break; \
+                case 2: RTAssertMsg2Weak("  %8s differs - iem=%04x - %s=%04x\n", #a_Field, pDebugXState->a_Field, pszWho, pOrgXState->a_Field); break; \
+                case 4: RTAssertMsg2Weak("  %8s differs - iem=%08x - %s=%08x\n", #a_Field, pDebugXState->a_Field, pszWho, pOrgXState->a_Field); break; \
+                case 8: RTAssertMsg2Weak("  %8s differs - iem=%016llx - %s=%016llx\n", #a_Field, pDebugXState->a_Field, pszWho, pOrgXState->a_Field); break; \
+                default: RTAssertMsg2Weak("  %8s differs\n", #a_Field); break; \
+            } \
+            cDiffs++; \
+        } \
+    } while (0)
 
 #  define CHECK_BIT_FIELD(a_Field) \
@@ -10216,61 +10257,64 @@
     } while (0)
 
+        PX86XSAVEAREA pOrgXState   = pOrgCtx->CTX_SUFF(pXState);
+        PX86XSAVEAREA pDebugXState = pDebugCtx->CTX_SUFF(pXState);
+
 #if 1 /* The recompiler doesn't update these the intel way. */
         if (fRem)
         {
-            pOrgCtx->XState.x87.FOP        = pDebugCtx->XState.x87.FOP;
-            pOrgCtx->XState.x87.FPUIP      = pDebugCtx->XState.x87.FPUIP;
-            pOrgCtx->XState.x87.CS         = pDebugCtx->XState.x87.CS;
-            pOrgCtx->XState.x87.Rsrvd1     = pDebugCtx->XState.x87.Rsrvd1;
-            pOrgCtx->XState.x87.FPUDP      = pDebugCtx->XState.x87.FPUDP;
-            pOrgCtx->XState.x87.DS         = pDebugCtx->XState.x87.DS;
-            pOrgCtx->XState.x87.Rsrvd2     = pDebugCtx->XState.x87.Rsrvd2;
-            //pOrgCtx->XState.x87.MXCSR_MASK = pDebugCtx->XState.x87.MXCSR_MASK;
-            if ((pOrgCtx->XState.x87.FSW & X86_FSW_TOP_MASK) == (pDebugCtx->XState.x87.FSW & X86_FSW_TOP_MASK))
-                pOrgCtx->XState.x87.FSW = pDebugCtx->XState.x87.FSW;
+            pOrgXState->x87.FOP        = pDebugXState->x87.FOP;
+            pOrgXState->x87.FPUIP      = pDebugXState->x87.FPUIP;
+            pOrgXState->x87.CS         = pDebugXState->x87.CS;
+            pOrgXState->x87.Rsrvd1     = pDebugXState->x87.Rsrvd1;
+            pOrgXState->x87.FPUDP      = pDebugXState->x87.FPUDP;
+            pOrgXState->x87.DS         = pDebugXState->x87.DS;
+            pOrgXState->x87.Rsrvd2     = pDebugXState->x87.Rsrvd2;
+            //pOrgXState->x87.MXCSR_MASK = pDebugXState->x87.MXCSR_MASK;
+            if ((pOrgXState->x87.FSW & X86_FSW_TOP_MASK) == (pDebugXState->x87.FSW & X86_FSW_TOP_MASK))
+                pOrgXState->x87.FSW = pDebugXState->x87.FSW;
         }
 #endif
-        if (memcmp(&pOrgCtx->XState.x87, &pDebugCtx->XState.x87, sizeof(pDebugCtx->XState.x87)))
+        if (memcmp(&pOrgXState->x87, &pDebugXState->x87, sizeof(pDebugXState->x87)))
         {
             RTAssertMsg2Weak("  the FPU state differs\n");
             cDiffs++;
-            CHECK_FIELD(XState.x87.FCW);
-            CHECK_FIELD(XState.x87.FSW);
-            CHECK_FIELD(XState.x87.FTW);
-            CHECK_FIELD(XState.x87.FOP);
-            CHECK_FIELD(XState.x87.FPUIP);
-            CHECK_FIELD(XState.x87.CS);
-            CHECK_FIELD(XState.x87.Rsrvd1);
-            CHECK_FIELD(XState.x87.FPUDP);
-            CHECK_FIELD(XState.x87.DS);
-            CHECK_FIELD(XState.x87.Rsrvd2);
-            CHECK_FIELD(XState.x87.MXCSR);
-            CHECK_FIELD(XState.x87.MXCSR_MASK);
-            CHECK_FIELD(XState.x87.aRegs[0].au64[0]); CHECK_FIELD(XState.x87.aRegs[0].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[1].au64[0]); CHECK_FIELD(XState.x87.aRegs[1].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[2].au64[0]); CHECK_FIELD(XState.x87.aRegs[2].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[3].au64[0]); CHECK_FIELD(XState.x87.aRegs[3].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[4].au64[0]); CHECK_FIELD(XState.x87.aRegs[4].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[5].au64[0]); CHECK_FIELD(XState.x87.aRegs[5].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[6].au64[0]); CHECK_FIELD(XState.x87.aRegs[6].au64[1]);
-            CHECK_FIELD(XState.x87.aRegs[7].au64[0]); CHECK_FIELD(XState.x87.aRegs[7].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 0].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 0].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 1].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 1].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 2].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 2].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 3].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 3].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 4].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 4].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 5].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 5].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 6].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 6].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 7].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 7].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 8].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 8].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[ 9].au64[0]);  CHECK_FIELD(XState.x87.aXMM[ 9].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[10].au64[0]);  CHECK_FIELD(XState.x87.aXMM[10].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[11].au64[0]);  CHECK_FIELD(XState.x87.aXMM[11].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[12].au64[0]);  CHECK_FIELD(XState.x87.aXMM[12].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[13].au64[0]);  CHECK_FIELD(XState.x87.aXMM[13].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[14].au64[0]);  CHECK_FIELD(XState.x87.aXMM[14].au64[1]);
-            CHECK_FIELD(XState.x87.aXMM[15].au64[0]);  CHECK_FIELD(XState.x87.aXMM[15].au64[1]);
-            for (unsigned i = 0; i < RT_ELEMENTS(pOrgCtx->XState.x87.au32RsrvdRest); i++)
-                CHECK_FIELD(XState.x87.au32RsrvdRest[i]);
+            CHECK_XSTATE_FIELD(x87.FCW);
+            CHECK_XSTATE_FIELD(x87.FSW);
+            CHECK_XSTATE_FIELD(x87.FTW);
+            CHECK_XSTATE_FIELD(x87.FOP);
+            CHECK_XSTATE_FIELD(x87.FPUIP);
+            CHECK_XSTATE_FIELD(x87.CS);
+            CHECK_XSTATE_FIELD(x87.Rsrvd1);
+            CHECK_XSTATE_FIELD(x87.FPUDP);
+            CHECK_XSTATE_FIELD(x87.DS);
+            CHECK_XSTATE_FIELD(x87.Rsrvd2);
+            CHECK_XSTATE_FIELD(x87.MXCSR);
+            CHECK_XSTATE_FIELD(x87.MXCSR_MASK);
+            CHECK_XSTATE_FIELD(x87.aRegs[0].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[0].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[1].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[1].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[2].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[2].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[3].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[3].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[4].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[4].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[5].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[5].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[6].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[6].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aRegs[7].au64[0]); CHECK_XSTATE_FIELD(x87.aRegs[7].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 0].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 0].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 1].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 1].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 2].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 2].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 3].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 3].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 4].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 4].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 5].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 5].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 6].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 6].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 7].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 7].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 8].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 8].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[ 9].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[ 9].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[10].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[10].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[11].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[11].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[12].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[12].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[13].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[13].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[14].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[14].au64[1]);
+            CHECK_XSTATE_FIELD(x87.aXMM[15].au64[0]);  CHECK_XSTATE_FIELD(x87.aXMM[15].au64[1]);
+            for (unsigned i = 0; i < RT_ELEMENTS(pOrgXState->x87.au32RsrvdRest); i++)
+                CHECK_XSTATE_FIELD(x87.au32RsrvdRest[i]);
         }
         CHECK_FIELD(rip);
@@ -10518,4 +10562,5 @@
         }
 
+        PCX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
         Log2(("****\n"
               " eax=%08x ebx=%08x ecx=%08x edx=%08x esi=%08x edi=%08x\n"
@@ -10529,5 +10574,5 @@
               pCtx->cs.Sel, pCtx->ss.Sel, pCtx->ds.Sel, pCtx->es.Sel,
               pCtx->fs.Sel, pCtx->gs.Sel, pCtx->eflags.u,
-              pCtx->XState.x87.FSW, pCtx->XState.x87.FCW, pCtx->XState.x87.FTW, pCtx->XState.x87.MXCSR, pCtx->XState.x87.MXCSR_MASK,
+              pFpuCtx->FSW, pFpuCtx->FCW, pFpuCtx->FTW, pFpuCtx->MXCSR, pFpuCtx->MXCSR_MASK,
               szInstr));
 
Index: /trunk/src/VBox/VMM/VMMAll/IEMAllCImpl.cpp.h
===================================================================
--- /trunk/src/VBox/VMM/VMMAll/IEMAllCImpl.cpp.h	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMAll/IEMAllCImpl.cpp.h	(revision 55048)
@@ -6043,20 +6043,21 @@
      */
 
+    PX86XSAVEAREA pXState = pCtx->CTX_SUFF(pXState);
     if (iemFRegIsFxSaveFormat(pIemCpu))
     {
-        pCtx->XState.x87.FCW   = 0x37f;
-        pCtx->XState.x87.FSW   = 0;
-        pCtx->XState.x87.FTW   = 0x00;         /* 0 - empty. */
-        pCtx->XState.x87.FPUDP = 0;
-        pCtx->XState.x87.DS    = 0; //??
-        pCtx->XState.x87.Rsrvd2= 0;
-        pCtx->XState.x87.FPUIP = 0;
-        pCtx->XState.x87.CS    = 0; //??
-        pCtx->XState.x87.Rsrvd1= 0;
-        pCtx->XState.x87.FOP   = 0;
+        pXState->x87.FCW   = 0x37f;
+        pXState->x87.FSW   = 0;
+        pXState->x87.FTW   = 0x00;         /* 0 - empty. */
+        pXState->x87.FPUDP = 0;
+        pXState->x87.DS    = 0; //??
+        pXState->x87.Rsrvd2= 0;
+        pXState->x87.FPUIP = 0;
+        pXState->x87.CS    = 0; //??
+        pXState->x87.Rsrvd1= 0;
+        pXState->x87.FOP   = 0;
     }
     else
     {
-        PX86FPUSTATE pFpu = (PX86FPUSTATE)&pCtx->XState.x87;
+        PX86FPUSTATE pFpu = (PX86FPUSTATE)&pXState->x87;
         pFpu->FCW       = 0x37f;
         pFpu->FSW       = 0;
@@ -6112,5 +6113,6 @@
     if (rcStrict != VINF_SUCCESS)
         return rcStrict;
-    PX86FXSTATE pDst = (PX86FXSTATE)pvMem512;
+    PX86FXSTATE  pDst = (PX86FXSTATE)pvMem512;
+    PCX86FXSTATE pSrc = &pCtx->CTX_SUFF(pXState)->x87;
 
     /*
@@ -6121,30 +6123,30 @@
 
     /* common for all formats */
-    pDst->FCW           = pCtx->XState.x87.FCW;
-    pDst->FSW           = pCtx->XState.x87.FSW;
-    pDst->FTW           = pCtx->XState.x87.FTW & UINT16_C(0xff);
-    pDst->FOP           = pCtx->XState.x87.FOP;
-    pDst->MXCSR         = pCtx->XState.x87.MXCSR;
-    pDst->MXCSR_MASK    = pCtx->XState.x87.MXCSR_MASK;
+    pDst->FCW           = pSrc->FCW;
+    pDst->FSW           = pSrc->FSW;
+    pDst->FTW           = pSrc->FTW & UINT16_C(0xff);
+    pDst->FOP           = pSrc->FOP;
+    pDst->MXCSR         = pSrc->MXCSR;
+    pDst->MXCSR_MASK    = pSrc->MXCSR_MASK;
     for (uint32_t i = 0; i < RT_ELEMENTS(pDst->aRegs); i++)
     {
         /** @todo Testcase: What actually happens to the 6 reserved bytes? I'm clearing
          *        them for now... */
-        pDst->aRegs[i].au32[0] = pCtx->XState.x87.aRegs[i].au32[0];
-        pDst->aRegs[i].au32[1] = pCtx->XState.x87.aRegs[i].au32[1];
-        pDst->aRegs[i].au32[2] = pCtx->XState.x87.aRegs[i].au32[2] & UINT32_C(0xffff);
+        pDst->aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
+        pDst->aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
+        pDst->aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
         pDst->aRegs[i].au32[3] = 0;
     }
 
     /* FPU IP, CS, DP and DS. */
-    pDst->FPUIP  = pCtx->XState.x87.FPUIP;
-    pDst->CS     = pCtx->XState.x87.CS;
-    pDst->FPUDP  = pCtx->XState.x87.FPUDP;
-    pDst->DS     = pCtx->XState.x87.DS;
+    pDst->FPUIP  = pSrc->FPUIP;
+    pDst->CS     = pSrc->CS;
+    pDst->FPUDP  = pSrc->FPUDP;
+    pDst->DS     = pSrc->DS;
     if (enmEffOpSize == IEMMODE_64BIT)
     {
         /* Save upper 16-bits of FPUIP (IP:CS:Rsvd1) and FPUDP (DP:DS:Rsvd2). */
-        pDst->Rsrvd1 = pCtx->XState.x87.Rsrvd1;
-        pDst->Rsrvd2 = pCtx->XState.x87.Rsrvd2;
+        pDst->Rsrvd1 = pSrc->Rsrvd1;
+        pDst->Rsrvd2 = pSrc->Rsrvd2;
         pDst->au32RsrvdForSoftware[0] = 0;
     }
@@ -6163,5 +6165,5 @@
         uint32_t cXmmRegs = enmEffOpSize == IEMMODE_64BIT ? 16 : 8;
         for (uint32_t i = 0; i < cXmmRegs; i++)
-            pDst->aXMM[i] = pCtx->XState.x87.aXMM[i];
+            pDst->aXMM[i] = pSrc->aXMM[i];
         /** @todo Testcase: What happens to the reserved XMM registers? Untouched,
          *        right? */
@@ -6217,4 +6219,5 @@
         return rcStrict;
     PCX86FXSTATE pSrc = (PCX86FXSTATE)pvMem512;
+    PX86FXSTATE  pDst = &pCtx->CTX_SUFF(pXState)->x87;
 
     /*
@@ -6222,5 +6225,5 @@
      */
     uint32_t const fMXCSR      = pSrc->MXCSR;
-    uint32_t const fMXCSR_MASK = pCtx->XState.x87.MXCSR_MASK ? pCtx->XState.x87.MXCSR_MASK : UINT32_C(0xffbf);
+    uint32_t const fMXCSR_MASK = pDst->MXCSR_MASK ? pDst->MXCSR_MASK : UINT32_C(0xffbf);
     if (fMXCSR & ~fMXCSR_MASK)
     {
@@ -6236,16 +6239,16 @@
 
     /* common for all formats */
-    pCtx->XState.x87.FCW       = pSrc->FCW;
-    pCtx->XState.x87.FSW       = pSrc->FSW;
-    pCtx->XState.x87.FTW       = pSrc->FTW & UINT16_C(0xff);
-    pCtx->XState.x87.FOP       = pSrc->FOP;
-    pCtx->XState.x87.MXCSR     = fMXCSR;
+    pDst->FCW       = pSrc->FCW;
+    pDst->FSW       = pSrc->FSW;
+    pDst->FTW       = pSrc->FTW & UINT16_C(0xff);
+    pDst->FOP       = pSrc->FOP;
+    pDst->MXCSR     = fMXCSR;
     /* (MXCSR_MASK is read-only) */
     for (uint32_t i = 0; i < RT_ELEMENTS(pSrc->aRegs); i++)
     {
-        pCtx->XState.x87.aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
-        pCtx->XState.x87.aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
-        pCtx->XState.x87.aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
-        pCtx->XState.x87.aRegs[i].au32[3] = 0;
+        pDst->aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
+        pDst->aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
+        pDst->aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
+        pDst->aRegs[i].au32[3] = 0;
     }
 
@@ -6253,19 +6256,19 @@
     if (pIemCpu->enmCpuMode == IEMMODE_64BIT)
     {
-        pCtx->XState.x87.FPUIP  = pSrc->FPUIP;
-        pCtx->XState.x87.CS     = pSrc->CS;
-        pCtx->XState.x87.Rsrvd1 = pSrc->Rsrvd1;
-        pCtx->XState.x87.FPUDP  = pSrc->FPUDP;
-        pCtx->XState.x87.DS     = pSrc->DS;
-        pCtx->XState.x87.Rsrvd2 = pSrc->Rsrvd2;
+        pDst->FPUIP  = pSrc->FPUIP;
+        pDst->CS     = pSrc->CS;
+        pDst->Rsrvd1 = pSrc->Rsrvd1;
+        pDst->FPUDP  = pSrc->FPUDP;
+        pDst->DS     = pSrc->DS;
+        pDst->Rsrvd2 = pSrc->Rsrvd2;
     }
     else
     {
-        pCtx->XState.x87.FPUIP  = pSrc->FPUIP;
-        pCtx->XState.x87.CS     = pSrc->CS;
-        pCtx->XState.x87.Rsrvd1 = 0;
-        pCtx->XState.x87.FPUDP  = pSrc->FPUDP;
-        pCtx->XState.x87.DS     = pSrc->DS;
-        pCtx->XState.x87.Rsrvd2 = 0;
+        pDst->FPUIP  = pSrc->FPUIP;
+        pDst->CS     = pSrc->CS;
+        pDst->Rsrvd1 = 0;
+        pDst->FPUDP  = pSrc->FPUDP;
+        pDst->DS     = pSrc->DS;
+        pDst->Rsrvd2 = 0;
     }
 
@@ -6277,5 +6280,5 @@
         uint32_t cXmmRegs = enmEffOpSize == IEMMODE_64BIT ? 16 : 8;
         for (uint32_t i = 0; i < cXmmRegs; i++)
-            pCtx->XState.x87.aXMM[i] = pSrc->aXMM[i];
+            pDst->aXMM[i] = pSrc->aXMM[i];
     }
 
@@ -6301,9 +6304,10 @@
 static void iemCImplCommonFpuStoreEnv(PIEMCPU pIemCpu, IEMMODE enmEffOpSize, RTPTRUNION uPtr, PCCPUMCTX pCtx)
 {
+    PCX86FXSTATE pSrcX87 = &pCtx->CTX_SUFF(pXState)->x87;
     if (enmEffOpSize == IEMMODE_16BIT)
     {
-        uPtr.pu16[0] = pCtx->XState.x87.FCW;
-        uPtr.pu16[1] = pCtx->XState.x87.FSW;
-        uPtr.pu16[2] = iemFpuCalcFullFtw(pCtx);
+        uPtr.pu16[0] = pSrcX87->FCW;
+        uPtr.pu16[1] = pSrcX87->FSW;
+        uPtr.pu16[2] = iemFpuCalcFullFtw(pSrcX87);
         if (IEM_IS_REAL_OR_V86_MODE(pIemCpu))
         {
@@ -6313,15 +6317,15 @@
              *        effective address ((CS << 4) + IP) in the offset register and not
              *        doing any address calculations here. */
-            uPtr.pu16[3] = (uint16_t)pCtx->XState.x87.FPUIP;
-            uPtr.pu16[4] = ((pCtx->XState.x87.FPUIP >> 4) & UINT16_C(0xf000)) | pCtx->XState.x87.FOP;
-            uPtr.pu16[5] = (uint16_t)pCtx->XState.x87.FPUDP;
-            uPtr.pu16[6] = (pCtx->XState.x87.FPUDP  >> 4) & UINT16_C(0xf000);
+            uPtr.pu16[3] = (uint16_t)pSrcX87->FPUIP;
+            uPtr.pu16[4] = ((pSrcX87->FPUIP >> 4) & UINT16_C(0xf000)) | pSrcX87->FOP;
+            uPtr.pu16[5] = (uint16_t)pSrcX87->FPUDP;
+            uPtr.pu16[6] = (pSrcX87->FPUDP  >> 4) & UINT16_C(0xf000);
         }
         else
         {
-            uPtr.pu16[3] = pCtx->XState.x87.FPUIP;
-            uPtr.pu16[4] = pCtx->XState.x87.CS;
-            uPtr.pu16[5] = pCtx->XState.x87.FPUDP;
-            uPtr.pu16[6] = pCtx->XState.x87.DS;
+            uPtr.pu16[3] = pSrcX87->FPUIP;
+            uPtr.pu16[4] = pSrcX87->CS;
+            uPtr.pu16[5] = pSrcX87->FPUDP;
+            uPtr.pu16[6] = pSrcX87->DS;
         }
     }
@@ -6329,21 +6333,21 @@
     {
         /** @todo Testcase: what is stored in the "gray" areas? (figure 8-9 and 8-10) */
-        uPtr.pu16[0*2] = pCtx->XState.x87.FCW;
-        uPtr.pu16[1*2] = pCtx->XState.x87.FSW;
-        uPtr.pu16[2*2] = iemFpuCalcFullFtw(pCtx);
+        uPtr.pu16[0*2] = pSrcX87->FCW;
+        uPtr.pu16[1*2] = pSrcX87->FSW;
+        uPtr.pu16[2*2] = iemFpuCalcFullFtw(pSrcX87);
         if (IEM_IS_REAL_OR_V86_MODE(pIemCpu))
         {
-            uPtr.pu16[3*2]  = (uint16_t)pCtx->XState.x87.FPUIP;
-            uPtr.pu32[4]    = ((pCtx->XState.x87.FPUIP & UINT32_C(0xffff0000)) >> 4) | pCtx->XState.x87.FOP;
-            uPtr.pu16[5*2]  = (uint16_t)pCtx->XState.x87.FPUDP;
-            uPtr.pu32[6]    = (pCtx->XState.x87.FPUDP  & UINT32_C(0xffff0000)) >> 4;
+            uPtr.pu16[3*2]  = (uint16_t)pSrcX87->FPUIP;
+            uPtr.pu32[4]    = ((pSrcX87->FPUIP & UINT32_C(0xffff0000)) >> 4) | pSrcX87->FOP;
+            uPtr.pu16[5*2]  = (uint16_t)pSrcX87->FPUDP;
+            uPtr.pu32[6]    = (pSrcX87->FPUDP  & UINT32_C(0xffff0000)) >> 4;
         }
         else
         {
-            uPtr.pu32[3]    = pCtx->XState.x87.FPUIP;
-            uPtr.pu16[4*2]  = pCtx->XState.x87.CS;
-            uPtr.pu16[4*2+1]= pCtx->XState.x87.FOP;
-            uPtr.pu32[5]    = pCtx->XState.x87.FPUDP;
-            uPtr.pu16[6*2]  = pCtx->XState.x87.DS;
+            uPtr.pu32[3]    = pSrcX87->FPUIP;
+            uPtr.pu16[4*2]  = pSrcX87->CS;
+            uPtr.pu16[4*2+1]= pSrcX87->FOP;
+            uPtr.pu32[5]    = pSrcX87->FPUDP;
+            uPtr.pu16[6*2]  = pSrcX87->DS;
         }
     }
@@ -6359,27 +6363,28 @@
 static void iemCImplCommonFpuRestoreEnv(PIEMCPU pIemCpu, IEMMODE enmEffOpSize, RTCPTRUNION uPtr, PCPUMCTX pCtx)
 {
+    PX86FXSTATE pDstX87 = &pCtx->CTX_SUFF(pXState)->x87;
     if (enmEffOpSize == IEMMODE_16BIT)
     {
-        pCtx->XState.x87.FCW = uPtr.pu16[0];
-        pCtx->XState.x87.FSW = uPtr.pu16[1];
-        pCtx->XState.x87.FTW = uPtr.pu16[2];
+        pDstX87->FCW = uPtr.pu16[0];
+        pDstX87->FSW = uPtr.pu16[1];
+        pDstX87->FTW = uPtr.pu16[2];
         if (IEM_IS_REAL_OR_V86_MODE(pIemCpu))
         {
-            pCtx->XState.x87.FPUIP = uPtr.pu16[3] | ((uint32_t)(uPtr.pu16[4] & UINT16_C(0xf000)) << 4);
-            pCtx->XState.x87.FPUDP = uPtr.pu16[5] | ((uint32_t)(uPtr.pu16[6] & UINT16_C(0xf000)) << 4);
-            pCtx->XState.x87.FOP   = uPtr.pu16[4] & UINT16_C(0x07ff);
-            pCtx->XState.x87.CS    = 0;
-            pCtx->XState.x87.Rsrvd1= 0;
-            pCtx->XState.x87.DS    = 0;
-            pCtx->XState.x87.Rsrvd2= 0;
+            pDstX87->FPUIP = uPtr.pu16[3] | ((uint32_t)(uPtr.pu16[4] & UINT16_C(0xf000)) << 4);
+            pDstX87->FPUDP = uPtr.pu16[5] | ((uint32_t)(uPtr.pu16[6] & UINT16_C(0xf000)) << 4);
+            pDstX87->FOP   = uPtr.pu16[4] & UINT16_C(0x07ff);
+            pDstX87->CS    = 0;
+            pDstX87->Rsrvd1= 0;
+            pDstX87->DS    = 0;
+            pDstX87->Rsrvd2= 0;
         }
         else
         {
-            pCtx->XState.x87.FPUIP = uPtr.pu16[3];
-            pCtx->XState.x87.CS    = uPtr.pu16[4];
-            pCtx->XState.x87.Rsrvd1= 0;
-            pCtx->XState.x87.FPUDP = uPtr.pu16[5];
-            pCtx->XState.x87.DS    = uPtr.pu16[6];
-            pCtx->XState.x87.Rsrvd2= 0;
+            pDstX87->FPUIP = uPtr.pu16[3];
+            pDstX87->CS    = uPtr.pu16[4];
+            pDstX87->Rsrvd1= 0;
+            pDstX87->FPUDP = uPtr.pu16[5];
+            pDstX87->DS    = uPtr.pu16[6];
+            pDstX87->Rsrvd2= 0;
             /** @todo Testcase: Is FOP cleared when doing 16-bit protected mode fldenv? */
         }
@@ -6387,33 +6392,33 @@
     else
     {
-        pCtx->XState.x87.FCW = uPtr.pu16[0*2];
-        pCtx->XState.x87.FSW = uPtr.pu16[1*2];
-        pCtx->XState.x87.FTW = uPtr.pu16[2*2];
+        pDstX87->FCW = uPtr.pu16[0*2];
+        pDstX87->FSW = uPtr.pu16[1*2];
+        pDstX87->FTW = uPtr.pu16[2*2];
         if (IEM_IS_REAL_OR_V86_MODE(pIemCpu))
         {
-            pCtx->XState.x87.FPUIP = uPtr.pu16[3*2] | ((uPtr.pu32[4] & UINT32_C(0x0ffff000)) << 4);
-            pCtx->XState.x87.FOP   = uPtr.pu32[4] & UINT16_C(0x07ff);
-            pCtx->XState.x87.FPUDP = uPtr.pu16[5*2] | ((uPtr.pu32[6] & UINT32_C(0x0ffff000)) << 4);
-            pCtx->XState.x87.CS    = 0;
-            pCtx->XState.x87.Rsrvd1= 0;
-            pCtx->XState.x87.DS    = 0;
-            pCtx->XState.x87.Rsrvd2= 0;
+            pDstX87->FPUIP = uPtr.pu16[3*2] | ((uPtr.pu32[4] & UINT32_C(0x0ffff000)) << 4);
+            pDstX87->FOP   = uPtr.pu32[4] & UINT16_C(0x07ff);
+            pDstX87->FPUDP = uPtr.pu16[5*2] | ((uPtr.pu32[6] & UINT32_C(0x0ffff000)) << 4);
+            pDstX87->CS    = 0;
+            pDstX87->Rsrvd1= 0;
+            pDstX87->DS    = 0;
+            pDstX87->Rsrvd2= 0;
         }
         else
         {
-            pCtx->XState.x87.FPUIP = uPtr.pu32[3];
-            pCtx->XState.x87.CS    = uPtr.pu16[4*2];
-            pCtx->XState.x87.Rsrvd1= 0;
-            pCtx->XState.x87.FOP   = uPtr.pu16[4*2+1];
-            pCtx->XState.x87.FPUDP = uPtr.pu32[5];
-            pCtx->XState.x87.DS    = uPtr.pu16[6*2];
-            pCtx->XState.x87.Rsrvd2= 0;
+            pDstX87->FPUIP = uPtr.pu32[3];
+            pDstX87->CS    = uPtr.pu16[4*2];
+            pDstX87->Rsrvd1= 0;
+            pDstX87->FOP   = uPtr.pu16[4*2+1];
+            pDstX87->FPUDP = uPtr.pu32[5];
+            pDstX87->DS    = uPtr.pu16[6*2];
+            pDstX87->Rsrvd2= 0;
         }
     }
 
     /* Make adjustments. */
-    pCtx->XState.x87.FTW = iemFpuCompressFtw(pCtx->XState.x87.FTW);
-    pCtx->XState.x87.FCW &= ~X86_FCW_ZERO_MASK;
-    iemFpuRecalcExceptionStatus(pCtx);
+    pDstX87->FTW = iemFpuCompressFtw(pDstX87->FTW);
+    pDstX87->FCW &= ~X86_FCW_ZERO_MASK;
+    iemFpuRecalcExceptionStatus(pDstX87);
     /** @todo Testcase: Check if ES and/or B are automatically cleared if no
      *        exceptions are pending after loading the saved state? */
@@ -6464,11 +6469,12 @@
         return rcStrict;
 
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
     iemCImplCommonFpuStoreEnv(pIemCpu, enmEffOpSize, uPtr, pCtx);
     PRTFLOAT80U paRegs = (PRTFLOAT80U)(uPtr.pu8 + (enmEffOpSize == IEMMODE_16BIT ? 14 : 28));
-    for (uint32_t i = 0; i < RT_ELEMENTS(pCtx->XState.x87.aRegs); i++)
-    {
-        paRegs[i].au32[0] = pCtx->XState.x87.aRegs[i].au32[0];
-        paRegs[i].au32[1] = pCtx->XState.x87.aRegs[i].au32[1];
-        paRegs[i].au16[4] = pCtx->XState.x87.aRegs[i].au16[4];
+    for (uint32_t i = 0; i < RT_ELEMENTS(pFpuCtx->aRegs); i++)
+    {
+        paRegs[i].au32[0] = pFpuCtx->aRegs[i].au32[0];
+        paRegs[i].au32[1] = pFpuCtx->aRegs[i].au32[1];
+        paRegs[i].au16[4] = pFpuCtx->aRegs[i].au16[4];
     }
 
@@ -6478,16 +6484,16 @@
 
     /*
-     * Re-initialize the XState.x87.
-     */
-    pCtx->XState.x87.FCW   = 0x37f;
-    pCtx->XState.x87.FSW   = 0;
-    pCtx->XState.x87.FTW   = 0x00;       /* 0 - empty */
-    pCtx->XState.x87.FPUDP = 0;
-    pCtx->XState.x87.DS    = 0;
-    pCtx->XState.x87.Rsrvd2= 0;
-    pCtx->XState.x87.FPUIP = 0;
-    pCtx->XState.x87.CS    = 0;
-    pCtx->XState.x87.Rsrvd1= 0;
-    pCtx->XState.x87.FOP   = 0;
+     * Re-initialize the FPU context.
+     */
+    pFpuCtx->FCW   = 0x37f;
+    pFpuCtx->FSW   = 0;
+    pFpuCtx->FTW   = 0x00;       /* 0 - empty */
+    pFpuCtx->FPUDP = 0;
+    pFpuCtx->DS    = 0;
+    pFpuCtx->Rsrvd2= 0;
+    pFpuCtx->FPUIP = 0;
+    pFpuCtx->CS    = 0;
+    pFpuCtx->Rsrvd1= 0;
+    pFpuCtx->FOP   = 0;
 
     iemHlpUsedFpu(pIemCpu);
@@ -6541,12 +6547,13 @@
         return rcStrict;
 
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
     iemCImplCommonFpuRestoreEnv(pIemCpu, enmEffOpSize, uPtr, pCtx);
     PCRTFLOAT80U paRegs = (PCRTFLOAT80U)(uPtr.pu8 + (enmEffOpSize == IEMMODE_16BIT ? 14 : 28));
-    for (uint32_t i = 0; i < RT_ELEMENTS(pCtx->XState.x87.aRegs); i++)
-    {
-        pCtx->XState.x87.aRegs[i].au32[0] = paRegs[i].au32[0];
-        pCtx->XState.x87.aRegs[i].au32[1] = paRegs[i].au32[1];
-        pCtx->XState.x87.aRegs[i].au32[2] = paRegs[i].au16[4];
-        pCtx->XState.x87.aRegs[i].au32[3] = 0;
+    for (uint32_t i = 0; i < RT_ELEMENTS(pFpuCtx->aRegs); i++)
+    {
+        pFpuCtx->aRegs[i].au32[0] = paRegs[i].au32[0];
+        pFpuCtx->aRegs[i].au32[1] = paRegs[i].au32[1];
+        pFpuCtx->aRegs[i].au32[2] = paRegs[i].au16[4];
+        pFpuCtx->aRegs[i].au32[3] = 0;
     }
 
@@ -6575,6 +6582,7 @@
     /** @todo Testcase: Test that it raises and loweres the FPU exception bits
      *        according to FSW. (This is was is currently implemented.) */
-    pCtx->XState.x87.FCW = u16Fcw & ~X86_FCW_ZERO_MASK;
-    iemFpuRecalcExceptionStatus(pCtx);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    pFpuCtx->FCW = u16Fcw & ~X86_FCW_ZERO_MASK;
+    iemFpuRecalcExceptionStatus(pFpuCtx);
 
     /* Note: C0, C1, C2 and C3 are documented as undefined, we leave them untouched! */
@@ -6595,37 +6603,38 @@
     PCPUMCTX pCtx = pIemCpu->CTX_SUFF(pCtx);
 
-    unsigned const iReg1 = X86_FSW_TOP_GET(pCtx->XState.x87.FSW);
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    unsigned const iReg1 = X86_FSW_TOP_GET(pFpuCtx->FSW);
     unsigned const iReg2 = (iReg1 + iStReg) & X86_FSW_TOP_SMASK;
-    Assert(!(RT_BIT(iReg1) & pCtx->XState.x87.FTW) || !(RT_BIT(iReg2) & pCtx->XState.x87.FTW));
+    Assert(!(RT_BIT(iReg1) & pFpuCtx->FTW) || !(RT_BIT(iReg2) & pFpuCtx->FTW));
 
     /** @todo Testcase: fxch underflow. Making assumptions that underflowed
      *        registers are read as QNaN and then exchanged. This could be
      *        wrong... */
-    if (pCtx->XState.x87.FCW & X86_FCW_IM)
-    {
-        if (RT_BIT(iReg1) & pCtx->XState.x87.FTW)
-        {
-            if (RT_BIT(iReg2) & pCtx->XState.x87.FTW)
-                iemFpuStoreQNan(&pCtx->XState.x87.aRegs[0].r80);
+    if (pFpuCtx->FCW & X86_FCW_IM)
+    {
+        if (RT_BIT(iReg1) & pFpuCtx->FTW)
+        {
+            if (RT_BIT(iReg2) & pFpuCtx->FTW)
+                iemFpuStoreQNan(&pFpuCtx->aRegs[0].r80);
             else
-                pCtx->XState.x87.aRegs[0].r80 = pCtx->XState.x87.aRegs[iStReg].r80;
-            iemFpuStoreQNan(&pCtx->XState.x87.aRegs[iStReg].r80);
+                pFpuCtx->aRegs[0].r80 = pFpuCtx->aRegs[iStReg].r80;
+            iemFpuStoreQNan(&pFpuCtx->aRegs[iStReg].r80);
         }
         else
         {
-            pCtx->XState.x87.aRegs[iStReg].r80 = pCtx->XState.x87.aRegs[0].r80;
-            iemFpuStoreQNan(&pCtx->XState.x87.aRegs[0].r80);
-        }
-        pCtx->XState.x87.FSW &= ~X86_FSW_C_MASK;
-        pCtx->XState.x87.FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF;
+            pFpuCtx->aRegs[iStReg].r80 = pFpuCtx->aRegs[0].r80;
+            iemFpuStoreQNan(&pFpuCtx->aRegs[0].r80);
+        }
+        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
+        pFpuCtx->FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF;
     }
     else
     {
         /* raise underflow exception, don't change anything. */
-        pCtx->XState.x87.FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_XCPT_MASK);
-        pCtx->XState.x87.FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
-    }
-
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
+        pFpuCtx->FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_XCPT_MASK);
+        pFpuCtx->FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
+    }
+
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
     iemHlpUsedFpu(pIemCpu);
     iemRegAddToRipAndClearRF(pIemCpu, cbInstr);
@@ -6649,5 +6658,7 @@
     if (pCtx->cr0 & (X86_CR0_EM | X86_CR0_TS))
         return iemRaiseDeviceNotAvailable(pIemCpu);
-    uint16_t u16Fsw = pCtx->XState.x87.FSW;
+
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
+    uint16_t u16Fsw = pFpuCtx->FSW;
     if (u16Fsw & X86_FSW_ES)
         return iemRaiseMathFault(pIemCpu);
@@ -6658,14 +6669,13 @@
     unsigned const iReg1 = X86_FSW_TOP_GET(u16Fsw);
     unsigned const iReg2 = (iReg1 + iStReg) & X86_FSW_TOP_SMASK;
-    if ((pCtx->XState.x87.FTW & (RT_BIT(iReg1) | RT_BIT(iReg2))) == (RT_BIT(iReg1) | RT_BIT(iReg2)))
-    {
-        uint32_t u32Eflags = pfnAImpl(&pCtx->XState.x87, &u16Fsw,
-                                      &pCtx->XState.x87.aRegs[0].r80, &pCtx->XState.x87.aRegs[iStReg].r80);
+    if ((pFpuCtx->FTW & (RT_BIT(iReg1) | RT_BIT(iReg2))) == (RT_BIT(iReg1) | RT_BIT(iReg2)))
+    {
+        uint32_t u32Eflags = pfnAImpl(pFpuCtx, &u16Fsw, &pFpuCtx->aRegs[0].r80, &pFpuCtx->aRegs[iStReg].r80);
         NOREF(u32Eflags);
 
-        pCtx->XState.x87.FSW &= ~X86_FSW_C1;
-        pCtx->XState.x87.FSW |= u16Fsw & ~X86_FSW_TOP_MASK;
+        pFpuCtx->FSW &= ~X86_FSW_C1;
+        pFpuCtx->FSW |= u16Fsw & ~X86_FSW_TOP_MASK;
         if (   !(u16Fsw & X86_FSW_IE)
-            || (pCtx->XState.x87.FCW & X86_FCW_IM) )
+            || (pFpuCtx->FCW & X86_FCW_IM) )
         {
             pCtx->eflags.u &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF);
@@ -6673,9 +6683,9 @@
         }
     }
-    else if (pCtx->XState.x87.FCW & X86_FCW_IM)
+    else if (pFpuCtx->FCW & X86_FCW_IM)
     {
         /* Masked underflow. */
-        pCtx->XState.x87.FSW &= ~X86_FSW_C1;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF;
+        pFpuCtx->FSW &= ~X86_FSW_C1;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF;
         pCtx->eflags.u &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF);
         pCtx->eflags.u |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF;
@@ -6684,6 +6694,6 @@
     {
         /* Raise underflow - don't touch EFLAGS or TOP. */
-        pCtx->XState.x87.FSW &= ~X86_FSW_C1;
-        pCtx->XState.x87.FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
+        pFpuCtx->FSW &= ~X86_FSW_C1;
+        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
         fPop = false;
     }
@@ -6694,10 +6704,10 @@
     if (fPop)
     {
-        pCtx->XState.x87.FTW &= ~RT_BIT(iReg1);
-        pCtx->XState.x87.FSW &= X86_FSW_TOP_MASK;
-        pCtx->XState.x87.FSW |= ((iReg1 + 7) & X86_FSW_TOP_SMASK) << X86_FSW_TOP_SHIFT;
-    }
-
-    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx);
+        pFpuCtx->FTW &= ~RT_BIT(iReg1);
+        pFpuCtx->FSW &= X86_FSW_TOP_MASK;
+        pFpuCtx->FSW |= ((iReg1 + 7) & X86_FSW_TOP_SMASK) << X86_FSW_TOP_SHIFT;
+    }
+
+    iemFpuUpdateOpcodeAndIpWorker(pIemCpu, pCtx, pFpuCtx);
     iemHlpUsedFpu(pIemCpu);
     iemRegAddToRipAndClearRF(pIemCpu, cbInstr);
Index: /trunk/src/VBox/VMM/VMMR0/CPUMR0.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/CPUMR0.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR0/CPUMR0.cpp	(revision 55048)
@@ -488,5 +488,5 @@
          *        We could just all this in assembly. */
         uint128_t aGuestXmmRegs[16];
-        memcpy(&aGuestXmmRegs[0], &pVCpu->cpum.s.Guest.XState.x87.aXMM[0], sizeof(aGuestXmmRegs));
+        memcpy(&aGuestXmmRegs[0], &pVCpu->cpum.s.Guest.CTX_SUFF(pXState)->x87.aXMM[0], sizeof(aGuestXmmRegs));
 #endif
 
@@ -511,5 +511,5 @@
 
 #ifdef VBOX_WITH_KERNEL_USING_XMM
-        memcpy(&pVCpu->cpum.s.Guest.XState.x87.aXMM[0], &aGuestXmmRegs[0], sizeof(aGuestXmmRegs));
+        memcpy(&pVCpu->cpum.s.Guest.CTX_SUFF(pXState)->x87.aXMM[0], &aGuestXmmRegs[0], sizeof(aGuestXmmRegs));
 #endif
     }
Index: /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR0/CPUMR0A.asm	(revision 55048)
@@ -1,5 +1,5 @@
 ; $Id$
 ;; @file
-; CPUM - Guest Context Assembly Routines.
+; CPUM - Ring-0 Assembly Routines (supporting HM and IEM).
 ;
 
@@ -59,4 +59,8 @@
 %ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
 BEGINDATA
+%if 0 ; Currently not used.
+g_r32_Zero:    dd 0.0
+%endif
+
 ;;
 ; Store the SUPR0AbsIs64bit absolute value here so we can cmp/test without
@@ -70,28 +74,35 @@
 BEGINCODE
 
-;; Macro for FXSAVE/FXRSTOR leaky behaviour on AMD CPUs, see cpumR3CheckLeakyFpu().
+%if 0 ; Currently not used anywhere.
+;;
+; Macro for FXSAVE/FXRSTOR leaky behaviour on AMD CPUs, see cpumR3CheckLeakyFpu().
+;
 ; Cleans the FPU state, if necessary, before restoring the FPU.
 ;
 ; This macro ASSUMES CR0.TS is not set!
-; @remarks Trashes xAX!!
+;
+; @param    xDX     Pointer to CPUMCPU.
+; @uses     xAX, EFLAGS
+;
 ; Changes here should also be reflected in CPUMRCA.asm's copy!
+;
 %macro CLEANFPU 0
-    test    dword [xDX + CPUMCPU.fUseFlags], CPUM_USE_FFXSR_LEAKY
-    jz      .nothing_to_clean
-
-    xor     eax, eax
-    fnstsw  ax               ; Get FSW
-    test    eax, RT_BIT(7)   ; If FSW.ES (bit 7) is set, clear it to not cause FPU exceptions
-                             ; while clearing & loading the FPU bits in 'clean_fpu'
-    jz      .clean_fpu
-    fnclex
+        test    dword [xDX + CPUMCPU.fUseFlags], CPUM_USE_FFXSR_LEAKY
+        jz      .nothing_to_clean
+
+        xor     eax, eax
+        fnstsw  ax                      ; FSW -> AX.
+        test    eax, RT_BIT(7)          ; If FSW.ES (bit 7) is set, clear it to not cause FPU exceptions
+                                        ; while clearing & loading the FPU bits in 'clean_fpu' below.
+        jz      .clean_fpu
+        fnclex
 
 .clean_fpu:
-    ffree   st7              ; Clear FPU stack register(7)'s tag entry to prevent overflow if a wraparound occurs
-                             ; for the upcoming push (load)
-    fild    dword [xDX + CPUMCPU.Guest.XState] ; Explicit FPU load to overwrite FIP, FOP, FDP registers in the FPU.
-
+        ffree   st7                     ; Clear FPU stack register(7)'s tag entry to prevent overflow if a wraparound occurs.
+                                        ; for the upcoming push (load)
+        fild    dword [g_r32_Zero xWrtRIP] ; Explicit FPU load to overwrite FIP, FOP, FDP registers in the FPU.
 .nothing_to_clean:
 %endmacro
+%endif ; Unused.
 
 
@@ -99,65 +110,75 @@
 ;  save the 32-bit FPU state or 64-bit FPU state.
 ;
-; @remarks Requires CPUMCPU pointer in RDX
-%macro SAVE_32_OR_64_FPU 0
-    o64 fxsave  [rdx + CPUMCPU.Guest.XState]
-
-    ; Shouldn't be necessary to check if the entire 64-bit FIP is 0 (i.e. guest hasn't used its FPU yet) because it should
-    ; be taken care of by the calling code, i.e. hmR0[Vmx|Svm]LoadSharedCR0() and hmR0[Vmx|Svm]ExitXcptNm() which ensure
-    ; we swap the guest FPU state when it starts using it (#NM). In any case it's only a performance optimization.
-    ; cmp         qword [rdx + CPUMCPU.Guest.XState + IP_OFF_IN_X86FXSTATE], 0
-    ; je          short %%save_done
-
-    cmp         dword [rdx + CPUMCPU.Guest.XState + CS_OFF_IN_X86FXSTATE], 0
-    jne         short %%save_done
-    sub         rsp, 20h                         ; Only need 1ch bytes but keep stack aligned otherwise we #GP(0)
-    fnstenv     [rsp]
-    movzx       eax, word [rsp + 10h]
-    mov         [rdx + CPUMCPU.Guest.XState + CS_OFF_IN_X86FXSTATE], eax
-    movzx       eax, word [rsp + 18h]
-    mov         [rdx + CPUMCPU.Guest.XState + DS_OFF_IN_X86FXSTATE], eax
-    add         rsp, 20h
-    mov         dword [rdx + CPUMCPU.Guest.XState + X86_OFF_FXSTATE_RSVD], X86_FXSTATE_RSVD_32BIT_MAGIC
+; @param    %1      Pointer to CPUMCPU.
+; @param    %2      Pointer to XState.
+; @uses     xAX, xDX, EFLAGS, 20h of stack.
+;
+%macro SAVE_32_OR_64_FPU 2
+        o64 fxsave [%2]
+
+        xor     edx, edx
+        cmp     dword [%2 + CS_OFF_IN_X86FXSTATE], 0
+        jne     short %%save_done
+
+        sub     rsp, 20h                ; Only need 1ch bytes but keep stack aligned otherwise we #GP(0).
+        fnstenv [rsp]
+        movzx   eax, word [rsp + 10h]
+        mov     [%2 + CS_OFF_IN_X86FXSTATE], eax
+        movzx   eax, word [rsp + 18h]
+        add     rsp, 20h
+        mov     [%2 + DS_OFF_IN_X86FXSTATE], eax
+        mov     edx, X86_FXSTATE_RSVD_32BIT_MAGIC
+
 %%save_done:
+        mov     dword [%2 + X86_OFF_FXSTATE_RSVD], edx
 %endmacro
 
-;; Macro for FXRSTOR for the guest FPU but loads the one based on what
-;  was saved before using SAVE_32_OR_64_FPU().
-;
-; @remarks Requires CPUMCPU pointer in RDX
-%macro RESTORE_32_OR_64_FPU 0
-    cmp         dword [rdx + CPUMCPU.Guest.XState + X86_OFF_FXSTATE_RSVD], X86_FXSTATE_RSVD_32BIT_MAGIC
-    jne         short %%restore_64bit_fpu
-    fxrstor     [rdx + CPUMCPU.Guest.XState]
-    jmp         short %%restore_fpu_done
+;;
+; Wrapper for selecting 32-bit or 64-bit FXRSTOR according to what SAVE_32_OR_64_FPU did.
+;
+; @param    %1      Pointer to CPUMCPU.
+; @param    %2      Pointer to XState.
+; @uses     xAX, xDX, EFLAGS
+;
+%macro RESTORE_32_OR_64_FPU 2
+        cmp     dword [%2 + X86_OFF_FXSTATE_RSVD], X86_FXSTATE_RSVD_32BIT_MAGIC
+        jne     short %%restore_64bit_fpu
+        fxrstor [%2]
+        jmp     short %%restore_fpu_done
 %%restore_64bit_fpu:
-    o64 fxrstor [rdx + CPUMCPU.Guest.XState]
+        o64 fxrstor [%2]
 %%restore_fpu_done:
 %endmacro
 
 
-;; Macro to save and modify CR0 (if necessary) before touching the FPU state
-;  so as to not cause any FPU exceptions.
-;
-; @remarks Uses xCX for backing-up CR0 (if CR0 needs to be modified) otherwise clears xCX.
-; @remarks Trashes xAX.
-%macro SAVE_CR0_CLEAR_FPU_TRAPS 0
-    xor     ecx, ecx
-    mov     xAX, cr0
-    test    eax, X86_CR0_TS | X86_CR0_EM    ; Make sure its safe to access the FPU state.
-    jz      %%skip_cr0_write
-    mov     xCX, xAX                        ; Save old CR0
-    and     xAX, ~(X86_CR0_TS | X86_CR0_EM)
-    mov     cr0, xAX
+;;
+; Clears CR0.TS and CR0.EM if necessary, saving the previous result.
+;
+; This is used to avoid FPU exceptions when touching the FPU state.
+;
+; @param    %1      Register to save the old CR0 in (pass to RESTORE_CR0).
+; @param    %2      Temporary scratch register.
+; @uses     EFLAGS, CR0
+;
+%macro SAVE_CR0_CLEAR_FPU_TRAPS 2
+        xor     %1, %1
+        mov     %2, cr0
+        test    %2, X86_CR0_TS | X86_CR0_EM ; Make sure its safe to access the FPU state.
+        jz      %%skip_cr0_write
+        mov     %1, %2                  ; Save old CR0
+        and     %2, ~(X86_CR0_TS | X86_CR0_EM)
+        mov     cr0, %2
 %%skip_cr0_write:
 %endmacro
 
-;; Macro to restore CR0 from xCX if necessary.
-;
-; @remarks xCX should contain the CR0 value to restore or 0 if no restoration is needed.
-%macro RESTORE_CR0 0
-    cmp     ecx, 0
-    je      %%skip_cr0_restore
-    mov     cr0, xCX
+;;
+; Restore CR0.TS and CR0.EM state if SAVE_CR0_CLEAR_FPU_TRAPS change it.
+;
+; @param    %1      The register that SAVE_CR0_CLEAR_FPU_TRAPS saved the old CR0 in.
+;
+%macro RESTORE_CR0 1
+        cmp     %1, 0
+        je      %%skip_cr0_restore
+        mov     cr0, %1
 %%skip_cr0_restore:
 %endmacro
@@ -165,76 +186,92 @@
 
 ;;
-; Saves the host FPU/XMM state and restores the guest state.
+; Saves the host FPU/SSE/AVX state and restores the guest FPU/SSE/AVX state.
 ;
 ; @returns  0
-; @param    pCPUMCPU  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
+; @param    pCpumCpu  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
 ;
 align 16
 BEGINPROC cpumR0SaveHostRestoreGuestFPUState
+        ;
+        ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input.
+        ;
 %ifdef RT_ARCH_AMD64
  %ifdef RT_OS_WINDOWS
-    mov     xDX, rcx
+        mov     r11, rcx
  %else
-    mov     xDX, rdi
+        mov     r11, rdi
  %endif
-%else
-    mov     xDX, dword [esp + 4]
-%endif
-    pushf                               ; The darwin kernel can get upset or upset things if an
-    cli                                 ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
-
-    ; Switch the state.
-    or      dword [xDX + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
-
-    ; Clear CR0 FPU bits to not cause exceptions, uses xCX
-    SAVE_CR0_CLEAR_FPU_TRAPS
-    ; Do NOT use xCX from this point!
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-    cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-    jz      .legacy_mode
-    db      0xea                        ; jmp far .sixtyfourbit_mode
-    dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+ %define pCpumCpu   r11
+ %define pXState    r10
+%else
+        push    ebx
+        push    esi
+        mov     ebx, dword [esp + 4]
+ %define pCpumCpu ebx
+ %define pXState  esi
+%endif
+
+        pushf                           ; The darwin kernel can get upset or upset things if an
+        cli                             ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
+
+        SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
+
+        ;
+        ; Switch state.
+        ;
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      .legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
 .legacy_mode:
-%endif ; VBOX_WITH_HYBRID_32BIT_KERNEL
-
-%ifdef RT_ARCH_AMD64
-    ; Use explicit REX prefix. See @bugref{6398}.
-    o64 fxsave  [rdx + CPUMCPU.Host.XState] ; ASSUMES that all VT-x/AMD-V boxes sports fxsave/fxrstor (safe assumption)
-
-    ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-    test    dword [rdx + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-    jnz     short .fpu_load_32_or_64
-    fxrstor [rdx + CPUMCPU.Guest.XState]
-    jmp     short .fpu_load_done
+%endif
+
+%ifdef RT_ARCH_AMD64
+        o64 fxsave [pXState]            ; Use explicit REX prefix. See @bugref{6398}.
+
+        ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
+        jnz     short .fpu_load_32_or_64
+        fxrstor [pXState]
+        jmp     short .fpu_load_done
 .fpu_load_32_or_64:
-    RESTORE_32_OR_64_FPU
+        RESTORE_32_OR_64_FPU pCpumCpu, pXState
 .fpu_load_done:
 %else
-    fxsave  [edx + CPUMCPU.Host.XState]     ; ASSUMES that all VT-x/AMD-V boxes sports fxsave/fxrstor (safe assumption)
-    fxrstor [edx + CPUMCPU.Guest.XState]
+        fxsave  [pXState]
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        fxrstor [pXState]
 %endif
 
 %ifdef VBOX_WITH_KERNEL_USING_XMM
-    ; Restore the non-volatile xmm registers. ASSUMING 64-bit windows
-    lea     r11, [xDX + CPUMCPU.Host.XState + XMM_OFF_IN_X86FXSTATE]
-    movdqa  xmm6,  [r11 + 060h]
-    movdqa  xmm7,  [r11 + 070h]
-    movdqa  xmm8,  [r11 + 080h]
-    movdqa  xmm9,  [r11 + 090h]
-    movdqa  xmm10, [r11 + 0a0h]
-    movdqa  xmm11, [r11 + 0b0h]
-    movdqa  xmm12, [r11 + 0c0h]
-    movdqa  xmm13, [r11 + 0d0h]
-    movdqa  xmm14, [r11 + 0e0h]
-    movdqa  xmm15, [r11 + 0f0h]
+        ; Restore the non-volatile xmm registers. ASSUMING 64-bit host.
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
+        movdqa  xmm6,  [pXState + XMM_OFF_IN_X86FXSTATE + 060h]
+        movdqa  xmm7,  [pXState + XMM_OFF_IN_X86FXSTATE + 070h]
+        movdqa  xmm8,  [pXState + XMM_OFF_IN_X86FXSTATE + 080h]
+        movdqa  xmm9,  [pXState + XMM_OFF_IN_X86FXSTATE + 090h]
+        movdqa  xmm10, [pXState + XMM_OFF_IN_X86FXSTATE + 0a0h]
+        movdqa  xmm11, [pXState + XMM_OFF_IN_X86FXSTATE + 0b0h]
+        movdqa  xmm12, [pXState + XMM_OFF_IN_X86FXSTATE + 0c0h]
+        movdqa  xmm13, [pXState + XMM_OFF_IN_X86FXSTATE + 0d0h]
+        movdqa  xmm14, [pXState + XMM_OFF_IN_X86FXSTATE + 0e0h]
+        movdqa  xmm15, [pXState + XMM_OFF_IN_X86FXSTATE + 0f0h]
 %endif
 
 .done:
-    ; Restore CR0 from xCX if it was previously saved.
-    RESTORE_CR0
-    popf
-    xor     eax, eax
-    ret
+        RESTORE_CR0 xCX
+        or      dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
+        popf
+
+%ifdef RT_ARCH_X86
+        pop     esi
+        pop     ebx
+%endif
+        xor     eax, eax
+        ret
 
 %ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
@@ -242,19 +279,19 @@
 BITS 64
 .sixtyfourbit_mode:
-    and     edx, 0ffffffffh
-    o64 fxsave  [rdx + CPUMCPU.Host.XState]
-
-    ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-    test    dword [rdx + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-    jnz     short .fpu_load_32_or_64_darwin
-    fxrstor [rdx + CPUMCPU.Guest.XState]
-    jmp     short .fpu_load_done_darwin
+        o64 fxsave  [pXState]
+
+        ; Restore the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
+        jnz     short .fpu_load_32_or_64_darwin
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        fxrstor [pXState]
+        jmp     short .fpu_load_done_darwin
 .fpu_load_32_or_64_darwin:
-    RESTORE_32_OR_64_FPU
+        RESTORE_32_OR_64_FPU pCpumCpu, pXState
 .fpu_load_done_darwin:
 
-    jmp far [.fpret wrt rip]
+        jmp far [.fpret wrt rip]
 .fpret:                                 ; 16:32 Pointer to .the_end.
-    dd      .done, NAME(SUPR0AbsKernelCS)
+        dd      .done, NAME(SUPR0AbsKernelCS)
 BITS 32
 %endif
@@ -266,30 +303,78 @@
 %ifndef VBOX_WITH_HYBRID_32BIT_KERNEL
 ;;
-; Saves the host FPU/XMM state
-;
-; @returns  0
-; @param    pCPUMCPU  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
+; Saves the host FPU/SSE/AVX state.
+;
+; @returns  VINF_SUCCESS (0) in EAX
+; @param    pCpumCpu  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
 ;
 align 16
 BEGINPROC cpumR0SaveHostFPUState
-    mov     xDX, dword [esp + 4]
-    pushf                               ; The darwin kernel can get upset or upset things if an
-    cli                                 ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
-
-    ; Switch the state.
-    or      dword [xDX + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
-
-    ; Clear CR0 FPU bits to not cause exceptions, uses xCX
-    SAVE_CR0_CLEAR_FPU_TRAPS
-    ; Do NOT use xCX from this point!
-
-    fxsave  [xDX + CPUMCPU.Host.XState] ; ASSUMES that all VT-x/AMD-V boxes support fxsave/fxrstor (safe assumption)
-
-    ; Restore CR0 from xCX if it was saved previously.
-    RESTORE_CR0
-
-    popf
-    xor     eax, eax
-    ret
+        ;
+        ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input.
+        ;
+%ifdef RT_ARCH_AMD64
+ %ifdef RT_OS_WINDOWS
+        mov     r11, rcx
+ %else
+        mov     r11, rdi
+ %endif
+ %define pCpumCpu   r11
+ %define pXState    r10
+%else
+        push    ebx
+        push    esi
+        mov     ebx, dword [esp + 4]
+ %define pCpumCpu ebx
+ %define pXState  esi
+%endif
+
+        pushf                           ; The darwin kernel can get upset or upset things if an
+        cli                             ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
+        SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
+
+        ;
+        ; Save the host state.
+        ;
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      .legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+.legacy_mode:
+%endif
+
+%ifdef RT_ARCH_AMD64
+        o64 fxsave [pXstate]
+%else
+        fxsave  [pXState]
+%endif
+
+.done:
+        RESTORE_CR0 xCX
+        or      dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
+        popf
+
+%ifdef RT_ARCH_X86
+        pop     esi
+        pop     ebx
+%endif
+        xor     eax, eax
+        ret
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
+ALIGNCODE(16)
+BITS 64
+.sixtyfourbit_mode:
+        ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        o64 fxsave [pXstate]
+        jmp far [.fpret wrt rip]
+.fpret:                                 ; 16:32 Pointer to .the_end.
+        dd      .done, NAME(SUPR0AbsKernelCS)
+BITS 32
+%endif
+%undef pCpumCpu
+%undef pXState
 ENDPROC   cpumR0SaveHostFPUState
 %endif
@@ -299,66 +384,80 @@
 
 ;;
-; Saves the guest FPU/XMM state and restores the host state.
-;
-; @returns  0
-; @param    pCPUMCPU  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
+; Saves the guest FPU/SSE/AVX state and restores the host FPU/SSE/AVX state.
+;
+; @returns  VINF_SUCCESS (0) in eax.
+; @param    pCpumCpu  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
 ;
 align 16
 BEGINPROC cpumR0SaveGuestRestoreHostFPUState
+        ;
+        ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input.
+        ;
 %ifdef RT_ARCH_AMD64
  %ifdef RT_OS_WINDOWS
-    mov     xDX, rcx
+        mov     r11, rcx
  %else
-    mov     xDX, rdi
+        mov     r11, rdi
  %endif
-%else
-    mov     xDX, dword [esp + 4]
-%endif
-
-    ; Only restore FPU if guest has used it.
-    ; Using fxrstor should ensure that we're not causing unwanted exception on the host.
-    test    dword [xDX + CPUMCPU.fUseFlags], CPUM_USED_FPU
-    jz      .fpu_not_used
-
-    pushf                               ; The darwin kernel can get upset or upset things if an
-    cli                                 ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
-
-    ; Clear CR0 FPU bits to not cause exceptions, uses xCX
-    SAVE_CR0_CLEAR_FPU_TRAPS
-    ; Do NOT use xCX from this point!
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-    cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-    jz      .legacy_mode
-    db      0xea                        ; jmp far .sixtyfourbit_mode
-    dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+ %define pCpumCpu   r11
+ %define pXState    r10
+%else
+        push    ebx
+        push    esi
+        mov     ebx, dword [esp + 4]
+ %define pCpumCpu ebx
+ %define pXState  esi
+%endif
+
+        ;
+        ; Only restore FPU if guest has used it.
+        ;
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USED_FPU
+        jz      .fpu_not_used
+
+        pushf                           ; The darwin kernel can get upset or upset things if an
+        cli                             ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
+        SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
+
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      .legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
 .legacy_mode:
-%endif ; VBOX_WITH_HYBRID_32BIT_KERNEL
-
-%ifdef RT_ARCH_AMD64
-    ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-    test    dword [rdx + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-    jnz     short .fpu_save_32_or_64
-    fxsave  [rdx + CPUMCPU.Guest.XState]
-    jmp     short .fpu_save_done
+%endif
+
+%ifdef RT_ARCH_AMD64
+        ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
+        jnz     short .fpu_save_32_or_64
+        fxsave  [pXState]
+        jmp     short .fpu_save_done
 .fpu_save_32_or_64:
-    SAVE_32_OR_64_FPU
+        SAVE_32_OR_64_FPU pCpumCpu, pXState
 .fpu_save_done:
 
-    ; Use explicit REX prefix. See @bugref{6398}.
-    o64 fxrstor [rdx + CPUMCPU.Host.XState]
-%else
-    fxsave  [edx + CPUMCPU.Guest.XState]    ; ASSUMES that all VT-x/AMD-V boxes support fxsave/fxrstor (safe assumption)
-    fxrstor [edx + CPUMCPU.Host.XState]
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        o64 fxrstor [pXState]           ; Use explicit REX prefix. See @bugref{6398}.
+%else
+        fxsave  [pXState]               ; ASSUMES that all VT-x/AMD-V boxes support fxsave/fxrstor (safe assumption)
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        fxrstor [pXState]
 %endif
 
 .done:
-    ; Restore CR0 from xCX if it was previously saved.
-    RESTORE_CR0
-    and     dword [xDX + CPUMCPU.fUseFlags], ~CPUM_USED_FPU
-    popf
+        RESTORE_CR0 xCX
+        and     dword [pCpumCpu + CPUMCPU.fUseFlags], ~CPUM_USED_FPU
+        popf
+
 .fpu_not_used:
-    xor     eax, eax
-    ret
+%ifdef RT_ARCH_X86
+        pop     esi
+        pop     ebx
+%endif
+        xor     eax, eax
+        ret
 
 %ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
@@ -366,76 +465,90 @@
 BITS 64
 .sixtyfourbit_mode:
-    and     edx, 0ffffffffh
-
-    ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
-    test    dword [rdx + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
-    jnz     short .fpu_save_32_or_64_darwin
-    fxsave  [rdx + CPUMCPU.Guest.XState]
-    jmp     short .fpu_save_done_darwin
+        ; Save the guest FPU (32-bit or 64-bit), preserves existing broken state. See @bugref{7138}.
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USE_SUPPORTS_LONGMODE
+        jnz     short .fpu_save_32_or_64_darwin
+        fxsave  [pXState]
+        jmp     short .fpu_save_done_darwin
 .fpu_save_32_or_64_darwin:
-    SAVE_32_OR_64_FPU
+        SAVE_32_OR_64_FPU pCpumCpu, pXState
 .fpu_save_done_darwin:
 
-    o64 fxrstor [rdx + CPUMCPU.Host.XState]
-    jmp far [.fpret wrt rip]
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0]
+        o64 fxrstor [pXstate]
+        jmp far [.fpret wrt rip]
 .fpret:                                 ; 16:32 Pointer to .the_end.
-    dd      .done, NAME(SUPR0AbsKernelCS)
+        dd      .done, NAME(SUPR0AbsKernelCS)
 BITS 32
 %endif
+%undef pCpumCpu
+%undef pXState
 ENDPROC   cpumR0SaveGuestRestoreHostFPUState
 
 
 ;;
-; Sets the host's FPU/XMM state
+; Restores the host's FPU/SSE/AVX state from pCpumCpu->Host.
 ;
 ; @returns  0
-; @param    pCPUMCPU  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
+; @param    pCpumCpu  x86:[esp+4] gcc:rdi msc:rcx     CPUMCPU pointer
 ;
 align 16
 BEGINPROC cpumR0RestoreHostFPUState
+        ;
+        ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input.
+        ;
 %ifdef RT_ARCH_AMD64
  %ifdef RT_OS_WINDOWS
-    mov     xDX, rcx
+        mov     r11, rcx
  %else
-    mov     xDX, rdi
+        mov     r11, rdi
  %endif
-%else
-    mov     xDX, dword [esp + 4]
-%endif
-
-    ; Restore FPU if guest has used it.
-    ; Using fxrstor should ensure that we're not causing unwanted exception on the host.
-    test    dword [xDX + CPUMCPU.fUseFlags], CPUM_USED_FPU
-    jz short .fpu_not_used
-
-    pushf                               ; The darwin kernel can get upset or upset things if an
-    cli                                 ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
-
-    ; Clear CR0 FPU bits to not cause exceptions, uses xCX
-    SAVE_CR0_CLEAR_FPU_TRAPS
-    ; Do NOT use xCX from this point!
-
-%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-    cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
-    jz      .legacy_mode
-    db      0xea                        ; jmp far .sixtyfourbit_mode
-    dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
+ %define pCpumCpu   r11
+ %define pXState    r10
+%else
+        push    ebx
+        push    esi
+        mov     ebx, dword [esp + 4]
+ %define pCpumCpu ebx
+ %define pXState  esi
+%endif
+
+        ;
+        ; Restore FPU if guest has used it.
+        ;
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USED_FPU
+        jz short .fpu_not_used
+
+        pushf                           ; The darwin kernel can get upset or upset things if an
+        cli                             ; interrupt occurs while we're doing fxsave/fxrstor/cr0.
+        SAVE_CR0_CLEAR_FPU_TRAPS xCX, xAX ; xCX is now old CR0 value, don't use!
+
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0]
+
+%ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
+        cmp     byte [NAME(g_fCPUMIs64bitHost)], 0
+        jz      .legacy_mode
+        db      0xea                    ; jmp far .sixtyfourbit_mode
+        dd      .sixtyfourbit_mode, NAME(SUPR0Abs64bitKernelCS)
 .legacy_mode:
-%endif ; VBOX_WITH_HYBRID_32BIT_KERNEL
-
-%ifdef RT_ARCH_AMD64
-    o64 fxrstor [xDX + CPUMCPU.Host.XState]
-%else
-    fxrstor [xDX + CPUMCPU.Host.XState]
+%endif
+
+%ifdef RT_ARCH_AMD64
+        o64 fxrstor [pXState]
+%else
+        fxrstor [pXState]
 %endif
 
 .done:
-    ; Restore CR0 from xCX if it was previously saved.
-    RESTORE_CR0
-    and     dword [xDX + CPUMCPU.fUseFlags], ~CPUM_USED_FPU
-    popf
+        RESTORE_CR0 xCX
+        and     dword [pCpumCpu + CPUMCPU.fUseFlags], ~CPUM_USED_FPU
+        popf
+
 .fpu_not_used:
-    xor     eax, eax
-    ret
+%ifdef RT_ARCH_X86
+        pop     esi
+        pop     ebx
+%endif
+        xor     eax, eax
+        ret
 
 %ifdef VBOX_WITH_HYBRID_32BIT_KERNEL_IN_R0
@@ -443,11 +556,12 @@
 BITS 64
 .sixtyfourbit_mode:
-    and     edx, 0ffffffffh
-    o64 fxrstor [rdx + CPUMCPU.Host.XState]
-    jmp far [.fpret wrt rip]
+        o64 fxrstor [pXState]
+        jmp far [.fpret wrt rip]
 .fpret:                                 ; 16:32 Pointer to .the_end.
-    dd      .done, NAME(SUPR0AbsKernelCS)
+        dd      .done, NAME(SUPR0AbsKernelCS)
 BITS 32
 %endif
+%undef pCpumCPu
+%undef pXState
 ENDPROC   cpumR0RestoreHostFPUState
 
Index: /trunk/src/VBox/VMM/VMMR0/HMR0.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/HMR0.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR0/HMR0.cpp	(revision 55048)
@@ -1971,4 +1971,5 @@
              pCtx->SysEnter.cs, pCtx->SysEnter.eip, pCtx->SysEnter.esp));
 
+    PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
     Log(("FPU:\n"
         "FCW=%04x FSW=%04x FTW=%02x\n"
@@ -1976,8 +1977,8 @@
         "FPUDP=%04x DS=%04x Rsvrd2=%04x MXCSR=%08x MXCSR_MASK=%08x\n"
         ,
-        pCtx->XState.x87.FCW, pCtx->XState.x87.FSW, pCtx->XState.x87.FTW,
-        pCtx->XState.x87.FOP, pCtx->XState.x87.FPUIP, pCtx->XState.x87.CS, pCtx->XState.x87.Rsrvd1,
-        pCtx->XState.x87.FPUDP, pCtx->XState.x87.DS, pCtx->XState.x87.Rsrvd2,
-        pCtx->XState.x87.MXCSR, pCtx->XState.x87.MXCSR_MASK));
+        pFpuCtx->FCW,   pFpuCtx->FSW,   pFpuCtx->FTW,
+        pFpuCtx->FOP,   pFpuCtx->FPUIP, pFpuCtx->CS, pFpuCtx->Rsrvd1,
+        pFpuCtx->FPUDP, pFpuCtx->DS,    pFpuCtx->Rsrvd2,
+        pFpuCtx->MXCSR, pFpuCtx->MXCSR_MASK));
 
     Log(("MSR:\n"
Index: /trunk/src/VBox/VMM/VMMR0/HMR0A.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMR0/HMR0A.asm	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR0/HMR0A.asm	(revision 55048)
@@ -1186,21 +1186,21 @@
         ; Load the full guest XMM register state.
         mov     r10, [xBP + 018h]       ; pCtx
-        lea     r10, [r10 + XMM_OFF_IN_X86FXSTATE]
-        movdqa  xmm0,  [r10 + 000h]
-        movdqa  xmm1,  [r10 + 010h]
-        movdqa  xmm2,  [r10 + 020h]
-        movdqa  xmm3,  [r10 + 030h]
-        movdqa  xmm4,  [r10 + 040h]
-        movdqa  xmm5,  [r10 + 050h]
-        movdqa  xmm6,  [r10 + 060h]
-        movdqa  xmm7,  [r10 + 070h]
-        movdqa  xmm8,  [r10 + 080h]
-        movdqa  xmm9,  [r10 + 090h]
-        movdqa  xmm10, [r10 + 0a0h]
-        movdqa  xmm11, [r10 + 0b0h]
-        movdqa  xmm12, [r10 + 0c0h]
-        movdqa  xmm13, [r10 + 0d0h]
-        movdqa  xmm14, [r10 + 0e0h]
-        movdqa  xmm15, [r10 + 0f0h]
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        movdqa  xmm0,  [r10 + XMM_OFF_IN_X86FXSTATE + 000h]
+        movdqa  xmm1,  [r10 + XMM_OFF_IN_X86FXSTATE + 010h]
+        movdqa  xmm2,  [r10 + XMM_OFF_IN_X86FXSTATE + 020h]
+        movdqa  xmm3,  [r10 + XMM_OFF_IN_X86FXSTATE + 030h]
+        movdqa  xmm4,  [r10 + XMM_OFF_IN_X86FXSTATE + 040h]
+        movdqa  xmm5,  [r10 + XMM_OFF_IN_X86FXSTATE + 050h]
+        movdqa  xmm6,  [r10 + XMM_OFF_IN_X86FXSTATE + 060h]
+        movdqa  xmm7,  [r10 + XMM_OFF_IN_X86FXSTATE + 070h]
+        movdqa  xmm8,  [r10 + XMM_OFF_IN_X86FXSTATE + 080h]
+        movdqa  xmm9,  [r10 + XMM_OFF_IN_X86FXSTATE + 090h]
+        movdqa  xmm10, [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h]
+        movdqa  xmm11, [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h]
+        movdqa  xmm12, [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h]
+        movdqa  xmm13, [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h]
+        movdqa  xmm14, [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h]
+        movdqa  xmm15, [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h]
 
         ; Make the call (same as in the other case ).
@@ -1216,21 +1216,21 @@
         ; Save the guest XMM registers.
         mov     r10, [xBP + 018h]       ; pCtx
-        lea     r10, [r10 + XMM_OFF_IN_X86FXSTATE]
-        movdqa  [r10 + 000h], xmm0
-        movdqa  [r10 + 010h], xmm1
-        movdqa  [r10 + 020h], xmm2
-        movdqa  [r10 + 030h], xmm3
-        movdqa  [r10 + 040h], xmm4
-        movdqa  [r10 + 050h], xmm5
-        movdqa  [r10 + 060h], xmm6
-        movdqa  [r10 + 070h], xmm7
-        movdqa  [r10 + 080h], xmm8
-        movdqa  [r10 + 090h], xmm9
-        movdqa  [r10 + 0a0h], xmm10
-        movdqa  [r10 + 0b0h], xmm11
-        movdqa  [r10 + 0c0h], xmm12
-        movdqa  [r10 + 0d0h], xmm13
-        movdqa  [r10 + 0e0h], xmm14
-        movdqa  [r10 + 0f0h], xmm15
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 000h], xmm0
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 010h], xmm1
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 020h], xmm2
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 030h], xmm3
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 040h], xmm4
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 050h], xmm5
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 060h], xmm6
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 070h], xmm7
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 080h], xmm8
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 090h], xmm9
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h], xmm10
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h], xmm11
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h], xmm12
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h], xmm13
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h], xmm14
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h], xmm15
 
         ; Load the host XMM registers.
@@ -1314,21 +1314,21 @@
         ; Load the full guest XMM register state.
         mov     r10, [xBP + 020h]       ; pCtx
-        lea     r10, [r10 + XMM_OFF_IN_X86FXSTATE]
-        movdqa  xmm0,  [r10 + 000h]
-        movdqa  xmm1,  [r10 + 010h]
-        movdqa  xmm2,  [r10 + 020h]
-        movdqa  xmm3,  [r10 + 030h]
-        movdqa  xmm4,  [r10 + 040h]
-        movdqa  xmm5,  [r10 + 050h]
-        movdqa  xmm6,  [r10 + 060h]
-        movdqa  xmm7,  [r10 + 070h]
-        movdqa  xmm8,  [r10 + 080h]
-        movdqa  xmm9,  [r10 + 090h]
-        movdqa  xmm10, [r10 + 0a0h]
-        movdqa  xmm11, [r10 + 0b0h]
-        movdqa  xmm12, [r10 + 0c0h]
-        movdqa  xmm13, [r10 + 0d0h]
-        movdqa  xmm14, [r10 + 0e0h]
-        movdqa  xmm15, [r10 + 0f0h]
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        movdqa  xmm0,  [r10 + XMM_OFF_IN_X86FXSTATE + 000h]
+        movdqa  xmm1,  [r10 + XMM_OFF_IN_X86FXSTATE + 010h]
+        movdqa  xmm2,  [r10 + XMM_OFF_IN_X86FXSTATE + 020h]
+        movdqa  xmm3,  [r10 + XMM_OFF_IN_X86FXSTATE + 030h]
+        movdqa  xmm4,  [r10 + XMM_OFF_IN_X86FXSTATE + 040h]
+        movdqa  xmm5,  [r10 + XMM_OFF_IN_X86FXSTATE + 050h]
+        movdqa  xmm6,  [r10 + XMM_OFF_IN_X86FXSTATE + 060h]
+        movdqa  xmm7,  [r10 + XMM_OFF_IN_X86FXSTATE + 070h]
+        movdqa  xmm8,  [r10 + XMM_OFF_IN_X86FXSTATE + 080h]
+        movdqa  xmm9,  [r10 + XMM_OFF_IN_X86FXSTATE + 090h]
+        movdqa  xmm10, [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h]
+        movdqa  xmm11, [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h]
+        movdqa  xmm12, [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h]
+        movdqa  xmm13, [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h]
+        movdqa  xmm14, [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h]
+        movdqa  xmm15, [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h]
 
         ; Make the call (same as in the other case ).
@@ -1344,21 +1344,21 @@
         ; Save the guest XMM registers.
         mov     r10, [xBP + 020h]       ; pCtx
-        lea     r10, [r10 + XMM_OFF_IN_X86FXSTATE]
-        movdqa  [r10 + 000h], xmm0
-        movdqa  [r10 + 010h], xmm1
-        movdqa  [r10 + 020h], xmm2
-        movdqa  [r10 + 030h], xmm3
-        movdqa  [r10 + 040h], xmm4
-        movdqa  [r10 + 050h], xmm5
-        movdqa  [r10 + 060h], xmm6
-        movdqa  [r10 + 070h], xmm7
-        movdqa  [r10 + 080h], xmm8
-        movdqa  [r10 + 090h], xmm9
-        movdqa  [r10 + 0a0h], xmm10
-        movdqa  [r10 + 0b0h], xmm11
-        movdqa  [r10 + 0c0h], xmm12
-        movdqa  [r10 + 0d0h], xmm13
-        movdqa  [r10 + 0e0h], xmm14
-        movdqa  [r10 + 0f0h], xmm15
+        mov     r10, [r10 + CPUMCTX.pXStateR0]
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 000h], xmm0
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 010h], xmm1
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 020h], xmm2
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 030h], xmm3
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 040h], xmm4
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 050h], xmm5
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 060h], xmm6
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 070h], xmm7
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 080h], xmm8
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 090h], xmm9
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h], xmm10
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h], xmm11
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h], xmm12
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h], xmm13
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h], xmm14
+        movdqa  [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h], xmm15
 
         ; Load the host XMM registers.
Index: /trunk/src/VBox/VMM/VMMR3/CPUM.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMR3/CPUM.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR3/CPUM.cpp	(revision 55048)
@@ -112,42 +112,48 @@
 *******************************************************************************/
 /** Saved state field descriptors for CPUMCTX. */
+static const SSMFIELD g_aCpumX87Fields[] =
+{
+    SSMFIELD_ENTRY(         X86FXSTATE, FCW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FSW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FTW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FOP),
+    SSMFIELD_ENTRY(         X86FXSTATE, FPUIP),
+    SSMFIELD_ENTRY(         X86FXSTATE, CS),
+    SSMFIELD_ENTRY(         X86FXSTATE, Rsrvd1),
+    SSMFIELD_ENTRY(         X86FXSTATE, FPUDP),
+    SSMFIELD_ENTRY(         X86FXSTATE, DS),
+    SSMFIELD_ENTRY(         X86FXSTATE, Rsrvd2),
+    SSMFIELD_ENTRY(         X86FXSTATE, MXCSR),
+    SSMFIELD_ENTRY(         X86FXSTATE, MXCSR_MASK),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[0]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[1]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[2]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[3]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[4]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[5]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[6]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[7]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[0]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[1]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[2]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[3]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[4]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[5]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[6]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[7]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[8]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[9]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[10]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[11]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[12]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[13]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[14]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[15]),
+    SSMFIELD_ENTRY_TERM()
+};
+
+/** Saved state field descriptors for CPUMCTX. */
 static const SSMFIELD g_aCpumCtxFields[] =
 {
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FCW),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FSW),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FTW),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FOP),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FPUIP),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.CS),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.Rsrvd1),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FPUDP),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.DS),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.Rsrvd2),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.MXCSR),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.MXCSR_MASK),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[0]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[1]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[2]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[3]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[4]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[5]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[6]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[7]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[0]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[1]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[2]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[3]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[4]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[5]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[6]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[7]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[8]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[9]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[10]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[11]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[12]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[13]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[14]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[15]),
     SSMFIELD_ENTRY(         CPUMCTX, rdi),
     SSMFIELD_ENTRY(         CPUMCTX, rsi),
@@ -246,43 +252,50 @@
 /** Saved state field descriptors for CPUMCTX in V4.1 before the hidden selector
  * registeres changed. */
+static const SSMFIELD g_aCpumX87FieldsMem[] =
+{
+    SSMFIELD_ENTRY(         X86FXSTATE, FCW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FSW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FTW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FOP),
+    SSMFIELD_ENTRY(         X86FXSTATE, FPUIP),
+    SSMFIELD_ENTRY(         X86FXSTATE, CS),
+    SSMFIELD_ENTRY(         X86FXSTATE, Rsrvd1),
+    SSMFIELD_ENTRY(         X86FXSTATE, FPUDP),
+    SSMFIELD_ENTRY(         X86FXSTATE, DS),
+    SSMFIELD_ENTRY(         X86FXSTATE, Rsrvd2),
+    SSMFIELD_ENTRY(         X86FXSTATE, MXCSR),
+    SSMFIELD_ENTRY(         X86FXSTATE, MXCSR_MASK),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[0]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[1]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[2]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[3]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[4]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[5]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[6]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[7]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[0]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[1]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[2]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[3]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[4]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[5]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[6]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[7]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[8]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[9]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[10]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[11]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[12]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[13]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[14]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[15]),
+    SSMFIELD_ENTRY_IGNORE(  X86FXSTATE, au32RsrvdRest),
+    SSMFIELD_ENTRY_IGNORE(  X86FXSTATE, au32RsrvdForSoftware),
+};
+
+/** Saved state field descriptors for CPUMCTX in V4.1 before the hidden selector
+ * registeres changed. */
 static const SSMFIELD g_aCpumCtxFieldsMem[] =
 {
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FCW),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FSW),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FTW),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FOP),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FPUIP),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.CS),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.Rsrvd1),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.FPUDP),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.DS),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.Rsrvd2),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.MXCSR),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.MXCSR_MASK),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[0]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[1]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[2]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[3]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[4]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[5]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[6]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aRegs[7]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[0]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[1]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[2]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[3]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[4]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[5]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[6]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[7]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[8]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[9]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[10]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[11]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[12]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[13]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[14]),
-    SSMFIELD_ENTRY(         CPUMCTX, XState.x87.aXMM[15]),
-    SSMFIELD_ENTRY_IGNORE(  CPUMCTX, XState.x87.au32RsrvdRest),
     SSMFIELD_ENTRY(         CPUMCTX, rdi),
     SSMFIELD_ENTRY(         CPUMCTX, rsi),
@@ -376,43 +389,49 @@
 
 /** Saved state field descriptors for CPUMCTX_VER1_6. */
+static const SSMFIELD g_aCpumX87FieldsV16[] =
+{
+    SSMFIELD_ENTRY(         X86FXSTATE, FCW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FSW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FTW),
+    SSMFIELD_ENTRY(         X86FXSTATE, FOP),
+    SSMFIELD_ENTRY(         X86FXSTATE, FPUIP),
+    SSMFIELD_ENTRY(         X86FXSTATE, CS),
+    SSMFIELD_ENTRY(         X86FXSTATE, Rsrvd1),
+    SSMFIELD_ENTRY(         X86FXSTATE, FPUDP),
+    SSMFIELD_ENTRY(         X86FXSTATE, DS),
+    SSMFIELD_ENTRY(         X86FXSTATE, Rsrvd2),
+    SSMFIELD_ENTRY(         X86FXSTATE, MXCSR),
+    SSMFIELD_ENTRY(         X86FXSTATE, MXCSR_MASK),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[0]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[1]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[2]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[3]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[4]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[5]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[6]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aRegs[7]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[0]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[1]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[2]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[3]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[4]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[5]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[6]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[7]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[8]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[9]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[10]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[11]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[12]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[13]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[14]),
+    SSMFIELD_ENTRY(         X86FXSTATE, aXMM[15]),
+    SSMFIELD_ENTRY_IGNORE(  X86FXSTATE, au32RsrvdRest),
+    SSMFIELD_ENTRY_TERM()
+};
+
+/** Saved state field descriptors for CPUMCTX_VER1_6. */
 static const SSMFIELD g_aCpumCtxFieldsV16[] =
 {
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.FCW),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.FSW),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.FTW),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.FOP),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.FPUIP),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.CS),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.Rsrvd1),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.FPUDP),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.DS),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.Rsrvd2),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.MXCSR),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.MXCSR_MASK),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[0]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[1]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[2]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[3]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[4]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[5]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[6]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aRegs[7]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[0]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[1]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[2]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[3]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[4]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[5]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[6]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[7]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[8]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[9]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[10]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[11]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[12]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[13]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[14]),
-    SSMFIELD_ENTRY(             CPUMCTX, XState.x87.aXMM[15]),
-    SSMFIELD_ENTRY_IGNORE(      CPUMCTX, XState.x87.au32RsrvdRest),
     SSMFIELD_ENTRY(             CPUMCTX, rdi),
     SSMFIELD_ENTRY(             CPUMCTX, rsi),
@@ -574,4 +593,8 @@
 #endif
 
+    /*
+     * Initialize offsets.
+     */
+
     /* Calculate the offset from CPUM to CPUMCPU for the first CPU. */
     pVM->cpum.s.offCPUMCPU0 = RT_OFFSETOF(VM, aCpus[0].cpum) - RT_OFFSETOF(VM, cpum);
@@ -640,4 +663,34 @@
     AssertLogRelRCReturn(rc, rc);
     pVM->cpum.s.GuestFeatures.enmCpuVendor = pVM->cpum.s.HostFeatures.enmCpuVendor;
+
+    /*
+     * Allocate memory for the extended CPU state.
+     */
+    uint32_t cbMaxXState = sizeof(X86FXSTATE);
+    cbMaxXState = RT_ALIGN(cbMaxXState, 128);
+    uint8_t *pbXStates;
+    rc = MMR3HyperAllocOnceNoRelEx(pVM, cbMaxXState * 3 * pVM->cCpus, PAGE_SIZE, MM_TAG_CPUM_CTX,
+                                   MMHYPER_AONR_FLAGS_KERNEL_MAPPING, (void **)&pbXStates);
+    AssertLogRelRCReturn(rc, rc);
+
+    for (VMCPUID i = 0; i < pVM->cCpus; i++)
+    {
+        PVMCPU pVCpu = &pVM->aCpus[i];
+
+        pVCpu->cpum.s.Guest.pXStateR3 = (PX86XSAVEAREA)pbXStates;
+        pVCpu->cpum.s.Guest.pXStateR0 = MMHyperR3ToR0(pVM, pbXStates);
+        pVCpu->cpum.s.Guest.pXStateRC = MMHyperR3ToR0(pVM, pbXStates);
+        pbXStates += cbMaxXState;
+
+        pVCpu->cpum.s.Host.pXStateR3  = (PX86XSAVEAREA)pbXStates;
+        pVCpu->cpum.s.Host.pXStateR0 = MMHyperR3ToR0(pVM, pbXStates);
+        pVCpu->cpum.s.Host.pXStateRC = MMHyperR3ToR0(pVM, pbXStates);
+        pbXStates += cbMaxXState;
+
+        pVCpu->cpum.s.Hyper.pXStateR3 = (PX86XSAVEAREA)pbXStates;
+        pVCpu->cpum.s.Hyper.pXStateR0 = MMHyperR3ToR0(pVM, pbXStates);
+        pVCpu->cpum.s.Hyper.pXStateRC = MMHyperR3ToR0(pVM, pbXStates);
+        pbXStates += cbMaxXState;
+    }
 
     /*
@@ -701,7 +754,14 @@
     pVM->cpum.s.GuestInfo.paCpuIdLeavesRC = MMHyperR3ToRC(pVM, pVM->cpum.s.GuestInfo.paCpuIdLeavesR3);
 
-    /* Recheck the guest DRx values in raw-mode. */
     for (VMCPUID iCpu = 0; iCpu < pVM->cCpus; iCpu++)
-        CPUMRecalcHyperDRx(&pVM->aCpus[iCpu], UINT8_MAX, false);
+    {
+        PVMCPU pVCpu = &pVM->aCpus[iCpu];
+        pVCpu->cpum.s.Guest.pXStateRC = MMHyperR3ToRC(pVM, pVCpu->cpum.s.Guest.pXStateR3);
+        pVCpu->cpum.s.Host.pXStateRC  = MMHyperR3ToRC(pVM, pVCpu->cpum.s.Host.pXStateR3);
+        pVCpu->cpum.s.Hyper.pXStateRC = MMHyperR3ToRC(pVM, pVCpu->cpum.s.Hyper.pXStateR3); /** @todo remove me */
+
+        /* Recheck the guest DRx values in raw-mode. */
+        CPUMRecalcHyperDRx(pVCpu, UINT8_MAX, false);
+    }
 }
 
@@ -777,7 +837,11 @@
      * Initialize everything to ZERO first.
      */
-    uint32_t fUseFlags =  pVCpu->cpum.s.fUseFlags & ~CPUM_USED_FPU_SINCE_REM;
-    memset(pCtx, 0, sizeof(*pCtx));
-    pVCpu->cpum.s.fUseFlags  = fUseFlags;
+    uint32_t fUseFlags              =  pVCpu->cpum.s.fUseFlags & ~CPUM_USED_FPU_SINCE_REM;
+
+    AssertCompile(RT_OFFSETOF(CPUMCTX, pXStateR0) < RT_OFFSETOF(CPUMCTX, pXStateR3));
+    AssertCompile(RT_OFFSETOF(CPUMCTX, pXStateR0) < RT_OFFSETOF(CPUMCTX, pXStateRC));
+    memset(pCtx, 0, RT_OFFSETOF(CPUMCTX, pXStateR0));
+
+    pVCpu->cpum.s.fUseFlags         = fUseFlags;
 
     pCtx->cr0                       = X86_CR0_CD | X86_CR0_NW | X86_CR0_ET;  //0x60000010
@@ -841,11 +905,12 @@
     pCtx->dr[7]                     = X86_DR7_INIT_VAL;
 
-    pCtx->XState.x87.FTW            = 0x00;         /* All empty (abbridged tag reg edition). */
-    pCtx->XState.x87.FCW            = 0x37f;
+    PX86FXSTATE pFpuCtx = &pCtx->pXStateR3->x87; AssertReleaseMsg(RT_VALID_PTR(pFpuCtx), ("%p\n", pFpuCtx));
+    pFpuCtx->FTW                    = 0x00;         /* All empty (abbridged tag reg edition). */
+    pFpuCtx->FCW                    = 0x37f;
 
     /* Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A, Table 8-1.
        IA-32 Processor States Following Power-up, Reset, or INIT */
-    pCtx->XState.x87.MXCSR          = 0x1F80;
-    pCtx->XState.x87.MXCSR_MASK     = 0xffff; /** @todo REM always changed this for us. Should probably check if the HW really
+    pFpuCtx->MXCSR                  = 0x1F80;
+    pFpuCtx->MXCSR_MASK             = 0xffff; /** @todo REM always changed this for us. Should probably check if the HW really
                                                         supports all bits, since a zero value here should be read as 0xffbf. */
 
@@ -1021,9 +1086,16 @@
 
         uint32_t const  fLoad = uVersion > CPUM_SAVED_STATE_VERSION_MEM ? 0 : SSMSTRUCT_FLAGS_MEM_BAND_AID_RELAXED;
-        PCSSMFIELD      paCpumCtxFields = g_aCpumCtxFields;
+        PCSSMFIELD      paCpumCtx1Fields = g_aCpumX87Fields;
+        PCSSMFIELD      paCpumCtx2Fields = g_aCpumCtxFields;
         if (uVersion == CPUM_SAVED_STATE_VERSION_VER1_6)
-            paCpumCtxFields = g_aCpumCtxFieldsV16;
+        {
+            paCpumCtx1Fields = g_aCpumX87FieldsV16;
+            paCpumCtx2Fields = g_aCpumCtxFieldsV16;
+        }
         else if (uVersion <= CPUM_SAVED_STATE_VERSION_MEM)
-            paCpumCtxFields = g_aCpumCtxFieldsMem;
+        {
+            paCpumCtx1Fields = g_aCpumX87FieldsMem;
+            paCpumCtx2Fields = g_aCpumCtxFieldsMem;
+        }
 
         /*
@@ -1035,5 +1107,9 @@
             uint64_t uCR3  = pVCpu->cpum.s.Hyper.cr3;
             uint64_t uRSP  = pVCpu->cpum.s.Hyper.rsp; /* see VMMR3Relocate(). */
-            SSMR3GetStructEx(pSSM, &pVCpu->cpum.s.Hyper, sizeof(pVCpu->cpum.s.Hyper), fLoad, paCpumCtxFields, NULL);
+            /** @todo drop the FPU bits here! */
+            SSMR3GetStructEx(pSSM, &pVCpu->cpum.s.Hyper.pXStateR3->x87, sizeof(pVCpu->cpum.s.Hyper.pXStateR3->x87),
+                             fLoad | SSMSTRUCT_FLAGS_NO_TAIL_MARKER, paCpumCtx1Fields, NULL);
+            SSMR3GetStructEx(pSSM, &pVCpu->cpum.s.Hyper, sizeof(pVCpu->cpum.s.Hyper),
+                             fLoad | SSMSTRUCT_FLAGS_NO_LEAD_MARKER, paCpumCtx2Fields, NULL);
             pVCpu->cpum.s.Hyper.cr3 = uCR3;
             pVCpu->cpum.s.Hyper.rsp = uRSP;
@@ -1065,6 +1141,8 @@
         {
             PVMCPU  pVCpu = &pVM->aCpus[iCpu];
-            SSMR3GetStructEx(pSSM, &pVCpu->cpum.s.Guest, sizeof(pVCpu->cpum.s.Guest), fLoad,
-                             paCpumCtxFields, NULL);
+            SSMR3GetStructEx(pSSM, &pVCpu->cpum.s.Guest.pXStateR3->x87, sizeof(pVCpu->cpum.s.Guest.pXStateR3->x87),
+                             fLoad | SSMSTRUCT_FLAGS_NO_TAIL_MARKER, paCpumCtx1Fields, NULL);
+            SSMR3GetStructEx(pSSM, &pVCpu->cpum.s.Guest, sizeof(pVCpu->cpum.s.Guest),
+                             fLoad | SSMSTRUCT_FLAGS_NO_LEAD_MARKER, paCpumCtx2Fields, NULL);
             SSMR3GetU32(pSSM, &pVCpu->cpum.s.fUseFlags);
             SSMR3GetU32(pSSM, &pVCpu->cpum.s.fChanged);
@@ -1516,35 +1594,36 @@
                     pszPrefix, pCtx->SysEnter.cs, pCtx->SysEnter.eip, pCtx->SysEnter.esp);
 
+            PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87;
             pHlp->pfnPrintf(pHlp,
                 "%sFCW=%04x %sFSW=%04x %sFTW=%04x %sFOP=%04x %sMXCSR=%08x %sMXCSR_MASK=%08x\n"
                 "%sFPUIP=%08x %sCS=%04x %sRsrvd1=%04x  %sFPUDP=%08x %sDS=%04x %sRsvrd2=%04x\n"
                 ,
-                pszPrefix, pCtx->XState.x87.FCW,   pszPrefix, pCtx->XState.x87.FSW, pszPrefix, pCtx->XState.x87.FTW, pszPrefix, pCtx->XState.x87.FOP,
-                pszPrefix, pCtx->XState.x87.MXCSR, pszPrefix, pCtx->XState.x87.MXCSR_MASK,
-                pszPrefix, pCtx->XState.x87.FPUIP, pszPrefix, pCtx->XState.x87.CS,  pszPrefix, pCtx->XState.x87.Rsrvd1,
-                pszPrefix, pCtx->XState.x87.FPUDP, pszPrefix, pCtx->XState.x87.DS,  pszPrefix, pCtx->XState.x87.Rsrvd2
+                pszPrefix, pFpuCtx->FCW,   pszPrefix, pFpuCtx->FSW, pszPrefix, pFpuCtx->FTW, pszPrefix, pFpuCtx->FOP,
+                pszPrefix, pFpuCtx->MXCSR, pszPrefix, pFpuCtx->MXCSR_MASK,
+                pszPrefix, pFpuCtx->FPUIP, pszPrefix, pFpuCtx->CS,  pszPrefix, pFpuCtx->Rsrvd1,
+                pszPrefix, pFpuCtx->FPUDP, pszPrefix, pFpuCtx->DS,  pszPrefix, pFpuCtx->Rsrvd2
                 );
-            unsigned iShift = (pCtx->XState.x87.FSW >> 11) & 7;
-            for (unsigned iST = 0; iST < RT_ELEMENTS(pCtx->XState.x87.aRegs); iST++)
+            unsigned iShift = (pFpuCtx->FSW >> 11) & 7;
+            for (unsigned iST = 0; iST < RT_ELEMENTS(pFpuCtx->aRegs); iST++)
             {
-                unsigned iFPR        = (iST + iShift) % RT_ELEMENTS(pCtx->XState.x87.aRegs);
-                unsigned uTag        = pCtx->XState.x87.FTW & (1 << iFPR) ? 1 : 0;
-                char     chSign      = pCtx->XState.x87.aRegs[0].au16[4] & 0x8000 ? '-' : '+';
-                unsigned iInteger    = (unsigned)(pCtx->XState.x87.aRegs[0].au64[0] >> 63);
-                uint64_t u64Fraction = pCtx->XState.x87.aRegs[0].au64[0] & UINT64_C(0x7fffffffffffffff);
-                unsigned uExponent   = pCtx->XState.x87.aRegs[0].au16[4] & 0x7fff;
+                unsigned iFPR        = (iST + iShift) % RT_ELEMENTS(pFpuCtx->aRegs);
+                unsigned uTag        = pFpuCtx->FTW & (1 << iFPR) ? 1 : 0;
+                char     chSign      = pFpuCtx->aRegs[0].au16[4] & 0x8000 ? '-' : '+';
+                unsigned iInteger    = (unsigned)(pFpuCtx->aRegs[0].au64[0] >> 63);
+                uint64_t u64Fraction = pFpuCtx->aRegs[0].au64[0] & UINT64_C(0x7fffffffffffffff);
+                unsigned uExponent   = pFpuCtx->aRegs[0].au16[4] & 0x7fff;
                 /** @todo This isn't entirenly correct and needs more work! */
                 pHlp->pfnPrintf(pHlp,
                                 "%sST(%u)=%sFPR%u={%04RX16'%08RX32'%08RX32} t%d %c%u.%022llu ^ %u",
                                 pszPrefix, iST, pszPrefix, iFPR,
-                                pCtx->XState.x87.aRegs[0].au16[4], pCtx->XState.x87.aRegs[0].au32[1], pCtx->XState.x87.aRegs[0].au32[0],
+                                pFpuCtx->aRegs[0].au16[4], pFpuCtx->aRegs[0].au32[1], pFpuCtx->aRegs[0].au32[0],
                                 uTag, chSign, iInteger, u64Fraction, uExponent);
-                if (pCtx->XState.x87.aRegs[0].au16[5] || pCtx->XState.x87.aRegs[0].au16[6] || pCtx->XState.x87.aRegs[0].au16[7])
+                if (pFpuCtx->aRegs[0].au16[5] || pFpuCtx->aRegs[0].au16[6] || pFpuCtx->aRegs[0].au16[7])
                     pHlp->pfnPrintf(pHlp, " res={%04RX16,%04RX16,%04RX16}\n",
-                                    pCtx->XState.x87.aRegs[0].au16[5], pCtx->XState.x87.aRegs[0].au16[6], pCtx->XState.x87.aRegs[0].au16[7]);
+                                    pFpuCtx->aRegs[0].au16[5], pFpuCtx->aRegs[0].au16[6], pFpuCtx->aRegs[0].au16[7]);
                 else
                     pHlp->pfnPrintf(pHlp, "\n");
             }
-            for (unsigned iXMM = 0; iXMM < RT_ELEMENTS(pCtx->XState.x87.aXMM); iXMM++)
+            for (unsigned iXMM = 0; iXMM < RT_ELEMENTS(pFpuCtx->aXMM); iXMM++)
                 pHlp->pfnPrintf(pHlp,
                                 iXMM & 1
@@ -1552,12 +1631,12 @@
                                 : "%sXMM%u%s=%08RX32'%08RX32'%08RX32'%08RX32  ",
                                 pszPrefix, iXMM, iXMM < 10 ? " " : "",
-                                pCtx->XState.x87.aXMM[iXMM].au32[3],
-                                pCtx->XState.x87.aXMM[iXMM].au32[2],
-                                pCtx->XState.x87.aXMM[iXMM].au32[1],
-                                pCtx->XState.x87.aXMM[iXMM].au32[0]);
-            for (unsigned i = 0; i < RT_ELEMENTS(pCtx->XState.x87.au32RsrvdRest); i++)
-                if (pCtx->XState.x87.au32RsrvdRest[i])
+                                pFpuCtx->aXMM[iXMM].au32[3],
+                                pFpuCtx->aXMM[iXMM].au32[2],
+                                pFpuCtx->aXMM[iXMM].au32[1],
+                                pFpuCtx->aXMM[iXMM].au32[0]);
+            for (unsigned i = 0; i < RT_ELEMENTS(pFpuCtx->au32RsrvdRest); i++)
+                if (pFpuCtx->au32RsrvdRest[i])
                     pHlp->pfnPrintf(pHlp, "%sRsrvdRest[i]=%RX32 (offset=%#x)\n",
-                                    pszPrefix, i, pCtx->XState.x87.au32RsrvdRest[i], RT_OFFSETOF(X86FXSTATE, au32RsrvdRest[i]) );
+                                    pszPrefix, i, pFpuCtx->au32RsrvdRest[i], RT_OFFSETOF(X86FXSTATE, au32RsrvdRest[i]) );
 
             pHlp->pfnPrintf(pHlp,
Index: /trunk/src/VBox/VMM/VMMR3/CPUMDbg.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMR3/CPUMDbg.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR3/CPUMDbg.cpp	(revision 55048)
@@ -30,4 +30,5 @@
 #include <VBox/log.h>
 #include <iprt/thread.h>
+#include <iprt/string.h>
 #include <iprt/uint128.h>
 
@@ -57,5 +58,5 @@
 
 /**
- * @interface_method_impl{DBGFREGDESC, pfnGet}
+ * @interface_method_impl{DBGFREGDESC, pfnSet}
  */
 static DECLCALLBACK(int) cpumR3RegSet_Generic(void *pvUser, PCDBGFREGDESC pDesc, PCDBGFREGVAL pValue, PCDBGFREGVAL pfMask)
@@ -100,4 +101,74 @@
     }
 }
+
+
+/**
+ * @interface_method_impl{DBGFREGDESC, pfnGet}
+ */
+static DECLCALLBACK(int) cpumR3RegGet_XStateGeneric(void *pvUser, PCDBGFREGDESC pDesc, PDBGFREGVAL pValue)
+{
+    PVMCPU      pVCpu   = (PVMCPU)pvUser;
+    void const *pv      = (uint8_t const *)&pVCpu->cpum.s.Guest.pXStateR3 + pDesc->offRegister;
+
+    VMCPU_ASSERT_EMT(pVCpu);
+
+    switch (pDesc->enmType)
+    {
+        case DBGFREGVALTYPE_U8:        pValue->u8   = *(uint8_t  const *)pv; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U16:       pValue->u16  = *(uint16_t const *)pv; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U32:       pValue->u32  = *(uint32_t const *)pv; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U64:       pValue->u64  = *(uint64_t const *)pv; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U128:      pValue->u128 = *(PCRTUINT128U    )pv; return VINF_SUCCESS;
+        default:
+            AssertMsgFailedReturn(("%d %s\n", pDesc->enmType, pDesc->pszName), VERR_IPE_NOT_REACHED_DEFAULT_CASE);
+    }
+}
+
+
+/**
+ * @interface_method_impl{DBGFREGDESC, pfnSet}
+ */
+static DECLCALLBACK(int) cpumR3RegSet_XStateGeneric(void *pvUser, PCDBGFREGDESC pDesc, PCDBGFREGVAL pValue, PCDBGFREGVAL pfMask)
+{
+    PVMCPU      pVCpu = (PVMCPU)pvUser;
+    void       *pv    = (uint8_t *)&pVCpu->cpum.s.Guest.pXStateR3 + pDesc->offRegister;
+
+    VMCPU_ASSERT_EMT(pVCpu);
+
+    switch (pDesc->enmType)
+    {
+        case DBGFREGVALTYPE_U8:
+            *(uint8_t *)pv &= ~pfMask->u8;
+            *(uint8_t *)pv |= pValue->u8 & pfMask->u8;
+            return VINF_SUCCESS;
+
+        case DBGFREGVALTYPE_U16:
+            *(uint16_t *)pv &= ~pfMask->u16;
+            *(uint16_t *)pv |= pValue->u16 & pfMask->u16;
+            return VINF_SUCCESS;
+
+        case DBGFREGVALTYPE_U32:
+            *(uint32_t *)pv &= ~pfMask->u32;
+            *(uint32_t *)pv |= pValue->u32 & pfMask->u32;
+            return VINF_SUCCESS;
+
+        case DBGFREGVALTYPE_U64:
+            *(uint64_t *)pv &= ~pfMask->u64;
+            *(uint64_t *)pv |= pValue->u64 & pfMask->u64;
+            return VINF_SUCCESS;
+
+        case DBGFREGVALTYPE_U128:
+        {
+            RTUINT128U Val;
+            RTUInt128AssignAnd((PRTUINT128U)pv, RTUInt128AssignBitwiseNot(RTUInt128Assign(&Val, &pfMask->u128)));
+            RTUInt128AssignOr((PRTUINT128U)pv, RTUInt128AssignAnd(RTUInt128Assign(&Val, &pValue->u128), &pfMask->u128));
+            return VINF_SUCCESS;
+        }
+
+        default:
+            AssertMsgFailedReturn(("%d %s\n", pDesc->enmType, pDesc->pszName), VERR_IPE_NOT_REACHED_DEFAULT_CASE);
+    }
+}
+
 
 
@@ -251,4 +322,41 @@
  */
 static DECLCALLBACK(int) cpumR3RegSet_ftw(void *pvUser, PCDBGFREGDESC pDesc, PCDBGFREGVAL pValue, PCDBGFREGVAL pfMask)
+{
+    NOREF(pvUser); NOREF(pDesc); NOREF(pValue); NOREF(pfMask);
+    return VERR_DBGF_READ_ONLY_REGISTER;
+}
+
+
+/**
+ * @interface_method_impl{DBGFREGDESC, pfnGet}
+ */
+static DECLCALLBACK(int) cpumR3RegGet_Dummy(void *pvUser, PCDBGFREGDESC pDesc, PDBGFREGVAL pValue)
+{
+    switch (pDesc->enmType)
+    {
+        case DBGFREGVALTYPE_U8:        pValue->u8   = 0; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U16:       pValue->u16  = 0; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U32:       pValue->u32  = 0; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U64:       pValue->u64  = 0; return VINF_SUCCESS;
+        case DBGFREGVALTYPE_U128:
+            RT_ZERO(pValue->u128);
+            return VINF_SUCCESS;
+        case DBGFREGVALTYPE_DTR:
+            pValue->dtr.u32Limit = 0;
+            pValue->dtr.u64Base  = 0;
+            return VINF_SUCCESS;
+        case DBGFREGVALTYPE_R80:
+            RT_ZERO(pValue->r80Ex);
+            return VINF_SUCCESS;
+        default:
+            AssertMsgFailedReturn(("%d %s\n", pDesc->enmType, pDesc->pszName), VERR_IPE_NOT_REACHED_DEFAULT_CASE);
+    }
+}
+
+
+/**
+ * @interface_method_impl{DBGFREGDESC, pfnSet}
+ */
+static DECLCALLBACK(int) cpumR3RegSet_Dummy(void *pvUser, PCDBGFREGDESC pDesc, PCDBGFREGVAL pValue, PCDBGFREGVAL pfMask)
 {
     NOREF(pvUser); NOREF(pDesc); NOREF(pValue); NOREF(pfMask);
@@ -511,20 +619,20 @@
     Assert(pDesc->enmType == DBGFREGVALTYPE_R80);
 
+    PX86FXSTATE pFpuCtx = &pVCpu->cpum.s.Guest.CTX_SUFF(pXState)->x87;
     if (cpumR3RegIsFxSaveFormat(pVCpu))
     {
-        unsigned iReg = (pVCpu->cpum.s.Guest.XState.x87.FSW >> 11) & 7;
+        unsigned iReg = (pFpuCtx->FSW >> 11) & 7;
         iReg += pDesc->offRegister;
         iReg &= 7;
-        pValue->r80Ex = pVCpu->cpum.s.Guest.XState.x87.aRegs[iReg].r80Ex;
+        pValue->r80Ex = pFpuCtx->aRegs[iReg].r80Ex;
     }
     else
     {
-        PCX86FPUSTATE pOldFpu = (PCX86FPUSTATE)&pVCpu->cpum.s.Guest.XState.x87;
-
-        unsigned iReg = (pOldFpu->FSW >> 11) & 7;
+        PCX86FPUSTATE pOldFpuCtx = (PCX86FPUSTATE)pFpuCtx;
+
+        unsigned iReg = (pOldFpuCtx->FSW >> 11) & 7;
         iReg += pDesc->offRegister;
         iReg &= 7;
-
-        pValue->r80Ex = pOldFpu->regs[iReg].r80Ex;
+        pValue->r80Ex = pOldFpuCtx->regs[iReg].r80Ex;
     }
 
@@ -663,48 +771,4 @@
     return VERR_ACCESS_DENIED;
 }
-
-
-/**
- * @interface_method_impl{DBGFREGDESC, pfnGet}
- */
-static DECLCALLBACK(int) cpumR3RegHyperGet_stN(void *pvUser, PCDBGFREGDESC pDesc, PDBGFREGVAL pValue)
-{
-    PVMCPU      pVCpu   = (PVMCPU)pvUser;
-
-    VMCPU_ASSERT_EMT(pVCpu);
-    Assert(pDesc->enmType == DBGFREGVALTYPE_R80);
-
-    if (cpumR3RegIsFxSaveFormat(pVCpu))
-    {
-        unsigned iReg = (pVCpu->cpum.s.Guest.XState.x87.FSW >> 11) & 7;
-        iReg += pDesc->offRegister;
-        iReg &= 7;
-        pValue->r80Ex = pVCpu->cpum.s.Guest.XState.x87.aRegs[iReg].r80Ex;
-    }
-    else
-    {
-        PCX86FPUSTATE pOldFpu = (PCX86FPUSTATE)&pVCpu->cpum.s.Guest.XState.x87;
-
-        unsigned iReg = (pOldFpu->FSW >> 11) & 7;
-        iReg += pDesc->offRegister;
-        iReg &= 7;
-
-        pValue->r80Ex = pOldFpu->regs[iReg].r80Ex;
-    }
-
-    return VINF_SUCCESS;
-}
-
-
-/**
- * @interface_method_impl{DBGFREGDESC, pfnGet}
- */
-static DECLCALLBACK(int) cpumR3RegHyperSet_stN(void *pvUser, PCDBGFREGDESC pDesc, PCDBGFREGVAL pValue, PCDBGFREGVAL pfMask)
-{
-    /* There isn't a FPU context for the hypervisor yet, so no point in trying to set stuff. */
-    NOREF(pvUser); NOREF(pDesc); NOREF(pValue); NOREF(pfMask);
-    return VERR_ACCESS_DENIED;
-}
-
 
 
@@ -1079,8 +1143,8 @@
 
 #define CPU_REG_MM(n) \
-    CPU_REG_RW_AS("mm" #n,          MM##n,          U64, XState.x87.aRegs[n].mmx, cpumR3RegGet_Generic, cpumR3RegSet_Generic, NULL,                       g_aCpumRegFields_mmN)
+    CPU_REG_XS_RW_AS("mm" #n,       MM##n,          U64, x87.aRegs[n].mmx, cpumR3RegGet_XStateGeneric, cpumR3RegSet_XStateGeneric, NULL,                       g_aCpumRegFields_mmN)
 
 #define CPU_REG_XMM(n) \
-    CPU_REG_RW_AS("xmm" #n,         XMM##n,         U128, XState.x87.aXMM[n].xmm, cpumR3RegGet_Generic, cpumR3RegSet_Generic, NULL,                       g_aCpumRegFields_xmmN)
+    CPU_REG_XS_RW_AS("xmm" #n,      XMM##n,         U128, x87.aXMM[n].xmm, cpumR3RegGet_XStateGeneric, cpumR3RegSet_XStateGeneric, NULL,                       g_aCpumRegFields_xmmN)
 /** @} */
 
@@ -1095,4 +1159,8 @@
 #define CPU_REG_RO_AS(a_szName, a_RegSuff, a_TypeSuff, a_CpumCtxMemb, a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields) \
     { a_szName, DBGFREG_##a_RegSuff, DBGFREGVALTYPE_##a_TypeSuff, DBGFREG_FLAGS_READ_ONLY, RT_OFFSETOF(CPUMCPU, Guest.a_CpumCtxMemb), a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields }
+#define CPU_REG_XS_RW_AS(a_szName, a_RegSuff, a_TypeSuff, a_XStateMemb, a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields) \
+    { a_szName, DBGFREG_##a_RegSuff, DBGFREGVALTYPE_##a_TypeSuff, 0 /*fFlags*/,            RT_OFFSETOF(X86XSAVEAREA, a_XStateMemb),   a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields }
+#define CPU_REG_XS_RO_AS(a_szName, a_RegSuff, a_TypeSuff, a_XStateMemb, a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields) \
+    { a_szName, DBGFREG_##a_RegSuff, DBGFREGVALTYPE_##a_TypeSuff, DBGFREG_FLAGS_READ_ONLY, RT_OFFSETOF(X86XSAVEAREA, a_XStateMemb), a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields }
 #define CPU_REG_MSR(a_szName, UName, a_TypeSuff, a_paSubFields) \
     CPU_REG_EX_AS(a_szName,         MSR_##UName,    a_TypeSuff, MSR_##UName,    cpumR3RegGstGet_msr,  cpumR3RegGstSet_msr,  NULL,                       a_paSubFields)
@@ -1123,15 +1191,15 @@
     CPU_REG_SEG(SS, ss),
     CPU_REG_REG(RIP, rip),
-    CPU_REG_RW_AS("rflags",         RFLAGS,         U64, rflags,                cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   g_aCpumRegAliases_rflags,   g_aCpumRegFields_rflags ),
-    CPU_REG_RW_AS("fcw",            FCW,            U16, XState.x87.FCW,        cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_fcw    ),
-    CPU_REG_RW_AS("fsw",            FSW,            U16, XState.x87.FSW,        cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_fsw    ),
-    CPU_REG_RO_AS("ftw",            FTW,            U16, XState.x87,            cpumR3RegGet_ftw,       cpumR3RegSet_ftw,       NULL,                       g_aCpumRegFields_ftw    ),
-    CPU_REG_RW_AS("fop",            FOP,            U16, XState.x87.FOP,        cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
-    CPU_REG_RW_AS("fpuip",          FPUIP,          U32, XState.x87.FPUIP,      cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   g_aCpumRegAliases_fpuip,    NULL                    ),
-    CPU_REG_RW_AS("fpucs",          FPUCS,          U16, XState.x87.CS,         cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
-    CPU_REG_RW_AS("fpudp",          FPUDP,          U32, XState.x87.FPUDP,      cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   g_aCpumRegAliases_fpudp,    NULL                    ),
-    CPU_REG_RW_AS("fpuds",          FPUDS,          U16, XState.x87.DS,         cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
-    CPU_REG_RW_AS("mxcsr",          MXCSR,          U32, XState.x87.MXCSR,      cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_mxcsr  ),
-    CPU_REG_RW_AS("mxcsr_mask",     MXCSR_MASK,     U32, XState.x87.MXCSR_MASK, cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_mxcsr  ),
+    CPU_REG_RW_AS("rflags",         RFLAGS,         U64, rflags,         cpumR3RegGet_Generic,         cpumR3RegSet_Generic,         g_aCpumRegAliases_rflags,   g_aCpumRegFields_rflags ),
+    CPU_REG_XS_RW_AS("fcw",         FCW,            U16, x87.FCW,        cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       g_aCpumRegFields_fcw    ),
+    CPU_REG_XS_RW_AS("fsw",         FSW,            U16, x87.FSW,        cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       g_aCpumRegFields_fsw    ),
+    CPU_REG_XS_RO_AS("ftw",         FTW,            U16, x87,            cpumR3RegGet_ftw,             cpumR3RegSet_ftw,             NULL,                       g_aCpumRegFields_ftw    ),
+    CPU_REG_XS_RW_AS("fop",         FOP,            U16, x87.FOP,        cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       NULL                    ),
+    CPU_REG_XS_RW_AS("fpuip",       FPUIP,          U32, x87.FPUIP,      cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   g_aCpumRegAliases_fpuip,    NULL                    ),
+    CPU_REG_XS_RW_AS("fpucs",       FPUCS,          U16, x87.CS,         cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       NULL                    ),
+    CPU_REG_XS_RW_AS("fpudp",       FPUDP,          U32, x87.FPUDP,      cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   g_aCpumRegAliases_fpudp,    NULL                    ),
+    CPU_REG_XS_RW_AS("fpuds",       FPUDS,          U16, x87.DS,         cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       NULL                    ),
+    CPU_REG_XS_RW_AS("mxcsr",       MXCSR,          U32, x87.MXCSR,      cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       g_aCpumRegFields_mxcsr  ),
+    CPU_REG_XS_RW_AS("mxcsr_mask",  MXCSR_MASK,     U32, x87.MXCSR_MASK, cpumR3RegGet_XStateGeneric,   cpumR3RegSet_XStateGeneric,   NULL,                       g_aCpumRegFields_mxcsr  ),
     CPU_REG_ST(0),
     CPU_REG_ST(1),
@@ -1223,8 +1291,8 @@
 #define CPU_REG_RO_AS(a_szName, a_RegSuff, a_TypeSuff, a_CpumCtxMemb, a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields) \
     { a_szName, DBGFREG_##a_RegSuff, DBGFREGVALTYPE_##a_TypeSuff, DBGFREG_FLAGS_READ_ONLY, RT_OFFSETOF(CPUMCPU, Hyper.a_CpumCtxMemb), a_pfnGet, a_pfnSet, a_paAliases, a_paSubFields }
+#define CPU_REG_DUMMY(a_szName, a_RegSuff, a_TypeSuff) \
+    { a_szName, DBGFREG_##a_RegSuff, DBGFREGVALTYPE_##a_TypeSuff, DBGFREG_FLAGS_READ_ONLY, 0, cpumR3RegGet_Dummy, cpumR3RegSet_Dummy, NULL, NULL}
 #define CPU_REG_MSR(a_szName, UName, a_TypeSuff, a_paSubFields) \
     CPU_REG_EX_AS(a_szName,         MSR_##UName,    a_TypeSuff, MSR_##UName,    cpumR3RegHyperGet_msr,  cpumR3RegHyperSet_msr,  NULL,                       a_paSubFields)
-#define CPU_REG_ST(n) \
-    CPU_REG_EX_AS("st" #n,          ST##n,          R80, n,                     cpumR3RegHyperGet_stN,  cpumR3RegHyperSet_stN,  NULL,                       g_aCpumRegFields_stN)
 
     CPU_REG_REG(RAX, rax),
@@ -1252,46 +1320,46 @@
     CPU_REG_REG(RIP, rip),
     CPU_REG_RW_AS("rflags",         RFLAGS,         U64, rflags,                cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   g_aCpumRegAliases_rflags,   g_aCpumRegFields_rflags ),
-    CPU_REG_RW_AS("fcw",            FCW,            U16, XState.x87.FCW,        cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_fcw    ),
-    CPU_REG_RW_AS("fsw",            FSW,            U16, XState.x87.FSW,        cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_fsw    ),
-    CPU_REG_RO_AS("ftw",            FTW,            U16, XState.x87,            cpumR3RegGet_ftw,       cpumR3RegSet_ftw,       NULL,                       g_aCpumRegFields_ftw    ),
-    CPU_REG_RW_AS("fop",            FOP,            U16, XState.x87.FOP,        cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
-    CPU_REG_RW_AS("fpuip",          FPUIP,          U32, XState.x87.FPUIP,      cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   g_aCpumRegAliases_fpuip,    NULL                    ),
-    CPU_REG_RW_AS("fpucs",          FPUCS,          U16, XState.x87.CS,         cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
-    CPU_REG_RW_AS("fpudp",          FPUDP,          U32, XState.x87.FPUDP,      cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   g_aCpumRegAliases_fpudp,    NULL                    ),
-    CPU_REG_RW_AS("fpuds",          FPUDS,          U16, XState.x87.DS,         cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
-    CPU_REG_RW_AS("mxcsr",          MXCSR,          U32, XState.x87.MXCSR,      cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_mxcsr  ),
-    CPU_REG_RW_AS("mxcsr_mask",     MXCSR_MASK,     U32, XState.x87.MXCSR_MASK, cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       g_aCpumRegFields_mxcsr  ),
-    CPU_REG_ST(0),
-    CPU_REG_ST(1),
-    CPU_REG_ST(2),
-    CPU_REG_ST(3),
-    CPU_REG_ST(4),
-    CPU_REG_ST(5),
-    CPU_REG_ST(6),
-    CPU_REG_ST(7),
-    CPU_REG_MM(0),
-    CPU_REG_MM(1),
-    CPU_REG_MM(2),
-    CPU_REG_MM(3),
-    CPU_REG_MM(4),
-    CPU_REG_MM(5),
-    CPU_REG_MM(6),
-    CPU_REG_MM(7),
-    CPU_REG_XMM(0),
-    CPU_REG_XMM(1),
-    CPU_REG_XMM(2),
-    CPU_REG_XMM(3),
-    CPU_REG_XMM(4),
-    CPU_REG_XMM(5),
-    CPU_REG_XMM(6),
-    CPU_REG_XMM(7),
-    CPU_REG_XMM(8),
-    CPU_REG_XMM(9),
-    CPU_REG_XMM(10),
-    CPU_REG_XMM(11),
-    CPU_REG_XMM(12),
-    CPU_REG_XMM(13),
-    CPU_REG_XMM(14),
-    CPU_REG_XMM(15),
+    CPU_REG_DUMMY("fcw",            FCW,            U16),
+    CPU_REG_DUMMY("fsw",            FSW,            U16),
+    CPU_REG_DUMMY("ftw",            FTW,            U16),
+    CPU_REG_DUMMY("fop",            FOP,            U16),
+    CPU_REG_DUMMY("fpuip",          FPUIP,          U32),
+    CPU_REG_DUMMY("fpucs",          FPUCS,          U16),
+    CPU_REG_DUMMY("fpudp",          FPUDP,          U32),
+    CPU_REG_DUMMY("fpuds",          FPUDS,          U16),
+    CPU_REG_DUMMY("mxcsr",          MXCSR,          U32),
+    CPU_REG_DUMMY("mxcsr_mask",     MXCSR_MASK,     U32),
+    CPU_REG_DUMMY("st0",            ST0,            R80),
+    CPU_REG_DUMMY("st1",            ST1,            R80),
+    CPU_REG_DUMMY("st2",            ST2,            R80),
+    CPU_REG_DUMMY("st3",            ST3,            R80),
+    CPU_REG_DUMMY("st4",            ST4,            R80),
+    CPU_REG_DUMMY("st5",            ST5,            R80),
+    CPU_REG_DUMMY("st6",            ST6,            R80),
+    CPU_REG_DUMMY("st7",            ST7,            R80),
+    CPU_REG_DUMMY("mm0",            MM0,            U64),
+    CPU_REG_DUMMY("mm1",            MM1,            U64),
+    CPU_REG_DUMMY("mm2",            MM2,            U64),
+    CPU_REG_DUMMY("mm3",            MM3,            U64),
+    CPU_REG_DUMMY("mm4",            MM4,            U64),
+    CPU_REG_DUMMY("mm5",            MM5,            U64),
+    CPU_REG_DUMMY("mm6",            MM6,            U64),
+    CPU_REG_DUMMY("mm7",            MM7,            U64),
+    CPU_REG_DUMMY("xmm0",           XMM0,           U128),
+    CPU_REG_DUMMY("xmm1",           XMM1,           U128),
+    CPU_REG_DUMMY("xmm2",           XMM2,           U128),
+    CPU_REG_DUMMY("xmm3",           XMM3,           U128),
+    CPU_REG_DUMMY("xmm4",           XMM4,           U128),
+    CPU_REG_DUMMY("xmm5",           XMM5,           U128),
+    CPU_REG_DUMMY("xmm6",           XMM6,           U128),
+    CPU_REG_DUMMY("xmm7",           XMM7,           U128),
+    CPU_REG_DUMMY("xmm8",           XMM8,           U128),
+    CPU_REG_DUMMY("xmm9",           XMM9,           U128),
+    CPU_REG_DUMMY("xmm10",          XMM10,          U128),
+    CPU_REG_DUMMY("xmm11",          XMM11,          U128),
+    CPU_REG_DUMMY("xmm12",          XMM12,          U128),
+    CPU_REG_DUMMY("xmm13",          XMM13,          U128),
+    CPU_REG_DUMMY("xmm14",          XMM14,          U128),
+    CPU_REG_DUMMY("xmm15",          XMM15,          U128),
     CPU_REG_RW_AS("gdtr_base",      GDTR_BASE,      U64, gdtr.pGdt,             cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
     CPU_REG_RW_AS("gdtr_lim",       GDTR_LIMIT,     U16, gdtr.cbGdt,            cpumR3RegGet_Generic,   cpumR3RegSet_Generic,   NULL,                       NULL                    ),
Index: /trunk/src/VBox/VMM/VMMR3/SSM.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMR3/SSM.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMR3/SSM.cpp	(revision 55048)
@@ -6672,5 +6672,5 @@
      * Begin marker.
      */
-    if (!(fFlags & SSMSTRUCT_FLAGS_NO_MARKERS))
+    if (!(fFlags & (SSMSTRUCT_FLAGS_NO_MARKERS | SSMSTRUCT_FLAGS_NO_LEAD_MARKER)))
     {
         rc = SSMR3GetU32(pSSM, &u32Magic);
@@ -6904,5 +6904,5 @@
      * End marker
      */
-    if (!(fFlags & SSMSTRUCT_FLAGS_NO_MARKERS))
+    if (!(fFlags & (SSMSTRUCT_FLAGS_NO_MARKERS | SSMSTRUCT_FLAGS_NO_TAIL_MARKER)))
     {
         rc = SSMR3GetU32(pSSM, &u32Magic);
Index: /trunk/src/VBox/VMM/VMMRC/CPUMRCA.asm
===================================================================
--- /trunk/src/VBox/VMM/VMMRC/CPUMRCA.asm	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMRC/CPUMRCA.asm	(revision 55048)
@@ -62,4 +62,10 @@
 align 16
 BEGINPROC   cpumHandleLazyFPUAsm
+        push    ebx
+        push    esi
+        mov     ebx, [esp + 4]
+%define pCpumCpu ebx
+%define pXState  esi
+
         ;
         ; Figure out what to do.
@@ -92,16 +98,7 @@
         ; loaded the GC FPU. Because if we have, this is an trap for the guest - raw ring-3.
         ;
-%ifdef RT_ARCH_AMD64
- %ifdef RT_OS_WINDOWS
-        mov     xDX, rcx
- %else
-        mov     xDX, rdi
- %endif
-%else
-        mov     xDX, dword [esp + 4]
-%endif
-        test    dword [xDX + CPUMCPU.fUseFlags], CPUM_USED_FPU
+        test    dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USED_FPU
         jz      hlfpua_not_loaded
-        jmp     hlfpua_to_host
+        jmp     hlfpua_guest_trap
 
     ;
@@ -110,12 +107,7 @@
 align 16
 hlfpua_not_loaded:
-        mov     eax, [xDX + CPUMCPU.Guest.cr0]
+        mov     eax, [pCpumCpu + CPUMCPU.Guest.cr0]
         and     eax, X86_CR0_MP | X86_CR0_EM | X86_CR0_TS
-%ifdef RT_ARCH_AMD64
-        lea     r8, [hlfpuajmp1 wrt rip]
-        jmp     qword [rax*4 + r8]
-%else
         jmp     dword [eax*2 + hlfpuajmp1]
-%endif
 align 16
 ;; jump table using fpu related cr0 flags as index.
@@ -126,7 +118,7 @@
         RTCCPTR_DEF hlfpua_switch_fpu_ctx
         RTCCPTR_DEF hlfpua_switch_fpu_ctx
-        RTCCPTR_DEF hlfpua_to_host
-        RTCCPTR_DEF hlfpua_switch_fpu_ctx
-        RTCCPTR_DEF hlfpua_to_host
+        RTCCPTR_DEF hlfpua_guest_trap
+        RTCCPTR_DEF hlfpua_switch_fpu_ctx
+        RTCCPTR_DEF hlfpua_guest_trap
 ;; and mask for cr0.
 hlfpu_afFlags:
@@ -145,65 +137,48 @@
 align 16
 hlfpua_switch_fpu_ctx:
-        ; Paranoia. This function was previously used in ring-0, not any longer.
-%ifdef IN_RING3
-%error "This function is not written for ring-3"
-%endif
-%ifdef IN_RING0
-%error "This function is not written for ring-0"
-%endif
-
-        mov     xCX, cr0
-%ifdef RT_ARCH_AMD64
-        lea     r8, [hlfpu_afFlags wrt rip]
-        and     rcx, [rax*4 + r8]       ; calc the new cr0 flags.
-%else
-        and     ecx, [eax*2 + hlfpu_afFlags] ; calc the new cr0 flags.
-%endif
-        mov     xAX, cr0
-        and     xAX, ~(X86_CR0_TS | X86_CR0_EM)
-        mov     cr0, xAX                ; clear flags so we don't trap here.
-%ifndef RT_ARCH_AMD64
-        mov     eax, edx                ; Calculate the PCPUM pointer
-        sub     eax, [edx + CPUMCPU.offCPUM]
+        mov     ecx, cr0
+        mov     edx, ecx
+        and     ecx, [eax*2 + hlfpu_afFlags] ; Calc the new cr0 flags. Do NOT use ECX until we restore it!
+        and     edx, ~(X86_CR0_TS | X86_CR0_EM)
+        mov     cr0, edx                ; Clear flags so we don't trap here.
+
+        mov     pXState, [pCpumCpu + CPUMCPU.Host.pXStateRC]
+        mov     eax, pCpumCpu           ; Calculate the PCPUM pointer
+        sub     eax, [pCpumCpu + CPUMCPU.offCPUM]
         test    dword [eax + CPUM.CPUFeatures.edx], X86_CPUID_FEATURE_EDX_FXSR
         jz short hlfpua_no_fxsave
-%endif
-
-%ifdef RT_ARCH_AMD64
-        ; Use explicit REX prefix. See @bugref{6398}.
-        o64 fxsave  [xDX + CPUMCPU.Host.XState]
-%else
-        fxsave  [xDX + CPUMCPU.Host.XState]
-%endif
-        or      dword [xDX + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
-        fxrstor [xDX + CPUMCPU.Guest.XState] ; raw-mode guest is always 32-bit. See @bugref{7138}.
+
+        fxsave  [pXState]
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateRC]
+        fxrstor [pXState]
 
 hlfpua_finished_switch:
+        or      dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM)
 
         ; Load new CR0 value.
-        ;; @todo Optimize the many unconditional CR0 writes.
-        mov     cr0, xCX                ; load the new cr0 flags.
+        mov     cr0, ecx                ; load the new cr0 flags.
 
         ; return continue execution.
+        pop     esi
+        pop     ebx
         xor     eax, eax
         ret
 
-%ifndef RT_ARCH_AMD64
-; legacy support.
+        ;
+        ; Legacy CPU support.
+        ;
 hlfpua_no_fxsave:
-        fnsave  [xDX + CPUMCPU.Host.XState]
-        or      dword [xDX + CPUMCPU.fUseFlags], dword (CPUM_USED_FPU | CPUM_USED_FPU_SINCE_REM) ; yasm / nasm
-        mov     eax, [xDX + CPUMCPU.Guest.XState]   ; control word
-        not     eax                                 ; 1 means exception ignored (6 LS bits)
-        and     eax, byte 03Fh                      ; 6 LS bits only
-        test    eax, [xDX + CPUMCPU.Guest.XState + 4] ; status word
+        fnsave  [pXState]
+        mov     pXState, [pCpumCpu + CPUMCPU.Guest.pXStateRC]
+        mov     eax, [pXState]          ; control word
+        not     eax                     ; 1 means exception ignored (6 LS bits)
+        and     eax, byte 03Fh          ; 6 LS bits only
+        test    eax, [pXState + 4]      ; status word
         jz short hlfpua_no_exceptions_pending
-        ; technically incorrect, but we certainly don't want any exceptions now!!
-        and     dword [xDX + CPUMCPU.Guest.XState + 4], ~03Fh
+        ; Technically incorrect, but we certainly don't want any exceptions now!!
+        and     dword [pXState + 4], ~03Fh
 hlfpua_no_exceptions_pending:
-        frstor  [xDX + CPUMCPU.Guest.XState]
+        frstor  [pXState]
         jmp near hlfpua_finished_switch
-%endif ; !RT_ARCH_AMD64
-
 
         ;
@@ -211,5 +186,7 @@
         ;
 hlfpua_action_4:
-hlfpua_to_host:
+hlfpua_guest_trap:
+        pop     esi
+        pop     ebx
         mov     eax, VINF_EM_RAW_GUEST_TRAP
         ret
Index: /trunk/src/VBox/VMM/VMMSwitcher/AMD64andLegacy.mac
===================================================================
--- /trunk/src/VBox/VMM/VMMSwitcher/AMD64andLegacy.mac	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMSwitcher/AMD64andLegacy.mac	(revision 55048)
@@ -1144,6 +1144,8 @@
     mov     cr0, rcx
 
-    fxsave  [rdx + r8 + CPUMCPU.Guest.XState]
-    o64 fxrstor [rdx + r8 + CPUMCPU.Host.XState]    ; Restore 64-bit host FPU state. See @bugref{7138}
+    mov     rax, [rdx + r8 + CPUMCPU.Guest.pXStateR0]
+    fxsave  [rax]
+    mov     rax, [rdx + r8 + CPUMCPU.Host.pXStateR0]
+    fxrstor [rax]                       ; We saved 32-bit state, so only restore 32-bit.
     jmp short gth_fpu_no
 
Index: /trunk/src/VBox/VMM/VMMSwitcher/LegacyandAMD64.mac
===================================================================
--- /trunk/src/VBox/VMM/VMMSwitcher/LegacyandAMD64.mac	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMSwitcher/LegacyandAMD64.mac	(revision 55048)
@@ -664,6 +664,6 @@
     and     rax, ~(X86_CR0_TS | X86_CR0_EM)
     mov     cr0, rax
-    ; Use explicit REX prefix. See @bugref{6398}.
-    o64 fxrstor [rdx + CPUMCPU.Guest.XState]
+    mov     eax, [rdx + CPUMCPU.Guest.pXStateRC]
+    o64 fxrstor [rax]                   ; (use explicit REX prefix, see @bugref{6398})
     mov     cr0, rcx                    ; and restore old CR0 again
 
@@ -725,5 +725,5 @@
     ; parameter for all helper functions (pCtx)
     DEBUG64_CHAR('9')
-    lea     rsi, [rdx + CPUMCPU.Guest.XState]
+    lea     rsi, [rdx + CPUMCPU.Guest]
     lea     rax, [htg_return wrt rip]
     push    rax                         ; return address
@@ -1258,6 +1258,6 @@
     mov     cr0, rax
 
-    ; Use explicit REX prefix. See @bugref{6398}.
-    o64 fxsave  [rsi + CPUMCTX.XState]
+    mov     eax, [rsi + CPUMCTX.pXStateRC]
+    o64 fxsave  [rax]                   ; (use explicit REX prefix, see @bugref{6398})
 
     mov     cr0, rcx                    ; and restore old CR0 again
Index: /trunk/src/VBox/VMM/VMMSwitcher/PAEand32Bit.mac
===================================================================
--- /trunk/src/VBox/VMM/VMMSwitcher/PAEand32Bit.mac	(revision 55047)
+++ /trunk/src/VBox/VMM/VMMSwitcher/PAEand32Bit.mac	(revision 55048)
@@ -990,22 +990,24 @@
     mov     cr0, ecx
 
+    mov     eax, [edx + CPUMCPU.Guest.pXStateR0]
+    mov     ecx, [edx + CPUMCPU.Host.pXStateR0]
     FIXUP FIX_NO_FXSAVE_JMP, 0, gth_no_fxsave - NAME(Start) ; this will insert a jmp gth_no_fxsave if fxsave isn't supported.
-    fxsave  [edx + CPUMCPU.Guest.XState]
-    fxrstor [edx + CPUMCPU.Host.XState]
+    fxsave  [eax]
+    fxrstor [ecx]
     jmp near gth_fpu_no
 
 gth_no_fxsave:
-    fnsave  [edx + CPUMCPU.Guest.XState]
-    mov     eax, [edx + CPUMCPU.Host.XState] ; control word
-    not     eax                            ; 1 means exception ignored (6 LS bits)
-    and     eax, byte 03Fh                 ; 6 LS bits only
-    test    eax, [edx + CPUMCPU.Host.XState + 4] ; status word
+    fnsave  [eax]
+    mov     eax, [ecx]                  ; control word
+    not     eax                         ; 1 means exception ignored (6 LS bits)
+    and     eax, byte 03Fh              ; 6 LS bits only
+    test    eax, [ecx + 4]              ; status word
     jz      gth_no_exceptions_pending
 
     ; technically incorrect, but we certainly don't want any exceptions now!!
-    and     dword [edx + CPUMCPU.Host.XState + 4], ~03Fh
+    and     dword [ecx + 4], ~03Fh
 
 gth_no_exceptions_pending:
-    frstor  [edx + CPUMCPU.Host.XState]
+    frstor  [ecx]
     jmp short gth_fpu_no
 
Index: /trunk/src/VBox/VMM/include/CPUMInternal.h
===================================================================
--- /trunk/src/VBox/VMM/include/CPUMInternal.h	(revision 55047)
+++ /trunk/src/VBox/VMM/include/CPUMInternal.h	(revision 55048)
@@ -286,8 +286,4 @@
 typedef struct CPUMHOSTCTX
 {
-    /** FPU state. (16-byte alignment)
-     * @remark On x86, the format isn't necessarily X86FXSTATE (not important). */
-    X86XSAVEAREA    XState;
-
     /** General purpose register, selectors, flags and more
      * @{ */
@@ -389,5 +385,5 @@
 
     /* padding to get 64byte aligned size */
-    uint8_t         auPadding[16+32];
+    uint8_t         auPadding[16+12];
 
 #elif HC_ARCH_BITS == 64 || defined(VBOX_WITH_HYBRID_32BIT_KERNEL)
@@ -435,7 +431,7 @@
     /* padding to get 32byte aligned size */
 # ifdef VBOX_WITH_HYBRID_32BIT_KERNEL
-    uint8_t         auPadding[16];
+    uint8_t         auPadding[4];
 # else
-    uint8_t         auPadding[8+32];
+    uint8_t         auPadding[8+12];
 # endif
 
@@ -443,4 +439,11 @@
 # error HC_ARCH_BITS not defined
 #endif
+
+    /** Pointer to the FPU/SSE/AVX/XXXX state raw-mode mapping. */
+    RCPTRTYPE(PX86XSAVEAREA)    pXStateRC;
+    /** Pointer to the FPU/SSE/AVX/XXXX state ring-0 mapping. */
+    R0PTRTYPE(PX86XSAVEAREA)    pXStateR0;
+    /** Pointer to the FPU/SSE/AVX/XXXX state ring-3 mapping. */
+    R3PTRTYPE(PX86XSAVEAREA)    pXStateR3;
 } CPUMHOSTCTX;
 /** Pointer to the saved host CPU state. */
Index: /trunk/src/VBox/VMM/include/CPUMInternal.mac
===================================================================
--- /trunk/src/VBox/VMM/include/CPUMInternal.mac	(revision 55047)
+++ /trunk/src/VBox/VMM/include/CPUMInternal.mac	(revision 55048)
@@ -126,6 +126,5 @@
     ; (Identical to the .Hyper chunk below.)
     ;
-    alignb 64
-    .Guest.XState             resb    XSTATE_SIZE
+    .Guest                    resq    0
     .Guest.eax                resq    1
     .Guest.ecx                resq    1
@@ -226,7 +225,10 @@
     .Guest.msrKERNELGSBASE    resb    8
     .Guest.msrApicBase        resb    8
-
+    .Guest.pXStateR0          RTR0PTR_RES 1
+    .Guest.pXStateR3          RTR3PTR_RES 1
+    .Guest.pXStateRC          RTRCPTR_RES 1
 
     alignb 64
+    .GuestMsrs                resq    0
     .GuestMsrs.au64           resq    64
 
@@ -256,6 +258,5 @@
     ;
     alignb 64
-    .Host.XState         resb    XSTATE_SIZE
-
+    .Host                resb    0
 %if HC_ARCH_BITS == 64 || fVBOX_WITH_HYBRID_32BIT_KERNEL
     ;.Host.rax            resq    1 - scratch
@@ -331,4 +332,5 @@
     .Host.SysEnter.esp   resq    1
     .Host.efer           resq    1
+    .Host.auPadding      resb    (16+12)
 
 %else ; 64-bit
@@ -362,5 +364,14 @@
     .Host.GSbase         resq    1
     .Host.efer           resq    1
+ %if fVBOX_WITH_HYBRID_32BIT_KERNEL
+    .Host.auPadding      resb    4
+ %else
+    .Host.auPadding      resb   (8+12)
+ %endif
 %endif ; 64-bit
+    .Host.pXStateRC RTRCPTR_RES  1
+    alignb RTR0PTR_CB
+    .Host.pXStateR0 RTR0PTR_RES  1
+    .Host.pXStateR3 RTR3PTR_RES  1
 
     ;
@@ -368,5 +379,5 @@
     ;
     alignb 64
-    .Hyper.XState             resb    XSTATE_SIZE
+    .Hyper                    resq    0
     .Hyper.eax                resq    1
     .Hyper.ecx                resq    1
@@ -467,4 +478,7 @@
     .Hyper.msrKERNELGSBASE    resb    8
     .Hyper.msrApicBase        resb    8
+    .Hyper.pXStateR0          RTR0PTR_RES 1
+    .Hyper.pXStateR3          RTR3PTR_RES 1
+    .Hyper.pXStateRC          RTRCPTR_RES 1
     alignb 64
 
Index: /trunk/src/VBox/VMM/testcase/tstVMStruct.h
===================================================================
--- /trunk/src/VBox/VMM/testcase/tstVMStruct.h	(revision 55047)
+++ /trunk/src/VBox/VMM/testcase/tstVMStruct.h	(revision 55048)
@@ -63,5 +63,7 @@
 
     GEN_CHECK_SIZE(CPUMHOSTCTX);
-    GEN_CHECK_OFF(CPUMHOSTCTX, XState);
+    GEN_CHECK_OFF(CPUMHOSTCTX, pXStateR3);
+    GEN_CHECK_OFF(CPUMHOSTCTX, pXStateR0);
+    GEN_CHECK_OFF(CPUMHOSTCTX, pXStateRC);
 #if HC_ARCH_BITS == 64 || defined(VBOX_WITH_HYBRID_32BIT_KERNEL)
     GEN_CHECK_OFF(CPUMHOSTCTX, rbx);
@@ -132,5 +134,7 @@
 
     GEN_CHECK_SIZE(CPUMCTX);
-    GEN_CHECK_OFF(CPUMCTX, XState);
+    GEN_CHECK_OFF(CPUMCTX, pXStateR0);
+    GEN_CHECK_OFF(CPUMCTX, pXStateR3);
+    GEN_CHECK_OFF(CPUMCTX, pXStateRC);
     GEN_CHECK_OFF(CPUMCTX, rdi);
     GEN_CHECK_OFF(CPUMCTX, rsi);
Index: /trunk/src/VBox/VMM/testcase/tstVMStructSize.cpp
===================================================================
--- /trunk/src/VBox/VMM/testcase/tstVMStructSize.cpp	(revision 55047)
+++ /trunk/src/VBox/VMM/testcase/tstVMStructSize.cpp	(revision 55048)
@@ -298,5 +298,4 @@
 
     /* cpumctx */
-    CHECK_MEMBER_ALIGNMENT(CPUMCTX, XState, 64);
     CHECK_MEMBER_ALIGNMENT(CPUMCTX, rax, 32);
     CHECK_MEMBER_ALIGNMENT(CPUMCTX, idtr.pIdt, 8);
Index: /trunk/src/recompiler/VBoxRecompiler.c
===================================================================
--- /trunk/src/recompiler/VBoxRecompiler.c	(revision 55047)
+++ /trunk/src/recompiler/VBoxRecompiler.c	(revision 55048)
@@ -2358,5 +2358,5 @@
         /* Sync FPU state after CR4, CPUID and EFER (!). */
         if (fFlags & CPUM_CHANGED_FPU_REM)
-            save_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->XState.x87); /* 'save' is an excellent name. */
+            save_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->pXStateR3->x87); /* 'save' is an excellent name. */
     }
 
@@ -2551,5 +2551,5 @@
 
     /** @todo check if FPU/XMM was actually used in the recompiler */
-    restore_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->XState.x87);
+    restore_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->pXStateR3->x87);
 ////    dprintf2(("FPU state CW=%04X TT=%04X SW=%04X (%04X)\n", env->fpuc, env->fpstt, env->fpus, pVMCtx->fpu.FSW));
 
@@ -2816,4 +2816,5 @@
      */
 
+    PX86FXSTATE pFpuCtx = &pCtx->pXStateR3->x87;
     /** @todo FOP */
     /** @todo FPUIP */
@@ -2822,9 +2823,9 @@
     /** @todo DS */
     /** @todo Fix MXCSR support in QEMU so we don't overwrite MXCSR with 0 when we shouldn't! */
-    pCtx->XState.x87.MXCSR       = 0;
-    pCtx->XState.x87.MXCSR_MASK  = 0;
+    pFpuCtx->MXCSR       = 0;
+    pFpuCtx->MXCSR_MASK  = 0;
 
     /** @todo check if FPU/XMM was actually used in the recompiler */
-    restore_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->XState.x87);
+    restore_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)pFpuCtx);
 ////    dprintf2(("FPU state CW=%04X TT=%04X SW=%04X (%04X)\n", env->fpuc, env->fpstt, env->fpus, pVMCtx->fpu.FSW));
 
