Index: /trunk/include/VBox/sup.h
===================================================================
--- /trunk/include/VBox/sup.h	(revision 64254)
+++ /trunk/include/VBox/sup.h	(revision 64255)
@@ -230,41 +230,38 @@
      * indicate stable data. Use this to make sure that the data items you fetch
      * are consistent. */
-    volatile uint32_t   u32TransactionId;
+    volatile uint32_t       u32TransactionId;
     /** The interval in TSC ticks between two NanoTS updates.
      * This is the average interval over the last 2, 4 or 8 updates + a little slack.
      * The slack makes the time go a tiny tiny bit slower and extends the interval enough
      * to avoid ending up with too many 1ns increments. */
-    volatile uint32_t   u32UpdateIntervalTSC;
+    volatile uint32_t       u32UpdateIntervalTSC;
     /** Current nanosecond timestamp. */
-    volatile uint64_t   u64NanoTS;
+    volatile uint64_t       u64NanoTS;
     /** The TSC at the time of u64NanoTS. */
-    volatile uint64_t   u64TSC;
+    volatile uint64_t       u64TSC;
     /** Current CPU Frequency. */
-    volatile uint64_t   u64CpuHz;
+    volatile uint64_t       u64CpuHz;
     /** The TSC delta with reference to the master TSC, subtract from RDTSC. */
-    volatile int64_t    i64TSCDelta;
+    volatile int64_t        i64TSCDelta;
     /** Number of errors during updating.
      * Typical errors are under/overflows. */
-    volatile uint32_t   cErrors;
+    volatile uint32_t       cErrors;
     /** Index of the head item in au32TSCHistory. */
-    volatile uint32_t   iTSCHistoryHead;
+    volatile uint32_t       iTSCHistoryHead;
     /** Array of recent TSC interval deltas.
      * The most recent item is at index iTSCHistoryHead.
      * This history is used to calculate u32UpdateIntervalTSC.
      */
-    volatile uint32_t   au32TSCHistory[8];
+    volatile uint32_t       au32TSCHistory[8];
     /** The interval between the last two NanoTS updates. (experiment for now) */
-    volatile uint32_t   u32PrevUpdateIntervalNS;
+    volatile uint32_t       u32PrevUpdateIntervalNS;
 
     /** Reserved for future per processor data. */
-    volatile uint32_t   au32Reserved0[5];
-
+    volatile uint32_t       u32Reserved;
     /** The TSC value read while doing TSC delta measurements across CPUs. */
-    volatile uint64_t   u64TSCSample;
-
+    volatile uint64_t       u64TSCSample;
     /** Reserved for future per processor data. */
-    volatile uint32_t   au32Reserved1[1];
-
-    /** @todo Add topology/NUMA info. */
+    volatile uint32_t       au32Reserved1[3];
+
     /** The CPU state. */
     SUPGIPCPUSTATE volatile enmState;
@@ -273,6 +270,12 @@
     /** The CPU set index of this CPU. */
     int16_t                 iCpuSet;
+    /** CPU group number (always zero, except on windows). */
+    uint16_t                iCpuGroup;
+    /** CPU group number (same as iCpuSet, except on windows). */
+    uint16_t                iCpuGroupMember;
     /** The APIC ID of this CPU. */
     uint16_t                idApic;
+    /** @todo Add topology/NUMA info. */
+    uint32_t                iReservedForNumaNode;
 } SUPGIPCPU;
 AssertCompileSize(RTCPUID, 4);
@@ -280,4 +283,5 @@
 AssertCompileMemberAlignment(SUPGIPCPU, u64NanoTS, 8);
 AssertCompileMemberAlignment(SUPGIPCPU, u64TSC, 8);
+AssertCompileMemberAlignment(SUPGIPCPU, u64TSCSample, 8);
 
 /** Pointer to per cpu data.
@@ -314,4 +318,10 @@
 
 /** @name SUPGIPGETCPU_XXX - methods that aCPUs can be indexed.
+ *
+ * @note    Linux offers information via selector 0x78, and Windows via selector
+ *          0x53.  But since they both support RDTSCP as well, and because most
+ *          CPUs now have RDTSCP, we prefer it over LSL.  We can implement more
+ *          alternatives if it becomes necessary.
+ *
  * @{
  */
@@ -335,6 +345,15 @@
  * does with RDTSCP. */
 #define SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS   RT_BIT_32(2)
-/* Linux also offers information via selector 0x78, but we'll settle for
-   RDTSCP for now. */
+/** Windows specific RDTSCP variant, where CH gives you the group and CL gives
+ * you the CPU number within that group.
+ *
+ * Use SUPGLOBALINFOPAGE::aidFirstCpuFromCpuGroup to get the group base CPU set
+ * index, then translate the sum of thru aiCpuFromCpuSetIdx to find the aCPUs
+ * entry.
+ *
+ * @note The group number is actually 16-bit wide (ECX[23:8]), but we simplify
+ *       it since we only support 256 CPUs/groups at the moment.
+ */
+#define SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL RT_BIT_32(3)
 /** @} */
 
@@ -381,5 +400,6 @@
     /** The highest number of CPUs possible. */
     uint16_t            cPossibleCpus;
-    uint16_t            u16Padding0;
+    /** The highest number of CPU groups possible. */
+    uint16_t            cPossibleCpuGroups;
     /** The max CPU ID (RTMpGetMaxCpuId). */
     RTCPUID             idCpuMax;
@@ -399,4 +419,7 @@
     /** CPU set index to CPU table index. */
     uint16_t            aiCpuFromCpuSetIdx[RTCPUSET_MAX_CPUS];
+    /** Table indexed by CPU group index to get the CPU set index of the first
+     *  CPU. */
+    uint16_t            aiFirstCpuSetIdxFromCpuGroup[RTCPUSET_MAX_CPUS];
 
     /** Array of per-cpu data.
@@ -415,4 +438,5 @@
 AssertCompileMemberAlignment(SUPGLOBALINFOPAGE, aCPUs, 256);
 #endif
+AssertCompile(sizeof(SUPGLOBALINFOPAGE) <= 0x1000); /* Keeping it less or equal to a page for raw-mode (saved state). */
 
 /** Pointer to the global info page.
@@ -426,5 +450,5 @@
  * Upper 16 bits is the major version. Major version is only changed with
  * incompatible changes in the GIP. */
-#define SUPGLOBALINFOPAGE_VERSION   0x00060001
+#define SUPGLOBALINFOPAGE_VERSION   0x00070000
 
 /**
@@ -569,4 +593,27 @@
     AssertFailed();
     return UINT64_MAX;
+}
+
+
+/**
+ * Gets the pointer to the per CPU data for a CPU given by its set index.
+ *
+ * @returns Pointer to the corresponding per CPU structure, or NULL if invalid.
+ * @param   pGip        The GIP pointer.
+ * @param   iCpuSet     The CPU set index of the CPU which we want.
+ */
+DECLINLINE(PSUPGIPCPU) SUPGetGipCpuBySetIndex(PSUPGLOBALINFOPAGE pGip, uint32_t iCpuSet)
+{
+    if (RT_LIKELY(   pGip
+                  && pGip->u32Magic == SUPGLOBALINFOPAGE_MAGIC))
+    {
+        if (RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
+        {
+            uint16_t iCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
+            if (RT_LIKELY(iCpu < pGip->cCpus))
+                return &pGip->aCPUs[iCpu];
+        }
+    }
+    return NULL;
 }
 
Index: /trunk/include/iprt/mangling.h
===================================================================
--- /trunk/include/iprt/mangling.h	(revision 64254)
+++ /trunk/include/iprt/mangling.h	(revision 64255)
@@ -2104,4 +2104,6 @@
 # define RTTimeNanoTSLegacyAsyncUseRdtscp                       RT_MANGLER(RTTimeNanoTSLegacyAsyncUseRdtscp)
 # define RTTimeNanoTSLegacyAsyncUseRdtscp_EndProc               RT_MANGLER(RTTimeNanoTSLegacyAsyncUseRdtscp_EndProc)
+# define RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl           RT_MANGLER(RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl)
+# define RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl_EndProc   RT_MANGLER(RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl_EndProc)
 # define RTTimeNanoTSLegacyAsyncUseIdtrLim                      RT_MANGLER(RTTimeNanoTSLegacyAsyncUseIdtrLim)
 # define RTTimeNanoTSLegacyAsyncUseIdtrLim_EndProc              RT_MANGLER(RTTimeNanoTSLegacyAsyncUseIdtrLim_EndProc)
@@ -2122,4 +2124,6 @@
 # define RTTimeNanoTSLFenceAsyncUseRdtscp                       RT_MANGLER(RTTimeNanoTSLFenceAsyncUseRdtscp)
 # define RTTimeNanoTSLFenceAsyncUseRdtscp_EndProc               RT_MANGLER(RTTimeNanoTSLFenceAsyncUseRdtscp_EndProc)
+# define RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl           RT_MANGLER(RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl)
+# define RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl_EndProc   RT_MANGLER(RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl_EndProc)
 # define RTTimeNanoTSLFenceAsyncUseIdtrLim                      RT_MANGLER(RTTimeNanoTSLFenceAsyncUseIdtrLim)
 # define RTTimeNanoTSLFenceAsyncUseIdtrLim_EndProc              RT_MANGLER(RTTimeNanoTSLFenceAsyncUseIdtrLim_EndProc)
Index: /trunk/include/iprt/nt/nt.h
===================================================================
--- /trunk/include/iprt/nt/nt.h	(revision 64254)
+++ /trunk/include/iprt/nt/nt.h	(revision 64255)
@@ -2390,4 +2390,15 @@
 /*NTSYSAPI ULONG NTAPI RtlNtStatusToDosError(NTSTATUS rcNt);*/
 
+/** @def RTL_QUERY_REGISTRY_TYPECHECK
+ * WDK 8.1+, backported in updates, ignored in older. */
+#if !defined(RTL_QUERY_REGISTRY_TYPECHECK) || defined(DOXYGEN_RUNNING)
+# define RTL_QUERY_REGISTRY_TYPECHECK       UINT32_C(0x00000100)
+#endif
+/** @def RTL_QUERY_REGISTRY_TYPECHECK_SHIFT
+ * WDK 8.1+, backported in updates, ignored in older. */
+#if !defined(RTL_QUERY_REGISTRY_TYPECHECK_SHIFT) || defined(DOXYGEN_RUNNING)
+# define RTL_QUERY_REGISTRY_TYPECHECK_SHIFT 24
+#endif
+
 
 RT_C_DECLS_END
Index: /trunk/include/iprt/time.h
===================================================================
--- /trunk/include/iprt/time.h	(revision 64254)
+++ /trunk/include/iprt/time.h	(revision 64255)
@@ -954,4 +954,5 @@
 RTDECL(uint64_t) RTTimeNanoTSLegacyAsyncUseApicId(PRTTIMENANOTSDATA pData);
 RTDECL(uint64_t) RTTimeNanoTSLegacyAsyncUseRdtscp(PRTTIMENANOTSDATA pData);
+RTDECL(uint64_t) RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl(PRTTIMENANOTSDATA pData);
 RTDECL(uint64_t) RTTimeNanoTSLegacyAsyncUseIdtrLim(PRTTIMENANOTSDATA pData);
 RTDECL(uint64_t) RTTimeNanoTSLegacySyncInvarWithDeltaUseApicId(PRTTIMENANOTSDATA pData);
@@ -960,4 +961,5 @@
 RTDECL(uint64_t) RTTimeNanoTSLFenceAsyncUseApicId(PRTTIMENANOTSDATA pData);
 RTDECL(uint64_t) RTTimeNanoTSLFenceAsyncUseRdtscp(PRTTIMENANOTSDATA pData);
+RTDECL(uint64_t) RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl(PRTTIMENANOTSDATA pData);
 RTDECL(uint64_t) RTTimeNanoTSLFenceAsyncUseIdtrLim(PRTTIMENANOTSDATA pData);
 RTDECL(uint64_t) RTTimeNanoTSLFenceSyncInvarWithDeltaUseApicId(PRTTIMENANOTSDATA pData);
Index: /trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp
===================================================================
--- /trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp	(revision 64254)
+++ /trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp	(revision 64255)
@@ -274,4 +274,6 @@
             && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
         {
+            PSUPGIPCPU pGipCpu = SUPGetGipCpuBySetIndex(pGip, iCpuSet);
+
             /*
              * Check whether the IDTR.LIMIT contains a CPU number.
@@ -305,5 +307,6 @@
                     && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
                 {
-                    uint32_t uAux;
+                    uint32_t const  uGroupedAux = (uint8_t)pGipCpu->iCpuGroupMember | ((uint32_t)pGipCpu->iCpuGroup << 8);
+                    uint32_t        uAux;
                     ASMReadTscWithAux(&uAux);
                     if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
@@ -313,4 +316,13 @@
                         if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
                             fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
+                    }
+
+                    if (   (uAux & UINT16_MAX) == uGroupedAux
+                        && pGipCpu->iCpuGroupMember <= UINT8_MAX)
+                    {
+                        ASMNopPause();
+                        ASMReadTscWithAux(&uAux);
+                        if ((uAux & UINT16_MAX) == uGroupedAux)
+                            fSupported |= SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL;
                     }
                 }
@@ -1259,9 +1271,9 @@
 static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
 {
-    int         iCpuSet = 0;
-    uint16_t    idApic = UINT16_MAX;
-    uint32_t    i = 0;
-    uint64_t    u64NanoTS = 0;
-    PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
+    PSUPGLOBALINFOPAGE  pGip      = pDevExt->pGip;
+    int                 iCpuSet   = 0;
+    uint16_t            idApic    = UINT16_MAX;
+    uint32_t            i         = 0;
+    uint64_t            u64NanoTS = 0;
 
     AssertPtrReturnVoid(pGip);
@@ -1301,4 +1313,10 @@
     ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
     ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu,  idCpu);
+
+    pGip->aCPUs[i].iCpuGroup = 0;
+    pGip->aCPUs[i].iCpuGroupMember = iCpuSet;
+#ifdef RT_OS_WINDOWS
+    pGip->aCPUs[i].iCpuGroup = supdrvOSGipGetGroupFromCpu(pDevExt, idCpu, &pGip->aCPUs[i].iCpuGroupMember);
+#endif
 
     /*
@@ -1683,8 +1701,11 @@
     pCpu->i64TSCDelta        = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
 
-    ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
-    ASMAtomicWriteU32(&pCpu->idCpu,     NIL_RTCPUID);
-    ASMAtomicWriteS16(&pCpu->iCpuSet,   -1);
-    ASMAtomicWriteU16(&pCpu->idApic,    UINT16_MAX);
+    ASMAtomicWriteSize(&pCpu->enmState,             SUPGIPCPUSTATE_INVALID);
+    ASMAtomicWriteU32(&pCpu->idCpu,                 NIL_RTCPUID);
+    ASMAtomicWriteS16(&pCpu->iCpuSet,               -1);
+    ASMAtomicWriteU16(&pCpu->iCpuGroup,             0);
+    ASMAtomicWriteU16(&pCpu->iCpuGroupMember,       UINT16_MAX);
+    ASMAtomicWriteU16(&pCpu->idApic,                UINT16_MAX);
+    ASMAtomicWriteU32(&pCpu->iReservedForNumaNode,  0);
 
     /*
@@ -1764,4 +1785,5 @@
     pGip->cPresentCpus            = RTMpGetPresentCount();
     pGip->cPossibleCpus           = RTMpGetCount();
+    pGip->cPossibleCpuGroups      = 1;
     pGip->idCpuMax                = RTMpGetMaxCpuId();
     for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
@@ -1769,4 +1791,10 @@
     for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
         pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
+    pGip->aiFirstCpuSetIdxFromCpuGroup[0] = 0;
+    for (i = 1; i < RT_ELEMENTS(pGip->aiFirstCpuSetIdxFromCpuGroup); i++)
+        pGip->aiFirstCpuSetIdxFromCpuGroup[i] = UINT16_MAX;
+#ifdef RT_OS_WINDOWS
+    supdrvOSInitGipGroupTable(pDevExt, pGip);
+#endif
     for (i = 0; i < cCpus; i++)
         supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
@@ -2394,5 +2422,7 @@
      * run.
      */
-    if (RT_UNLIKELY(iTick == 1))
+    if (RT_LIKELY(iTick != 1))
+    { /* likely*/ }
+    else
     {
         iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
Index: /trunk/src/VBox/HostDrivers/Support/SUPDrvIOC.h
===================================================================
--- /trunk/src/VBox/HostDrivers/Support/SUPDrvIOC.h	(revision 64254)
+++ /trunk/src/VBox/HostDrivers/Support/SUPDrvIOC.h	(revision 64255)
@@ -215,5 +215,5 @@
  *          - nothing.
  */
-#define SUPDRV_IOC_VERSION                              0x00260000
+#define SUPDRV_IOC_VERSION                              0x00270000
 
 /** SUP_IOCTL_COOKIE. */
Index: /trunk/src/VBox/HostDrivers/Support/SUPDrvInternal.h
===================================================================
--- /trunk/src/VBox/HostDrivers/Support/SUPDrvInternal.h	(revision 64254)
+++ /trunk/src/VBox/HostDrivers/Support/SUPDrvInternal.h	(revision 64255)
@@ -800,4 +800,27 @@
 void VBOXCALL   supdrvOSSessionHashTabRemoved(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, void *pvUser);
 
+/**
+ * Called during GIP initialization to set up the group table and group count.
+ *
+ * This is currently only implemented on windows [lazy bird].
+ *
+ * @param   pDevExt             The device globals.
+ * @param   pGip                The GIP which group table needs initialization.
+ *                              It's only partially initialized at this point.
+ */
+void VBOXCALL   supdrvOSInitGipGroupTable(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip);
+
+/**
+ * Gets the CPU group and member indexes for the given CPU ID.
+ *
+ * This is currently only implemented on windows [lazy bird].
+ *
+ * @returns CPU group number.
+ * @param   pDevExt             The device globals.
+ * @param   idCpu               The ID of the CPU.
+ * @param   piCpuGroupMember    Where to return the group member number.
+ */
+uint16_t VBOXCALL supdrvOSGipGetGroupFromCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu, uint16_t *piCpuGroupMember);
+
 void VBOXCALL   supdrvOSObjInitCreator(PSUPDRVOBJ pObj, PSUPDRVSESSION pSession);
 bool VBOXCALL   supdrvOSObjCanAccess(PSUPDRVOBJ pObj, PSUPDRVSESSION pSession, const char *pszObjName, int *prc);
Index: /trunk/src/VBox/HostDrivers/Support/testcase/tstGIP-2.cpp
===================================================================
--- /trunk/src/VBox/HostDrivers/Support/testcase/tstGIP-2.cpp	(revision 64254)
+++ /trunk/src/VBox/HostDrivers/Support/testcase/tstGIP-2.cpp	(revision 64255)
@@ -131,15 +131,23 @@
                 SUPR3GipSetFlags(SUPGIP_FLAGS_TESTING_ENABLE, UINT32_MAX);
 
-            RTPrintf("tstGIP-2: cCpus=%d  u32UpdateHz=%RU32  u32UpdateIntervalNS=%RU32  u64NanoTSLastUpdateHz=%RX64  u64CpuHz=%RU64  uCpuHzRef=%RU64  u32Mode=%d (%s)  fTestMode=%RTbool  u32Version=%#x\n",
+            RTPrintf("tstGIP-2: u32Mode=%d (%s)  fTestMode=%RTbool  u32Version=%#x  fGetGipCpu=%#RX32\n",
+                     g_pSUPGlobalInfoPage->u32Mode,
+                     SUPGetGIPModeName(g_pSUPGlobalInfoPage),
+                     fTestMode,
+                     g_pSUPGlobalInfoPage->u32Version,
+                     g_pSUPGlobalInfoPage->fGetGipCpu);
+            RTPrintf("tstGIP-2: cCpus=%d  cPossibleCpus=%d cPossibleCpuGroups=%d cPresentCpus=%d cOnlineCpus=%d\n",
                      g_pSUPGlobalInfoPage->cCpus,
+                     g_pSUPGlobalInfoPage->cPossibleCpus,
+                     g_pSUPGlobalInfoPage->cPossibleCpuGroups,
+                     g_pSUPGlobalInfoPage->cPresentCpus,
+                     g_pSUPGlobalInfoPage->cOnlineCpus);
+            RTPrintf("tstGIP-2: u32UpdateHz=%RU32  u32UpdateIntervalNS=%RU32  u64NanoTSLastUpdateHz=%RX64  u64CpuHz=%RU64  uCpuHzRef=%RU64\n",
                      g_pSUPGlobalInfoPage->u32UpdateHz,
                      g_pSUPGlobalInfoPage->u32UpdateIntervalNS,
                      g_pSUPGlobalInfoPage->u64NanoTSLastUpdateHz,
                      g_pSUPGlobalInfoPage->u64CpuHz,
-                     uCpuHzRef,
-                     g_pSUPGlobalInfoPage->u32Mode,
-                     SUPGetGIPModeName(g_pSUPGlobalInfoPage),
-                     fTestMode,
-                     g_pSUPGlobalInfoPage->u32Version);
+                     uCpuHzRef);
+
             RTPrintf(fHex
                      ? "tstGIP-2:     it: u64NanoTS        delta     u64TSC           UpIntTSC H  TransId      CpuHz      %sTSC Interval History...\n"
Index: /trunk/src/VBox/HostDrivers/Support/win/SUPDrv-win.cpp
===================================================================
--- /trunk/src/VBox/HostDrivers/Support/win/SUPDrv-win.cpp	(revision 64254)
+++ /trunk/src/VBox/HostDrivers/Support/win/SUPDrv-win.cpp	(revision 64255)
@@ -359,4 +359,38 @@
 #endif /* VBOXDRV_WITH_FAST_IO */
 
+/** Default ZERO value. */
+static ULONG                        g_fOptDefaultZero = 0;
+/** Registry values.
+ * We wrap these in a struct to ensure they are followed by a little zero
+ * padding in order to limit the chance of trouble on unpatched systems.  */
+struct
+{
+    /** The ForceAsync registry value. */
+    ULONG                           fOptForceAsyncTsc;
+    /** Padding. */
+    uint64_t                        au64Padding[2];
+}                                   g_Options = { FALSE, 0, 0 };
+/** Registry query table for RtlQueryRegistryValues. */
+static RTL_QUERY_REGISTRY_TABLE     g_aRegValues[] =
+{
+    {
+        /* .QueryRoutine = */   NULL,
+        /* .Flags = */          RTL_QUERY_REGISTRY_DIRECT | RTL_QUERY_REGISTRY_TYPECHECK,
+        /* .Name = */           L"ForceAsyncTsc",
+        /* .EntryContext = */   &g_Options.fOptForceAsyncTsc,
+        /* .DefaultType = */    (REG_DWORD << RTL_QUERY_REGISTRY_TYPECHECK_SHIFT) | REG_DWORD,
+        /* .DefaultData = */    &g_fOptDefaultZero,
+        /* .DefaultLength = */  sizeof(g_fOptDefaultZero),
+    },
+    {   NULL, 0, NULL, NULL, 0, NULL, 0 } /* terminator entry. */
+};
+
+/** Pointer to KeQueryMaximumGroupCount. */
+static PFNKEQUERYMAXIMUMGROUPCOUNT      g_pfnKeQueryMaximumGroupCount = NULL;
+/** Pointer to KeGetProcessorIndexFromNumber. */
+static PFNKEGETPROCESSORINDEXFROMNUMBER g_pfnKeGetProcessorIndexFromNumber = NULL;
+/** Pointer to KeGetProcessorNumberFromIndex. */
+static PFNKEGETPROCESSORNUMBERFROMINDEX g_pfnKeGetProcessorNumberFromIndex = NULL;
+
 #ifdef VBOX_WITH_HARDENING
 /** Pointer to the stub device instance. */
@@ -554,7 +588,41 @@
 
     /*
+     * Query options first so any overflows on unpatched machines will do less
+     * harm (see MS11-011 / 2393802 / 2011-03-18).
+     *
+     * Unfortunately, pRegPath isn't documented as zero terminated, even if it
+     * quite likely always is, so we have to make a copy here.
+     */
+    NTSTATUS rcNt;
+    PWSTR pwszCopy = (PWSTR)ExAllocatePoolWithTag(NonPagedPool, pRegPath->Length + sizeof(WCHAR), 'VBox');
+    if (pwszCopy)
+    {
+        memcpy(pwszCopy, pRegPath->Buffer, pRegPath->Length);
+        pwszCopy[pRegPath->Length / sizeof(WCHAR)] = '\0';
+        rcNt = RtlQueryRegistryValues(RTL_REGISTRY_ABSOLUTE | RTL_REGISTRY_OPTIONAL, pwszCopy,
+                                      g_aRegValues, NULL /*pvContext*/, NULL /*pvEnv*/);
+        ExFreePoolWithTag(pwszCopy, 'VBox');
+        /* Probably safe to ignore rcNt here. */
+    }
+
+    /*
+     * Resolve methods we want but isn't available everywhere.
+     */
+    UNICODE_STRING RoutineName;
+    RtlInitUnicodeString(&RoutineName, L"KeQueryMaximumGroupCount");
+    g_pfnKeQueryMaximumGroupCount = (PFNKEQUERYMAXIMUMGROUPCOUNT)MmGetSystemRoutineAddress(&RoutineName);
+
+    RtlInitUnicodeString(&RoutineName, L"KeGetProcessorIndexFromNumber");
+    g_pfnKeGetProcessorIndexFromNumber = (PFNKEGETPROCESSORINDEXFROMNUMBER)MmGetSystemRoutineAddress(&RoutineName);
+
+    RtlInitUnicodeString(&RoutineName, L"KeGetProcessorNumberFromIndex");
+    g_pfnKeGetProcessorNumberFromIndex = (PFNKEGETPROCESSORNUMBERFROMINDEX)MmGetSystemRoutineAddress(&RoutineName);
+
+    Assert(   (g_pfnKeGetProcessorNumberFromIndex != NULL) == (g_pfnKeGetProcessorIndexFromNumber != NULL)
+           && (g_pfnKeGetProcessorNumberFromIndex != NULL) == (g_pfnKeQueryMaximumGroupCount != NULL)); /* all or nothing. */
+
+    /*
      * Initialize the runtime (IPRT).
      */
-    NTSTATUS rcNt;
     int vrc = RTR0Init(0);
     if (RT_SUCCESS(vrc))
@@ -1643,4 +1711,71 @@
 
 
+void VBOXCALL supdrvOSInitGipGroupTable(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip)
+{
+    NOREF(pDevExt);
+
+    /*
+     * The indexes are assigned in group order (see initterm-r0drv-nt.cpp).
+     */
+    if (   g_pfnKeQueryMaximumGroupCount
+        && g_pfnKeGetProcessorIndexFromNumber)
+    {
+        unsigned cGroups = g_pfnKeQueryMaximumGroupCount();
+        AssertStmt(cGroups > 0, cGroups = 1);
+        AssertStmt(cGroups < RT_ELEMENTS(pGip->aiFirstCpuSetIdxFromCpuGroup),
+                   cGroups = RT_ELEMENTS(pGip->aiFirstCpuSetIdxFromCpuGroup));
+        pGip->cPossibleCpuGroups = cGroups;
+
+        KEPROCESSORINDEX idxCpuMin = 0;
+        for (unsigned iGroup = 0; iGroup < cGroups; iGroup++)
+        {
+            PROCESSOR_NUMBER ProcNum;
+            ProcNum.Group    = (USHORT)iGroup;
+            ProcNum.Number   = 0;
+            ProcNum.Reserved = 0;
+            KEPROCESSORINDEX idxCpu = g_pfnKeGetProcessorIndexFromNumber(&ProcNum);
+            Assert(idxCpu != INVALID_PROCESSOR_INDEX);
+            Assert(idxCpu >= idxCpuMin);
+            idxCpuMin = idxCpu;
+            pGip->aiFirstCpuSetIdxFromCpuGroup[iGroup] = (uint16_t)idxCpu;
+        }
+    }
+    else
+    {
+        Assert(!g_pfnKeQueryMaximumGroupCount);
+        Assert(!g_pfnKeGetProcessorIndexFromNumber);
+
+        pGip->cPossibleCpuGroups              = 1;
+        pGip->aiFirstCpuSetIdxFromCpuGroup[0] = 0;
+    }
+}
+
+
+uint16_t VBOXCALL supdrvOSGipGetGroupFromCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu, uint16_t *piCpuGroupMember)
+{
+    NOREF(pDevExt);
+
+    /*
+     * This is just a wrapper around KeGetProcessorNumberFromIndex.
+     */
+    if (g_pfnKeGetProcessorNumberFromIndex)
+    {
+        PROCESSOR_NUMBER ProcNum = { UINT16_MAX, UINT8_MAX, 0 };
+        NTSTATUS rcNt = g_pfnKeGetProcessorNumberFromIndex(idCpu, &ProcNum);
+        if (NT_SUCCESS(rcNt))
+        {
+            Assert(ProcNum.Group < g_pfnKeQueryMaximumGroupCount());
+            *piCpuGroupMember = ProcNum.Number;
+            return ProcNum.Group;
+        }
+
+        AssertMsgFailed(("rcNt=%#x for idCpu=%u\n", rcNt, idCpu));
+    }
+
+    *piCpuGroupMember = 0;
+    return idCpu;
+}
+
+
 /**
  * Initializes any OS specific object creator fields.
@@ -1680,5 +1815,5 @@
 {
     RT_NOREF1(pDevExt);
-    return false;
+    return g_Options.fOptForceAsyncTsc != 0;
 }
 
Index: /trunk/src/VBox/Runtime/common/time/timesup.cpp
===================================================================
--- /trunk/src/VBox/Runtime/common/time/timesup.cpp	(revision 64254)
+++ /trunk/src/VBox/Runtime/common/time/timesup.cpp	(revision 64255)
@@ -203,4 +203,6 @@
                           : pGip->fGetGipCpu & SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
                           ? RTTimeNanoTSLFenceAsyncUseRdtscp
+                          : pGip->fGetGipCpu & SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
+                          ? RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl
                           : pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID
                           ? RTTimeNanoTSLFenceAsyncUseApicId
@@ -239,4 +241,6 @@
                 pfnWorker = pGip->fGetGipCpu & SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
                           ? RTTimeNanoTSLegacyAsyncUseRdtscp
+                          : pGip->fGetGipCpu & SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
+                          ? RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl
                           : pGip->fGetGipCpu & SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS
                           ? RTTimeNanoTSLegacyAsyncUseIdtrLim
Index: /trunk/src/VBox/Runtime/common/time/timesupref.cpp
===================================================================
--- /trunk/src/VBox/Runtime/common/time/timesupref.cpp	(revision 64254)
+++ /trunk/src/VBox/Runtime/common/time/timesupref.cpp	(revision 64255)
@@ -112,4 +112,11 @@
 RT_EXPORT_SYMBOL(RTTimeNanoTSLegacyAsyncUseIdtrLim);
 
+# undef  TMPL_GET_CPU_METHOD
+# define TMPL_GET_CPU_METHOD     SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
+# undef  rtTimeNanoTSInternalRef
+# define rtTimeNanoTSInternalRef RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl
+# include "timesupref.h"
+RT_EXPORT_SYMBOL(RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl);
+
 #else  /* IN_RC || IN_RING0: Disable interrupts and call getter function. */
 
@@ -196,4 +203,11 @@
 RT_EXPORT_SYMBOL(RTTimeNanoTSLFenceAsyncUseIdtrLim);
 
+# undef  TMPL_GET_CPU_METHOD
+# define TMPL_GET_CPU_METHOD     SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
+# undef  rtTimeNanoTSInternalRef
+# define rtTimeNanoTSInternalRef RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl
+# include "timesupref.h"
+RT_EXPORT_SYMBOL(RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl);
+
 #else  /* IN_RC || IN_RING0: Disable interrupts and call getter function. */
 
Index: /trunk/src/VBox/Runtime/common/time/timesupref.h
===================================================================
--- /trunk/src/VBox/Runtime/common/time/timesupref.h	(revision 64254)
+++ /trunk/src/VBox/Runtime/common/time/timesupref.h	(revision 64255)
@@ -91,5 +91,6 @@
             uint8_t  const  idApic   = ASMGetApicId();
             uint16_t const  iGipCpu  = pGip->aiCpuFromApicId[idApic];
-# elif TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
+# elif TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS \
+    || TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
 #  if TMPL_MODE != TMPL_MODE_ASYNC
             uint32_t const  u32TransactionId = pGip->aCPUs[0].u32TransactionId;
@@ -97,5 +98,9 @@
             uint32_t        uAux;
             ASMReadTscWithAux(&uAux);
+#  if TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
             uint16_t const  iCpuSet  = uAux & (RTCPUSET_MAX_CPUS - 1);
+#  else
+            uint16_t const  iCpuSet  = pGip->aiFirstCpuSetIdxFromCpuGroup[(uAux >> 8) & UINT8_MAX] + (uAux & UINT8_MAX);
+#  endif
             uint16_t const  iGipCpu  = pGip->aiCpuFromCpuSetIdx[iCpuSet];
 # elif TMPL_GET_CPU_METHOD == SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS
@@ -124,5 +129,6 @@
 #elif TMPL_MODE != TMPL_MODE_ASYNC \
    && TMPL_GET_CPU_METHOD != SUPGIPGETCPU_APIC_ID \
-   && TMPL_GET_CPU_METHOD != SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
+   && TMPL_GET_CPU_METHOD != SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS \
+   && TMPL_GET_CPU_METHOD != SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
                 uint32_t const u32TransactionId = pGip->aCPUs[0].u32TransactionId;
                 ASMCompilerBarrier();
@@ -149,5 +155,6 @@
 #endif
                 uint64_t u64PrevNanoTS          = ASMAtomicUoReadU64(pData->pu64Prev);
-#if TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
+#if TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS \
+ || TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
                 ASMCompilerBarrier();
                 uint32_t uAux2;
@@ -168,5 +175,6 @@
 # if   TMPL_GET_CPU_METHOD == SUPGIPGETCPU_APIC_ID
                 if (RT_LIKELY(ASMGetApicId() == idApic))
-# elif TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS
+# elif TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS \
+    || TMPL_GET_CPU_METHOD == SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL
                 if (RT_LIKELY(uAux2 == uAux))
 # elif TMPL_GET_CPU_METHOD == SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS
Index: /trunk/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
===================================================================
--- /trunk/src/VBox/VMM/VMMAll/TMAllVirtual.cpp	(revision 64254)
+++ /trunk/src/VBox/VMM/VMMAll/TMAllVirtual.cpp	(revision 64255)
@@ -119,4 +119,6 @@
             else if (pGip->fGetGipCpu & SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS)
                 pfnWorker = fLFence ? RTTimeNanoTSLFenceAsyncUseRdtscp      : RTTimeNanoTSLegacyAsyncUseRdtscp;
+            else if (pGip->fGetGipCpu & SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL)
+                pfnWorker = fLFence ? RTTimeNanoTSLFenceAsyncUseRdtscpGroupChNumCl : RTTimeNanoTSLegacyAsyncUseRdtscpGroupChNumCl;
             else
                 pfnWorker = fLFence ? RTTimeNanoTSLFenceAsyncUseApicId      : RTTimeNanoTSLegacyAsyncUseApicId;
