VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/PGMAllPool.cpp@ 100594

Last change on this file since 100594 was 100577, checked in by vboxsync, 17 months ago

VMM/PGM: Nested VMX: bugref:10318 More info on assertion.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 225.0 KB
Line 
1/* $Id: PGMAllPool.cpp 100577 2023-07-14 14:00:25Z vboxsync $ */
2/** @file
3 * PGM Shadow Page Pool.
4 */
5
6/*
7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#define LOG_GROUP LOG_GROUP_PGM_POOL
33#define VBOX_WITHOUT_PAGING_BIT_FIELDS /* 64-bit bitfields are just asking for trouble. See @bugref{9841} and others. */
34#include <VBox/vmm/pgm.h>
35#include <VBox/vmm/mm.h>
36#include <VBox/vmm/em.h>
37#include <VBox/vmm/cpum.h>
38#include "PGMInternal.h"
39#include <VBox/vmm/vmcc.h>
40#include "PGMInline.h"
41#include <VBox/vmm/hm_vmx.h>
42
43#include <VBox/log.h>
44#include <VBox/err.h>
45#include <iprt/asm.h>
46#include <iprt/string.h>
47
48
49/*********************************************************************************************************************************
50* Internal Functions *
51*********************************************************************************************************************************/
52RT_C_DECLS_BEGIN
53#if 0 /* unused */
54DECLINLINE(unsigned) pgmPoolTrackGetShadowEntrySize(PGMPOOLKIND enmKind);
55DECLINLINE(unsigned) pgmPoolTrackGetGuestEntrySize(PGMPOOLKIND enmKind);
56#endif /* unused */
57static void pgmPoolTrackClearPageUsers(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
58static void pgmPoolTrackDeref(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
59static int pgmPoolTrackAddUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable);
60static void pgmPoolMonitorModifiedRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
61#if defined(LOG_ENABLED) || defined(VBOX_STRICT)
62static const char *pgmPoolPoolKindToStr(uint8_t enmKind);
63#endif
64#if 0 /*defined(VBOX_STRICT) && defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT)*/
65static void pgmPoolTrackCheckPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT);
66#endif
67
68int pgmPoolTrackFlushGCPhysPTsSlow(PVMCC pVM, PPGMPAGE pPhysPage);
69PPGMPOOLPHYSEXT pgmPoolTrackPhysExtAlloc(PVMCC pVM, uint16_t *piPhysExt);
70void pgmPoolTrackPhysExtFree(PVMCC pVM, uint16_t iPhysExt);
71void pgmPoolTrackPhysExtFreeList(PVMCC pVM, uint16_t iPhysExt);
72
73RT_C_DECLS_END
74
75
76#if 0 /* unused */
77/**
78 * Checks if the specified page pool kind is for a 4MB or 2MB guest page.
79 *
80 * @returns true if it's the shadow of a 4MB or 2MB guest page, otherwise false.
81 * @param enmKind The page kind.
82 */
83DECLINLINE(bool) pgmPoolIsBigPage(PGMPOOLKIND enmKind)
84{
85 switch (enmKind)
86 {
87 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
88 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
89 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
90 return true;
91 default:
92 return false;
93 }
94}
95#endif /* unused */
96
97
98/**
99 * Flushes a chain of pages sharing the same access monitor.
100 *
101 * @param pPool The pool.
102 * @param pPage A page in the chain.
103 */
104void pgmPoolMonitorChainFlush(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
105{
106 LogFlow(("pgmPoolMonitorChainFlush: Flush page %RGp type=%d\n", pPage->GCPhys, pPage->enmKind));
107
108 /*
109 * Find the list head.
110 */
111 uint16_t idx = pPage->idx;
112 if (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
113 {
114 while (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
115 {
116 idx = pPage->iMonitoredPrev;
117 Assert(idx != pPage->idx);
118 pPage = &pPool->aPages[idx];
119 }
120 }
121
122 /*
123 * Iterate the list flushing each shadow page.
124 */
125 for (;;)
126 {
127 idx = pPage->iMonitoredNext;
128 Assert(idx != pPage->idx);
129 if (pPage->idx >= PGMPOOL_IDX_FIRST)
130 {
131 int rc2 = pgmPoolFlushPage(pPool, pPage);
132 AssertRC(rc2);
133 }
134 /* next */
135 if (idx == NIL_PGMPOOL_IDX)
136 break;
137 pPage = &pPool->aPages[idx];
138 }
139}
140
141
142/**
143 * Wrapper for getting the current context pointer to the entry being modified.
144 *
145 * @returns VBox status code suitable for scheduling.
146 * @param pVM The cross context VM structure.
147 * @param pvDst Destination address
148 * @param pvSrc Pointer to the mapping of @a GCPhysSrc or NULL depending
149 * on the context (e.g. \#PF in R0 & RC).
150 * @param GCPhysSrc The source guest physical address.
151 * @param cb Size of data to read
152 */
153DECLINLINE(int) pgmPoolPhysSimpleReadGCPhys(PVMCC pVM, void *pvDst, void const *pvSrc, RTGCPHYS GCPhysSrc, size_t cb)
154{
155#if defined(IN_RING3)
156 NOREF(pVM); NOREF(GCPhysSrc);
157 memcpy(pvDst, (RTHCPTR)((uintptr_t)pvSrc & ~(RTHCUINTPTR)(cb - 1)), cb);
158 return VINF_SUCCESS;
159#else
160 /** @todo in RC we could attempt to use the virtual address, although this can cause many faults (PAE Windows XP guest). */
161 NOREF(pvSrc);
162 return PGMPhysSimpleReadGCPhys(pVM, pvDst, GCPhysSrc & ~(RTGCPHYS)(cb - 1), cb);
163#endif
164}
165
166
167/**
168 * Process shadow entries before they are changed by the guest.
169 *
170 * For PT entries we will clear them. For PD entries, we'll simply check
171 * for mapping conflicts and set the SyncCR3 FF if found.
172 *
173 * @param pVCpu The cross context virtual CPU structure.
174 * @param pPool The pool.
175 * @param pPage The head page.
176 * @param GCPhysFault The guest physical fault address.
177 * @param pvAddress Pointer to the mapping of @a GCPhysFault or NULL
178 * depending on the context (e.g. \#PF in R0 & RC).
179 * @param cbWrite Write size; might be zero if the caller knows we're not crossing entry boundaries
180 */
181static void pgmPoolMonitorChainChanging(PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTGCPHYS GCPhysFault,
182 void const *pvAddress, unsigned cbWrite)
183{
184 AssertMsg(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX, ("%u (idx=%u)\n", pPage->iMonitoredPrev, pPage->idx));
185 const unsigned off = GCPhysFault & GUEST_PAGE_OFFSET_MASK;
186 PVMCC pVM = pPool->CTX_SUFF(pVM);
187 NOREF(pVCpu);
188
189 LogFlow(("pgmPoolMonitorChainChanging: %RGv phys=%RGp cbWrite=%d\n",
190 (RTGCPTR)(CTXTYPE(RTGCPTR, uintptr_t, RTGCPTR))(uintptr_t)pvAddress, GCPhysFault, cbWrite));
191
192 if (PGMPOOL_PAGE_IS_NESTED(pPage))
193 Log7Func(("%RGv phys=%RGp cbWrite=%d\n", (RTGCPTR)(CTXTYPE(RTGCPTR, uintptr_t, RTGCPTR))(uintptr_t)pvAddress, GCPhysFault, cbWrite));
194
195 for (;;)
196 {
197 union
198 {
199 void *pv;
200 PX86PT pPT;
201 PPGMSHWPTPAE pPTPae;
202 PX86PD pPD;
203 PX86PDPAE pPDPae;
204 PX86PDPT pPDPT;
205 PX86PML4 pPML4;
206#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
207 PEPTPDPT pEptPdpt;
208 PEPTPD pEptPd;
209 PEPTPT pEptPt;
210#endif
211 } uShw;
212
213 LogFlow(("pgmPoolMonitorChainChanging: page idx=%d phys=%RGp (next=%d) kind=%s write=%#x\n",
214 pPage->idx, pPage->GCPhys, pPage->iMonitoredNext, pgmPoolPoolKindToStr(pPage->enmKind), cbWrite));
215
216 uShw.pv = NULL;
217 switch (pPage->enmKind)
218 {
219 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
220 {
221 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
222 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
223 const unsigned iShw = off / sizeof(X86PTE);
224 LogFlow(("PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT iShw=%x\n", iShw));
225 X86PGUINT const uPde = uShw.pPT->a[iShw].u;
226 if (uPde & X86_PTE_P)
227 {
228 X86PTE GstPte;
229 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
230 AssertRC(rc);
231 Log4(("pgmPoolMonitorChainChanging 32_32: deref %016RX64 GCPhys %08RX32\n", uPde & X86_PTE_PG_MASK, GstPte.u & X86_PTE_PG_MASK));
232 pgmPoolTracDerefGCPhysHint(pPool, pPage, uPde & X86_PTE_PG_MASK, GstPte.u & X86_PTE_PG_MASK, iShw);
233 ASMAtomicWriteU32(&uShw.pPT->a[iShw].u, 0);
234 }
235 break;
236 }
237
238 /* page/2 sized */
239 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
240 {
241 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
242 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
243 if (!((off ^ pPage->GCPhys) & (PAGE_SIZE / 2)))
244 {
245 const unsigned iShw = (off / sizeof(X86PTE)) & (X86_PG_PAE_ENTRIES - 1);
246 LogFlow(("PGMPOOLKIND_PAE_PT_FOR_32BIT_PT iShw=%x\n", iShw));
247 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw]))
248 {
249 X86PTE GstPte;
250 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
251 AssertRC(rc);
252
253 Log4(("pgmPoolMonitorChainChanging pae_32: deref %016RX64 GCPhys %08RX32\n", uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK, GstPte.u & X86_PTE_PG_MASK));
254 pgmPoolTracDerefGCPhysHint(pPool, pPage,
255 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]),
256 GstPte.u & X86_PTE_PG_MASK,
257 iShw);
258 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw], 0);
259 }
260 }
261 break;
262 }
263
264 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
265 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
266 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
267 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
268 {
269 unsigned iGst = off / sizeof(X86PDE);
270 unsigned iShwPdpt = iGst / 256;
271 unsigned iShw = (iGst % 256) * 2;
272 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
273
274 LogFlow(("pgmPoolMonitorChainChanging PAE for 32 bits: iGst=%x iShw=%x idx = %d page idx=%d\n", iGst, iShw, iShwPdpt, pPage->enmKind - PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD));
275 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
276 if (iShwPdpt == pPage->enmKind - (unsigned)PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD)
277 {
278 for (unsigned i = 0; i < 2; i++)
279 {
280 X86PGPAEUINT const uPde = uShw.pPDPae->a[iShw + i].u;
281 if (uPde & X86_PDE_P)
282 {
283 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw + i, uPde));
284 pgmPoolFree(pVM, uPde & X86_PDE_PAE_PG_MASK, pPage->idx, iShw + i);
285 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw + i].u, 0);
286 }
287
288 /* paranoia / a bit assumptive. */
289 if ( (off & 3)
290 && (off & 3) + cbWrite > 4)
291 {
292 const unsigned iShw2 = iShw + 2 + i;
293 if (iShw2 < RT_ELEMENTS(uShw.pPDPae->a))
294 {
295 X86PGPAEUINT const uPde2 = uShw.pPDPae->a[iShw2].u;
296 if (uPde2 & X86_PDE_P)
297 {
298 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw2, uPde2));
299 pgmPoolFree(pVM, uPde2 & X86_PDE_PAE_PG_MASK, pPage->idx, iShw2);
300 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
301 }
302 }
303 }
304 }
305 }
306 break;
307 }
308
309 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
310 {
311 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
312 const unsigned iShw = off / sizeof(X86PTEPAE);
313 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
314 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw]))
315 {
316 X86PTEPAE GstPte;
317 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
318 AssertRC(rc);
319
320 Log4(("pgmPoolMonitorChainChanging pae: deref %016RX64 GCPhys %016RX64\n", PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]), GstPte.u & X86_PTE_PAE_PG_MASK));
321 pgmPoolTracDerefGCPhysHint(pPool, pPage,
322 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]),
323 GstPte.u & X86_PTE_PAE_PG_MASK,
324 iShw);
325 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw], 0);
326 }
327
328 /* paranoia / a bit assumptive. */
329 if ( (off & 7)
330 && (off & 7) + cbWrite > sizeof(X86PTEPAE))
331 {
332 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PTEPAE);
333 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPTPae->a));
334
335 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw2]))
336 {
337 X86PTEPAE GstPte;
338 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte,
339 pvAddress ? (uint8_t const *)pvAddress + sizeof(GstPte) : NULL,
340 GCPhysFault + sizeof(GstPte), sizeof(GstPte));
341 AssertRC(rc);
342 Log4(("pgmPoolMonitorChainChanging pae: deref %016RX64 GCPhys %016RX64\n", PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw2]), GstPte.u & X86_PTE_PAE_PG_MASK));
343 pgmPoolTracDerefGCPhysHint(pPool, pPage,
344 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw2]),
345 GstPte.u & X86_PTE_PAE_PG_MASK,
346 iShw2);
347 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw2], 0);
348 }
349 }
350 break;
351 }
352
353 case PGMPOOLKIND_32BIT_PD:
354 {
355 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
356 const unsigned iShw = off / sizeof(X86PTE); // ASSUMING 32-bit guest paging!
357
358 LogFlow(("pgmPoolMonitorChainChanging: PGMPOOLKIND_32BIT_PD %x\n", iShw));
359 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
360 X86PGUINT const uPde = uShw.pPD->a[iShw].u;
361 if (uPde & X86_PDE_P)
362 {
363 LogFlow(("pgmPoolMonitorChainChanging: 32 bit pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uPde));
364 pgmPoolFree(pVM, uPde & X86_PDE_PG_MASK, pPage->idx, iShw);
365 ASMAtomicWriteU32(&uShw.pPD->a[iShw].u, 0);
366 }
367
368 /* paranoia / a bit assumptive. */
369 if ( (off & 3)
370 && (off & 3) + cbWrite > sizeof(X86PTE))
371 {
372 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PTE);
373 if ( iShw2 != iShw
374 && iShw2 < RT_ELEMENTS(uShw.pPD->a))
375 {
376 X86PGUINT const uPde2 = uShw.pPD->a[iShw2].u;
377 if (uPde2 & X86_PDE_P)
378 {
379 LogFlow(("pgmPoolMonitorChainChanging: 32 bit pd iShw=%#x: %RX64 -> freeing it!\n", iShw2, uPde2));
380 pgmPoolFree(pVM, uPde2 & X86_PDE_PG_MASK, pPage->idx, iShw2);
381 ASMAtomicWriteU32(&uShw.pPD->a[iShw2].u, 0);
382 }
383 }
384 }
385#if 0 /* useful when running PGMAssertCR3(), a bit too troublesome for general use (TLBs). - not working any longer... */
386 if ( uShw.pPD->a[iShw].n.u1Present
387 && !VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3))
388 {
389 LogFlow(("pgmPoolMonitorChainChanging: iShw=%#x: %RX32 -> freeing it!\n", iShw, uShw.pPD->a[iShw].u));
390 pgmPoolFree(pVM, uShw.pPD->a[iShw].u & X86_PDE_PG_MASK, pPage->idx, iShw);
391 ASMAtomicWriteU32(&uShw.pPD->a[iShw].u, 0);
392 }
393#endif
394 break;
395 }
396
397 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
398 {
399 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
400 const unsigned iShw = off / sizeof(X86PDEPAE);
401 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
402
403 /*
404 * Causes trouble when the guest uses a PDE to refer to the whole page table level
405 * structure. (Invalidate here; faults later on when it tries to change the page
406 * table entries -> recheck; probably only applies to the RC case.)
407 */
408 X86PGPAEUINT const uPde = uShw.pPDPae->a[iShw].u;
409 if (uPde & X86_PDE_P)
410 {
411 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uPde));
412 pgmPoolFree(pVM, uPde & X86_PDE_PAE_PG_MASK, pPage->idx, iShw);
413 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw].u, 0);
414 }
415
416 /* paranoia / a bit assumptive. */
417 if ( (off & 7)
418 && (off & 7) + cbWrite > sizeof(X86PDEPAE))
419 {
420 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDEPAE);
421 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPDPae->a));
422
423 X86PGPAEUINT const uPde2 = uShw.pPDPae->a[iShw2].u;
424 if (uPde2 & X86_PDE_P)
425 {
426 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uPde2));
427 pgmPoolFree(pVM, uPde2 & X86_PDE_PAE_PG_MASK, pPage->idx, iShw2);
428 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
429 }
430 }
431 break;
432 }
433
434 case PGMPOOLKIND_PAE_PDPT:
435 {
436 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPDPT));
437 /*
438 * Hopefully this doesn't happen very often:
439 * - touching unused parts of the page
440 * - messing with the bits of pd pointers without changing the physical address
441 */
442 /* PDPT roots are not page aligned; 32 byte only! */
443 const unsigned offPdpt = GCPhysFault - pPage->GCPhys;
444
445 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
446 const unsigned iShw = offPdpt / sizeof(X86PDPE);
447 if (iShw < X86_PG_PAE_PDPE_ENTRIES) /* don't use RT_ELEMENTS(uShw.pPDPT->a), because that's for long mode only */
448 {
449 X86PGPAEUINT const uPdpe = uShw.pPDPT->a[iShw].u;
450 if (uPdpe & X86_PDPE_P)
451 {
452 LogFlow(("pgmPoolMonitorChainChanging: pae pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPT->a[iShw].u));
453 pgmPoolFree(pVM, uPdpe & X86_PDPE_PG_MASK, pPage->idx, iShw);
454 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw].u, 0);
455 }
456
457 /* paranoia / a bit assumptive. */
458 if ( (offPdpt & 7)
459 && (offPdpt & 7) + cbWrite > sizeof(X86PDPE))
460 {
461 const unsigned iShw2 = (offPdpt + cbWrite - 1) / sizeof(X86PDPE);
462 if ( iShw2 != iShw
463 && iShw2 < X86_PG_PAE_PDPE_ENTRIES)
464 {
465 X86PGPAEUINT const uPdpe2 = uShw.pPDPT->a[iShw2].u;
466 if (uPdpe2 & X86_PDPE_P)
467 {
468 LogFlow(("pgmPoolMonitorChainChanging: pae pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPT->a[iShw2].u));
469 pgmPoolFree(pVM, uPdpe2 & X86_PDPE_PG_MASK, pPage->idx, iShw2);
470 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw2].u, 0);
471 }
472 }
473 }
474 }
475 break;
476 }
477
478 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
479 {
480 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
481 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
482 const unsigned iShw = off / sizeof(X86PDEPAE);
483 X86PGPAEUINT const uPde = uShw.pPDPae->a[iShw].u;
484 if (uPde & X86_PDE_P)
485 {
486 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uPde));
487 pgmPoolFree(pVM, uPde & X86_PDE_PAE_PG_MASK, pPage->idx, iShw);
488 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw].u, 0);
489 }
490
491 /* paranoia / a bit assumptive. */
492 if ( (off & 7)
493 && (off & 7) + cbWrite > sizeof(X86PDEPAE))
494 {
495 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDEPAE);
496 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPDPae->a));
497 X86PGPAEUINT const uPde2 = uShw.pPDPae->a[iShw2].u;
498 if (uPde2 & X86_PDE_P)
499 {
500 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uPde2));
501 pgmPoolFree(pVM, uPde2 & X86_PDE_PAE_PG_MASK, pPage->idx, iShw2);
502 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
503 }
504 }
505 break;
506 }
507
508 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
509 {
510 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPDPT));
511 /*
512 * Hopefully this doesn't happen very often:
513 * - messing with the bits of pd pointers without changing the physical address
514 */
515 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
516 const unsigned iShw = off / sizeof(X86PDPE);
517 X86PGPAEUINT const uPdpe = uShw.pPDPT->a[iShw].u;
518 if (uPdpe & X86_PDPE_P)
519 {
520 LogFlow(("pgmPoolMonitorChainChanging: pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw, uPdpe));
521 pgmPoolFree(pVM, uPdpe & X86_PDPE_PG_MASK, pPage->idx, iShw);
522 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw].u, 0);
523 }
524 /* paranoia / a bit assumptive. */
525 if ( (off & 7)
526 && (off & 7) + cbWrite > sizeof(X86PDPE))
527 {
528 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDPE);
529 X86PGPAEUINT const uPdpe2 = uShw.pPDPT->a[iShw2].u;
530 if (uPdpe2 & X86_PDPE_P)
531 {
532 LogFlow(("pgmPoolMonitorChainChanging: pdpt iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uPdpe2));
533 pgmPoolFree(pVM, uPdpe2 & X86_PDPE_PG_MASK, pPage->idx, iShw2);
534 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw2].u, 0);
535 }
536 }
537 break;
538 }
539
540 case PGMPOOLKIND_64BIT_PML4:
541 {
542 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPML4));
543 /*
544 * Hopefully this doesn't happen very often:
545 * - messing with the bits of pd pointers without changing the physical address
546 */
547 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
548 const unsigned iShw = off / sizeof(X86PDPE);
549 X86PGPAEUINT const uPml4e = uShw.pPML4->a[iShw].u;
550 if (uPml4e & X86_PML4E_P)
551 {
552 LogFlow(("pgmPoolMonitorChainChanging: pml4 iShw=%#x: %RX64 -> freeing it!\n", iShw, uPml4e));
553 pgmPoolFree(pVM, uPml4e & X86_PML4E_PG_MASK, pPage->idx, iShw);
554 ASMAtomicWriteU64(&uShw.pPML4->a[iShw].u, 0);
555 }
556 /* paranoia / a bit assumptive. */
557 if ( (off & 7)
558 && (off & 7) + cbWrite > sizeof(X86PDPE))
559 {
560 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PML4E);
561 X86PGPAEUINT const uPml4e2 = uShw.pPML4->a[iShw2].u;
562 if (uPml4e2 & X86_PML4E_P)
563 {
564 LogFlow(("pgmPoolMonitorChainChanging: pml4 iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uPml4e2));
565 pgmPoolFree(pVM, uPml4e2 & X86_PML4E_PG_MASK, pPage->idx, iShw2);
566 ASMAtomicWriteU64(&uShw.pPML4->a[iShw2].u, 0);
567 }
568 }
569 break;
570 }
571
572#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
573 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
574 {
575 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
576 const unsigned iShw = off / sizeof(EPTPML4E);
577 X86PGPAEUINT const uPml4e = uShw.pPML4->a[iShw].u;
578 if (uPml4e & EPT_PRESENT_MASK)
579 {
580 Log7Func(("PML4 iShw=%#x: %RX64 (%RGp) -> freeing it!\n", iShw, uPml4e, pPage->GCPhys));
581 pgmPoolFree(pVM, uPml4e & X86_PML4E_PG_MASK, pPage->idx, iShw);
582 ASMAtomicWriteU64(&uShw.pPML4->a[iShw].u, 0);
583 }
584
585 /* paranoia / a bit assumptive. */
586 if ( (off & 7)
587 && (off & 7) + cbWrite > sizeof(X86PML4E))
588 {
589 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PML4E);
590 X86PGPAEUINT const uPml4e2 = uShw.pPML4->a[iShw2].u;
591 if (uPml4e2 & EPT_PRESENT_MASK)
592 {
593 Log7Func(("PML4 iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uPml4e2));
594 pgmPoolFree(pVM, uPml4e2 & X86_PML4E_PG_MASK, pPage->idx, iShw2);
595 ASMAtomicWriteU64(&uShw.pPML4->a[iShw2].u, 0);
596 }
597 }
598 break;
599 }
600
601 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
602 {
603 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
604 const unsigned iShw = off / sizeof(EPTPDPTE);
605 X86PGPAEUINT const uPdpte = uShw.pEptPdpt->a[iShw].u;
606 if (uPdpte & EPT_PRESENT_MASK)
607 {
608 Log7Func(("EPT PDPT iShw=%#x: %RX64 (%RGp) -> freeing it!\n", iShw, uPdpte, pPage->GCPhys));
609 pgmPoolFree(pVM, uPdpte & EPT_PDPTE_PG_MASK, pPage->idx, iShw);
610 ASMAtomicWriteU64(&uShw.pEptPdpt->a[iShw].u, 0);
611 }
612
613 /* paranoia / a bit assumptive. */
614 if ( (off & 7)
615 && (off & 7) + cbWrite > sizeof(EPTPDPTE))
616 {
617 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(EPTPDPTE);
618 X86PGPAEUINT const uPdpte2 = uShw.pEptPdpt->a[iShw2].u;
619 if (uPdpte2 & EPT_PRESENT_MASK)
620 {
621 Log7Func(("EPT PDPT iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uPdpte2));
622 pgmPoolFree(pVM, uPdpte2 & EPT_PDPTE_PG_MASK, pPage->idx, iShw2);
623 ASMAtomicWriteU64(&uShw.pEptPdpt->a[iShw2].u, 0);
624 }
625 }
626 break;
627 }
628
629 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
630 {
631 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
632 const unsigned iShw = off / sizeof(EPTPDE);
633 X86PGPAEUINT const uPde = uShw.pEptPd->a[iShw].u;
634 if (uPde & EPT_PRESENT_MASK)
635 {
636 Assert(!(uPde & EPT_E_LEAF));
637 Log7Func(("EPT PD iShw=%#x: %RX64 (%RGp) -> freeing it!\n", iShw, uPde, pPage->GCPhys));
638 pgmPoolFree(pVM, uPde & EPT_PDE_PG_MASK, pPage->idx, iShw);
639 ASMAtomicWriteU64(&uShw.pEptPd->a[iShw].u, 0);
640 }
641
642 /* paranoia / a bit assumptive. */
643 if ( (off & 7)
644 && (off & 7) + cbWrite > sizeof(EPTPDE))
645 {
646 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(EPTPDE);
647 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pEptPd->a));
648 X86PGPAEUINT const uPde2 = uShw.pEptPd->a[iShw2].u;
649 if (uPde2 & EPT_PRESENT_MASK)
650 {
651 Assert(!(uPde2 & EPT_E_LEAF));
652 Log7Func(("EPT PD (2): iShw2=%#x: %RX64 (%RGp) -> freeing it!\n", iShw2, uPde2, pPage->GCPhys));
653 pgmPoolFree(pVM, uPde2 & EPT_PDE_PG_MASK, pPage->idx, iShw2);
654 ASMAtomicWriteU64(&uShw.pEptPd->a[iShw2].u, 0);
655 }
656 }
657 break;
658 }
659
660 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
661 {
662 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
663 const unsigned iShw = off / sizeof(EPTPTE);
664 X86PGPAEUINT const uPte = uShw.pEptPt->a[iShw].u;
665 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
666 if (uPte & EPT_PRESENT_MASK)
667 {
668 EPTPTE GstPte;
669 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
670 AssertRC(rc);
671
672 Log7Func(("EPT PT: iShw=%#x %RX64 (%RGp)\n", iShw, uPte, pPage->GCPhys));
673 pgmPoolTracDerefGCPhysHint(pPool, pPage,
674 uShw.pEptPt->a[iShw].u & EPT_PTE_PG_MASK,
675 GstPte.u & EPT_PTE_PG_MASK,
676 iShw);
677 ASMAtomicWriteU64(&uShw.pEptPt->a[iShw].u, 0);
678 }
679
680 /* paranoia / a bit assumptive. */
681 if ( (off & 7)
682 && (off & 7) + cbWrite > sizeof(EPTPTE))
683 {
684 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(EPTPTE);
685 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pEptPt->a));
686 X86PGPAEUINT const uPte2 = uShw.pEptPt->a[iShw2].u;
687 if (uPte2 & EPT_PRESENT_MASK)
688 {
689 EPTPTE GstPte;
690 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte,
691 pvAddress ? (uint8_t const *)pvAddress + sizeof(GstPte) : NULL,
692 GCPhysFault + sizeof(GstPte), sizeof(GstPte));
693 AssertRC(rc);
694 Log7Func(("EPT PT (2): iShw=%#x %RX64 (%RGp)\n", iShw2, uPte2, pPage->GCPhys));
695 pgmPoolTracDerefGCPhysHint(pPool, pPage,
696 uShw.pEptPt->a[iShw2].u & EPT_PTE_PG_MASK,
697 GstPte.u & EPT_PTE_PG_MASK,
698 iShw2);
699 ASMAtomicWriteU64(&uShw.pEptPt->a[iShw2].u, 0);
700 }
701 }
702 break;
703 }
704#endif /* VBOX_WITH_NESTED_HWVIRT_VMX_EPT */
705
706 default:
707 AssertFatalMsgFailed(("enmKind=%d\n", pPage->enmKind));
708 }
709 PGM_DYNMAP_UNUSED_HINT_VM(pVM, uShw.pv);
710
711 /* next */
712 if (pPage->iMonitoredNext == NIL_PGMPOOL_IDX)
713 return;
714 pPage = &pPool->aPages[pPage->iMonitoredNext];
715 }
716}
717
718#ifndef IN_RING3
719
720/**
721 * Checks if a access could be a fork operation in progress.
722 *
723 * Meaning, that the guest is setting up the parent process for Copy-On-Write.
724 *
725 * @returns true if it's likely that we're forking, otherwise false.
726 * @param pPool The pool.
727 * @param pDis The disassembled instruction.
728 * @param offFault The access offset.
729 */
730DECLINLINE(bool) pgmRZPoolMonitorIsForking(PPGMPOOL pPool, PDISSTATE pDis, unsigned offFault)
731{
732 /*
733 * i386 linux is using btr to clear X86_PTE_RW.
734 * The functions involved are (2.6.16 source inspection):
735 * clear_bit
736 * ptep_set_wrprotect
737 * copy_one_pte
738 * copy_pte_range
739 * copy_pmd_range
740 * copy_pud_range
741 * copy_page_range
742 * dup_mmap
743 * dup_mm
744 * copy_mm
745 * copy_process
746 * do_fork
747 */
748 if ( pDis->pCurInstr->uOpcode == OP_BTR
749 && !(offFault & 4)
750 /** @todo Validate that the bit index is X86_PTE_RW. */
751 )
752 {
753 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitorPf,Fork)); RT_NOREF_PV(pPool);
754 return true;
755 }
756 return false;
757}
758
759
760/**
761 * Determine whether the page is likely to have been reused.
762 *
763 * @returns true if we consider the page as being reused for a different purpose.
764 * @returns false if we consider it to still be a paging page.
765 * @param pVM The cross context VM structure.
766 * @param pVCpu The cross context virtual CPU structure.
767 * @param pCtx Pointer to the register context for the CPU.
768 * @param pDis The disassembly info for the faulting instruction.
769 * @param pvFault The fault address.
770 * @param pPage The pool page being accessed.
771 *
772 * @remark The REP prefix check is left to the caller because of STOSD/W.
773 */
774DECLINLINE(bool) pgmRZPoolMonitorIsReused(PVMCC pVM, PVMCPUCC pVCpu, PCPUMCTX pCtx, PDISSTATE pDis, RTGCPTR pvFault,
775 PPGMPOOLPAGE pPage)
776{
777 /* Locked (CR3, PDPTR*4) should not be reusable. Considering them as
778 such may cause loops booting tst-ubuntu-15_10-64-efi, ++. */
779 if (pPage->cLocked)
780 {
781 Log2(("pgmRZPoolMonitorIsReused: %RGv (%p) can't have been resued, because it's locked!\n", pvFault, pPage));
782 return false;
783 }
784
785 /** @todo could make this general, faulting close to rsp should be a safe reuse heuristic. */
786 if ( HMHasPendingIrq(pVM)
787 && pCtx->rsp - pvFault < 32)
788 {
789 /* Fault caused by stack writes while trying to inject an interrupt event. */
790 Log(("pgmRZPoolMonitorIsReused: reused %RGv for interrupt stack (rsp=%RGv).\n", pvFault, pCtx->rsp));
791 return true;
792 }
793
794 LogFlow(("Reused instr %RGv %d at %RGv param1.fUse=%llx param1.reg=%d\n", pCtx->rip, pDis->pCurInstr->uOpcode, pvFault, pDis->Param1.fUse, pDis->Param1.arch.x86.Base.idxGenReg));
795
796 /* Non-supervisor mode write means it's used for something else. */
797 if (CPUMGetGuestCPL(pVCpu) == 3)
798 return true;
799
800 switch (pDis->pCurInstr->uOpcode)
801 {
802 /* call implies the actual push of the return address faulted */
803 case OP_CALL:
804 Log4(("pgmRZPoolMonitorIsReused: CALL\n"));
805 return true;
806 case OP_PUSH:
807 Log4(("pgmRZPoolMonitorIsReused: PUSH\n"));
808 return true;
809 case OP_PUSHF:
810 Log4(("pgmRZPoolMonitorIsReused: PUSHF\n"));
811 return true;
812 case OP_PUSHA:
813 Log4(("pgmRZPoolMonitorIsReused: PUSHA\n"));
814 return true;
815 case OP_FXSAVE:
816 Log4(("pgmRZPoolMonitorIsReused: FXSAVE\n"));
817 return true;
818 case OP_MOVNTI: /* solaris - block_zero_no_xmm */
819 Log4(("pgmRZPoolMonitorIsReused: MOVNTI\n"));
820 return true;
821 case OP_MOVNTDQ: /* solaris - hwblkclr & hwblkpagecopy */
822 Log4(("pgmRZPoolMonitorIsReused: MOVNTDQ\n"));
823 return true;
824 case OP_MOVSWD:
825 case OP_STOSWD:
826 if ( pDis->arch.x86.fPrefix == (DISPREFIX_REP|DISPREFIX_REX)
827 && pCtx->rcx >= 0x40
828 )
829 {
830 Assert(pDis->uCpuMode == DISCPUMODE_64BIT);
831
832 Log(("pgmRZPoolMonitorIsReused: OP_STOSQ\n"));
833 return true;
834 }
835 break;
836
837 default:
838 /*
839 * Anything having ESP on the left side means stack writes.
840 */
841 if ( ( (pDis->Param1.fUse & DISUSE_REG_GEN32)
842 || (pDis->Param1.fUse & DISUSE_REG_GEN64))
843 && (pDis->Param1.arch.x86.Base.idxGenReg == DISGREG_ESP))
844 {
845 Log4(("pgmRZPoolMonitorIsReused: ESP\n"));
846 return true;
847 }
848 break;
849 }
850
851 /*
852 * Page table updates are very very unlikely to be crossing page boundraries,
853 * and we don't want to deal with that in pgmPoolMonitorChainChanging and such.
854 */
855 uint32_t const cbWrite = DISGetParamSize(pDis, &pDis->Param1);
856 if ( (((uintptr_t)pvFault + cbWrite) >> X86_PAGE_SHIFT) != ((uintptr_t)pvFault >> X86_PAGE_SHIFT) )
857 {
858 Log4(("pgmRZPoolMonitorIsReused: cross page write\n"));
859 return true;
860 }
861
862 /*
863 * Nobody does an unaligned 8 byte write to a page table, right.
864 */
865 if (cbWrite >= 8 && ((uintptr_t)pvFault & 7) != 0)
866 {
867 Log4(("pgmRZPoolMonitorIsReused: Unaligned 8+ byte write\n"));
868 return true;
869 }
870
871 return false;
872}
873
874
875/**
876 * Flushes the page being accessed.
877 *
878 * @returns VBox status code suitable for scheduling.
879 * @param pVM The cross context VM structure.
880 * @param pVCpu The cross context virtual CPU structure.
881 * @param pPool The pool.
882 * @param pPage The pool page (head).
883 * @param pDis The disassembly of the write instruction.
884 * @param pCtx Pointer to the register context for the CPU.
885 * @param GCPhysFault The fault address as guest physical address.
886 * @todo VBOXSTRICTRC
887 */
888static int pgmRZPoolAccessPfHandlerFlush(PVMCC pVM, PVMCPUCC pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISSTATE pDis,
889 PCPUMCTX pCtx, RTGCPHYS GCPhysFault)
890{
891 NOREF(pVM); NOREF(GCPhysFault);
892
893 /*
894 * First, do the flushing.
895 */
896 pgmPoolMonitorChainFlush(pPool, pPage);
897
898 /*
899 * Emulate the instruction (xp/w2k problem, requires pc/cr2/sp detection).
900 * Must do this in raw mode (!); XP boot will fail otherwise.
901 */
902 int rc = VINF_SUCCESS;
903 VBOXSTRICTRC rc2 = EMInterpretInstructionDisasState(pVCpu, pDis, pCtx->rip);
904 if (rc2 == VINF_SUCCESS)
905 { /* do nothing */ }
906 else if (rc2 == VINF_EM_RESCHEDULE)
907 {
908 rc = VBOXSTRICTRC_VAL(rc2);
909# ifndef IN_RING3
910 VMCPU_FF_SET(pVCpu, VMCPU_FF_TO_R3);
911# endif
912 }
913 else if (rc2 == VERR_EM_INTERPRETER)
914 {
915 rc = VINF_EM_RAW_EMULATE_INSTR;
916 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitorPf,EmulateInstr));
917 }
918 else if (RT_FAILURE_NP(rc2))
919 rc = VBOXSTRICTRC_VAL(rc2);
920 else
921 AssertMsgFailed(("%Rrc\n", VBOXSTRICTRC_VAL(rc2))); /* ASSUMES no complicated stuff here. */
922
923 LogFlow(("pgmRZPoolAccessPfHandlerFlush: returns %Rrc (flushed)\n", rc));
924 return rc;
925}
926
927
928/**
929 * Handles the STOSD write accesses.
930 *
931 * @returns VBox status code suitable for scheduling.
932 * @param pVM The cross context VM structure.
933 * @param pPool The pool.
934 * @param pPage The pool page (head).
935 * @param pDis The disassembly of the write instruction.
936 * @param pCtx Pointer to the register context for the CPU.
937 * @param GCPhysFault The fault address as guest physical address.
938 * @param pvFault The fault address.
939 */
940DECLINLINE(int) pgmRZPoolAccessPfHandlerSTOSD(PVMCC pVM, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISSTATE pDis,
941 PCPUMCTX pCtx, RTGCPHYS GCPhysFault, RTGCPTR pvFault)
942{
943 unsigned uIncrement = pDis->Param1.arch.x86.cb;
944 NOREF(pVM);
945
946 Assert(pDis->uCpuMode == DISCPUMODE_32BIT || pDis->uCpuMode == DISCPUMODE_64BIT);
947 Assert(pCtx->rcx <= 0x20);
948
949# ifdef VBOX_STRICT
950 if (pDis->arch.x86.uOpMode == DISCPUMODE_32BIT)
951 Assert(uIncrement == 4);
952 else
953 Assert(uIncrement == 8);
954# endif
955
956 Log3(("pgmRZPoolAccessPfHandlerSTOSD\n"));
957
958 /*
959 * Increment the modification counter and insert it into the list
960 * of modified pages the first time.
961 */
962 if (!pPage->cModifications++)
963 pgmPoolMonitorModifiedInsert(pPool, pPage);
964
965 /*
966 * Execute REP STOSD.
967 *
968 * This ASSUMES that we're not invoked by Trap0e on in a out-of-sync
969 * write situation, meaning that it's safe to write here.
970 */
971 PVMCPUCC pVCpu = VMMGetCpu(pPool->CTX_SUFF(pVM));
972 RTGCUINTPTR pu32 = (RTGCUINTPTR)pvFault;
973 while (pCtx->rcx)
974 {
975 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, NULL, uIncrement);
976 PGMPhysSimpleWriteGCPhys(pVM, GCPhysFault, &pCtx->rax, uIncrement);
977 pu32 += uIncrement;
978 GCPhysFault += uIncrement;
979 pCtx->rdi += uIncrement;
980 pCtx->rcx--;
981 }
982 pCtx->rip += pDis->cbInstr;
983
984 LogFlow(("pgmRZPoolAccessPfHandlerSTOSD: returns\n"));
985 return VINF_SUCCESS;
986}
987
988
989/**
990 * Handles the simple write accesses.
991 *
992 * @returns VBox status code suitable for scheduling.
993 * @param pVM The cross context VM structure.
994 * @param pVCpu The cross context virtual CPU structure.
995 * @param pPool The pool.
996 * @param pPage The pool page (head).
997 * @param pDis The disassembly of the write instruction.
998 * @param pCtx Pointer to the register context for the CPU.
999 * @param GCPhysFault The fault address as guest physical address.
1000 * @param pfReused Reused state (in/out)
1001 */
1002DECLINLINE(int) pgmRZPoolAccessPfHandlerSimple(PVMCC pVM, PVMCPUCC pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISSTATE pDis,
1003 PCPUMCTX pCtx, RTGCPHYS GCPhysFault, bool *pfReused)
1004{
1005 Log3(("pgmRZPoolAccessPfHandlerSimple\n"));
1006 NOREF(pVM);
1007 NOREF(pfReused); /* initialized by caller */
1008
1009 /*
1010 * Increment the modification counter and insert it into the list
1011 * of modified pages the first time.
1012 */
1013 if (!pPage->cModifications++)
1014 pgmPoolMonitorModifiedInsert(pPool, pPage);
1015
1016 /*
1017 * Clear all the pages.
1018 */
1019 uint32_t cbWrite = DISGetParamSize(pDis, &pDis->Param1);
1020 if (cbWrite <= 8)
1021 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, NULL, cbWrite);
1022 else if (cbWrite <= 16)
1023 {
1024 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, NULL, 8);
1025 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault + 8, NULL, cbWrite - 8);
1026 }
1027 else
1028 {
1029 Assert(cbWrite <= 32);
1030 for (uint32_t off = 0; off < cbWrite; off += 8)
1031 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault + off, NULL, RT_MIN(8, cbWrite - off));
1032 }
1033
1034 /*
1035 * Interpret the instruction.
1036 */
1037 VBOXSTRICTRC rc = EMInterpretInstructionDisasState(pVCpu, pDis, pCtx->rip);
1038 if (RT_SUCCESS(rc))
1039 AssertMsg(rc == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rc))); /* ASSUMES no complicated stuff here. */
1040 else if (rc == VERR_EM_INTERPRETER)
1041 {
1042 LogFlow(("pgmRZPoolAccessPfHandlerSimple: Interpretation failed for %04x:%RGv - opcode=%d\n",
1043 pCtx->cs.Sel, (RTGCPTR)pCtx->rip, pDis->pCurInstr->uOpcode));
1044 rc = VINF_EM_RAW_EMULATE_INSTR;
1045 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitorPf,EmulateInstr));
1046 }
1047
1048# if 0 /* experimental code */
1049 if (rc == VINF_SUCCESS)
1050 {
1051 switch (pPage->enmKind)
1052 {
1053 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
1054 {
1055 X86PTEPAE GstPte;
1056 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvFault, GCPhysFault, sizeof(GstPte));
1057 AssertRC(rc);
1058
1059 /* Check the new value written by the guest. If present and with a bogus physical address, then
1060 * it's fairly safe to assume the guest is reusing the PT.
1061 */
1062 if (GstPte.n.u1Present)
1063 {
1064 RTHCPHYS HCPhys = -1;
1065 int rc = PGMPhysGCPhys2HCPhys(pVM, GstPte.u & X86_PTE_PAE_PG_MASK, &HCPhys);
1066 if (rc != VINF_SUCCESS)
1067 {
1068 *pfReused = true;
1069 STAM_COUNTER_INC(&pPool->StatForceFlushReused);
1070 }
1071 }
1072 break;
1073 }
1074 }
1075 }
1076# endif
1077
1078 LogFlow(("pgmRZPoolAccessPfHandlerSimple: returns %Rrc\n", VBOXSTRICTRC_VAL(rc)));
1079 return VBOXSTRICTRC_VAL(rc);
1080}
1081
1082
1083/**
1084 * @callback_method_impl{FNPGMRZPHYSPFHANDLER,
1085 * \#PF access handler callback for page table pages.}
1086 *
1087 * @remarks The @a uUser argument is the index of the PGMPOOLPAGE.
1088 */
1089DECLCALLBACK(VBOXSTRICTRC) pgmRZPoolAccessPfHandler(PVMCC pVM, PVMCPUCC pVCpu, RTGCUINT uErrorCode, PCPUMCTX pCtx,
1090 RTGCPTR pvFault, RTGCPHYS GCPhysFault, uint64_t uUser)
1091{
1092 STAM_PROFILE_START(&pVM->pgm.s.CTX_SUFF(pPool)->StatMonitorRZ, a);
1093 PPGMPOOL const pPool = pVM->pgm.s.CTX_SUFF(pPool);
1094 AssertReturn(uUser < pPool->cCurPages, VERR_PGM_POOL_IPE);
1095 PPGMPOOLPAGE const pPage = &pPool->aPages[uUser];
1096 unsigned cMaxModifications;
1097 bool fForcedFlush = false;
1098 RT_NOREF_PV(uErrorCode);
1099
1100# ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
1101 AssertMsg(pVCpu->pgm.s.enmGuestSlatMode == PGMSLAT_DIRECT,
1102 ("pvFault=%RGv pPage=%p:{.idx=%d} GCPhysFault=%RGp\n", pvFault, pPage, pPage->idx, GCPhysFault));
1103# endif
1104 LogFlow(("pgmRZPoolAccessPfHandler: pvFault=%RGv pPage=%p:{.idx=%d} GCPhysFault=%RGp\n", pvFault, pPage, pPage->idx, GCPhysFault));
1105
1106 PGM_LOCK_VOID(pVM);
1107 if (PHYS_PAGE_ADDRESS(GCPhysFault) != PHYS_PAGE_ADDRESS(pPage->GCPhys))
1108 {
1109 /* Pool page changed while we were waiting for the lock; ignore. */
1110 Log(("CPU%d: pgmRZPoolAccessPfHandler pgm pool page for %RGp changed (to %RGp) while waiting!\n", pVCpu->idCpu, PHYS_PAGE_ADDRESS(GCPhysFault), PHYS_PAGE_ADDRESS(pPage->GCPhys)));
1111 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->StatMonitorPfRZ, &pPool->StatMonitorPfRZHandled, a);
1112 PGM_UNLOCK(pVM);
1113 return VINF_SUCCESS;
1114 }
1115# ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
1116 if (pPage->fDirty)
1117 {
1118# ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
1119 Assert(!PGMPOOL_PAGE_IS_NESTED(pPage));
1120# endif
1121 Assert(VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_TLB_FLUSH));
1122 PGM_UNLOCK(pVM);
1123 return VINF_SUCCESS; /* SMP guest case where we were blocking on the pgm lock while the same page was being marked dirty. */
1124 }
1125# endif
1126
1127# if 0 /* test code defined(VBOX_STRICT) && defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT) */
1128 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1129 {
1130 void *pvShw = PGMPOOL_PAGE_2_PTR(pPool->CTX_SUFF(pVM), pPage);
1131 void *pvGst;
1132 int rc = PGM_GCPHYS_2_PTR(pPool->CTX_SUFF(pVM), pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1133 pgmPoolTrackCheckPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
1134 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1135 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1136 }
1137# endif
1138
1139# ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
1140 if (PGMPOOL_PAGE_IS_NESTED(pPage))
1141 {
1142 Assert(!CPUMIsGuestInVmxNonRootMode(CPUMQueryGuestCtxPtr(pVCpu)));
1143 Log7Func(("Flushing pvFault=%RGv GCPhysFault=%RGp\n", pvFault, GCPhysFault));
1144 pgmPoolMonitorChainFlush(pPool, pPage);
1145 PGM_UNLOCK(pVM);
1146 return VINF_SUCCESS;
1147 }
1148# endif
1149
1150 /*
1151 * Disassemble the faulting instruction.
1152 */
1153 PDISSTATE pDis = &pVCpu->pgm.s.Dis;
1154 int rc = EMInterpretDisasCurrent(pVCpu, pDis, NULL);
1155 if (RT_UNLIKELY(rc != VINF_SUCCESS))
1156 {
1157 AssertMsg(rc == VERR_PAGE_NOT_PRESENT || rc == VERR_PAGE_TABLE_NOT_PRESENT, ("Unexpected rc %d\n", rc));
1158 PGM_UNLOCK(pVM);
1159 return rc;
1160 }
1161
1162 Assert(pPage->enmKind != PGMPOOLKIND_FREE);
1163
1164 /*
1165 * We should ALWAYS have the list head as user parameter. This
1166 * is because we use that page to record the changes.
1167 */
1168 Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1169
1170# ifdef IN_RING0
1171 /* Maximum nr of modifications depends on the page type. */
1172 if ( pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT
1173 || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1174 cMaxModifications = 4;
1175 else
1176 cMaxModifications = 24;
1177# else
1178 cMaxModifications = 48;
1179# endif
1180
1181 /*
1182 * Incremental page table updates should weigh more than random ones.
1183 * (Only applies when started from offset 0)
1184 */
1185 pVCpu->pgm.s.cPoolAccessHandler++;
1186 if ( pPage->GCPtrLastAccessHandlerRip >= pCtx->rip - 0x40 /* observed loops in Windows 7 x64 */
1187 && pPage->GCPtrLastAccessHandlerRip < pCtx->rip + 0x40
1188 && pvFault == (pPage->GCPtrLastAccessHandlerFault + pDis->Param1.arch.x86.cb)
1189 && pVCpu->pgm.s.cPoolAccessHandler == pPage->cLastAccessHandler + 1)
1190 {
1191 Log(("Possible page reuse cMods=%d -> %d (locked=%d type=%s)\n", pPage->cModifications, pPage->cModifications * 2, pgmPoolIsPageLocked(pPage), pgmPoolPoolKindToStr(pPage->enmKind)));
1192 Assert(pPage->cModifications < 32000);
1193 pPage->cModifications = pPage->cModifications * 2;
1194 pPage->GCPtrLastAccessHandlerFault = pvFault;
1195 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1196 if (pPage->cModifications >= cMaxModifications)
1197 {
1198 STAM_COUNTER_INC(&pPool->StatMonitorPfRZFlushReinit);
1199 fForcedFlush = true;
1200 }
1201 }
1202
1203 if (pPage->cModifications >= cMaxModifications)
1204 Log(("Mod overflow %RGv cMods=%d (locked=%d type=%s)\n", pvFault, pPage->cModifications, pgmPoolIsPageLocked(pPage), pgmPoolPoolKindToStr(pPage->enmKind)));
1205
1206 /*
1207 * Check if it's worth dealing with.
1208 */
1209 bool fReused = false;
1210 bool fNotReusedNotForking = false;
1211 if ( ( pPage->cModifications < cMaxModifications /** @todo \#define */ /** @todo need to check that it's not mapping EIP. */ /** @todo adjust this! */
1212 || pgmPoolIsPageLocked(pPage)
1213 )
1214 && !(fReused = pgmRZPoolMonitorIsReused(pVM, pVCpu, pCtx, pDis, pvFault, pPage))
1215 && !pgmRZPoolMonitorIsForking(pPool, pDis, GCPhysFault & PAGE_OFFSET_MASK))
1216 {
1217 /*
1218 * Simple instructions, no REP prefix.
1219 */
1220 if (!(pDis->arch.x86.fPrefix & (DISPREFIX_REP | DISPREFIX_REPNE)))
1221 {
1222 rc = pgmRZPoolAccessPfHandlerSimple(pVM, pVCpu, pPool, pPage, pDis, pCtx, GCPhysFault, &fReused);
1223 if (fReused)
1224 goto flushPage;
1225
1226 /* A mov instruction to change the first page table entry will be remembered so we can detect
1227 * full page table changes early on. This will reduce the amount of unnecessary traps we'll take.
1228 */
1229 if ( rc == VINF_SUCCESS
1230 && !pPage->cLocked /* only applies to unlocked pages as we can't free locked ones (e.g. cr3 root). */
1231 && pDis->pCurInstr->uOpcode == OP_MOV
1232 && (pvFault & PAGE_OFFSET_MASK) == 0)
1233 {
1234 pPage->GCPtrLastAccessHandlerFault = pvFault;
1235 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1236 pPage->GCPtrLastAccessHandlerRip = pCtx->rip;
1237 /* Make sure we don't kick out a page too quickly. */
1238 if (pPage->cModifications > 8)
1239 pPage->cModifications = 2;
1240 }
1241 else if (pPage->GCPtrLastAccessHandlerFault == pvFault)
1242 {
1243 /* ignore the 2nd write to this page table entry. */
1244 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1245 }
1246 else
1247 {
1248 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
1249 pPage->GCPtrLastAccessHandlerRip = 0;
1250 }
1251
1252 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->StatMonitorPfRZ, &pPool->StatMonitorPfRZHandled, a);
1253 PGM_UNLOCK(pVM);
1254 return rc;
1255 }
1256
1257 /*
1258 * Windows is frequently doing small memset() operations (netio test 4k+).
1259 * We have to deal with these or we'll kill the cache and performance.
1260 */
1261 if ( pDis->pCurInstr->uOpcode == OP_STOSWD
1262 && !pCtx->eflags.Bits.u1DF
1263 && pDis->arch.x86.uOpMode == pDis->uCpuMode
1264 && pDis->arch.x86.uAddrMode == pDis->uCpuMode)
1265 {
1266 bool fValidStosd = false;
1267
1268 if ( pDis->uCpuMode == DISCPUMODE_32BIT
1269 && pDis->arch.x86.fPrefix == DISPREFIX_REP
1270 && pCtx->ecx <= 0x20
1271 && pCtx->ecx * 4 <= GUEST_PAGE_SIZE - ((uintptr_t)pvFault & GUEST_PAGE_OFFSET_MASK)
1272 && !((uintptr_t)pvFault & 3)
1273 && (pCtx->eax == 0 || pCtx->eax == 0x80) /* the two values observed. */
1274 )
1275 {
1276 fValidStosd = true;
1277 pCtx->rcx &= 0xffffffff; /* paranoia */
1278 }
1279 else
1280 if ( pDis->uCpuMode == DISCPUMODE_64BIT
1281 && pDis->arch.x86.fPrefix == (DISPREFIX_REP | DISPREFIX_REX)
1282 && pCtx->rcx <= 0x20
1283 && pCtx->rcx * 8 <= GUEST_PAGE_SIZE - ((uintptr_t)pvFault & GUEST_PAGE_OFFSET_MASK)
1284 && !((uintptr_t)pvFault & 7)
1285 && (pCtx->rax == 0 || pCtx->rax == 0x80) /* the two values observed. */
1286 )
1287 {
1288 fValidStosd = true;
1289 }
1290
1291 if (fValidStosd)
1292 {
1293 rc = pgmRZPoolAccessPfHandlerSTOSD(pVM, pPool, pPage, pDis, pCtx, GCPhysFault, pvFault);
1294 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->StatMonitorPfRZ, &pPool->StatMonitorPfRZRepStosd, a);
1295 PGM_UNLOCK(pVM);
1296 return rc;
1297 }
1298 }
1299
1300 /* REP prefix, don't bother. */
1301 STAM_COUNTER_INC(&pPool->StatMonitorPfRZRepPrefix);
1302 Log4(("pgmRZPoolAccessPfHandler: eax=%#x ecx=%#x edi=%#x esi=%#x rip=%RGv opcode=%d prefix=%#x\n",
1303 pCtx->eax, pCtx->ecx, pCtx->edi, pCtx->esi, (RTGCPTR)pCtx->rip, pDis->pCurInstr->uOpcode, pDis->arch.x86.fPrefix));
1304 fNotReusedNotForking = true;
1305 }
1306
1307# if defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT) && defined(IN_RING0)
1308 /* E.g. Windows 7 x64 initializes page tables and touches some pages in the table during the process. This
1309 * leads to pgm pool trashing and an excessive amount of write faults due to page monitoring.
1310 */
1311 if ( pPage->cModifications >= cMaxModifications
1312 && !fForcedFlush
1313 && (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1314 && ( fNotReusedNotForking
1315 || ( !pgmRZPoolMonitorIsReused(pVM, pVCpu, pCtx, pDis, pvFault, pPage)
1316 && !pgmRZPoolMonitorIsForking(pPool, pDis, GCPhysFault & PAGE_OFFSET_MASK))
1317 )
1318 )
1319 {
1320 Assert(!pgmPoolIsPageLocked(pPage));
1321 Assert(pPage->fDirty == false);
1322
1323 /* Flush any monitored duplicates as we will disable write protection. */
1324 if ( pPage->iMonitoredNext != NIL_PGMPOOL_IDX
1325 || pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
1326 {
1327 PPGMPOOLPAGE pPageHead = pPage;
1328
1329 /* Find the monitor head. */
1330 while (pPageHead->iMonitoredPrev != NIL_PGMPOOL_IDX)
1331 pPageHead = &pPool->aPages[pPageHead->iMonitoredPrev];
1332
1333 while (pPageHead)
1334 {
1335 unsigned idxNext = pPageHead->iMonitoredNext;
1336
1337 if (pPageHead != pPage)
1338 {
1339 STAM_COUNTER_INC(&pPool->StatDirtyPageDupFlush);
1340 Log(("Flush duplicate page idx=%d GCPhys=%RGp type=%s\n", pPageHead->idx, pPageHead->GCPhys, pgmPoolPoolKindToStr(pPageHead->enmKind)));
1341 int rc2 = pgmPoolFlushPage(pPool, pPageHead);
1342 AssertRC(rc2);
1343 }
1344
1345 if (idxNext == NIL_PGMPOOL_IDX)
1346 break;
1347
1348 pPageHead = &pPool->aPages[idxNext];
1349 }
1350 }
1351
1352 /* The flushing above might fail for locked pages, so double check. */
1353 if ( pPage->iMonitoredNext == NIL_PGMPOOL_IDX
1354 && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX)
1355 {
1356 pgmPoolAddDirtyPage(pVM, pPool, pPage);
1357
1358 /* Temporarily allow write access to the page table again. */
1359 rc = PGMHandlerPhysicalPageTempOff(pVM,
1360 pPage->GCPhys & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK,
1361 pPage->GCPhys & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK);
1362 if (rc == VINF_SUCCESS)
1363 {
1364 rc = PGMShwMakePageWritable(pVCpu, pvFault, PGM_MK_PG_IS_WRITE_FAULT);
1365 AssertMsg(rc == VINF_SUCCESS
1366 /* In the SMP case the page table might be removed while we wait for the PGM lock in the trap handler. */
1367 || rc == VERR_PAGE_TABLE_NOT_PRESENT
1368 || rc == VERR_PAGE_NOT_PRESENT,
1369 ("PGMShwModifyPage -> GCPtr=%RGv rc=%d\n", pvFault, rc));
1370# ifdef VBOX_STRICT
1371 pPage->GCPtrDirtyFault = pvFault;
1372# endif
1373
1374 STAM_PROFILE_STOP(&pVM->pgm.s.CTX_SUFF(pPool)->StatMonitorPfRZ, a);
1375 PGM_UNLOCK(pVM);
1376 return rc;
1377 }
1378 }
1379 }
1380# endif /* PGMPOOL_WITH_OPTIMIZED_DIRTY_PT && IN_RING0 */
1381
1382 STAM_COUNTER_INC(&pPool->StatMonitorPfRZFlushModOverflow);
1383flushPage:
1384 /*
1385 * Not worth it, so flush it.
1386 *
1387 * If we considered it to be reused, don't go back to ring-3
1388 * to emulate failed instructions since we usually cannot
1389 * interpret then. This may be a bit risky, in which case
1390 * the reuse detection must be fixed.
1391 */
1392 rc = pgmRZPoolAccessPfHandlerFlush(pVM, pVCpu, pPool, pPage, pDis, pCtx, GCPhysFault);
1393 if ( rc == VINF_EM_RAW_EMULATE_INSTR
1394 && fReused)
1395 {
1396 Assert(!PGMPOOL_PAGE_IS_NESTED(pPage)); /* temporary, remove later. */
1397 /* Make sure that the current instruction still has shadow page backing, otherwise we'll end up in a loop. */
1398 if (PGMShwGetPage(pVCpu, pCtx->rip, NULL, NULL) == VINF_SUCCESS)
1399 rc = VINF_SUCCESS; /* safe to restart the instruction. */
1400 }
1401 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->StatMonitorPfRZ, &pPool->StatMonitorPfRZFlushPage, a);
1402 PGM_UNLOCK(pVM);
1403 return rc;
1404}
1405
1406#endif /* !IN_RING3 */
1407
1408/**
1409 * @callback_method_impl{FNPGMPHYSHANDLER,
1410 * Access handler for shadowed page table pages.}
1411 *
1412 * @remarks Only uses the VINF_PGM_HANDLER_DO_DEFAULT status.
1413 * @note The @a uUser argument is the index of the PGMPOOLPAGE.
1414 */
1415DECLCALLBACK(VBOXSTRICTRC)
1416pgmPoolAccessHandler(PVMCC pVM, PVMCPUCC pVCpu, RTGCPHYS GCPhys, void *pvPhys, void *pvBuf, size_t cbBuf,
1417 PGMACCESSTYPE enmAccessType, PGMACCESSORIGIN enmOrigin, uint64_t uUser)
1418{
1419 PPGMPOOL const pPool = pVM->pgm.s.CTX_SUFF(pPool);
1420 STAM_PROFILE_START(&pPool->CTX_SUFF_Z(StatMonitor), a);
1421 AssertReturn(uUser < pPool->cCurPages, VERR_PGM_POOL_IPE);
1422 PPGMPOOLPAGE const pPage = &pPool->aPages[uUser];
1423 LogFlow(("PGM_ALL_CB_DECL: GCPhys=%RGp %p:{.Core=%RHp, .idx=%d, .GCPhys=%RGp, .enmType=%d}\n",
1424 GCPhys, pPage, pPage->Core.Key, pPage->idx, pPage->GCPhys, pPage->enmKind));
1425
1426 NOREF(pvPhys); NOREF(pvBuf); NOREF(enmAccessType);
1427
1428 PGM_LOCK_VOID(pVM);
1429
1430#ifdef VBOX_WITH_STATISTICS
1431 /*
1432 * Collect stats on the access.
1433 */
1434 AssertCompile(RT_ELEMENTS(pPool->CTX_MID_Z(aStatMonitor,Sizes)) == 19);
1435 if (cbBuf <= 16 && cbBuf > 0)
1436 STAM_COUNTER_INC(&pPool->CTX_MID_Z(aStatMonitor,Sizes)[cbBuf - 1]);
1437 else if (cbBuf >= 17 && cbBuf < 32)
1438 STAM_COUNTER_INC(&pPool->CTX_MID_Z(aStatMonitor,Sizes)[16]);
1439 else if (cbBuf >= 32 && cbBuf < 64)
1440 STAM_COUNTER_INC(&pPool->CTX_MID_Z(aStatMonitor,Sizes)[17]);
1441 else if (cbBuf >= 64)
1442 STAM_COUNTER_INC(&pPool->CTX_MID_Z(aStatMonitor,Sizes)[18]);
1443
1444 uint8_t cbAlign;
1445 switch (pPage->enmKind)
1446 {
1447 default:
1448 cbAlign = 7;
1449 break;
1450 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
1451 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
1452 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
1453 case PGMPOOLKIND_32BIT_PD:
1454 case PGMPOOLKIND_32BIT_PD_PHYS:
1455 cbAlign = 3;
1456 break;
1457 }
1458 AssertCompile(RT_ELEMENTS(pPool->CTX_MID_Z(aStatMonitor,Misaligned)) == 7);
1459 if ((uint8_t)GCPhys & cbAlign)
1460 STAM_COUNTER_INC(&pPool->CTX_MID_Z(aStatMonitor,Misaligned)[((uint8_t)GCPhys & cbAlign) - 1]);
1461#endif
1462
1463 /*
1464 * Make sure the pool page wasn't modified by a different CPU.
1465 */
1466 if (PHYS_PAGE_ADDRESS(GCPhys) == PHYS_PAGE_ADDRESS(pPage->GCPhys))
1467 {
1468 Assert(pPage->enmKind != PGMPOOLKIND_FREE);
1469
1470 /* The max modification count before flushing depends on the context and page type. */
1471#ifdef IN_RING3
1472 uint16_t const cMaxModifications = 96; /* it's cheaper here, right? */
1473#else
1474 uint16_t cMaxModifications;
1475 if ( pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT
1476 || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1477 cMaxModifications = 4;
1478 else
1479 cMaxModifications = 24;
1480#endif
1481
1482 /*
1483 * We don't have to be very sophisticated about this since there are relativly few calls here.
1484 * However, we must try our best to detect any non-cpu accesses (disk / networking).
1485 */
1486 if ( ( pPage->cModifications < cMaxModifications
1487 || pgmPoolIsPageLocked(pPage) )
1488 && enmOrigin != PGMACCESSORIGIN_DEVICE
1489 && cbBuf <= 16)
1490 {
1491 /* Clear the shadow entry. */
1492 if (!pPage->cModifications++)
1493 pgmPoolMonitorModifiedInsert(pPool, pPage);
1494
1495 if (cbBuf <= 8)
1496 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhys, pvBuf, (uint32_t)cbBuf);
1497 else
1498 {
1499 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhys, pvBuf, 8);
1500 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhys + 8, (uint8_t *)pvBuf + 8, (uint32_t)cbBuf - 8);
1501 }
1502 }
1503 else
1504 pgmPoolMonitorChainFlush(pPool, pPage);
1505
1506 STAM_PROFILE_STOP_EX(&pPool->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,FlushPage), a);
1507 }
1508 else
1509 Log(("CPU%d: PGM_ALL_CB_DECL pgm pool page for %RGp changed (to %RGp) while waiting!\n", pVCpu->idCpu, PHYS_PAGE_ADDRESS(GCPhys), PHYS_PAGE_ADDRESS(pPage->GCPhys)));
1510 PGM_UNLOCK(pVM);
1511 return VINF_PGM_HANDLER_DO_DEFAULT;
1512}
1513
1514
1515#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
1516
1517# if defined(VBOX_STRICT) && !defined(IN_RING3)
1518
1519/**
1520 * Check references to guest physical memory in a PAE / PAE page table.
1521 *
1522 * @param pPool The pool.
1523 * @param pPage The page.
1524 * @param pShwPT The shadow page table (mapping of the page).
1525 * @param pGstPT The guest page table.
1526 */
1527static void pgmPoolTrackCheckPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT)
1528{
1529 unsigned cErrors = 0;
1530 int LastRc = -1; /* initialized to shut up gcc */
1531 unsigned LastPTE = ~0U; /* initialized to shut up gcc */
1532 RTHCPHYS LastHCPhys = NIL_RTHCPHYS; /* initialized to shut up gcc */
1533 PVMCC pVM = pPool->CTX_SUFF(pVM);
1534
1535# ifdef VBOX_STRICT
1536 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1537 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1538# endif
1539 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1540 {
1541 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1542 {
1543 RTHCPHYS HCPhys = NIL_RTHCPHYS;
1544 int rc = PGMPhysGCPhys2HCPhys(pVM, pGstPT->a[i].u & X86_PTE_PAE_PG_MASK, &HCPhys);
1545 if ( rc != VINF_SUCCESS
1546 || PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) != HCPhys)
1547 {
1548 Log(("rc=%d idx=%d guest %RX64 shw=%RX64 vs %RHp\n", rc, i, pGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1549 LastPTE = i;
1550 LastRc = rc;
1551 LastHCPhys = HCPhys;
1552 cErrors++;
1553
1554 RTHCPHYS HCPhysPT = NIL_RTHCPHYS;
1555 rc = PGMPhysGCPhys2HCPhys(pVM, pPage->GCPhys, &HCPhysPT);
1556 AssertRC(rc);
1557
1558 for (unsigned iPage = 0; iPage < pPool->cCurPages; iPage++)
1559 {
1560 PPGMPOOLPAGE pTempPage = &pPool->aPages[iPage];
1561
1562 if (pTempPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1563 {
1564 PPGMSHWPTPAE pShwPT2 = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pTempPage);
1565
1566 for (unsigned j = 0; j < RT_ELEMENTS(pShwPT->a); j++)
1567 {
1568 if ( PGMSHWPTEPAE_IS_P_RW(pShwPT2->a[j])
1569 && PGMSHWPTEPAE_GET_HCPHYS(pShwPT2->a[j]) == HCPhysPT)
1570 {
1571 Log(("GCPhys=%RGp idx=%d %RX64 vs %RX64\n", pTempPage->GCPhys, j, PGMSHWPTEPAE_GET_LOG(pShwPT->a[j]), PGMSHWPTEPAE_GET_LOG(pShwPT2->a[j])));
1572 }
1573 }
1574
1575 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pShwPT2);
1576 }
1577 }
1578 }
1579 }
1580 }
1581 AssertMsg(!cErrors, ("cErrors=%d: last rc=%d idx=%d guest %RX64 shw=%RX64 vs %RHp\n", cErrors, LastRc, LastPTE, pGstPT->a[LastPTE].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[LastPTE]), LastHCPhys));
1582}
1583
1584
1585/**
1586 * Check references to guest physical memory in a PAE / 32-bit page table.
1587 *
1588 * @param pPool The pool.
1589 * @param pPage The page.
1590 * @param pShwPT The shadow page table (mapping of the page).
1591 * @param pGstPT The guest page table.
1592 */
1593static void pgmPoolTrackCheckPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT)
1594{
1595 unsigned cErrors = 0;
1596 int LastRc = -1; /* initialized to shut up gcc */
1597 unsigned LastPTE = ~0U; /* initialized to shut up gcc */
1598 RTHCPHYS LastHCPhys = NIL_RTHCPHYS; /* initialized to shut up gcc */
1599 PVMCC pVM = pPool->CTX_SUFF(pVM);
1600
1601# ifdef VBOX_STRICT
1602 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1603 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1604# endif
1605 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1606 {
1607 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1608 {
1609 RTHCPHYS HCPhys = NIL_RTHCPHYS;
1610 int rc = PGMPhysGCPhys2HCPhys(pVM, pGstPT->a[i].u & X86_PTE_PG_MASK, &HCPhys);
1611 if ( rc != VINF_SUCCESS
1612 || PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) != HCPhys)
1613 {
1614 Log(("rc=%d idx=%d guest %x shw=%RX64 vs %RHp\n", rc, i, pGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1615 LastPTE = i;
1616 LastRc = rc;
1617 LastHCPhys = HCPhys;
1618 cErrors++;
1619
1620 RTHCPHYS HCPhysPT = NIL_RTHCPHYS;
1621 rc = PGMPhysGCPhys2HCPhys(pVM, pPage->GCPhys, &HCPhysPT);
1622 AssertRC(rc);
1623
1624 for (unsigned iPage = 0; iPage < pPool->cCurPages; iPage++)
1625 {
1626 PPGMPOOLPAGE pTempPage = &pPool->aPages[iPage];
1627
1628 if (pTempPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1629 {
1630 PPGMSHWPTPAE pShwPT2 = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pTempPage);
1631
1632 for (unsigned j = 0; j < RT_ELEMENTS(pShwPT->a); j++)
1633 {
1634 if ( PGMSHWPTEPAE_IS_P_RW(pShwPT2->a[j])
1635 && PGMSHWPTEPAE_GET_HCPHYS(pShwPT2->a[j]) == HCPhysPT)
1636 {
1637 Log(("GCPhys=%RGp idx=%d %RX64 vs %RX64\n", pTempPage->GCPhys, j, PGMSHWPTEPAE_GET_LOG(pShwPT->a[j]), PGMSHWPTEPAE_GET_LOG(pShwPT2->a[j])));
1638 }
1639 }
1640
1641 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pShwPT2);
1642 }
1643 }
1644 }
1645 }
1646 }
1647 AssertMsg(!cErrors, ("cErrors=%d: last rc=%d idx=%d guest %x shw=%RX64 vs %RHp\n", cErrors, LastRc, LastPTE, pGstPT->a[LastPTE].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[LastPTE]), LastHCPhys));
1648}
1649
1650# endif /* VBOX_STRICT && !IN_RING3 */
1651
1652/**
1653 * Clear references to guest physical memory in a PAE / PAE page table.
1654 *
1655 * @returns nr of changed PTEs
1656 * @param pPool The pool.
1657 * @param pPage The page.
1658 * @param pShwPT The shadow page table (mapping of the page).
1659 * @param pGstPT The guest page table.
1660 * @param pOldGstPT The old cached guest page table.
1661 * @param fAllowRemoval Bail out as soon as we encounter an invalid PTE
1662 * @param pfFlush Flush reused page table (out)
1663 */
1664DECLINLINE(unsigned) pgmPoolTrackFlushPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT,
1665 PCX86PTPAE pOldGstPT, bool fAllowRemoval, bool *pfFlush)
1666{
1667 unsigned cChanged = 0;
1668
1669# ifdef VBOX_STRICT
1670 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1671 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1672# endif
1673 *pfFlush = false;
1674
1675 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1676 {
1677 /* Check the new value written by the guest. If present and with a bogus physical address, then
1678 * it's fairly safe to assume the guest is reusing the PT.
1679 */
1680 if ( fAllowRemoval
1681 && (pGstPT->a[i].u & X86_PTE_P))
1682 {
1683 if (!PGMPhysIsGCPhysValid(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK))
1684 {
1685 *pfFlush = true;
1686 return ++cChanged;
1687 }
1688 }
1689 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1690 {
1691 /* If the old cached PTE is identical, then there's no need to flush the shadow copy. */
1692 if ((pGstPT->a[i].u & X86_PTE_PAE_PG_MASK) == (pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK))
1693 {
1694# ifdef VBOX_STRICT
1695 RTHCPHYS HCPhys = NIL_RTGCPHYS;
1696 int rc = PGMPhysGCPhys2HCPhys(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK, &HCPhys);
1697 AssertMsg(rc == VINF_SUCCESS && PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) == HCPhys, ("rc=%d guest %RX64 old %RX64 shw=%RX64 vs %RHp\n", rc, pGstPT->a[i].u, pOldGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1698# endif
1699 uint64_t uHostAttr = PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G | X86_PTE_PAE_NX);
1700 bool fHostRW = !!(PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & X86_PTE_RW);
1701 uint64_t uGuestAttr = pGstPT->a[i].u & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G | X86_PTE_PAE_NX);
1702 bool fGuestRW = !!(pGstPT->a[i].u & X86_PTE_RW);
1703
1704 if ( uHostAttr == uGuestAttr
1705 && fHostRW <= fGuestRW)
1706 continue;
1707 }
1708 cChanged++;
1709 /* Something was changed, so flush it. */
1710 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX64 hint=%RX64\n",
1711 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK));
1712 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK, i);
1713 PGMSHWPTEPAE_ATOMIC_SET(pShwPT->a[i], 0);
1714 }
1715 }
1716 return cChanged;
1717}
1718
1719
1720/**
1721 * Clear references to guest physical memory in a PAE / PAE page table.
1722 *
1723 * @returns nr of changed PTEs
1724 * @param pPool The pool.
1725 * @param pPage The page.
1726 * @param pShwPT The shadow page table (mapping of the page).
1727 * @param pGstPT The guest page table.
1728 * @param pOldGstPT The old cached guest page table.
1729 * @param fAllowRemoval Bail out as soon as we encounter an invalid PTE
1730 * @param pfFlush Flush reused page table (out)
1731 */
1732DECLINLINE(unsigned) pgmPoolTrackFlushPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT,
1733 PCX86PT pOldGstPT, bool fAllowRemoval, bool *pfFlush)
1734{
1735 unsigned cChanged = 0;
1736
1737# ifdef VBOX_STRICT
1738 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1739 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1740# endif
1741 *pfFlush = false;
1742
1743 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1744 {
1745 /* Check the new value written by the guest. If present and with a bogus physical address, then
1746 * it's fairly safe to assume the guest is reusing the PT. */
1747 if (fAllowRemoval)
1748 {
1749 X86PGUINT const uPte = pGstPT->a[i].u;
1750 if ( (uPte & X86_PTE_P)
1751 && !PGMPhysIsGCPhysValid(pPool->CTX_SUFF(pVM), uPte & X86_PTE_PG_MASK))
1752 {
1753 *pfFlush = true;
1754 return ++cChanged;
1755 }
1756 }
1757 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1758 {
1759 /* If the old cached PTE is identical, then there's no need to flush the shadow copy. */
1760 if ((pGstPT->a[i].u & X86_PTE_PG_MASK) == (pOldGstPT->a[i].u & X86_PTE_PG_MASK))
1761 {
1762# ifdef VBOX_STRICT
1763 RTHCPHYS HCPhys = NIL_RTGCPHYS;
1764 int rc = PGMPhysGCPhys2HCPhys(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PG_MASK, &HCPhys);
1765 AssertMsg(rc == VINF_SUCCESS && PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) == HCPhys, ("rc=%d guest %x old %x shw=%RX64 vs %RHp\n", rc, pGstPT->a[i].u, pOldGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1766# endif
1767 uint64_t uHostAttr = PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G);
1768 bool fHostRW = !!(PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & X86_PTE_RW);
1769 uint64_t uGuestAttr = pGstPT->a[i].u & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G);
1770 bool fGuestRW = !!(pGstPT->a[i].u & X86_PTE_RW);
1771
1772 if ( uHostAttr == uGuestAttr
1773 && fHostRW <= fGuestRW)
1774 continue;
1775 }
1776 cChanged++;
1777 /* Something was changed, so flush it. */
1778 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX64 hint=%x\n",
1779 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PG_MASK));
1780 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PG_MASK, i);
1781 PGMSHWPTEPAE_ATOMIC_SET(pShwPT->a[i], 0);
1782 }
1783 }
1784 return cChanged;
1785}
1786
1787
1788/**
1789 * Flush a dirty page
1790 *
1791 * @param pVM The cross context VM structure.
1792 * @param pPool The pool.
1793 * @param idxSlot Dirty array slot index
1794 * @param fAllowRemoval Allow a reused page table to be removed
1795 */
1796static void pgmPoolFlushDirtyPage(PVMCC pVM, PPGMPOOL pPool, unsigned idxSlot, bool fAllowRemoval = false)
1797{
1798 AssertCompile(RT_ELEMENTS(pPool->aidxDirtyPages) == RT_ELEMENTS(pPool->aDirtyPages));
1799
1800 Assert(idxSlot < RT_ELEMENTS(pPool->aDirtyPages));
1801 unsigned idxPage = pPool->aidxDirtyPages[idxSlot];
1802 if (idxPage == NIL_PGMPOOL_IDX)
1803 return;
1804
1805 PPGMPOOLPAGE pPage = &pPool->aPages[idxPage];
1806 Assert(pPage->idx == idxPage);
1807 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1808
1809 AssertMsg(pPage->fDirty, ("Page %RGp (slot=%d) not marked dirty!", pPage->GCPhys, idxSlot));
1810 Log(("Flush dirty page %RGp cMods=%d\n", pPage->GCPhys, pPage->cModifications));
1811
1812 /* First write protect the page again to catch all write accesses. (before checking for changes -> SMP) */
1813 int rc = PGMHandlerPhysicalReset(pVM, pPage->GCPhys & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK);
1814 Assert(rc == VINF_SUCCESS);
1815 pPage->fDirty = false;
1816
1817# ifdef VBOX_STRICT
1818 uint64_t fFlags = 0;
1819 RTHCPHYS HCPhys;
1820 rc = PGMShwGetPage(VMMGetCpu(pVM), pPage->GCPtrDirtyFault, &fFlags, &HCPhys);
1821 AssertMsg( ( rc == VINF_SUCCESS
1822 && (!(fFlags & X86_PTE_RW) || HCPhys != pPage->Core.Key))
1823 /* In the SMP case the page table might be removed while we wait for the PGM lock in the trap handler. */
1824 || rc == VERR_PAGE_TABLE_NOT_PRESENT
1825 || rc == VERR_PAGE_NOT_PRESENT,
1826 ("PGMShwGetPage -> GCPtr=%RGv rc=%d flags=%RX64\n", pPage->GCPtrDirtyFault, rc, fFlags));
1827# endif
1828
1829 /* Flush those PTEs that have changed. */
1830 STAM_PROFILE_START(&pPool->StatTrackDeref,a);
1831 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
1832 void *pvGst;
1833 rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1834 bool fFlush;
1835 unsigned cChanges;
1836
1837 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1838 cChanges = pgmPoolTrackFlushPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst,
1839 (PCX86PTPAE)&pPool->aDirtyPages[idxSlot].aPage[0], fAllowRemoval, &fFlush);
1840 else
1841 {
1842 Assert(!PGMPOOL_PAGE_IS_NESTED(pPage)); /* temporary, remove later. */
1843 cChanges = pgmPoolTrackFlushPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst,
1844 (PCX86PT)&pPool->aDirtyPages[idxSlot].aPage[0], fAllowRemoval, &fFlush);
1845 }
1846
1847 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1848 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1849 STAM_PROFILE_STOP(&pPool->StatTrackDeref,a);
1850 /* Note: we might want to consider keeping the dirty page active in case there were many changes. */
1851
1852 /* This page is likely to be modified again, so reduce the nr of modifications just a bit here. */
1853 Assert(pPage->cModifications);
1854 if (cChanges < 4)
1855 pPage->cModifications = 1; /* must use > 0 here */
1856 else
1857 pPage->cModifications = RT_MAX(1, pPage->cModifications / 2);
1858
1859 STAM_COUNTER_INC(&pPool->StatResetDirtyPages);
1860 if (pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages))
1861 pPool->idxFreeDirtyPage = idxSlot;
1862
1863 pPool->cDirtyPages--;
1864 pPool->aidxDirtyPages[idxSlot] = NIL_PGMPOOL_IDX;
1865 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1866 if (fFlush)
1867 {
1868 Assert(fAllowRemoval);
1869 Log(("Flush reused page table!\n"));
1870 pgmPoolFlushPage(pPool, pPage);
1871 STAM_COUNTER_INC(&pPool->StatForceFlushReused);
1872 }
1873 else
1874 Log(("Removed dirty page %RGp cMods=%d cChanges=%d\n", pPage->GCPhys, pPage->cModifications, cChanges));
1875}
1876
1877
1878# ifndef IN_RING3
1879/**
1880 * Add a new dirty page
1881 *
1882 * @param pVM The cross context VM structure.
1883 * @param pPool The pool.
1884 * @param pPage The page.
1885 */
1886void pgmPoolAddDirtyPage(PVMCC pVM, PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1887{
1888 PGM_LOCK_ASSERT_OWNER(pVM);
1889 AssertCompile(RT_ELEMENTS(pPool->aDirtyPages) == 8 || RT_ELEMENTS(pPool->aDirtyPages) == 16);
1890 Assert(!pPage->fDirty);
1891 Assert(!PGMPOOL_PAGE_IS_NESTED(pPage));
1892
1893 unsigned idxFree = pPool->idxFreeDirtyPage;
1894 Assert(idxFree < RT_ELEMENTS(pPool->aDirtyPages));
1895 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1896
1897 if (pPool->cDirtyPages >= RT_ELEMENTS(pPool->aDirtyPages))
1898 {
1899 STAM_COUNTER_INC(&pPool->StatDirtyPageOverFlowFlush);
1900 pgmPoolFlushDirtyPage(pVM, pPool, idxFree, true /* allow removal of reused page tables*/);
1901 }
1902 Assert(pPool->cDirtyPages < RT_ELEMENTS(pPool->aDirtyPages));
1903 AssertMsg(pPool->aidxDirtyPages[idxFree] == NIL_PGMPOOL_IDX, ("idxFree=%d cDirtyPages=%d\n", idxFree, pPool->cDirtyPages));
1904
1905 Log(("Add dirty page %RGp (slot=%d)\n", pPage->GCPhys, idxFree));
1906
1907 /*
1908 * Make a copy of the guest page table as we require valid GCPhys addresses
1909 * when removing references to physical pages.
1910 * (The HCPhys linear lookup is *extremely* expensive!)
1911 */
1912 void *pvGst;
1913 int rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1914 memcpy(&pPool->aDirtyPages[idxFree].aPage[0], pvGst,
1915 pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT ? PAGE_SIZE : PAGE_SIZE / 2);
1916# ifdef VBOX_STRICT
1917 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
1918 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1919 pgmPoolTrackCheckPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
1920 else
1921 pgmPoolTrackCheckPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst);
1922 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1923# endif
1924 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1925
1926 STAM_COUNTER_INC(&pPool->StatDirtyPage);
1927 pPage->fDirty = true;
1928 pPage->idxDirtyEntry = (uint8_t)idxFree; Assert(pPage->idxDirtyEntry == idxFree);
1929 pPool->aidxDirtyPages[idxFree] = pPage->idx;
1930 pPool->cDirtyPages++;
1931
1932 pPool->idxFreeDirtyPage = (pPool->idxFreeDirtyPage + 1) & (RT_ELEMENTS(pPool->aDirtyPages) - 1);
1933 if ( pPool->cDirtyPages < RT_ELEMENTS(pPool->aDirtyPages)
1934 && pPool->aidxDirtyPages[pPool->idxFreeDirtyPage] != NIL_PGMPOOL_IDX)
1935 {
1936 unsigned i;
1937 for (i = 1; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1938 {
1939 idxFree = (pPool->idxFreeDirtyPage + i) & (RT_ELEMENTS(pPool->aDirtyPages) - 1);
1940 if (pPool->aidxDirtyPages[idxFree] == NIL_PGMPOOL_IDX)
1941 {
1942 pPool->idxFreeDirtyPage = idxFree;
1943 break;
1944 }
1945 }
1946 Assert(i != RT_ELEMENTS(pPool->aDirtyPages));
1947 }
1948
1949 Assert(pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages) || pPool->aidxDirtyPages[pPool->idxFreeDirtyPage] == NIL_PGMPOOL_IDX);
1950
1951 /*
1952 * Clear all references to this shadow table. See @bugref{7298}.
1953 */
1954 pgmPoolTrackClearPageUsers(pPool, pPage);
1955}
1956# endif /* !IN_RING3 */
1957
1958
1959/**
1960 * Check if the specified page is dirty (not write monitored)
1961 *
1962 * @return dirty or not
1963 * @param pVM The cross context VM structure.
1964 * @param GCPhys Guest physical address
1965 */
1966bool pgmPoolIsDirtyPageSlow(PVMCC pVM, RTGCPHYS GCPhys)
1967{
1968 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1969 PGM_LOCK_ASSERT_OWNER(pVM);
1970 if (!pPool->cDirtyPages)
1971 return false;
1972
1973 GCPhys = GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
1974
1975 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1976 {
1977 unsigned idxPage = pPool->aidxDirtyPages[i];
1978 if (idxPage != NIL_PGMPOOL_IDX)
1979 {
1980 PPGMPOOLPAGE pPage = &pPool->aPages[idxPage];
1981 if (pPage->GCPhys == GCPhys)
1982 return true;
1983 }
1984 }
1985 return false;
1986}
1987
1988
1989/**
1990 * Reset all dirty pages by reinstating page monitoring.
1991 *
1992 * @param pVM The cross context VM structure.
1993 */
1994void pgmPoolResetDirtyPages(PVMCC pVM)
1995{
1996 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1997 PGM_LOCK_ASSERT_OWNER(pVM);
1998 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1999
2000 if (!pPool->cDirtyPages)
2001 return;
2002
2003 Log(("pgmPoolResetDirtyPages\n"));
2004 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
2005 pgmPoolFlushDirtyPage(pVM, pPool, i, true /* allow removal of reused page tables*/);
2006
2007 pPool->idxFreeDirtyPage = 0;
2008 if ( pPool->cDirtyPages != RT_ELEMENTS(pPool->aDirtyPages)
2009 && pPool->aidxDirtyPages[pPool->idxFreeDirtyPage] != NIL_PGMPOOL_IDX)
2010 {
2011 unsigned i;
2012 for (i = 1; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
2013 {
2014 if (pPool->aidxDirtyPages[i] == NIL_PGMPOOL_IDX)
2015 {
2016 pPool->idxFreeDirtyPage = i;
2017 break;
2018 }
2019 }
2020 AssertMsg(i != RT_ELEMENTS(pPool->aDirtyPages), ("cDirtyPages %d", pPool->cDirtyPages));
2021 }
2022
2023 Assert(pPool->aidxDirtyPages[pPool->idxFreeDirtyPage] == NIL_PGMPOOL_IDX || pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages));
2024 return;
2025}
2026
2027
2028/**
2029 * Invalidate the PT entry for the specified page
2030 *
2031 * @param pVM The cross context VM structure.
2032 * @param GCPtrPage Guest page to invalidate
2033 */
2034void pgmPoolResetDirtyPage(PVMCC pVM, RTGCPTR GCPtrPage)
2035{
2036 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
2037 PGM_LOCK_ASSERT_OWNER(pVM);
2038 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
2039
2040 if (!pPool->cDirtyPages)
2041 return;
2042
2043 Log(("pgmPoolResetDirtyPage %RGv\n", GCPtrPage)); RT_NOREF_PV(GCPtrPage);
2044 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
2045 {
2046 /** @todo What was intended here??? This looks incomplete... */
2047 }
2048}
2049
2050
2051/**
2052 * Reset all dirty pages by reinstating page monitoring.
2053 *
2054 * @param pVM The cross context VM structure.
2055 * @param GCPhysPT Physical address of the page table
2056 */
2057void pgmPoolInvalidateDirtyPage(PVMCC pVM, RTGCPHYS GCPhysPT)
2058{
2059 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
2060 PGM_LOCK_ASSERT_OWNER(pVM);
2061 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
2062 unsigned idxDirtyPage = RT_ELEMENTS(pPool->aDirtyPages);
2063
2064 if (!pPool->cDirtyPages)
2065 return;
2066
2067 GCPhysPT = GCPhysPT & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2068
2069 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
2070 {
2071 unsigned idxPage = pPool->aidxDirtyPages[i];
2072 if (idxPage != NIL_PGMPOOL_IDX)
2073 {
2074 PPGMPOOLPAGE pPage = &pPool->aPages[idxPage];
2075 if (pPage->GCPhys == GCPhysPT)
2076 {
2077 idxDirtyPage = i;
2078 break;
2079 }
2080 }
2081 }
2082
2083 if (idxDirtyPage != RT_ELEMENTS(pPool->aDirtyPages))
2084 {
2085 pgmPoolFlushDirtyPage(pVM, pPool, idxDirtyPage, true /* allow removal of reused page tables*/);
2086 if ( pPool->cDirtyPages != RT_ELEMENTS(pPool->aDirtyPages)
2087 && pPool->aidxDirtyPages[pPool->idxFreeDirtyPage] != NIL_PGMPOOL_IDX)
2088 {
2089 unsigned i;
2090 for (i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
2091 {
2092 if (pPool->aidxDirtyPages[i] == NIL_PGMPOOL_IDX)
2093 {
2094 pPool->idxFreeDirtyPage = i;
2095 break;
2096 }
2097 }
2098 AssertMsg(i != RT_ELEMENTS(pPool->aDirtyPages), ("cDirtyPages %d", pPool->cDirtyPages));
2099 }
2100 }
2101}
2102
2103#endif /* PGMPOOL_WITH_OPTIMIZED_DIRTY_PT */
2104
2105/**
2106 * Inserts a page into the GCPhys hash table.
2107 *
2108 * @param pPool The pool.
2109 * @param pPage The page.
2110 */
2111DECLINLINE(void) pgmPoolHashInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2112{
2113 Log3(("pgmPoolHashInsert: %RGp\n", pPage->GCPhys));
2114 Assert(pPage->GCPhys != NIL_RTGCPHYS); Assert(pPage->iNext == NIL_PGMPOOL_IDX);
2115 uint16_t iHash = PGMPOOL_HASH(pPage->GCPhys);
2116 pPage->iNext = pPool->aiHash[iHash];
2117 pPool->aiHash[iHash] = pPage->idx;
2118}
2119
2120
2121/**
2122 * Removes a page from the GCPhys hash table.
2123 *
2124 * @param pPool The pool.
2125 * @param pPage The page.
2126 */
2127DECLINLINE(void) pgmPoolHashRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2128{
2129 Log3(("pgmPoolHashRemove: %RGp\n", pPage->GCPhys));
2130 uint16_t iHash = PGMPOOL_HASH(pPage->GCPhys);
2131 if (pPool->aiHash[iHash] == pPage->idx)
2132 pPool->aiHash[iHash] = pPage->iNext;
2133 else
2134 {
2135 uint16_t iPrev = pPool->aiHash[iHash];
2136 for (;;)
2137 {
2138 const int16_t i = pPool->aPages[iPrev].iNext;
2139 if (i == pPage->idx)
2140 {
2141 pPool->aPages[iPrev].iNext = pPage->iNext;
2142 break;
2143 }
2144 if (i == NIL_PGMPOOL_IDX)
2145 {
2146 AssertReleaseMsgFailed(("GCPhys=%RGp idx=%d\n", pPage->GCPhys, pPage->idx));
2147 break;
2148 }
2149 iPrev = i;
2150 }
2151 }
2152 pPage->iNext = NIL_PGMPOOL_IDX;
2153}
2154
2155
2156/**
2157 * Frees up one cache page.
2158 *
2159 * @returns VBox status code.
2160 * @retval VINF_SUCCESS on success.
2161 * @param pPool The pool.
2162 * @param iUser The user index.
2163 */
2164static int pgmPoolCacheFreeOne(PPGMPOOL pPool, uint16_t iUser)
2165{
2166#ifndef VBOX_VMM_TARGET_ARMV8
2167 const PVMCC pVM = pPool->CTX_SUFF(pVM);
2168#endif
2169 Assert(pPool->iAgeHead != pPool->iAgeTail); /* We shouldn't be here if there < 2 cached entries! */
2170 STAM_COUNTER_INC(&pPool->StatCacheFreeUpOne);
2171
2172 /*
2173 * Select one page from the tail of the age list.
2174 */
2175 PPGMPOOLPAGE pPage;
2176 for (unsigned iLoop = 0; ; iLoop++)
2177 {
2178 uint16_t iToFree = pPool->iAgeTail;
2179 if (iToFree == iUser && iUser != NIL_PGMPOOL_IDX)
2180 iToFree = pPool->aPages[iToFree].iAgePrev;
2181/* This is the alternative to the SyncCR3 pgmPoolCacheUsed calls.
2182 if (pPool->aPages[iToFree].iUserHead != NIL_PGMPOOL_USER_INDEX)
2183 {
2184 uint16_t i = pPool->aPages[iToFree].iAgePrev;
2185 for (unsigned j = 0; j < 10 && i != NIL_PGMPOOL_USER_INDEX; j++, i = pPool->aPages[i].iAgePrev)
2186 {
2187 if (pPool->aPages[iToFree].iUserHead == NIL_PGMPOOL_USER_INDEX)
2188 continue;
2189 iToFree = i;
2190 break;
2191 }
2192 }
2193*/
2194 Assert(iToFree != iUser);
2195 AssertReleaseMsg(iToFree != NIL_PGMPOOL_IDX,
2196 ("iToFree=%#x (iAgeTail=%#x) iUser=%#x iLoop=%u - pPool=%p LB %#zx\n",
2197 iToFree, pPool->iAgeTail, iUser, iLoop, pPool,
2198 RT_UOFFSETOF_DYN(PGMPOOL, aPages[pPool->cMaxPages])
2199 + pPool->cMaxUsers * sizeof(PGMPOOLUSER)
2200 + pPool->cMaxPhysExts * sizeof(PGMPOOLPHYSEXT) ));
2201
2202 pPage = &pPool->aPages[iToFree];
2203
2204 /*
2205 * Reject any attempts at flushing the currently active shadow CR3 mapping.
2206 * Call pgmPoolCacheUsed to move the page to the head of the age list.
2207 */
2208 if ( !pgmPoolIsPageLocked(pPage)
2209 && pPage->idx >= PGMPOOL_IDX_FIRST /* paranoia (#6349) */)
2210 break;
2211 LogFlow(("pgmPoolCacheFreeOne: refuse CR3 mapping\n"));
2212 pgmPoolCacheUsed(pPool, pPage);
2213 AssertLogRelReturn(iLoop < 8192, VERR_PGM_POOL_TOO_MANY_LOOPS);
2214 }
2215
2216 /*
2217 * Found a usable page, flush it and return.
2218 */
2219 int rc = pgmPoolFlushPage(pPool, pPage);
2220 /* This flush was initiated by us and not the guest, so explicitly flush the TLB. */
2221 /** @todo find out why this is necessary; pgmPoolFlushPage should trigger a flush if one is really needed. */
2222 if (rc == VINF_SUCCESS)
2223 PGM_INVL_ALL_VCPU_TLBS(pVM);
2224 return rc;
2225}
2226
2227
2228/**
2229 * Checks if a kind mismatch is really a page being reused
2230 * or if it's just normal remappings.
2231 *
2232 * @returns true if reused and the cached page (enmKind1) should be flushed
2233 * @returns false if not reused.
2234 * @param enmKind1 The kind of the cached page.
2235 * @param enmKind2 The kind of the requested page.
2236 */
2237static bool pgmPoolCacheReusedByKind(PGMPOOLKIND enmKind1, PGMPOOLKIND enmKind2)
2238{
2239 switch (enmKind1)
2240 {
2241 /*
2242 * Never reuse them. There is no remapping in non-paging mode.
2243 */
2244 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2245 case PGMPOOLKIND_32BIT_PD_PHYS:
2246 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2247 case PGMPOOLKIND_PAE_PD_PHYS:
2248 case PGMPOOLKIND_PAE_PDPT_PHYS:
2249 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2250 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2251 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2252 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2253 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2254 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT: /* never reuse them for other types */
2255 return false;
2256
2257 /*
2258 * It's perfectly fine to reuse these, except for PAE and non-paging stuff.
2259 */
2260 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2261 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2262 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2263 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2264 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2265 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2266 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2267 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2268 case PGMPOOLKIND_32BIT_PD:
2269 case PGMPOOLKIND_PAE_PDPT:
2270 Assert(!PGMPOOL_PAGE_IS_KIND_NESTED(enmKind2));
2271 switch (enmKind2)
2272 {
2273 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2274 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2275 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2276 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2277 case PGMPOOLKIND_64BIT_PML4:
2278 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2279 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2280 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2281 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2282 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2283 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2284 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2285 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2286 return true;
2287 default:
2288 return false;
2289 }
2290
2291 /*
2292 * It's perfectly fine to reuse these, except for PAE and non-paging stuff.
2293 */
2294 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2295 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2296 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2297 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2298 case PGMPOOLKIND_64BIT_PML4:
2299 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2300 Assert(!PGMPOOL_PAGE_IS_KIND_NESTED(enmKind2));
2301 switch (enmKind2)
2302 {
2303 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2304 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2305 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2306 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2307 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2308 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2309 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2310 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2311 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2312 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2313 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2314 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2315 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2316 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2317 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2318 return true;
2319 default:
2320 return false;
2321 }
2322
2323#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
2324 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
2325 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
2326 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
2327 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
2328 return PGMPOOL_PAGE_IS_KIND_NESTED(enmKind2);
2329
2330 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
2331 return false;
2332#endif
2333
2334 /*
2335 * These cannot be flushed, and it's common to reuse the PDs as PTs.
2336 */
2337 case PGMPOOLKIND_ROOT_NESTED:
2338 return false;
2339
2340 default:
2341 AssertFatalMsgFailed(("enmKind1=%d\n", enmKind1));
2342 }
2343}
2344
2345
2346/**
2347 * Attempts to satisfy a pgmPoolAlloc request from the cache.
2348 *
2349 * @returns VBox status code.
2350 * @retval VINF_PGM_CACHED_PAGE on success.
2351 * @retval VERR_FILE_NOT_FOUND if not found.
2352 * @param pPool The pool.
2353 * @param GCPhys The GC physical address of the page we're gonna shadow.
2354 * @param enmKind The kind of mapping.
2355 * @param enmAccess Access type for the mapping (only relevant for big pages)
2356 * @param fA20Enabled Whether the CPU has the A20 gate enabled.
2357 * @param iUser The shadow page pool index of the user table. This is
2358 * NIL_PGMPOOL_IDX for root pages.
2359 * @param iUserTable The index into the user table (shadowed). Ignored if
2360 * root page
2361 * @param ppPage Where to store the pointer to the page.
2362 */
2363static int pgmPoolCacheAlloc(PPGMPOOL pPool, RTGCPHYS GCPhys, PGMPOOLKIND enmKind, PGMPOOLACCESS enmAccess, bool fA20Enabled,
2364 uint16_t iUser, uint32_t iUserTable, PPPGMPOOLPAGE ppPage)
2365{
2366 /*
2367 * Look up the GCPhys in the hash.
2368 */
2369 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
2370 Log3(("pgmPoolCacheAlloc: %RGp kind %s iUser=%d iUserTable=%x SLOT=%d\n", GCPhys, pgmPoolPoolKindToStr(enmKind), iUser, iUserTable, i));
2371 if (i != NIL_PGMPOOL_IDX)
2372 {
2373 do
2374 {
2375 PPGMPOOLPAGE pPage = &pPool->aPages[i];
2376 Log4(("pgmPoolCacheAlloc: slot %d found page %RGp\n", i, pPage->GCPhys));
2377 if (pPage->GCPhys == GCPhys)
2378 {
2379 if ( (PGMPOOLKIND)pPage->enmKind == enmKind
2380 && (PGMPOOLACCESS)pPage->enmAccess == enmAccess
2381 && pPage->fA20Enabled == fA20Enabled)
2382 {
2383 /* Put it at the start of the use list to make sure pgmPoolTrackAddUser
2384 * doesn't flush it in case there are no more free use records.
2385 */
2386 pgmPoolCacheUsed(pPool, pPage);
2387
2388 int rc = VINF_SUCCESS;
2389 if (iUser != NIL_PGMPOOL_IDX)
2390 rc = pgmPoolTrackAddUser(pPool, pPage, iUser, iUserTable);
2391 if (RT_SUCCESS(rc))
2392 {
2393 Assert((PGMPOOLKIND)pPage->enmKind == enmKind);
2394 *ppPage = pPage;
2395 if (pPage->cModifications)
2396 pPage->cModifications = 1; /* reset counter (can't use 0, or else it will be reinserted in the modified list) */
2397 STAM_COUNTER_INC(&pPool->StatCacheHits);
2398 return VINF_PGM_CACHED_PAGE;
2399 }
2400 return rc;
2401 }
2402
2403 if ((PGMPOOLKIND)pPage->enmKind != enmKind)
2404 {
2405 /*
2406 * The kind is different. In some cases we should now flush the page
2407 * as it has been reused, but in most cases this is normal remapping
2408 * of PDs as PT or big pages using the GCPhys field in a slightly
2409 * different way than the other kinds.
2410 */
2411 if (pgmPoolCacheReusedByKind((PGMPOOLKIND)pPage->enmKind, enmKind))
2412 {
2413 STAM_COUNTER_INC(&pPool->StatCacheKindMismatches);
2414 pgmPoolFlushPage(pPool, pPage);
2415 break;
2416 }
2417 }
2418 }
2419
2420 /* next */
2421 i = pPage->iNext;
2422 } while (i != NIL_PGMPOOL_IDX);
2423 }
2424
2425 Log3(("pgmPoolCacheAlloc: Missed GCPhys=%RGp enmKind=%s\n", GCPhys, pgmPoolPoolKindToStr(enmKind)));
2426 STAM_COUNTER_INC(&pPool->StatCacheMisses);
2427 return VERR_FILE_NOT_FOUND;
2428}
2429
2430
2431/**
2432 * Inserts a page into the cache.
2433 *
2434 * @param pPool The pool.
2435 * @param pPage The cached page.
2436 * @param fCanBeCached Set if the page is fit for caching from the caller's point of view.
2437 */
2438static void pgmPoolCacheInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage, bool fCanBeCached)
2439{
2440 /*
2441 * Insert into the GCPhys hash if the page is fit for that.
2442 */
2443 Assert(!pPage->fCached);
2444 if (fCanBeCached)
2445 {
2446 pPage->fCached = true;
2447 pgmPoolHashInsert(pPool, pPage);
2448 Log3(("pgmPoolCacheInsert: Caching %p:{.Core=%RHp, .idx=%d, .enmKind=%s, GCPhys=%RGp}\n",
2449 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
2450 STAM_COUNTER_INC(&pPool->StatCacheCacheable);
2451 }
2452 else
2453 {
2454 Log3(("pgmPoolCacheInsert: Not caching %p:{.Core=%RHp, .idx=%d, .enmKind=%s, GCPhys=%RGp}\n",
2455 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
2456 STAM_COUNTER_INC(&pPool->StatCacheUncacheable);
2457 }
2458
2459 /*
2460 * Insert at the head of the age list.
2461 */
2462 pPage->iAgePrev = NIL_PGMPOOL_IDX;
2463 pPage->iAgeNext = pPool->iAgeHead;
2464 if (pPool->iAgeHead != NIL_PGMPOOL_IDX)
2465 pPool->aPages[pPool->iAgeHead].iAgePrev = pPage->idx;
2466 else
2467 pPool->iAgeTail = pPage->idx;
2468 pPool->iAgeHead = pPage->idx;
2469}
2470
2471
2472/**
2473 * Flushes a cached page.
2474 *
2475 * @param pPool The pool.
2476 * @param pPage The cached page.
2477 */
2478static void pgmPoolCacheFlushPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2479{
2480 Log3(("pgmPoolCacheFlushPage: %RGp\n", pPage->GCPhys));
2481
2482 /*
2483 * Remove the page from the hash.
2484 */
2485 if (pPage->fCached)
2486 {
2487 pPage->fCached = false;
2488 pgmPoolHashRemove(pPool, pPage);
2489 }
2490 else
2491 Assert(pPage->iNext == NIL_PGMPOOL_IDX);
2492
2493 /*
2494 * Remove it from the age list.
2495 */
2496 if (pPage->iAgeNext != NIL_PGMPOOL_IDX)
2497 pPool->aPages[pPage->iAgeNext].iAgePrev = pPage->iAgePrev;
2498 else
2499 pPool->iAgeTail = pPage->iAgePrev;
2500 if (pPage->iAgePrev != NIL_PGMPOOL_IDX)
2501 pPool->aPages[pPage->iAgePrev].iAgeNext = pPage->iAgeNext;
2502 else
2503 pPool->iAgeHead = pPage->iAgeNext;
2504 pPage->iAgeNext = NIL_PGMPOOL_IDX;
2505 pPage->iAgePrev = NIL_PGMPOOL_IDX;
2506}
2507
2508
2509/**
2510 * Looks for pages sharing the monitor.
2511 *
2512 * @returns Pointer to the head page.
2513 * @returns NULL if not found.
2514 * @param pPool The Pool
2515 * @param pNewPage The page which is going to be monitored.
2516 */
2517static PPGMPOOLPAGE pgmPoolMonitorGetPageByGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pNewPage)
2518{
2519 /*
2520 * Look up the GCPhys in the hash.
2521 */
2522 RTGCPHYS GCPhys = pNewPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2523 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
2524 if (i == NIL_PGMPOOL_IDX)
2525 return NULL;
2526 do
2527 {
2528 PPGMPOOLPAGE pPage = &pPool->aPages[i];
2529 if ( pPage->GCPhys - GCPhys < PAGE_SIZE
2530 && pPage != pNewPage)
2531 {
2532 switch (pPage->enmKind)
2533 {
2534 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2535 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2536 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2537 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2538 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2539 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2540 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2541 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2542 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2543 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2544 case PGMPOOLKIND_64BIT_PML4:
2545 case PGMPOOLKIND_32BIT_PD:
2546 case PGMPOOLKIND_PAE_PDPT:
2547#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
2548 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
2549 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
2550 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
2551#endif
2552 {
2553 /* find the head */
2554 while (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
2555 {
2556 Assert(pPage->iMonitoredPrev != pPage->idx);
2557 pPage = &pPool->aPages[pPage->iMonitoredPrev];
2558 }
2559 return pPage;
2560 }
2561
2562 /* ignore, no monitoring. */
2563 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2564 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2565 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2566 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2567 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2568 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2569 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2570 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2571 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2572 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2573 case PGMPOOLKIND_ROOT_NESTED:
2574 case PGMPOOLKIND_PAE_PD_PHYS:
2575 case PGMPOOLKIND_PAE_PDPT_PHYS:
2576 case PGMPOOLKIND_32BIT_PD_PHYS:
2577 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
2578#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
2579 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
2580 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
2581#endif
2582 break;
2583 default:
2584 AssertFatalMsgFailed(("enmKind=%d idx=%d\n", pPage->enmKind, pPage->idx));
2585 }
2586 }
2587
2588 /* next */
2589 i = pPage->iNext;
2590 } while (i != NIL_PGMPOOL_IDX);
2591 return NULL;
2592}
2593
2594
2595/**
2596 * Enabled write monitoring of a guest page.
2597 *
2598 * @returns VBox status code.
2599 * @retval VINF_SUCCESS on success.
2600 * @param pPool The pool.
2601 * @param pPage The cached page.
2602 */
2603static int pgmPoolMonitorInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2604{
2605 LogFlow(("pgmPoolMonitorInsert %RGp\n", pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK));
2606
2607 /*
2608 * Filter out the relevant kinds.
2609 */
2610 switch (pPage->enmKind)
2611 {
2612 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2613 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2614 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2615 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2616 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2617 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2618 case PGMPOOLKIND_64BIT_PML4:
2619 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2620 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2621 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2622 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2623 case PGMPOOLKIND_32BIT_PD:
2624 case PGMPOOLKIND_PAE_PDPT:
2625 break;
2626
2627 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2628 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2629 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2630 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2631 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2632 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2633 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2634 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2635 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2636 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2637 case PGMPOOLKIND_ROOT_NESTED:
2638 /* Nothing to monitor here. */
2639 return VINF_SUCCESS;
2640
2641 case PGMPOOLKIND_32BIT_PD_PHYS:
2642 case PGMPOOLKIND_PAE_PDPT_PHYS:
2643 case PGMPOOLKIND_PAE_PD_PHYS:
2644 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
2645 /* Nothing to monitor here. */
2646 return VINF_SUCCESS;
2647
2648#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
2649 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
2650 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
2651 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
2652 break;
2653
2654 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
2655 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
2656 /* Nothing to monitor here. */
2657 return VINF_SUCCESS;
2658#endif
2659
2660 default:
2661 AssertFatalMsgFailed(("This can't happen! enmKind=%d\n", pPage->enmKind));
2662 }
2663
2664 /*
2665 * Install handler.
2666 */
2667 int rc;
2668 PPGMPOOLPAGE pPageHead = pgmPoolMonitorGetPageByGCPhys(pPool, pPage);
2669 if (pPageHead)
2670 {
2671 Assert(pPageHead != pPage); Assert(pPageHead->iMonitoredNext != pPage->idx);
2672 Assert(pPageHead->iMonitoredPrev != pPage->idx);
2673
2674#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2675 if (pPageHead->fDirty)
2676 pgmPoolFlushDirtyPage(pPool->CTX_SUFF(pVM), pPool, pPageHead->idxDirtyEntry, false /* do not remove */);
2677#endif
2678
2679 pPage->iMonitoredPrev = pPageHead->idx;
2680 pPage->iMonitoredNext = pPageHead->iMonitoredNext;
2681 if (pPageHead->iMonitoredNext != NIL_PGMPOOL_IDX)
2682 pPool->aPages[pPageHead->iMonitoredNext].iMonitoredPrev = pPage->idx;
2683 pPageHead->iMonitoredNext = pPage->idx;
2684 rc = VINF_SUCCESS;
2685 if (PGMPOOL_PAGE_IS_NESTED(pPage))
2686 Log7Func(("Adding to monitoring list GCPhysPage=%RGp\n", pPage->GCPhys));
2687 }
2688 else
2689 {
2690 if (PGMPOOL_PAGE_IS_NESTED(pPage))
2691 Log7Func(("Started monitoring GCPhysPage=%RGp HCPhys=%RHp enmKind=%s\n", pPage->GCPhys, pPage->Core.Key, pgmPoolPoolKindToStr(pPage->enmKind)));
2692
2693 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX); Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
2694 PVMCC pVM = pPool->CTX_SUFF(pVM);
2695 const RTGCPHYS GCPhysPage = pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2696 rc = PGMHandlerPhysicalRegister(pVM, GCPhysPage, GCPhysPage + PAGE_OFFSET_MASK, pPool->hAccessHandlerType,
2697 pPage - &pPool->aPages[0], NIL_RTR3PTR /*pszDesc*/);
2698 /** @todo we should probably deal with out-of-memory conditions here, but for now increasing
2699 * the heap size should suffice. */
2700 AssertFatalMsgRC(rc, ("PGMHandlerPhysicalRegisterEx %RGp failed with %Rrc\n", GCPhysPage, rc));
2701 PVMCPU pVCpu = VMMGetCpu(pVM);
2702 AssertFatalMsg(!(pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL) || VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3), ("fSyncFlags=%x syncff=%d\n", pVCpu->pgm.s.fSyncFlags, VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3)));
2703 }
2704 pPage->fMonitored = true;
2705 return rc;
2706}
2707
2708
2709/**
2710 * Disables write monitoring of a guest page.
2711 *
2712 * @returns VBox status code.
2713 * @retval VINF_SUCCESS on success.
2714 * @param pPool The pool.
2715 * @param pPage The cached page.
2716 */
2717static int pgmPoolMonitorFlush(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2718{
2719 /*
2720 * Filter out the relevant kinds.
2721 */
2722 switch (pPage->enmKind)
2723 {
2724 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2725 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2726 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2727 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2728 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2729 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2730 case PGMPOOLKIND_64BIT_PML4:
2731 case PGMPOOLKIND_32BIT_PD:
2732 case PGMPOOLKIND_PAE_PDPT:
2733 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2734 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2735 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2736 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2737 break;
2738
2739 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2740 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2741 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2742 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2743 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2744 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2745 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2746 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2747 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2748 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2749 case PGMPOOLKIND_ROOT_NESTED:
2750 case PGMPOOLKIND_PAE_PD_PHYS:
2751 case PGMPOOLKIND_PAE_PDPT_PHYS:
2752 case PGMPOOLKIND_32BIT_PD_PHYS:
2753 /* Nothing to monitor here. */
2754 Assert(!pPage->fMonitored);
2755 return VINF_SUCCESS;
2756
2757#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
2758 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
2759 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
2760 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
2761 break;
2762
2763 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
2764 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
2765 /* Nothing to monitor here. */
2766 Assert(!pPage->fMonitored);
2767 return VINF_SUCCESS;
2768#endif
2769
2770 default:
2771 AssertFatalMsgFailed(("This can't happen! enmKind=%d\n", pPage->enmKind));
2772 }
2773 Assert(pPage->fMonitored);
2774
2775 /*
2776 * Remove the page from the monitored list or uninstall it if last.
2777 */
2778 const PVMCC pVM = pPool->CTX_SUFF(pVM);
2779 int rc;
2780 if ( pPage->iMonitoredNext != NIL_PGMPOOL_IDX
2781 || pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
2782 {
2783 if (pPage->iMonitoredPrev == NIL_PGMPOOL_IDX)
2784 {
2785 PPGMPOOLPAGE pNewHead = &pPool->aPages[pPage->iMonitoredNext];
2786 pNewHead->iMonitoredPrev = NIL_PGMPOOL_IDX;
2787 rc = PGMHandlerPhysicalChangeUserArg(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK, pPage->iMonitoredNext);
2788
2789 AssertFatalRCSuccess(rc);
2790 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
2791 }
2792 else
2793 {
2794 pPool->aPages[pPage->iMonitoredPrev].iMonitoredNext = pPage->iMonitoredNext;
2795 if (pPage->iMonitoredNext != NIL_PGMPOOL_IDX)
2796 {
2797 pPool->aPages[pPage->iMonitoredNext].iMonitoredPrev = pPage->iMonitoredPrev;
2798 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
2799 }
2800 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
2801 rc = VINF_SUCCESS;
2802 }
2803 }
2804 else
2805 {
2806 rc = PGMHandlerPhysicalDeregister(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK);
2807 AssertFatalRC(rc);
2808 PVMCPU pVCpu = VMMGetCpu(pVM);
2809 AssertFatalMsg(!(pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL) || VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3),
2810 ("%#x %#x\n", pVCpu->pgm.s.fSyncFlags, pVM->fGlobalForcedActions));
2811 }
2812 pPage->fMonitored = false;
2813
2814 /*
2815 * Remove it from the list of modified pages (if in it).
2816 */
2817 pgmPoolMonitorModifiedRemove(pPool, pPage);
2818
2819 if (PGMPOOL_PAGE_IS_NESTED(pPage))
2820 Log7Func(("Stopped monitoring %RGp\n", pPage->GCPhys));
2821
2822 return rc;
2823}
2824
2825
2826/**
2827 * Inserts the page into the list of modified pages.
2828 *
2829 * @param pPool The pool.
2830 * @param pPage The page.
2831 */
2832void pgmPoolMonitorModifiedInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2833{
2834 Log3(("pgmPoolMonitorModifiedInsert: idx=%d\n", pPage->idx));
2835 AssertMsg( pPage->iModifiedNext == NIL_PGMPOOL_IDX
2836 && pPage->iModifiedPrev == NIL_PGMPOOL_IDX
2837 && pPool->iModifiedHead != pPage->idx,
2838 ("Next=%d Prev=%d idx=%d cModifications=%d Head=%d cModifiedPages=%d\n",
2839 pPage->iModifiedNext, pPage->iModifiedPrev, pPage->idx, pPage->cModifications,
2840 pPool->iModifiedHead, pPool->cModifiedPages));
2841
2842 pPage->iModifiedNext = pPool->iModifiedHead;
2843 if (pPool->iModifiedHead != NIL_PGMPOOL_IDX)
2844 pPool->aPages[pPool->iModifiedHead].iModifiedPrev = pPage->idx;
2845 pPool->iModifiedHead = pPage->idx;
2846 pPool->cModifiedPages++;
2847#ifdef VBOX_WITH_STATISTICS
2848 if (pPool->cModifiedPages > pPool->cModifiedPagesHigh)
2849 pPool->cModifiedPagesHigh = pPool->cModifiedPages;
2850#endif
2851}
2852
2853
2854/**
2855 * Removes the page from the list of modified pages and resets the
2856 * modification counter.
2857 *
2858 * @param pPool The pool.
2859 * @param pPage The page which is believed to be in the list of modified pages.
2860 */
2861static void pgmPoolMonitorModifiedRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2862{
2863 Log3(("pgmPoolMonitorModifiedRemove: idx=%d cModifications=%d\n", pPage->idx, pPage->cModifications));
2864 if (pPool->iModifiedHead == pPage->idx)
2865 {
2866 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
2867 pPool->iModifiedHead = pPage->iModifiedNext;
2868 if (pPage->iModifiedNext != NIL_PGMPOOL_IDX)
2869 {
2870 pPool->aPages[pPage->iModifiedNext].iModifiedPrev = NIL_PGMPOOL_IDX;
2871 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2872 }
2873 pPool->cModifiedPages--;
2874 }
2875 else if (pPage->iModifiedPrev != NIL_PGMPOOL_IDX)
2876 {
2877 pPool->aPages[pPage->iModifiedPrev].iModifiedNext = pPage->iModifiedNext;
2878 if (pPage->iModifiedNext != NIL_PGMPOOL_IDX)
2879 {
2880 pPool->aPages[pPage->iModifiedNext].iModifiedPrev = pPage->iModifiedPrev;
2881 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2882 }
2883 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
2884 pPool->cModifiedPages--;
2885 }
2886 else
2887 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
2888 pPage->cModifications = 0;
2889}
2890
2891
2892/**
2893 * Zaps the list of modified pages, resetting their modification counters in the process.
2894 *
2895 * @param pVM The cross context VM structure.
2896 */
2897static void pgmPoolMonitorModifiedClearAll(PVMCC pVM)
2898{
2899 PGM_LOCK_VOID(pVM);
2900 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
2901 LogFlow(("pgmPoolMonitorModifiedClearAll: cModifiedPages=%d\n", pPool->cModifiedPages));
2902
2903 unsigned cPages = 0; NOREF(cPages);
2904
2905#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2906 pgmPoolResetDirtyPages(pVM);
2907#endif
2908
2909 uint16_t idx = pPool->iModifiedHead;
2910 pPool->iModifiedHead = NIL_PGMPOOL_IDX;
2911 while (idx != NIL_PGMPOOL_IDX)
2912 {
2913 PPGMPOOLPAGE pPage = &pPool->aPages[idx];
2914 idx = pPage->iModifiedNext;
2915 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2916 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
2917 pPage->cModifications = 0;
2918 Assert(++cPages);
2919 }
2920 AssertMsg(cPages == pPool->cModifiedPages, ("%d != %d\n", cPages, pPool->cModifiedPages));
2921 pPool->cModifiedPages = 0;
2922 PGM_UNLOCK(pVM);
2923}
2924
2925
2926/**
2927 * Handle SyncCR3 pool tasks
2928 *
2929 * @returns VBox status code.
2930 * @retval VINF_SUCCESS if successfully added.
2931 * @retval VINF_PGM_SYNC_CR3 is it needs to be deferred to ring 3 (GC only)
2932 * @param pVCpu The cross context virtual CPU structure.
2933 * @remark Should only be used when monitoring is available, thus placed in
2934 * the PGMPOOL_WITH_MONITORING \#ifdef.
2935 */
2936int pgmPoolSyncCR3(PVMCPUCC pVCpu)
2937{
2938 PVMCC pVM = pVCpu->CTX_SUFF(pVM);
2939 LogFlow(("pgmPoolSyncCR3 fSyncFlags=%x\n", pVCpu->pgm.s.fSyncFlags));
2940
2941 /*
2942 * When monitoring shadowed pages, we reset the modification counters on CR3 sync.
2943 * Occasionally we will have to clear all the shadow page tables because we wanted
2944 * to monitor a page which was mapped by too many shadowed page tables. This operation
2945 * sometimes referred to as a 'lightweight flush'.
2946 */
2947# ifdef IN_RING3 /* Don't flush in ring-0 or raw mode, it's taking too long. */
2948 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2949 pgmR3PoolClearAll(pVM, false /*fFlushRemTlb*/);
2950# else /* !IN_RING3 */
2951 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2952 {
2953 Log(("SyncCR3: PGM_SYNC_CLEAR_PGM_POOL is set -> VINF_PGM_SYNC_CR3\n"));
2954 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3); /** @todo no need to do global sync, right? */
2955
2956 /* Make sure all other VCPUs return to ring 3. */
2957 if (pVM->cCpus > 1)
2958 {
2959 VM_FF_SET(pVM, VM_FF_PGM_POOL_FLUSH_PENDING);
2960 PGM_INVL_ALL_VCPU_TLBS(pVM);
2961 }
2962 return VINF_PGM_SYNC_CR3;
2963 }
2964# endif /* !IN_RING3 */
2965 else
2966 {
2967 pgmPoolMonitorModifiedClearAll(pVM);
2968
2969 /* pgmPoolMonitorModifiedClearAll can cause a pgm pool flush (dirty page clearing), so make sure we handle this! */
2970 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2971 {
2972 Log(("pgmPoolMonitorModifiedClearAll caused a pgm flush -> call pgmPoolSyncCR3 again!\n"));
2973 return pgmPoolSyncCR3(pVCpu);
2974 }
2975 }
2976 return VINF_SUCCESS;
2977}
2978
2979
2980/**
2981 * Frees up at least one user entry.
2982 *
2983 * @returns VBox status code.
2984 * @retval VINF_SUCCESS if successfully added.
2985 *
2986 * @param pPool The pool.
2987 * @param iUser The user index.
2988 */
2989static int pgmPoolTrackFreeOneUser(PPGMPOOL pPool, uint16_t iUser)
2990{
2991 STAM_COUNTER_INC(&pPool->StatTrackFreeUpOneUser);
2992 /*
2993 * Just free cached pages in a braindead fashion.
2994 */
2995 /** @todo walk the age list backwards and free the first with usage. */
2996 int rc = VINF_SUCCESS;
2997 do
2998 {
2999 int rc2 = pgmPoolCacheFreeOne(pPool, iUser);
3000 if (RT_FAILURE(rc2) && rc == VINF_SUCCESS)
3001 rc = rc2;
3002 } while (pPool->iUserFreeHead == NIL_PGMPOOL_USER_INDEX);
3003 return rc;
3004}
3005
3006
3007/**
3008 * Inserts a page into the cache.
3009 *
3010 * This will create user node for the page, insert it into the GCPhys
3011 * hash, and insert it into the age list.
3012 *
3013 * @returns VBox status code.
3014 * @retval VINF_SUCCESS if successfully added.
3015 *
3016 * @param pPool The pool.
3017 * @param pPage The cached page.
3018 * @param GCPhys The GC physical address of the page we're gonna shadow.
3019 * @param iUser The user index.
3020 * @param iUserTable The user table index.
3021 */
3022DECLINLINE(int) pgmPoolTrackInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTGCPHYS GCPhys, uint16_t iUser, uint32_t iUserTable)
3023{
3024 int rc = VINF_SUCCESS;
3025 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
3026
3027 LogFlow(("pgmPoolTrackInsert GCPhys=%RGp iUser=%d iUserTable=%x\n", GCPhys, iUser, iUserTable)); RT_NOREF_PV(GCPhys);
3028
3029 if (iUser != NIL_PGMPOOL_IDX)
3030 {
3031#ifdef VBOX_STRICT
3032 /*
3033 * Check that the entry doesn't already exists.
3034 */
3035 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
3036 {
3037 uint16_t i = pPage->iUserHead;
3038 do
3039 {
3040 Assert(i < pPool->cMaxUsers);
3041 AssertMsg(paUsers[i].iUser != iUser || paUsers[i].iUserTable != iUserTable, ("%x %x vs new %x %x\n", paUsers[i].iUser, paUsers[i].iUserTable, iUser, iUserTable));
3042 i = paUsers[i].iNext;
3043 } while (i != NIL_PGMPOOL_USER_INDEX);
3044 }
3045#endif
3046
3047 /*
3048 * Find free a user node.
3049 */
3050 uint16_t i = pPool->iUserFreeHead;
3051 if (i == NIL_PGMPOOL_USER_INDEX)
3052 {
3053 rc = pgmPoolTrackFreeOneUser(pPool, iUser);
3054 if (RT_FAILURE(rc))
3055 return rc;
3056 i = pPool->iUserFreeHead;
3057 }
3058
3059 /*
3060 * Unlink the user node from the free list,
3061 * initialize and insert it into the user list.
3062 */
3063 pPool->iUserFreeHead = paUsers[i].iNext;
3064 paUsers[i].iNext = NIL_PGMPOOL_USER_INDEX;
3065 paUsers[i].iUser = iUser;
3066 paUsers[i].iUserTable = iUserTable;
3067 pPage->iUserHead = i;
3068 }
3069 else
3070 pPage->iUserHead = NIL_PGMPOOL_USER_INDEX;
3071
3072
3073 /*
3074 * Insert into cache and enable monitoring of the guest page if enabled.
3075 *
3076 * Until we implement caching of all levels, including the CR3 one, we'll
3077 * have to make sure we don't try monitor & cache any recursive reuse of
3078 * a monitored CR3 page. Because all windows versions are doing this we'll
3079 * have to be able to do combined access monitoring, CR3 + PT and
3080 * PD + PT (guest PAE).
3081 *
3082 * Update:
3083 * We're now cooperating with the CR3 monitor if an uncachable page is found.
3084 */
3085 const bool fCanBeMonitored = true;
3086 pgmPoolCacheInsert(pPool, pPage, fCanBeMonitored); /* This can be expanded. */
3087 if (fCanBeMonitored)
3088 {
3089 rc = pgmPoolMonitorInsert(pPool, pPage);
3090 AssertRC(rc);
3091 }
3092 return rc;
3093}
3094
3095
3096/**
3097 * Adds a user reference to a page.
3098 *
3099 * This will move the page to the head of the
3100 *
3101 * @returns VBox status code.
3102 * @retval VINF_SUCCESS if successfully added.
3103 *
3104 * @param pPool The pool.
3105 * @param pPage The cached page.
3106 * @param iUser The user index.
3107 * @param iUserTable The user table.
3108 */
3109static int pgmPoolTrackAddUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
3110{
3111 Log3(("pgmPoolTrackAddUser: GCPhys=%RGp iUser=%x iUserTable=%x\n", pPage->GCPhys, iUser, iUserTable));
3112 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
3113 Assert(iUser != NIL_PGMPOOL_IDX);
3114
3115# ifdef VBOX_STRICT
3116 /*
3117 * Check that the entry doesn't already exists. We only allow multiple
3118 * users of top-level paging structures (SHW_POOL_ROOT_IDX).
3119 */
3120 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
3121 {
3122 uint16_t i = pPage->iUserHead;
3123 do
3124 {
3125 Assert(i < pPool->cMaxUsers);
3126 /** @todo this assertion looks odd... Shouldn't it be && here? */
3127 AssertMsg(paUsers[i].iUser != iUser || paUsers[i].iUserTable != iUserTable, ("%x %x vs new %x %x\n", paUsers[i].iUser, paUsers[i].iUserTable, iUser, iUserTable));
3128 i = paUsers[i].iNext;
3129 } while (i != NIL_PGMPOOL_USER_INDEX);
3130 }
3131# endif
3132
3133 /*
3134 * Allocate a user node.
3135 */
3136 uint16_t i = pPool->iUserFreeHead;
3137 if (i == NIL_PGMPOOL_USER_INDEX)
3138 {
3139 int rc = pgmPoolTrackFreeOneUser(pPool, iUser);
3140 if (RT_FAILURE(rc))
3141 return rc;
3142 i = pPool->iUserFreeHead;
3143 }
3144 pPool->iUserFreeHead = paUsers[i].iNext;
3145
3146 /*
3147 * Initialize the user node and insert it.
3148 */
3149 paUsers[i].iNext = pPage->iUserHead;
3150 paUsers[i].iUser = iUser;
3151 paUsers[i].iUserTable = iUserTable;
3152 pPage->iUserHead = i;
3153
3154# ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
3155 if (pPage->fDirty)
3156 pgmPoolFlushDirtyPage(pPool->CTX_SUFF(pVM), pPool, pPage->idxDirtyEntry, false /* do not remove */);
3157# endif
3158
3159 /*
3160 * Tell the cache to update its replacement stats for this page.
3161 */
3162 pgmPoolCacheUsed(pPool, pPage);
3163 return VINF_SUCCESS;
3164}
3165
3166
3167/**
3168 * Frees a user record associated with a page.
3169 *
3170 * This does not clear the entry in the user table, it simply replaces the
3171 * user record to the chain of free records.
3172 *
3173 * @param pPool The pool.
3174 * @param pPage The shadow page.
3175 * @param iUser The shadow page pool index of the user table.
3176 * @param iUserTable The index into the user table (shadowed).
3177 *
3178 * @remarks Don't call this for root pages.
3179 */
3180static void pgmPoolTrackFreeUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
3181{
3182 Log3(("pgmPoolTrackFreeUser %RGp %x %x\n", pPage->GCPhys, iUser, iUserTable));
3183 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
3184 Assert(iUser != NIL_PGMPOOL_IDX);
3185
3186 /*
3187 * Unlink and free the specified user entry.
3188 */
3189
3190 /* Special: For PAE and 32-bit paging, there is usually no more than one user. */
3191 uint16_t i = pPage->iUserHead;
3192 if ( i != NIL_PGMPOOL_USER_INDEX
3193 && paUsers[i].iUser == iUser
3194 && paUsers[i].iUserTable == iUserTable)
3195 {
3196 pPage->iUserHead = paUsers[i].iNext;
3197
3198 paUsers[i].iUser = NIL_PGMPOOL_IDX;
3199 paUsers[i].iNext = pPool->iUserFreeHead;
3200 pPool->iUserFreeHead = i;
3201 return;
3202 }
3203
3204 /* General: Linear search. */
3205 uint16_t iPrev = NIL_PGMPOOL_USER_INDEX;
3206 while (i != NIL_PGMPOOL_USER_INDEX)
3207 {
3208 if ( paUsers[i].iUser == iUser
3209 && paUsers[i].iUserTable == iUserTable)
3210 {
3211 if (iPrev != NIL_PGMPOOL_USER_INDEX)
3212 paUsers[iPrev].iNext = paUsers[i].iNext;
3213 else
3214 pPage->iUserHead = paUsers[i].iNext;
3215
3216 paUsers[i].iUser = NIL_PGMPOOL_IDX;
3217 paUsers[i].iNext = pPool->iUserFreeHead;
3218 pPool->iUserFreeHead = i;
3219 return;
3220 }
3221 iPrev = i;
3222 i = paUsers[i].iNext;
3223 }
3224
3225 /* Fatal: didn't find it */
3226 AssertFatalMsgFailed(("Didn't find the user entry! iUser=%d iUserTable=%#x GCPhys=%RGp\n",
3227 iUser, iUserTable, pPage->GCPhys));
3228}
3229
3230
3231#if 0 /* unused */
3232/**
3233 * Gets the entry size of a shadow table.
3234 *
3235 * @param enmKind The kind of page.
3236 *
3237 * @returns The size of the entry in bytes. That is, 4 or 8.
3238 * @returns If the kind is not for a table, an assertion is raised and 0 is
3239 * returned.
3240 */
3241DECLINLINE(unsigned) pgmPoolTrackGetShadowEntrySize(PGMPOOLKIND enmKind)
3242{
3243 switch (enmKind)
3244 {
3245 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3246 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3247 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3248 case PGMPOOLKIND_32BIT_PD:
3249 case PGMPOOLKIND_32BIT_PD_PHYS:
3250 return 4;
3251
3252 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3253 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3254 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3255 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3256 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3257 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3258 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3259 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3260 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3261 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3262 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3263 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3264 case PGMPOOLKIND_64BIT_PML4:
3265 case PGMPOOLKIND_PAE_PDPT:
3266 case PGMPOOLKIND_ROOT_NESTED:
3267 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3268 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3269 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3270 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3271 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3272 case PGMPOOLKIND_PAE_PD_PHYS:
3273 case PGMPOOLKIND_PAE_PDPT_PHYS:
3274 return 8;
3275
3276 default:
3277 AssertFatalMsgFailed(("enmKind=%d\n", enmKind));
3278 }
3279}
3280#endif /* unused */
3281
3282#if 0 /* unused */
3283/**
3284 * Gets the entry size of a guest table.
3285 *
3286 * @param enmKind The kind of page.
3287 *
3288 * @returns The size of the entry in bytes. That is, 0, 4 or 8.
3289 * @returns If the kind is not for a table, an assertion is raised and 0 is
3290 * returned.
3291 */
3292DECLINLINE(unsigned) pgmPoolTrackGetGuestEntrySize(PGMPOOLKIND enmKind)
3293{
3294 switch (enmKind)
3295 {
3296 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3297 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3298 case PGMPOOLKIND_32BIT_PD:
3299 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3300 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3301 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3302 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3303 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3304 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3305 return 4;
3306
3307 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3308 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3309 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3310 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3311 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3312 case PGMPOOLKIND_64BIT_PML4:
3313 case PGMPOOLKIND_PAE_PDPT:
3314 return 8;
3315
3316 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3317 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3318 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3319 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3320 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3321 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3322 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3323 case PGMPOOLKIND_ROOT_NESTED:
3324 case PGMPOOLKIND_PAE_PD_PHYS:
3325 case PGMPOOLKIND_PAE_PDPT_PHYS:
3326 case PGMPOOLKIND_32BIT_PD_PHYS:
3327 /** @todo can we return 0? (nobody is calling this...) */
3328 AssertFailed();
3329 return 0;
3330
3331 default:
3332 AssertFatalMsgFailed(("enmKind=%d\n", enmKind));
3333 }
3334}
3335#endif /* unused */
3336
3337
3338/**
3339 * Checks one shadow page table entry for a mapping of a physical page.
3340 *
3341 * @returns true / false indicating removal of all relevant PTEs
3342 *
3343 * @param pVM The cross context VM structure.
3344 * @param pPhysPage The guest page in question.
3345 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3346 * @param iShw The shadow page table.
3347 * @param iPte Page table entry or NIL_PGMPOOL_PHYSEXT_IDX_PTE if unknown
3348 */
3349static bool pgmPoolTrackFlushGCPhysPTInt(PVM pVM, PCPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iShw, uint16_t iPte)
3350{
3351 LogFlow(("pgmPoolTrackFlushGCPhysPTInt: pPhysPage=%RHp iShw=%d iPte=%d\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iShw, iPte));
3352 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3353 bool fRet = false;
3354
3355 /*
3356 * Assert sanity.
3357 */
3358 Assert(iPte != NIL_PGMPOOL_PHYSEXT_IDX_PTE);
3359 AssertFatalMsg(iShw < pPool->cCurPages && iShw != NIL_PGMPOOL_IDX, ("iShw=%d\n", iShw));
3360 PPGMPOOLPAGE pPage = &pPool->aPages[iShw];
3361
3362 /*
3363 * Then, clear the actual mappings to the page in the shadow PT.
3364 */
3365 switch (pPage->enmKind)
3366 {
3367 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3368 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3369 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3370 {
3371 const uint32_t u32 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P;
3372 PX86PT pPT = (PX86PT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3373 uint32_t u32AndMask = 0;
3374 uint32_t u32OrMask = 0;
3375
3376 if (!fFlushPTEs)
3377 {
3378 /* Note! Disregarding the PGMPHYSHANDLER_F_NOT_IN_HM bit here. Should be harmless. */
3379 switch (PGM_PAGE_GET_HNDL_PHYS_STATE(pPhysPage))
3380 {
3381 case PGM_PAGE_HNDL_PHYS_STATE_NONE: /* No handler installed. */
3382 case PGM_PAGE_HNDL_PHYS_STATE_DISABLED: /* Monitoring is temporarily disabled. */
3383 u32OrMask = X86_PTE_RW;
3384 u32AndMask = UINT32_MAX;
3385 fRet = true;
3386 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3387 break;
3388
3389 case PGM_PAGE_HNDL_PHYS_STATE_WRITE: /* Write access is monitored. */
3390 u32OrMask = 0;
3391 u32AndMask = ~X86_PTE_RW;
3392 fRet = true;
3393 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3394 break;
3395 default:
3396 /* We will end up here when called with an "ALL" access handler. */
3397 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3398 break;
3399 }
3400 }
3401 else
3402 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3403
3404 /* Update the counter if we're removing references. */
3405 if (!u32AndMask)
3406 {
3407 Assert(pPage->cPresent);
3408 Assert(pPool->cPresent);
3409 pPage->cPresent--;
3410 pPool->cPresent--;
3411 }
3412
3413 if ((pPT->a[iPte].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3414 {
3415 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pte=%RX32\n", iPte, pPT->a[iPte]));
3416 X86PTE Pte;
3417 Pte.u = (pPT->a[iPte].u & u32AndMask) | u32OrMask;
3418 if (Pte.u & PGM_PTFLAGS_TRACK_DIRTY)
3419 Pte.u &= ~(X86PGUINT)X86_PTE_RW; /* need to disallow writes when dirty bit tracking is still active. */
3420
3421 ASMAtomicWriteU32(&pPT->a[iPte].u, Pte.u);
3422 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3423 return fRet;
3424 }
3425#ifdef LOG_ENABLED
3426 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3427 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPT->a); i++)
3428 if ((pPT->a[i].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3429 {
3430 Log(("i=%d cFound=%d\n", i, ++cFound));
3431 }
3432#endif
3433 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d u32=%RX32 poolkind=%x\n", pPage->iFirstPresent, pPage->cPresent, u32, pPage->enmKind));
3434 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);*/
3435 break;
3436 }
3437
3438 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3439 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3440 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3441 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3442 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3443 case PGMPOOLKIND_EPT_PT_FOR_PHYS: /* physical mask the same as PAE; RW bit as well; be careful! */
3444#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
3445 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
3446# ifdef PGM_WITH_LARGE_PAGES
3447 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
3448# endif
3449#endif
3450 {
3451 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P;
3452 PPGMSHWPTPAE pPT = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3453 uint64_t u64OrMask = 0;
3454 uint64_t u64AndMask = 0;
3455
3456 if (!fFlushPTEs)
3457 {
3458 /* Note! Disregarding the PGMPHYSHANDLER_F_NOT_IN_HM bit here. Should be harmless. */
3459 switch (PGM_PAGE_GET_HNDL_PHYS_STATE(pPhysPage))
3460 {
3461 case PGM_PAGE_HNDL_PHYS_STATE_NONE: /* No handler installed. */
3462 case PGM_PAGE_HNDL_PHYS_STATE_DISABLED: /* Monitoring is temporarily disabled. */
3463 u64OrMask = X86_PTE_RW;
3464 u64AndMask = UINT64_MAX;
3465 fRet = true;
3466 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3467 break;
3468
3469 case PGM_PAGE_HNDL_PHYS_STATE_WRITE: /* Write access is monitored. */
3470 u64OrMask = 0;
3471 u64AndMask = ~(uint64_t)X86_PTE_RW;
3472 fRet = true;
3473 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3474 break;
3475
3476 default:
3477 /* We will end up here when called with an "ALL" access handler. */
3478 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3479 break;
3480 }
3481 }
3482 else
3483 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3484
3485 /* Update the counter if we're removing references. */
3486 if (!u64AndMask)
3487 {
3488 Assert(pPage->cPresent);
3489 Assert(pPool->cPresent);
3490 pPage->cPresent--;
3491 pPool->cPresent--;
3492 }
3493
3494 if ((PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX)) == u64)
3495 {
3496 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pte=%RX64\n", iPte, PGMSHWPTEPAE_GET_LOG(pPT->a[iPte])));
3497 X86PTEPAE Pte;
3498 Pte.u = (PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & u64AndMask) | u64OrMask;
3499 if (Pte.u & PGM_PTFLAGS_TRACK_DIRTY)
3500 Pte.u &= ~(X86PGPAEUINT)X86_PTE_RW; /* need to disallow writes when dirty bit tracking is still active. */
3501
3502 PGMSHWPTEPAE_ATOMIC_SET(pPT->a[iPte], Pte.u);
3503 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3504 return fRet;
3505 }
3506#ifdef LOG_ENABLED
3507 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3508 Log(("Found %RX64 expected %RX64\n", PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX), u64));
3509 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPT->a); i++)
3510 if ((PGMSHWPTEPAE_GET_U(pPT->a[i]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX)) == u64)
3511 Log(("i=%d cFound=%d\n", i, ++cFound));
3512#endif
3513 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d u64=%RX64 poolkind=%x iPte=%d PT=%RX64\n", pPage->iFirstPresent, pPage->cPresent, u64, pPage->enmKind, iPte, PGMSHWPTEPAE_GET_LOG(pPT->a[iPte])));
3514 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);*/
3515 break;
3516 }
3517
3518#ifdef PGM_WITH_LARGE_PAGES
3519 /* Large page case only. */
3520 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3521 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
3522 {
3523 Assert(pVM->pgm.s.fNestedPaging);
3524
3525 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PDE4M_P | X86_PDE4M_PS;
3526 PEPTPD pPD = (PEPTPD)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3527
3528 Assert( pPage->enmKind != PGMPOOLKIND_EPT_PD_FOR_EPT_PD
3529 || (pPD->a[iPte].u & EPT_E_LEAF));
3530
3531 if ((pPD->a[iPte].u & (EPT_PDE2M_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3532 {
3533 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pde=%RX64\n", iPte, pPD->a[iPte]));
3534 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3535 pPD->a[iPte].u = 0;
3536 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);
3537
3538 /* Update the counter as we're removing references. */
3539 Assert(pPage->cPresent);
3540 Assert(pPool->cPresent);
3541 pPage->cPresent--;
3542 pPool->cPresent--;
3543
3544 return fRet;
3545 }
3546# ifdef LOG_ENABLED
3547 LogRel(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3548 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPD->a); i++)
3549 if ((pPD->a[i].u & (EPT_PDE2M_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3550 LogRel(("i=%d cFound=%d\n", i, ++cFound));
3551# endif
3552 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d enmKind=%d\n", pPage->iFirstPresent, pPage->cPresent, pPage->enmKind));
3553 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);*/
3554 break;
3555 }
3556
3557 /* AMD-V nested paging */ /** @todo merge with EPT as we only check the parts that are identical. */
3558 case PGMPOOLKIND_PAE_PD_PHYS:
3559 {
3560 Assert(pVM->pgm.s.fNestedPaging);
3561
3562 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PDE4M_P | X86_PDE4M_PS;
3563 PX86PDPAE pPD = (PX86PDPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3564
3565 if ((pPD->a[iPte].u & (X86_PDE2M_PAE_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3566 {
3567 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pde=%RX64\n", iPte, pPD->a[iPte]));
3568 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3569 pPD->a[iPte].u = 0;
3570 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);
3571
3572 /* Update the counter as we're removing references. */
3573 Assert(pPage->cPresent);
3574 Assert(pPool->cPresent);
3575 pPage->cPresent--;
3576 pPool->cPresent--;
3577 return fRet;
3578 }
3579# ifdef LOG_ENABLED
3580 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3581 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPD->a); i++)
3582 if ((pPD->a[i].u & (X86_PDE2M_PAE_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3583 Log(("i=%d cFound=%d\n", i, ++cFound));
3584# endif
3585 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3586 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);*/
3587 break;
3588 }
3589#endif /* PGM_WITH_LARGE_PAGES */
3590
3591 default:
3592 AssertFatalMsgFailed(("enmKind=%d iShw=%d\n", pPage->enmKind, iShw));
3593 }
3594
3595 /* not reached. */
3596#ifndef _MSC_VER
3597 return fRet;
3598#endif
3599}
3600
3601
3602/**
3603 * Scans one shadow page table for mappings of a physical page.
3604 *
3605 * @param pVM The cross context VM structure.
3606 * @param pPhysPage The guest page in question.
3607 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3608 * @param iShw The shadow page table.
3609 */
3610static void pgmPoolTrackFlushGCPhysPT(PVM pVM, PPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iShw)
3611{
3612 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool); NOREF(pPool);
3613
3614 /* We should only come here with when there's only one reference to this physical page. */
3615 Assert(PGMPOOL_TD_GET_CREFS(PGM_PAGE_GET_TRACKING(pPhysPage)) == 1);
3616
3617 Log2(("pgmPoolTrackFlushGCPhysPT: pPhysPage=%RHp iShw=%d\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iShw));
3618 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPT, f);
3619 bool fKeptPTEs = pgmPoolTrackFlushGCPhysPTInt(pVM, pPhysPage, fFlushPTEs, iShw, PGM_PAGE_GET_PTE_INDEX(pPhysPage));
3620 if (!fKeptPTEs)
3621 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3622 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPT, f);
3623}
3624
3625
3626/**
3627 * Flushes a list of shadow page tables mapping the same physical page.
3628 *
3629 * @param pVM The cross context VM structure.
3630 * @param pPhysPage The guest page in question.
3631 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3632 * @param iPhysExt The physical cross reference extent list to flush.
3633 */
3634static void pgmPoolTrackFlushGCPhysPTs(PVMCC pVM, PPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iPhysExt)
3635{
3636 PGM_LOCK_ASSERT_OWNER(pVM);
3637 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3638 bool fKeepList = false;
3639
3640 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPTs, f);
3641 Log2(("pgmPoolTrackFlushGCPhysPTs: pPhysPage=%RHp iPhysExt=%u\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iPhysExt));
3642
3643 const uint16_t iPhysExtStart = iPhysExt;
3644 PPGMPOOLPHYSEXT pPhysExt;
3645 do
3646 {
3647 Assert(iPhysExt < pPool->cMaxPhysExts);
3648 pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3649 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3650 {
3651 if (pPhysExt->aidx[i] != NIL_PGMPOOL_IDX)
3652 {
3653 bool fKeptPTEs = pgmPoolTrackFlushGCPhysPTInt(pVM, pPhysPage, fFlushPTEs, pPhysExt->aidx[i], pPhysExt->apte[i]);
3654 if (!fKeptPTEs)
3655 {
3656 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3657 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3658 }
3659 else
3660 fKeepList = true;
3661 }
3662 }
3663 /* next */
3664 iPhysExt = pPhysExt->iNext;
3665 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
3666
3667 if (!fKeepList)
3668 {
3669 /* insert the list into the free list and clear the ram range entry. */
3670 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3671 pPool->iPhysExtFreeHead = iPhysExtStart;
3672 /* Invalidate the tracking data. */
3673 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3674 }
3675
3676 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTs, f);
3677}
3678
3679
3680/**
3681 * Flushes all shadow page table mappings of the given guest page.
3682 *
3683 * This is typically called when the host page backing the guest one has been
3684 * replaced or when the page protection was changed due to a guest access
3685 * caught by the monitoring.
3686 *
3687 * @returns VBox status code.
3688 * @retval VINF_SUCCESS if all references has been successfully cleared.
3689 * @retval VINF_PGM_SYNC_CR3 if we're better off with a CR3 sync and a page
3690 * pool cleaning. FF and sync flags are set.
3691 *
3692 * @param pVM The cross context VM structure.
3693 * @param GCPhysPage GC physical address of the page in question
3694 * @param pPhysPage The guest page in question.
3695 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3696 * @param pfFlushTLBs This is set to @a true if the shadow TLBs should be
3697 * flushed, it is NOT touched if this isn't necessary.
3698 * The caller MUST initialized this to @a false.
3699 */
3700int pgmPoolTrackUpdateGCPhys(PVMCC pVM, RTGCPHYS GCPhysPage, PPGMPAGE pPhysPage, bool fFlushPTEs, bool *pfFlushTLBs)
3701{
3702 PVMCPUCC pVCpu = VMMGetCpu(pVM);
3703 PGM_LOCK_VOID(pVM);
3704 int rc = VINF_SUCCESS;
3705
3706#ifdef PGM_WITH_LARGE_PAGES
3707 /* Is this page part of a large page? */
3708 if (PGM_PAGE_GET_PDE_TYPE(pPhysPage) == PGM_PAGE_PDE_TYPE_PDE)
3709 {
3710 RTGCPHYS GCPhysBase = GCPhysPage & X86_PDE2M_PAE_PG_MASK;
3711 GCPhysPage &= X86_PDE_PAE_PG_MASK;
3712
3713 /* Fetch the large page base. */
3714 PPGMPAGE pLargePage;
3715 if (GCPhysBase != GCPhysPage)
3716 {
3717 pLargePage = pgmPhysGetPage(pVM, GCPhysBase);
3718 AssertFatal(pLargePage);
3719 }
3720 else
3721 pLargePage = pPhysPage;
3722
3723 Log(("pgmPoolTrackUpdateGCPhys: update large page PDE for %RGp (%RGp)\n", GCPhysBase, GCPhysPage));
3724
3725 if (PGM_PAGE_GET_PDE_TYPE(pLargePage) == PGM_PAGE_PDE_TYPE_PDE)
3726 {
3727 /* Mark the large page as disabled as we need to break it up to change a single page in the 2 MB range. */
3728 PGM_PAGE_SET_PDE_TYPE(pVM, pLargePage, PGM_PAGE_PDE_TYPE_PDE_DISABLED);
3729 pVM->pgm.s.cLargePagesDisabled++;
3730
3731 /* Update the base as that *only* that one has a reference and there's only one PDE to clear. */
3732 rc = pgmPoolTrackUpdateGCPhys(pVM, GCPhysBase, pLargePage, fFlushPTEs, pfFlushTLBs);
3733
3734 *pfFlushTLBs = true;
3735 PGM_UNLOCK(pVM);
3736 return rc;
3737 }
3738 }
3739#else
3740 NOREF(GCPhysPage);
3741#endif /* PGM_WITH_LARGE_PAGES */
3742
3743 const uint16_t u16 = PGM_PAGE_GET_TRACKING(pPhysPage);
3744 if (u16)
3745 {
3746 /*
3747 * The zero page is currently screwing up the tracking and we'll
3748 * have to flush the whole shebang. Unless VBOX_WITH_NEW_LAZY_PAGE_ALLOC
3749 * is defined, zero pages won't normally be mapped. Some kind of solution
3750 * will be needed for this problem of course, but it will have to wait...
3751 */
3752 if ( PGM_PAGE_IS_ZERO(pPhysPage)
3753 || PGM_PAGE_IS_BALLOONED(pPhysPage))
3754 rc = VINF_PGM_GCPHYS_ALIASED;
3755 else
3756 {
3757 if (PGMPOOL_TD_GET_CREFS(u16) != PGMPOOL_TD_CREFS_PHYSEXT)
3758 {
3759 Assert(PGMPOOL_TD_GET_CREFS(u16) == 1);
3760 pgmPoolTrackFlushGCPhysPT(pVM,
3761 pPhysPage,
3762 fFlushPTEs,
3763 PGMPOOL_TD_GET_IDX(u16));
3764 }
3765 else if (u16 != PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED))
3766 pgmPoolTrackFlushGCPhysPTs(pVM, pPhysPage, fFlushPTEs, PGMPOOL_TD_GET_IDX(u16));
3767 else
3768 rc = pgmPoolTrackFlushGCPhysPTsSlow(pVM, pPhysPage);
3769 *pfFlushTLBs = true;
3770 }
3771 }
3772
3773 if (rc == VINF_PGM_GCPHYS_ALIASED)
3774 {
3775 pVCpu->pgm.s.fSyncFlags |= PGM_SYNC_CLEAR_PGM_POOL;
3776 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
3777 rc = VINF_PGM_SYNC_CR3;
3778 }
3779 PGM_UNLOCK(pVM);
3780 return rc;
3781}
3782
3783
3784/**
3785 * Scans all shadow page tables for mappings of a physical page.
3786 *
3787 * This may be slow, but it's most likely more efficient than cleaning
3788 * out the entire page pool / cache.
3789 *
3790 * @returns VBox status code.
3791 * @retval VINF_SUCCESS if all references has been successfully cleared.
3792 * @retval VINF_PGM_GCPHYS_ALIASED if we're better off with a CR3 sync and
3793 * a page pool cleaning.
3794 *
3795 * @param pVM The cross context VM structure.
3796 * @param pPhysPage The guest page in question.
3797 */
3798int pgmPoolTrackFlushGCPhysPTsSlow(PVMCC pVM, PPGMPAGE pPhysPage)
3799{
3800 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3801 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3802 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: cUsedPages=%d cPresent=%d pPhysPage=%R[pgmpage]\n",
3803 pPool->cUsedPages, pPool->cPresent, pPhysPage));
3804
3805 /*
3806 * There is a limit to what makes sense.
3807 */
3808 if ( pPool->cPresent > 1024
3809 && pVM->cCpus == 1)
3810 {
3811 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: giving up... (cPresent=%d)\n", pPool->cPresent));
3812 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3813 return VINF_PGM_GCPHYS_ALIASED;
3814 }
3815
3816 /*
3817 * Iterate all the pages until we've encountered all that in use.
3818 * This is simple but not quite optimal solution.
3819 */
3820 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage);
3821 unsigned cLeft = pPool->cUsedPages;
3822 unsigned iPage = pPool->cCurPages;
3823 while (--iPage >= PGMPOOL_IDX_FIRST)
3824 {
3825 PPGMPOOLPAGE pPage = &pPool->aPages[iPage];
3826 if ( pPage->GCPhys != NIL_RTGCPHYS
3827 && pPage->cPresent)
3828 {
3829 Assert(!PGMPOOL_PAGE_IS_NESTED(pPage)); /* see if it hits */
3830 switch (pPage->enmKind)
3831 {
3832 /*
3833 * We only care about shadow page tables.
3834 */
3835 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3836 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3837 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3838 {
3839 const uint32_t u32 = (uint32_t)u64;
3840 unsigned cPresent = pPage->cPresent;
3841 PX86PT pPT = (PX86PT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3842 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3843 {
3844 const X86PGUINT uPte = pPT->a[i].u;
3845 if (uPte & X86_PTE_P)
3846 {
3847 if ((uPte & X86_PTE_PG_MASK) == u32)
3848 {
3849 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX32\n", iPage, i, pPT->a[i]));
3850 ASMAtomicWriteU32(&pPT->a[i].u, 0);
3851
3852 /* Update the counter as we're removing references. */
3853 Assert(pPage->cPresent);
3854 Assert(pPool->cPresent);
3855 pPage->cPresent--;
3856 pPool->cPresent--;
3857 }
3858 if (!--cPresent)
3859 break;
3860 }
3861 }
3862 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3863 break;
3864 }
3865
3866 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3867 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3868 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3869 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3870 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3871 {
3872 unsigned cPresent = pPage->cPresent;
3873 PPGMSHWPTPAE pPT = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3874 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3875 if (PGMSHWPTEPAE_IS_P(pPT->a[i]))
3876 {
3877 if ((PGMSHWPTEPAE_GET_U(pPT->a[i]) & X86_PTE_PAE_PG_MASK) == u64)
3878 {
3879 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX64\n", iPage, i, pPT->a[i]));
3880 PGMSHWPTEPAE_ATOMIC_SET(pPT->a[i], 0); /// @todo why not atomic?
3881
3882 /* Update the counter as we're removing references. */
3883 Assert(pPage->cPresent);
3884 Assert(pPool->cPresent);
3885 pPage->cPresent--;
3886 pPool->cPresent--;
3887 }
3888 if (!--cPresent)
3889 break;
3890 }
3891 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3892 break;
3893 }
3894
3895 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3896 {
3897 unsigned cPresent = pPage->cPresent;
3898 PEPTPT pPT = (PEPTPT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3899 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3900 {
3901 X86PGPAEUINT const uPte = pPT->a[i].u;
3902 if (uPte & EPT_E_READ)
3903 {
3904 if ((uPte & EPT_PTE_PG_MASK) == u64)
3905 {
3906 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX64\n", iPage, i, pPT->a[i]));
3907 ASMAtomicWriteU64(&pPT->a[i].u, 0);
3908
3909 /* Update the counter as we're removing references. */
3910 Assert(pPage->cPresent);
3911 Assert(pPool->cPresent);
3912 pPage->cPresent--;
3913 pPool->cPresent--;
3914 }
3915 if (!--cPresent)
3916 break;
3917 }
3918 }
3919 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3920 break;
3921 }
3922 }
3923
3924 if (!--cLeft)
3925 break;
3926 }
3927 }
3928
3929 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3930 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3931
3932 /*
3933 * There is a limit to what makes sense. The above search is very expensive, so force a pgm pool flush.
3934 */
3935 if (pPool->cPresent > 1024)
3936 {
3937 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: giving up... (cPresent=%d)\n", pPool->cPresent));
3938 return VINF_PGM_GCPHYS_ALIASED;
3939 }
3940
3941 return VINF_SUCCESS;
3942}
3943
3944
3945/**
3946 * Clears the user entry in a user table.
3947 *
3948 * This is used to remove all references to a page when flushing it.
3949 */
3950static void pgmPoolTrackClearPageUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PCPGMPOOLUSER pUser)
3951{
3952 Assert(pUser->iUser != NIL_PGMPOOL_IDX);
3953 Assert(pUser->iUser < pPool->cCurPages);
3954 uint32_t iUserTable = pUser->iUserTable;
3955
3956 /*
3957 * Map the user page. Ignore references made by fictitious pages.
3958 */
3959 PPGMPOOLPAGE pUserPage = &pPool->aPages[pUser->iUser];
3960 LogFlow(("pgmPoolTrackClearPageUser: clear %x in %s (%RGp) (flushing %s)\n", iUserTable, pgmPoolPoolKindToStr(pUserPage->enmKind), pUserPage->Core.Key, pgmPoolPoolKindToStr(pPage->enmKind)));
3961 union
3962 {
3963 uint64_t *pau64;
3964 uint32_t *pau32;
3965 } u;
3966 if (pUserPage->idx < PGMPOOL_IDX_FIRST)
3967 {
3968 Assert(!pUserPage->pvPageR3);
3969 return;
3970 }
3971 u.pau64 = (uint64_t *)PGMPOOL_PAGE_2_PTR(pPool->CTX_SUFF(pVM), pUserPage);
3972
3973
3974 /* Safety precaution in case we change the paging for other modes too in the future. */
3975 Assert(!pgmPoolIsPageLocked(pPage)); RT_NOREF_PV(pPage);
3976
3977#ifdef VBOX_STRICT
3978 /*
3979 * Some sanity checks.
3980 */
3981 switch (pUserPage->enmKind)
3982 {
3983 case PGMPOOLKIND_32BIT_PD:
3984 case PGMPOOLKIND_32BIT_PD_PHYS:
3985 Assert(iUserTable < X86_PG_ENTRIES);
3986 break;
3987 case PGMPOOLKIND_PAE_PDPT:
3988 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
3989 case PGMPOOLKIND_PAE_PDPT_PHYS:
3990 Assert(iUserTable < 4);
3991 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3992 break;
3993 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3994 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3995 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3996 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3997 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3998 case PGMPOOLKIND_PAE_PD_PHYS:
3999 Assert(iUserTable < X86_PG_PAE_ENTRIES);
4000 break;
4001 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
4002 Assert(iUserTable < X86_PG_PAE_ENTRIES);
4003 break;
4004 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
4005 Assert(iUserTable < X86_PG_PAE_ENTRIES);
4006 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
4007 break;
4008 case PGMPOOLKIND_64BIT_PML4:
4009 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
4010 /* GCPhys >> PAGE_SHIFT is the index here */
4011 break;
4012 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
4013 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
4014 Assert(iUserTable < X86_PG_PAE_ENTRIES);
4015 break;
4016
4017 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
4018 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
4019 Assert(iUserTable < X86_PG_PAE_ENTRIES);
4020 break;
4021
4022 case PGMPOOLKIND_ROOT_NESTED:
4023 Assert(iUserTable < X86_PG_PAE_ENTRIES);
4024 break;
4025
4026# ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
4027 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
4028 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
4029 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
4030 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
4031 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
4032 Assert(iUserTable < EPT_PG_ENTRIES);
4033 break;
4034# endif
4035
4036 default:
4037 AssertMsgFailed(("enmKind=%d GCPhys=%RGp\n", pUserPage->enmKind, pPage->GCPhys));
4038 break;
4039 }
4040#endif /* VBOX_STRICT */
4041
4042 /*
4043 * Clear the entry in the user page.
4044 */
4045 switch (pUserPage->enmKind)
4046 {
4047 /* 32-bit entries */
4048 case PGMPOOLKIND_32BIT_PD:
4049 case PGMPOOLKIND_32BIT_PD_PHYS:
4050 ASMAtomicWriteU32(&u.pau32[iUserTable], 0);
4051 break;
4052
4053 /* 64-bit entries */
4054 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
4055 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
4056 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
4057 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
4058 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
4059 case PGMPOOLKIND_PAE_PD_PHYS:
4060 case PGMPOOLKIND_PAE_PDPT_PHYS:
4061 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
4062 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
4063 case PGMPOOLKIND_64BIT_PML4:
4064 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
4065 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
4066 case PGMPOOLKIND_PAE_PDPT:
4067 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
4068 case PGMPOOLKIND_ROOT_NESTED:
4069 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
4070 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
4071# ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
4072 case PGMPOOLKIND_EPT_PT_FOR_EPT_PT:
4073 case PGMPOOLKIND_EPT_PT_FOR_EPT_2MB:
4074 case PGMPOOLKIND_EPT_PD_FOR_EPT_PD:
4075 case PGMPOOLKIND_EPT_PDPT_FOR_EPT_PDPT:
4076 case PGMPOOLKIND_EPT_PML4_FOR_EPT_PML4:
4077#endif
4078 ASMAtomicWriteU64(&u.pau64[iUserTable], 0);
4079 break;
4080
4081 default:
4082 AssertFatalMsgFailed(("enmKind=%d iUser=%d iUserTable=%#x\n", pUserPage->enmKind, pUser->iUser, pUser->iUserTable));
4083 }
4084 PGM_DYNMAP_UNUSED_HINT_VM(pPool->CTX_SUFF(pVM), u.pau64);
4085}
4086
4087
4088/**
4089 * Clears all users of a page.
4090 */
4091static void pgmPoolTrackClearPageUsers(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
4092{
4093 /*
4094 * Free all the user records.
4095 */
4096 LogFlow(("pgmPoolTrackClearPageUsers %RGp\n", pPage->GCPhys));
4097
4098 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
4099 uint16_t i = pPage->iUserHead;
4100 while (i != NIL_PGMPOOL_USER_INDEX)
4101 {
4102 /* Clear enter in user table. */
4103 pgmPoolTrackClearPageUser(pPool, pPage, &paUsers[i]);
4104
4105 /* Free it. */
4106 const uint16_t iNext = paUsers[i].iNext;
4107 paUsers[i].iUser = NIL_PGMPOOL_IDX;
4108 paUsers[i].iNext = pPool->iUserFreeHead;
4109 pPool->iUserFreeHead = i;
4110
4111 /* Next. */
4112 i = iNext;
4113 }
4114 pPage->iUserHead = NIL_PGMPOOL_USER_INDEX;
4115}
4116
4117
4118/**
4119 * Allocates a new physical cross reference extent.
4120 *
4121 * @returns Pointer to the allocated extent on success. NULL if we're out of them.
4122 * @param pVM The cross context VM structure.
4123 * @param piPhysExt Where to store the phys ext index.
4124 */
4125PPGMPOOLPHYSEXT pgmPoolTrackPhysExtAlloc(PVMCC pVM, uint16_t *piPhysExt)
4126{
4127 PGM_LOCK_ASSERT_OWNER(pVM);
4128 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
4129 uint16_t iPhysExt = pPool->iPhysExtFreeHead;
4130 if (iPhysExt == NIL_PGMPOOL_PHYSEXT_INDEX)
4131 {
4132 STAM_COUNTER_INC(&pPool->StamTrackPhysExtAllocFailures);
4133 return NULL;
4134 }
4135 PPGMPOOLPHYSEXT pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
4136 pPool->iPhysExtFreeHead = pPhysExt->iNext;
4137 pPhysExt->iNext = NIL_PGMPOOL_PHYSEXT_INDEX;
4138 *piPhysExt = iPhysExt;
4139 return pPhysExt;
4140}
4141
4142
4143/**
4144 * Frees a physical cross reference extent.
4145 *
4146 * @param pVM The cross context VM structure.
4147 * @param iPhysExt The extent to free.
4148 */
4149void pgmPoolTrackPhysExtFree(PVMCC pVM, uint16_t iPhysExt)
4150{
4151 PGM_LOCK_ASSERT_OWNER(pVM);
4152 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
4153 Assert(iPhysExt < pPool->cMaxPhysExts);
4154 PPGMPOOLPHYSEXT pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
4155 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
4156 {
4157 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
4158 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
4159 }
4160 pPhysExt->iNext = pPool->iPhysExtFreeHead;
4161 pPool->iPhysExtFreeHead = iPhysExt;
4162}
4163
4164
4165/**
4166 * Frees a physical cross reference extent.
4167 *
4168 * @param pVM The cross context VM structure.
4169 * @param iPhysExt The extent to free.
4170 */
4171void pgmPoolTrackPhysExtFreeList(PVMCC pVM, uint16_t iPhysExt)
4172{
4173 PGM_LOCK_ASSERT_OWNER(pVM);
4174 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
4175
4176 const uint16_t iPhysExtStart = iPhysExt;
4177 PPGMPOOLPHYSEXT pPhysExt;
4178 do
4179 {
4180 Assert(iPhysExt < pPool->cMaxPhysExts);
4181 pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
4182 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
4183 {
4184 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
4185 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
4186 }
4187
4188 /* next */
4189 iPhysExt = pPhysExt->iNext;
4190 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
4191
4192 pPhysExt->iNext = pPool->iPhysExtFreeHead;
4193 pPool->iPhysExtFreeHead = iPhysExtStart;
4194}
4195
4196
4197/**
4198 * Insert a reference into a list of physical cross reference extents.
4199 *
4200 * @returns The new tracking data for PGMPAGE.
4201 *
4202 * @param pVM The cross context VM structure.
4203 * @param iPhysExt The physical extent index of the list head.
4204 * @param iShwPT The shadow page table index.
4205 * @param iPte Page table entry
4206 *
4207 */
4208static uint16_t pgmPoolTrackPhysExtInsert(PVMCC pVM, uint16_t iPhysExt, uint16_t iShwPT, uint16_t iPte)
4209{
4210 PGM_LOCK_ASSERT_OWNER(pVM);
4211 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
4212 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
4213
4214 /*
4215 * Special common cases.
4216 */
4217 if (paPhysExts[iPhysExt].aidx[1] == NIL_PGMPOOL_IDX)
4218 {
4219 paPhysExts[iPhysExt].aidx[1] = iShwPT;
4220 paPhysExts[iPhysExt].apte[1] = iPte;
4221 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackAliasedMany);
4222 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{,%d pte %d,}\n", iPhysExt, iShwPT, iPte));
4223 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4224 }
4225 if (paPhysExts[iPhysExt].aidx[2] == NIL_PGMPOOL_IDX)
4226 {
4227 paPhysExts[iPhysExt].aidx[2] = iShwPT;
4228 paPhysExts[iPhysExt].apte[2] = iPte;
4229 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackAliasedMany);
4230 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{,,%d pte %d}\n", iPhysExt, iShwPT, iPte));
4231 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4232 }
4233 AssertCompile(RT_ELEMENTS(paPhysExts[iPhysExt].aidx) == 3);
4234
4235 /*
4236 * General treatment.
4237 */
4238 const uint16_t iPhysExtStart = iPhysExt;
4239 unsigned cMax = 15;
4240 for (;;)
4241 {
4242 Assert(iPhysExt < pPool->cMaxPhysExts);
4243 for (unsigned i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4244 if (paPhysExts[iPhysExt].aidx[i] == NIL_PGMPOOL_IDX)
4245 {
4246 paPhysExts[iPhysExt].aidx[i] = iShwPT;
4247 paPhysExts[iPhysExt].apte[i] = iPte;
4248 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackAliasedMany);
4249 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{%d pte %d} i=%d cMax=%d\n", iPhysExt, iShwPT, iPte, i, cMax));
4250 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExtStart);
4251 }
4252 if (!--cMax)
4253 {
4254 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackOverflows);
4255 pgmPoolTrackPhysExtFreeList(pVM, iPhysExtStart);
4256 LogFlow(("pgmPoolTrackPhysExtInsert: overflow (1) iShwPT=%d\n", iShwPT));
4257 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4258 }
4259
4260 /* advance */
4261 iPhysExt = paPhysExts[iPhysExt].iNext;
4262 if (iPhysExt == NIL_PGMPOOL_PHYSEXT_INDEX)
4263 break;
4264 }
4265
4266 /*
4267 * Add another extent to the list.
4268 */
4269 PPGMPOOLPHYSEXT pNew = pgmPoolTrackPhysExtAlloc(pVM, &iPhysExt);
4270 if (!pNew)
4271 {
4272 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackNoExtentsLeft);
4273 pgmPoolTrackPhysExtFreeList(pVM, iPhysExtStart);
4274 LogFlow(("pgmPoolTrackPhysExtInsert: pgmPoolTrackPhysExtAlloc failed iShwPT=%d\n", iShwPT));
4275 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4276 }
4277 pNew->iNext = iPhysExtStart;
4278 pNew->aidx[0] = iShwPT;
4279 pNew->apte[0] = iPte;
4280 LogFlow(("pgmPoolTrackPhysExtInsert: added new extent %d:{%d pte %d}->%d\n", iPhysExt, iShwPT, iPte, iPhysExtStart));
4281 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4282}
4283
4284
4285/**
4286 * Add a reference to guest physical page where extents are in use.
4287 *
4288 * @returns The new tracking data for PGMPAGE.
4289 *
4290 * @param pVM The cross context VM structure.
4291 * @param pPhysPage Pointer to the aPages entry in the ram range.
4292 * @param u16 The ram range flags (top 16-bits).
4293 * @param iShwPT The shadow page table index.
4294 * @param iPte Page table entry
4295 */
4296uint16_t pgmPoolTrackPhysExtAddref(PVMCC pVM, PPGMPAGE pPhysPage, uint16_t u16, uint16_t iShwPT, uint16_t iPte)
4297{
4298 PGM_LOCK_VOID(pVM);
4299 if (PGMPOOL_TD_GET_CREFS(u16) != PGMPOOL_TD_CREFS_PHYSEXT)
4300 {
4301 /*
4302 * Convert to extent list.
4303 */
4304 Assert(PGMPOOL_TD_GET_CREFS(u16) == 1);
4305 uint16_t iPhysExt;
4306 PPGMPOOLPHYSEXT pPhysExt = pgmPoolTrackPhysExtAlloc(pVM, &iPhysExt);
4307 if (pPhysExt)
4308 {
4309 LogFlow(("pgmPoolTrackPhysExtAddref: new extent: %d:{%d, %d}\n", iPhysExt, PGMPOOL_TD_GET_IDX(u16), iShwPT));
4310 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackAliased);
4311 pPhysExt->aidx[0] = PGMPOOL_TD_GET_IDX(u16);
4312 pPhysExt->apte[0] = PGM_PAGE_GET_PTE_INDEX(pPhysPage);
4313 pPhysExt->aidx[1] = iShwPT;
4314 pPhysExt->apte[1] = iPte;
4315 u16 = PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4316 }
4317 else
4318 u16 = PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4319 }
4320 else if (u16 != PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED))
4321 {
4322 /*
4323 * Insert into the extent list.
4324 */
4325 u16 = pgmPoolTrackPhysExtInsert(pVM, PGMPOOL_TD_GET_IDX(u16), iShwPT, iPte);
4326 }
4327 else
4328 STAM_COUNTER_INC(&pVM->pgm.s.Stats.StatTrackAliasedLots);
4329 PGM_UNLOCK(pVM);
4330 return u16;
4331}
4332
4333
4334/**
4335 * Clear references to guest physical memory.
4336 *
4337 * @param pPool The pool.
4338 * @param pPage The page.
4339 * @param pPhysPage Pointer to the aPages entry in the ram range.
4340 * @param iPte Shadow PTE index
4341 */
4342void pgmPoolTrackPhysExtDerefGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMPAGE pPhysPage, uint16_t iPte)
4343{
4344 PVMCC pVM = pPool->CTX_SUFF(pVM);
4345 const unsigned cRefs = PGM_PAGE_GET_TD_CREFS(pPhysPage);
4346 AssertFatalMsg(cRefs == PGMPOOL_TD_CREFS_PHYSEXT, ("cRefs=%d pPhysPage=%R[pgmpage] pPage=%p:{.idx=%d}\n", cRefs, pPhysPage, pPage, pPage->idx));
4347
4348 uint16_t iPhysExt = PGM_PAGE_GET_TD_IDX(pPhysPage);
4349 if (iPhysExt != PGMPOOL_TD_IDX_OVERFLOWED)
4350 {
4351 PGM_LOCK_VOID(pVM);
4352
4353 uint16_t iPhysExtPrev = NIL_PGMPOOL_PHYSEXT_INDEX;
4354 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
4355 do
4356 {
4357 Assert(iPhysExt < pPool->cMaxPhysExts);
4358
4359 /*
4360 * Look for the shadow page and check if it's all freed.
4361 */
4362 for (unsigned i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4363 {
4364 if ( paPhysExts[iPhysExt].aidx[i] == pPage->idx
4365 && paPhysExts[iPhysExt].apte[i] == iPte)
4366 {
4367 paPhysExts[iPhysExt].aidx[i] = NIL_PGMPOOL_IDX;
4368 paPhysExts[iPhysExt].apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
4369
4370 for (i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4371 if (paPhysExts[iPhysExt].aidx[i] != NIL_PGMPOOL_IDX)
4372 {
4373 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d\n", pPhysPage, pPage->idx));
4374 PGM_UNLOCK(pVM);
4375 return;
4376 }
4377
4378 /* we can free the node. */
4379 const uint16_t iPhysExtNext = paPhysExts[iPhysExt].iNext;
4380 if ( iPhysExtPrev == NIL_PGMPOOL_PHYSEXT_INDEX
4381 && iPhysExtNext == NIL_PGMPOOL_PHYSEXT_INDEX)
4382 {
4383 /* lonely node */
4384 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4385 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d lonely\n", pPhysPage, pPage->idx));
4386 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
4387 }
4388 else if (iPhysExtPrev == NIL_PGMPOOL_PHYSEXT_INDEX)
4389 {
4390 /* head */
4391 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d head\n", pPhysPage, pPage->idx));
4392 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExtNext));
4393 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4394 }
4395 else
4396 {
4397 /* in list */
4398 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d in list\n", pPhysPage, pPage->idx));
4399 paPhysExts[iPhysExtPrev].iNext = iPhysExtNext;
4400 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4401 }
4402 iPhysExt = iPhysExtNext;
4403 PGM_UNLOCK(pVM);
4404 return;
4405 }
4406 }
4407
4408 /* next */
4409 iPhysExtPrev = iPhysExt;
4410 iPhysExt = paPhysExts[iPhysExt].iNext;
4411 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
4412
4413 PGM_UNLOCK(pVM);
4414 AssertFatalMsgFailed(("not-found! cRefs=%d pPhysPage=%R[pgmpage] pPage=%p:{.idx=%d}\n", cRefs, pPhysPage, pPage, pPage->idx));
4415 }
4416 else /* nothing to do */
4417 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage]\n", pPhysPage));
4418}
4419
4420/**
4421 * Clear references to guest physical memory.
4422 *
4423 * This is the same as pgmPoolTracDerefGCPhysHint except that the guest
4424 * physical address is assumed to be correct, so the linear search can be
4425 * skipped and we can assert at an earlier point.
4426 *
4427 * @param pPool The pool.
4428 * @param pPage The page.
4429 * @param HCPhys The host physical address corresponding to the guest page.
4430 * @param GCPhys The guest physical address corresponding to HCPhys.
4431 * @param iPte Shadow PTE index
4432 */
4433static void pgmPoolTracDerefGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTHCPHYS HCPhys, RTGCPHYS GCPhys, uint16_t iPte)
4434{
4435 /*
4436 * Lookup the page and check if it checks out before derefing it.
4437 */
4438 PVMCC pVM = pPool->CTX_SUFF(pVM);
4439 PPGMPAGE pPhysPage = pgmPhysGetPage(pVM, GCPhys);
4440 if (pPhysPage)
4441 {
4442 Assert(PGM_PAGE_GET_HCPHYS(pPhysPage));
4443#ifdef LOG_ENABLED
4444 RTHCPHYS HCPhysPage = PGM_PAGE_GET_HCPHYS(pPhysPage);
4445 Log2(("pgmPoolTracDerefGCPhys %RHp vs %RHp\n", HCPhysPage, HCPhys));
4446#endif
4447 if (PGM_PAGE_GET_HCPHYS(pPhysPage) == HCPhys)
4448 {
4449 Assert(pPage->cPresent);
4450 Assert(pPool->cPresent);
4451 pPage->cPresent--;
4452 pPool->cPresent--;
4453 pgmTrackDerefGCPhys(pPool, pPage, pPhysPage, iPte);
4454 return;
4455 }
4456
4457 AssertFatalMsgFailed(("HCPhys=%RHp GCPhys=%RGp; found page has HCPhys=%RHp iPte=%u fIsNested=%RTbool\n",
4458 HCPhys, GCPhys, PGM_PAGE_GET_HCPHYS(pPhysPage), iPte, PGMPOOL_PAGE_IS_NESTED(pPage)));
4459 }
4460 AssertFatalMsgFailed(("HCPhys=%RHp GCPhys=%RGp\n", HCPhys, GCPhys));
4461}
4462
4463
4464/**
4465 * Clear references to guest physical memory.
4466 *
4467 * @param pPool The pool.
4468 * @param pPage The page.
4469 * @param HCPhys The host physical address corresponding to the guest page.
4470 * @param GCPhysHint The guest physical address which may corresponding to HCPhys.
4471 * @param iPte Shadow pte index
4472 */
4473void pgmPoolTracDerefGCPhysHint(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTHCPHYS HCPhys, RTGCPHYS GCPhysHint, uint16_t iPte)
4474{
4475 Log4(("pgmPoolTracDerefGCPhysHint %RHp %RGp\n", HCPhys, GCPhysHint));
4476
4477 /*
4478 * Try the hint first.
4479 */
4480 RTHCPHYS HCPhysHinted;
4481 PVMCC pVM = pPool->CTX_SUFF(pVM);
4482 PPGMPAGE pPhysPage = pgmPhysGetPage(pVM, GCPhysHint);
4483 if (pPhysPage)
4484 {
4485 HCPhysHinted = PGM_PAGE_GET_HCPHYS(pPhysPage);
4486 Assert(HCPhysHinted);
4487 if (HCPhysHinted == HCPhys)
4488 {
4489 Assert(pPage->cPresent);
4490 Assert(pPool->cPresent);
4491 pPage->cPresent--;
4492 pPool->cPresent--;
4493 pgmTrackDerefGCPhys(pPool, pPage, pPhysPage, iPte);
4494 return;
4495 }
4496 }
4497 else
4498 HCPhysHinted = UINT64_C(0xdeadbeefdeadbeef);
4499
4500 /*
4501 * Damn, the hint didn't work. We'll have to do an expensive linear search.
4502 */
4503 STAM_COUNTER_INC(&pPool->StatTrackLinearRamSearches);
4504 PPGMRAMRANGE pRam = pPool->CTX_SUFF(pVM)->pgm.s.CTX_SUFF(pRamRangesX);
4505 while (pRam)
4506 {
4507 unsigned iPage = pRam->cb >> PAGE_SHIFT;
4508 while (iPage-- > 0)
4509 {
4510 if (PGM_PAGE_GET_HCPHYS(&pRam->aPages[iPage]) == HCPhys)
4511 {
4512 Log4(("pgmPoolTracDerefGCPhysHint: Linear HCPhys=%RHp GCPhysHint=%RGp GCPhysReal=%RGp\n",
4513 HCPhys, GCPhysHint, pRam->GCPhys + (iPage << PAGE_SHIFT)));
4514 Assert(pPage->cPresent);
4515 Assert(pPool->cPresent);
4516 pPage->cPresent--;
4517 pPool->cPresent--;
4518 pgmTrackDerefGCPhys(pPool, pPage, &pRam->aPages[iPage], iPte);
4519 return;
4520 }
4521 }
4522 pRam = pRam->CTX_SUFF(pNext);
4523 }
4524
4525 AssertFatalMsgFailed(("HCPhys=%RHp GCPhysHint=%RGp (Hinted page has HCPhys = %RHp)\n", HCPhys, GCPhysHint, HCPhysHinted));
4526}
4527
4528
4529/**
4530 * Clear references to guest physical memory in a 32-bit / 32-bit page table.
4531 *
4532 * @param pPool The pool.
4533 * @param pPage The page.
4534 * @param pShwPT The shadow page table (mapping of the page).
4535 * @param pGstPT The guest page table.
4536 */
4537DECLINLINE(void) pgmPoolTrackDerefPT32Bit32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PT pShwPT, PCX86PT pGstPT)
4538{
4539 RTGCPHYS32 const fPgMask = pPage->fA20Enabled ? X86_PTE_PG_MASK : X86_PTE_PG_MASK & ~RT_BIT_32(20);
4540 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4541 {
4542 const X86PGUINT uPte = pShwPT->a[i].u;
4543 Assert(!(uPte & RT_BIT_32(10)));
4544 if (uPte & X86_PTE_P)
4545 {
4546 Log4(("pgmPoolTrackDerefPT32Bit32Bit: i=%d pte=%RX32 hint=%RX32\n",
4547 i, uPte & X86_PTE_PG_MASK, pGstPT->a[i].u & X86_PTE_PG_MASK));
4548 pgmPoolTracDerefGCPhysHint(pPool, pPage, uPte & X86_PTE_PG_MASK, pGstPT->a[i].u & fPgMask, i);
4549 if (!pPage->cPresent)
4550 break;
4551 }
4552 }
4553}
4554
4555
4556/**
4557 * Clear references to guest physical memory in a PAE / 32-bit page table.
4558 *
4559 * @param pPool The pool.
4560 * @param pPage The page.
4561 * @param pShwPT The shadow page table (mapping of the page).
4562 * @param pGstPT The guest page table (just a half one).
4563 */
4564DECLINLINE(void) pgmPoolTrackDerefPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT)
4565{
4566 RTGCPHYS32 const fPgMask = pPage->fA20Enabled ? X86_PTE_PG_MASK : X86_PTE_PG_MASK & ~RT_BIT_32(20);
4567 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4568 {
4569 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4570 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4571 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4572 {
4573 Log4(("pgmPoolTrackDerefPTPae32Bit: i=%d pte=%RX64 hint=%RX32\n",
4574 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & X86_PTE_PG_MASK));
4575 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & fPgMask, i);
4576 if (!pPage->cPresent)
4577 break;
4578 }
4579 }
4580}
4581
4582
4583/**
4584 * Clear references to guest physical memory in a PAE / PAE page table.
4585 *
4586 * @param pPool The pool.
4587 * @param pPage The page.
4588 * @param pShwPT The shadow page table (mapping of the page).
4589 * @param pGstPT The guest page table.
4590 */
4591DECLINLINE(void) pgmPoolTrackDerefPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT)
4592{
4593 RTGCPHYS const fPgMask = pPage->fA20Enabled ? X86_PTE_PAE_PG_MASK : X86_PTE_PAE_PG_MASK & ~RT_BIT_64(20);
4594 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4595 {
4596 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4597 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4598 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4599 {
4600 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX32 hint=%RX32\n",
4601 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK));
4602 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & fPgMask, i);
4603 if (!pPage->cPresent)
4604 break;
4605 }
4606 }
4607}
4608
4609
4610/**
4611 * Clear references to guest physical memory in a 32-bit / 4MB page table.
4612 *
4613 * @param pPool The pool.
4614 * @param pPage The page.
4615 * @param pShwPT The shadow page table (mapping of the page).
4616 */
4617DECLINLINE(void) pgmPoolTrackDerefPT32Bit4MB(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PT pShwPT)
4618{
4619 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4620 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4621 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4622 {
4623 const X86PGUINT uPte = pShwPT->a[i].u;
4624 Assert(!(uPte & RT_BIT_32(10)));
4625 if (uPte & X86_PTE_P)
4626 {
4627 Log4(("pgmPoolTrackDerefPT32Bit4MB: i=%d pte=%RX32 GCPhys=%RGp\n",
4628 i, uPte & X86_PTE_PG_MASK, GCPhys));
4629 pgmPoolTracDerefGCPhys(pPool, pPage, uPte & X86_PTE_PG_MASK, GCPhys & GCPhysA20Mask, i);
4630 if (!pPage->cPresent)
4631 break;
4632 }
4633 }
4634}
4635
4636
4637/**
4638 * Clear references to guest physical memory in a PAE / 2/4MB page table.
4639 *
4640 * @param pPool The pool.
4641 * @param pPage The page.
4642 * @param pShwPT The shadow page table (mapping of the page).
4643 */
4644DECLINLINE(void) pgmPoolTrackDerefPTPaeBig(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT)
4645{
4646 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4647 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4648 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4649 {
4650 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4651 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4652 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4653 {
4654 Log4(("pgmPoolTrackDerefPTPaeBig: i=%d pte=%RX64 hint=%RGp\n",
4655 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), GCPhys));
4656 pgmPoolTracDerefGCPhys(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), GCPhys & GCPhysA20Mask, i);
4657 if (!pPage->cPresent)
4658 break;
4659 }
4660 }
4661}
4662
4663
4664/**
4665 * Clear references to shadowed pages in an EPT page table.
4666 *
4667 * @param pPool The pool.
4668 * @param pPage The page.
4669 * @param pShwPT The shadow page directory pointer table (mapping of the
4670 * page).
4671 */
4672DECLINLINE(void) pgmPoolTrackDerefPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPT pShwPT)
4673{
4674 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4675 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4676 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4677 {
4678 X86PGPAEUINT const uPte = pShwPT->a[i].u;
4679 Assert((uPte & UINT64_C(0xfff0000000000f80)) == 0);
4680 if (uPte & EPT_E_READ)
4681 {
4682 Log4(("pgmPoolTrackDerefPTEPT: i=%d pte=%RX64 GCPhys=%RX64\n",
4683 i, uPte & EPT_PTE_PG_MASK, pPage->GCPhys));
4684 pgmPoolTracDerefGCPhys(pPool, pPage, uPte & EPT_PTE_PG_MASK, GCPhys & GCPhysA20Mask, i);
4685 if (!pPage->cPresent)
4686 break;
4687 }
4688 }
4689}
4690
4691#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
4692
4693/**
4694 * Clears references to shadowed pages in a SLAT EPT page table.
4695 *
4696 * @param pPool The pool.
4697 * @param pPage The page.
4698 * @param pShwPT The shadow page table (mapping of the page).
4699 * @param pGstPT The guest page table.
4700 */
4701DECLINLINE(void) pgmPoolTrackDerefNestedPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPT pShwPT, PCEPTPT pGstPT)
4702{
4703 Assert(PGMPOOL_PAGE_IS_NESTED(pPage));
4704 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4705 {
4706 X86PGPAEUINT const uShwPte = pShwPT->a[i].u;
4707 Assert((uShwPte & UINT64_C(0xfff0000000000f80)) == 0); /* Access, Dirty, UserX (not supported) and ignored bits 7, 11. */
4708 if (uShwPte & EPT_PRESENT_MASK)
4709 {
4710 Log7Func(("Shw=%RX64 GstPte=%RX64\n", uShwPte, pGstPT->a[i].u));
4711 pgmPoolTracDerefGCPhys(pPool, pPage, uShwPte & EPT_PTE_PG_MASK, pGstPT->a[i].u & EPT_PTE_PG_MASK, i);
4712 if (!pPage->cPresent)
4713 break;
4714 }
4715 }
4716}
4717
4718
4719/**
4720 * Clear references to guest physical memory in a SLAT 2MB EPT page table.
4721 *
4722 * @param pPool The pool.
4723 * @param pPage The page.
4724 * @param pShwPT The shadow page table (mapping of the page).
4725 */
4726DECLINLINE(void) pgmPoolTrackDerefNestedPTEPT2MB(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPT pShwPT)
4727{
4728 Assert(pPage->fA20Enabled);
4729 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4730 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4731 {
4732 X86PGPAEUINT const uShwPte = pShwPT->a[i].u;
4733 Assert((uShwPte & UINT64_C(0xfff0000000000f80)) == 0); /* Access, Dirty, UserX (not supported) and ignored bits 7, 11. */
4734 if (uShwPte & EPT_PRESENT_MASK)
4735 {
4736 Log7Func(("Shw=%RX64 GstPte=%RX64\n", uShwPte, GCPhys));
4737 pgmPoolTracDerefGCPhys(pPool, pPage, uShwPte & EPT_PTE_PG_MASK, GCPhys, i);
4738 if (!pPage->cPresent)
4739 break;
4740 }
4741 }
4742}
4743
4744
4745/**
4746 * Clear references to shadowed pages in a SLAT EPT page directory.
4747 *
4748 * @param pPool The pool.
4749 * @param pPage The page.
4750 * @param pShwPD The shadow page directory (mapping of the page).
4751 * @param pGstPD The guest page directory.
4752 */
4753DECLINLINE(void) pgmPoolTrackDerefNestedPDEpt(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPD pShwPD, PCEPTPD pGstPD)
4754{
4755 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4756 {
4757 X86PGPAEUINT const uPde = pShwPD->a[i].u;
4758#ifdef PGM_WITH_LARGE_PAGES
4759 AssertMsg((uPde & UINT64_C(0xfff0000000000f00)) == 0, ("uPde=%RX64\n", uPde));
4760#else
4761 AssertMsg((uPde & UINT64_C(0xfff0000000000f80)) == 0, ("uPde=%RX64\n", uPde));
4762#endif
4763 if (uPde & EPT_PRESENT_MASK)
4764 {
4765#ifdef PGM_WITH_LARGE_PAGES
4766 if (uPde & EPT_E_LEAF)
4767 {
4768 Log4(("pgmPoolTrackDerefPDEPT: i=%d pde=%RX64 GCPhys=%RX64\n", i, uPde & EPT_PDE2M_PG_MASK, pPage->GCPhys));
4769 pgmPoolTracDerefGCPhys(pPool, pPage, uPde & EPT_PDE2M_PG_MASK, pGstPD->a[i].u & EPT_PDE2M_PG_MASK, i);
4770 }
4771 else
4772#endif
4773 {
4774 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPde & EPT_PDE_PG_MASK);
4775 if (pSubPage)
4776 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4777 else
4778 AssertFatalMsgFailed(("%RX64\n", pShwPD->a[i].u & EPT_PDE_PG_MASK));
4779 }
4780 }
4781 }
4782}
4783
4784#endif /* VBOX_WITH_NESTED_HWVIRT_VMX_EPT */
4785
4786
4787/**
4788 * Clear references to shadowed pages in a 32 bits page directory.
4789 *
4790 * @param pPool The pool.
4791 * @param pPage The page.
4792 * @param pShwPD The shadow page directory (mapping of the page).
4793 */
4794DECLINLINE(void) pgmPoolTrackDerefPD(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PD pShwPD)
4795{
4796 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4797 {
4798 X86PGUINT const uPde = pShwPD->a[i].u;
4799 if (uPde & X86_PDE_P)
4800 {
4801 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & X86_PDE_PG_MASK);
4802 if (pSubPage)
4803 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4804 else
4805 AssertFatalMsgFailed(("%x\n", pShwPD->a[i].u & X86_PDE_PG_MASK));
4806 }
4807 }
4808}
4809
4810
4811/**
4812 * Clear references to shadowed pages in a PAE (legacy or 64 bits) page directory.
4813 *
4814 * @param pPool The pool.
4815 * @param pPage The page.
4816 * @param pShwPD The shadow page directory (mapping of the page).
4817 */
4818DECLINLINE(void) pgmPoolTrackDerefPDPae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPAE pShwPD)
4819{
4820 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4821 {
4822 X86PGPAEUINT const uPde = pShwPD->a[i].u;
4823 if (uPde & X86_PDE_P)
4824 {
4825#ifdef PGM_WITH_LARGE_PAGES
4826 if (uPde & X86_PDE_PS)
4827 {
4828 Log4(("pgmPoolTrackDerefPDPae: i=%d pde=%RX64 GCPhys=%RX64\n",
4829 i, uPde & X86_PDE2M_PAE_PG_MASK, pPage->GCPhys));
4830 pgmPoolTracDerefGCPhys(pPool, pPage, uPde & X86_PDE2M_PAE_PG_MASK,
4831 pPage->GCPhys + i * 2 * _1M /* pPage->GCPhys = base address of the memory described by the PD */,
4832 i);
4833 }
4834 else
4835#endif
4836 {
4837 Assert((uPde & (X86_PDE_PAE_MBZ_MASK_NX | UINT64_C(0x7ff0000000000000))) == 0);
4838 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPde & X86_PDE_PAE_PG_MASK);
4839 if (pSubPage)
4840 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4841 else
4842 AssertFatalMsgFailed(("%RX64\n", uPde & X86_PDE_PAE_PG_MASK));
4843 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4844 }
4845 }
4846 }
4847}
4848
4849
4850/**
4851 * Clear references to shadowed pages in a PAE page directory pointer table.
4852 *
4853 * @param pPool The pool.
4854 * @param pPage The page.
4855 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4856 */
4857DECLINLINE(void) pgmPoolTrackDerefPDPTPae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPT pShwPDPT)
4858{
4859 for (unsigned i = 0; i < X86_PG_PAE_PDPE_ENTRIES; i++)
4860 {
4861 X86PGPAEUINT const uPdpe = pShwPDPT->a[i].u;
4862 Assert((uPdpe & (X86_PDPE_PAE_MBZ_MASK | UINT64_C(0x7ff0000000000200))) == 0);
4863 if (uPdpe & X86_PDPE_P)
4864 {
4865 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPdpe & X86_PDPE_PG_MASK);
4866 if (pSubPage)
4867 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4868 else
4869 AssertFatalMsgFailed(("%RX64\n", uPdpe & X86_PDPE_PG_MASK));
4870 }
4871 }
4872}
4873
4874
4875/**
4876 * Clear references to shadowed pages in a 64-bit page directory pointer table.
4877 *
4878 * @param pPool The pool.
4879 * @param pPage The page.
4880 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4881 */
4882DECLINLINE(void) pgmPoolTrackDerefPDPT64Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPT pShwPDPT)
4883{
4884 for (unsigned i = 0; i < RT_ELEMENTS(pShwPDPT->a); i++)
4885 {
4886 X86PGPAEUINT const uPdpe = pShwPDPT->a[i].u;
4887 Assert((uPdpe & (X86_PDPE_LM_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4888 if (uPdpe & X86_PDPE_P)
4889 {
4890 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPdpe & X86_PDPE_PG_MASK);
4891 if (pSubPage)
4892 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4893 else
4894 AssertFatalMsgFailed(("%RX64\n", uPdpe & X86_PDPE_PG_MASK));
4895 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4896 }
4897 }
4898}
4899
4900
4901/**
4902 * Clear references to shadowed pages in a 64-bit level 4 page table.
4903 *
4904 * @param pPool The pool.
4905 * @param pPage The page.
4906 * @param pShwPML4 The shadow page directory pointer table (mapping of the page).
4907 */
4908DECLINLINE(void) pgmPoolTrackDerefPML464Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PML4 pShwPML4)
4909{
4910 for (unsigned i = 0; i < RT_ELEMENTS(pShwPML4->a); i++)
4911 {
4912 X86PGPAEUINT const uPml4e = pShwPML4->a[i].u;
4913 Assert((uPml4e & (X86_PML4E_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4914 if (uPml4e & X86_PML4E_P)
4915 {
4916 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPml4e & X86_PDPE_PG_MASK);
4917 if (pSubPage)
4918 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4919 else
4920 AssertFatalMsgFailed(("%RX64\n", uPml4e & X86_PML4E_PG_MASK));
4921 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4922 }
4923 }
4924}
4925
4926
4927/**
4928 * Clear references to shadowed pages in an EPT page directory.
4929 *
4930 * @param pPool The pool.
4931 * @param pPage The page.
4932 * @param pShwPD The shadow page directory (mapping of the page).
4933 */
4934DECLINLINE(void) pgmPoolTrackDerefPDEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPD pShwPD)
4935{
4936 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4937 {
4938 X86PGPAEUINT const uPde = pShwPD->a[i].u;
4939#ifdef PGM_WITH_LARGE_PAGES
4940 AssertMsg((uPde & UINT64_C(0xfff0000000000f00)) == 0, ("uPde=%RX64\n", uPde));
4941#else
4942 AssertMsg((uPde & UINT64_C(0xfff0000000000f80)) == 0, ("uPde=%RX64\n", uPde));
4943#endif
4944 if (uPde & EPT_E_READ)
4945 {
4946#ifdef PGM_WITH_LARGE_PAGES
4947 if (uPde & EPT_E_LEAF)
4948 {
4949 Log4(("pgmPoolTrackDerefPDEPT: i=%d pde=%RX64 GCPhys=%RX64\n",
4950 i, uPde & EPT_PDE2M_PG_MASK, pPage->GCPhys));
4951 pgmPoolTracDerefGCPhys(pPool, pPage, uPde & EPT_PDE2M_PG_MASK,
4952 pPage->GCPhys + i * 2 * _1M /* pPage->GCPhys = base address of the memory described by the PD */,
4953 i);
4954 }
4955 else
4956#endif
4957 {
4958 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPde & EPT_PDE_PG_MASK);
4959 if (pSubPage)
4960 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4961 else
4962 AssertFatalMsgFailed(("%RX64\n", pShwPD->a[i].u & EPT_PDE_PG_MASK));
4963 }
4964 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4965 }
4966 }
4967}
4968
4969
4970/**
4971 * Clear references to shadowed pages in an EPT page directory pointer table.
4972 *
4973 * @param pPool The pool.
4974 * @param pPage The page.
4975 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4976 */
4977DECLINLINE(void) pgmPoolTrackDerefPDPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPDPT pShwPDPT)
4978{
4979 for (unsigned i = 0; i < RT_ELEMENTS(pShwPDPT->a); i++)
4980 {
4981 X86PGPAEUINT const uPdpe = pShwPDPT->a[i].u;
4982 Assert((uPdpe & UINT64_C(0xfff0000000000f80)) == 0);
4983 if (uPdpe & EPT_E_READ)
4984 {
4985 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, uPdpe & EPT_PDPTE_PG_MASK);
4986 if (pSubPage)
4987 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4988 else
4989 AssertFatalMsgFailed(("%RX64\n", uPdpe & EPT_PDPTE_PG_MASK));
4990 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4991 }
4992 }
4993}
4994
4995
4996/**
4997 * Clears all references made by this page.
4998 *
4999 * This includes other shadow pages and GC physical addresses.
5000 *
5001 * @param pPool The pool.
5002 * @param pPage The page.
5003 */
5004static void pgmPoolTrackDeref(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
5005{
5006 /*
5007 * Map the shadow page and take action according to the page kind.
5008 */
5009 PVMCC pVM = pPool->CTX_SUFF(pVM);
5010 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
5011 switch