VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/PGMAllPool.cpp@ 43667

Last change on this file since 43667 was 43387, checked in by vboxsync, 12 years ago

VMM: HM cleanup.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 211.1 KB
Line 
1/* $Id: PGMAllPool.cpp 43387 2012-09-21 09:40:25Z vboxsync $ */
2/** @file
3 * PGM Shadow Page Pool.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*******************************************************************************
20* Header Files *
21*******************************************************************************/
22#define LOG_GROUP LOG_GROUP_PGM_POOL
23#include <VBox/vmm/pgm.h>
24#include <VBox/vmm/mm.h>
25#include <VBox/vmm/em.h>
26#include <VBox/vmm/cpum.h>
27#ifdef IN_RC
28# include <VBox/vmm/patm.h>
29#endif
30#include "PGMInternal.h"
31#include <VBox/vmm/vm.h>
32#include "PGMInline.h"
33#include <VBox/disopcode.h>
34#include <VBox/vmm/hm_vmx.h>
35
36#include <VBox/log.h>
37#include <VBox/err.h>
38#include <iprt/asm.h>
39#include <iprt/asm-amd64-x86.h>
40#include <iprt/string.h>
41
42
43/*******************************************************************************
44* Internal Functions *
45*******************************************************************************/
46RT_C_DECLS_BEGIN
47DECLINLINE(unsigned) pgmPoolTrackGetShadowEntrySize(PGMPOOLKIND enmKind);
48DECLINLINE(unsigned) pgmPoolTrackGetGuestEntrySize(PGMPOOLKIND enmKind);
49static void pgmPoolTrackDeref(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
50static int pgmPoolTrackAddUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable);
51static void pgmPoolMonitorModifiedRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
52#ifndef IN_RING3
53DECLEXPORT(int) pgmPoolAccessHandler(PVM pVM, RTGCUINT uErrorCode, PCPUMCTXCORE pRegFrame, RTGCPTR pvFault, RTGCPHYS GCPhysFault, void *pvUser);
54#endif
55#ifdef LOG_ENABLED
56static const char *pgmPoolPoolKindToStr(uint8_t enmKind);
57#endif
58#if 0 /*defined(VBOX_STRICT) && defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT)*/
59static void pgmPoolTrackCheckPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT);
60#endif
61
62int pgmPoolTrackFlushGCPhysPTsSlow(PVM pVM, PPGMPAGE pPhysPage);
63PPGMPOOLPHYSEXT pgmPoolTrackPhysExtAlloc(PVM pVM, uint16_t *piPhysExt);
64void pgmPoolTrackPhysExtFree(PVM pVM, uint16_t iPhysExt);
65void pgmPoolTrackPhysExtFreeList(PVM pVM, uint16_t iPhysExt);
66
67RT_C_DECLS_END
68
69
70/**
71 * Checks if the specified page pool kind is for a 4MB or 2MB guest page.
72 *
73 * @returns true if it's the shadow of a 4MB or 2MB guest page, otherwise false.
74 * @param enmKind The page kind.
75 */
76DECLINLINE(bool) pgmPoolIsBigPage(PGMPOOLKIND enmKind)
77{
78 switch (enmKind)
79 {
80 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
81 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
82 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
83 return true;
84 default:
85 return false;
86 }
87}
88
89
90/**
91 * Flushes a chain of pages sharing the same access monitor.
92 *
93 * @returns VBox status code suitable for scheduling.
94 * @param pPool The pool.
95 * @param pPage A page in the chain.
96 * @todo VBOXSTRICTRC
97 */
98int pgmPoolMonitorChainFlush(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
99{
100 LogFlow(("pgmPoolMonitorChainFlush: Flush page %RGp type=%d\n", pPage->GCPhys, pPage->enmKind));
101
102 /*
103 * Find the list head.
104 */
105 uint16_t idx = pPage->idx;
106 if (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
107 {
108 while (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
109 {
110 idx = pPage->iMonitoredPrev;
111 Assert(idx != pPage->idx);
112 pPage = &pPool->aPages[idx];
113 }
114 }
115
116 /*
117 * Iterate the list flushing each shadow page.
118 */
119 int rc = VINF_SUCCESS;
120 for (;;)
121 {
122 idx = pPage->iMonitoredNext;
123 Assert(idx != pPage->idx);
124 if (pPage->idx >= PGMPOOL_IDX_FIRST)
125 {
126 int rc2 = pgmPoolFlushPage(pPool, pPage);
127 AssertRC(rc2);
128 }
129 /* next */
130 if (idx == NIL_PGMPOOL_IDX)
131 break;
132 pPage = &pPool->aPages[idx];
133 }
134 return rc;
135}
136
137
138/**
139 * Wrapper for getting the current context pointer to the entry being modified.
140 *
141 * @returns VBox status code suitable for scheduling.
142 * @param pVM Pointer to the VM.
143 * @param pvDst Destination address
144 * @param pvSrc Source guest virtual address.
145 * @param GCPhysSrc The source guest physical address.
146 * @param cb Size of data to read
147 */
148DECLINLINE(int) pgmPoolPhysSimpleReadGCPhys(PVM pVM, void *pvDst, CTXTYPE(RTGCPTR, RTHCPTR, RTGCPTR) pvSrc,
149 RTGCPHYS GCPhysSrc, size_t cb)
150{
151#if defined(IN_RING3)
152 NOREF(pVM); NOREF(GCPhysSrc);
153 memcpy(pvDst, (RTHCPTR)((uintptr_t)pvSrc & ~(RTHCUINTPTR)(cb - 1)), cb);
154 return VINF_SUCCESS;
155#else
156 /* @todo in RC we could attempt to use the virtual address, although this can cause many faults (PAE Windows XP guest). */
157 NOREF(pvSrc);
158 return PGMPhysSimpleReadGCPhys(pVM, pvDst, GCPhysSrc & ~(RTGCPHYS)(cb - 1), cb);
159#endif
160}
161
162
163/**
164 * Process shadow entries before they are changed by the guest.
165 *
166 * For PT entries we will clear them. For PD entries, we'll simply check
167 * for mapping conflicts and set the SyncCR3 FF if found.
168 *
169 * @param pVCpu Pointer to the VMCPU.
170 * @param pPool The pool.
171 * @param pPage The head page.
172 * @param GCPhysFault The guest physical fault address.
173 * @param uAddress In R0 and GC this is the guest context fault address (flat).
174 * In R3 this is the host context 'fault' address.
175 * @param cbWrite Write size; might be zero if the caller knows we're not crossing entry boundaries
176 */
177void pgmPoolMonitorChainChanging(PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTGCPHYS GCPhysFault,
178 CTXTYPE(RTGCPTR, RTHCPTR, RTGCPTR) pvAddress, unsigned cbWrite)
179{
180 AssertMsg(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX, ("%u (idx=%u)\n", pPage->iMonitoredPrev, pPage->idx));
181 const unsigned off = GCPhysFault & PAGE_OFFSET_MASK;
182 PVM pVM = pPool->CTX_SUFF(pVM);
183 NOREF(pVCpu);
184
185 LogFlow(("pgmPoolMonitorChainChanging: %RGv phys=%RGp cbWrite=%d\n", (RTGCPTR)(CTXTYPE(RTGCPTR, uintptr_t, RTGCPTR))pvAddress, GCPhysFault, cbWrite));
186
187 for (;;)
188 {
189 union
190 {
191 void *pv;
192 PX86PT pPT;
193 PPGMSHWPTPAE pPTPae;
194 PX86PD pPD;
195 PX86PDPAE pPDPae;
196 PX86PDPT pPDPT;
197 PX86PML4 pPML4;
198 } uShw;
199
200 LogFlow(("pgmPoolMonitorChainChanging: page idx=%d phys=%RGp (next=%d) kind=%s\n", pPage->idx, pPage->GCPhys, pPage->iMonitoredNext, pgmPoolPoolKindToStr(pPage->enmKind), cbWrite));
201
202 uShw.pv = NULL;
203 switch (pPage->enmKind)
204 {
205 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
206 {
207 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
208 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
209 const unsigned iShw = off / sizeof(X86PTE);
210 LogFlow(("PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT iShw=%x\n", iShw));
211 if (uShw.pPT->a[iShw].n.u1Present)
212 {
213 X86PTE GstPte;
214
215 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
216 AssertRC(rc);
217 Log4(("pgmPoolMonitorChainChanging 32_32: deref %016RX64 GCPhys %08RX32\n", uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK, GstPte.u & X86_PTE_PG_MASK));
218 pgmPoolTracDerefGCPhysHint(pPool, pPage,
219 uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK,
220 GstPte.u & X86_PTE_PG_MASK,
221 iShw);
222 ASMAtomicWriteU32(&uShw.pPT->a[iShw].u, 0);
223 }
224 break;
225 }
226
227 /* page/2 sized */
228 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
229 {
230 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
231 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
232 if (!((off ^ pPage->GCPhys) & (PAGE_SIZE / 2)))
233 {
234 const unsigned iShw = (off / sizeof(X86PTE)) & (X86_PG_PAE_ENTRIES - 1);
235 LogFlow(("PGMPOOLKIND_PAE_PT_FOR_32BIT_PT iShw=%x\n", iShw));
236 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw]))
237 {
238 X86PTE GstPte;
239 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
240 AssertRC(rc);
241
242 Log4(("pgmPoolMonitorChainChanging pae_32: deref %016RX64 GCPhys %08RX32\n", uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK, GstPte.u & X86_PTE_PG_MASK));
243 pgmPoolTracDerefGCPhysHint(pPool, pPage,
244 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]),
245 GstPte.u & X86_PTE_PG_MASK,
246 iShw);
247 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw], 0);
248 }
249 }
250 break;
251 }
252
253 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
254 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
255 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
256 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
257 {
258 unsigned iGst = off / sizeof(X86PDE);
259 unsigned iShwPdpt = iGst / 256;
260 unsigned iShw = (iGst % 256) * 2;
261 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
262
263 LogFlow(("pgmPoolMonitorChainChanging PAE for 32 bits: iGst=%x iShw=%x idx = %d page idx=%d\n", iGst, iShw, iShwPdpt, pPage->enmKind - PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD));
264 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
265 if (iShwPdpt == pPage->enmKind - (unsigned)PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD)
266 {
267 for (unsigned i = 0; i < 2; i++)
268 {
269# ifdef VBOX_WITH_RAW_MODE_NOT_R0
270 if ((uShw.pPDPae->a[iShw + i].u & (PGM_PDFLAGS_MAPPING | X86_PDE_P)) == (PGM_PDFLAGS_MAPPING | X86_PDE_P))
271 {
272 Assert(pgmMapAreMappingsEnabled(pVM));
273 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
274 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShwPdpt=%#x iShw=%#x!\n", iShwPdpt, iShw+i));
275 break;
276 }
277# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
278 if (uShw.pPDPae->a[iShw+i].n.u1Present)
279 {
280 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw+i, uShw.pPDPae->a[iShw+i].u));
281 pgmPoolFree(pVM,
282 uShw.pPDPae->a[iShw+i].u & X86_PDE_PAE_PG_MASK,
283 pPage->idx,
284 iShw + i);
285 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw+i].u, 0);
286 }
287
288 /* paranoia / a bit assumptive. */
289 if ( (off & 3)
290 && (off & 3) + cbWrite > 4)
291 {
292 const unsigned iShw2 = iShw + 2 + i;
293 if (iShw2 < RT_ELEMENTS(uShw.pPDPae->a))
294 {
295# ifdef VBOX_WITH_RAW_MODE_NOT_R0
296 if ((uShw.pPDPae->a[iShw2].u & (PGM_PDFLAGS_MAPPING | X86_PDE_P)) == (PGM_PDFLAGS_MAPPING | X86_PDE_P))
297 {
298 Assert(pgmMapAreMappingsEnabled(pVM));
299 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
300 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShwPdpt=%#x iShw2=%#x!\n", iShwPdpt, iShw2));
301 break;
302 }
303# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
304 if (uShw.pPDPae->a[iShw2].n.u1Present)
305 {
306 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPae->a[iShw2].u));
307 pgmPoolFree(pVM,
308 uShw.pPDPae->a[iShw2].u & X86_PDE_PAE_PG_MASK,
309 pPage->idx,
310 iShw2);
311 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
312 }
313 }
314 }
315 }
316 }
317 break;
318 }
319
320 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
321 {
322 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
323 const unsigned iShw = off / sizeof(X86PTEPAE);
324 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
325 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw]))
326 {
327 X86PTEPAE GstPte;
328 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
329 AssertRC(rc);
330
331 Log4(("pgmPoolMonitorChainChanging pae: deref %016RX64 GCPhys %016RX64\n", PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]), GstPte.u & X86_PTE_PAE_PG_MASK));
332 pgmPoolTracDerefGCPhysHint(pPool, pPage,
333 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]),
334 GstPte.u & X86_PTE_PAE_PG_MASK,
335 iShw);
336 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw], 0);
337 }
338
339 /* paranoia / a bit assumptive. */
340 if ( (off & 7)
341 && (off & 7) + cbWrite > sizeof(X86PTEPAE))
342 {
343 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PTEPAE);
344 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPTPae->a));
345
346 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw2]))
347 {
348 X86PTEPAE GstPte;
349# ifdef IN_RING3
350 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, (RTHCPTR)((RTHCUINTPTR)pvAddress + sizeof(GstPte)), GCPhysFault + sizeof(GstPte), sizeof(GstPte));
351# else
352 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress + sizeof(GstPte), GCPhysFault + sizeof(GstPte), sizeof(GstPte));
353# endif
354 AssertRC(rc);
355 Log4(("pgmPoolMonitorChainChanging pae: deref %016RX64 GCPhys %016RX64\n", PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw2]), GstPte.u & X86_PTE_PAE_PG_MASK));
356 pgmPoolTracDerefGCPhysHint(pPool, pPage,
357 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw2]),
358 GstPte.u & X86_PTE_PAE_PG_MASK,
359 iShw2);
360 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw2], 0);
361 }
362 }
363 break;
364 }
365
366 case PGMPOOLKIND_32BIT_PD:
367 {
368 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
369 const unsigned iShw = off / sizeof(X86PTE); // ASSUMING 32-bit guest paging!
370
371 LogFlow(("pgmPoolMonitorChainChanging: PGMPOOLKIND_32BIT_PD %x\n", iShw));
372 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
373# ifdef VBOX_WITH_RAW_MODE_NOT_R0
374 if (uShw.pPD->a[iShw].u & PGM_PDFLAGS_MAPPING)
375 {
376 Assert(pgmMapAreMappingsEnabled(pVM));
377 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
378 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
379 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw=%#x!\n", iShw));
380 break;
381 }
382 else
383# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
384 {
385 if (uShw.pPD->a[iShw].n.u1Present)
386 {
387 LogFlow(("pgmPoolMonitorChainChanging: 32 bit pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPD->a[iShw].u));
388 pgmPoolFree(pVM,
389 uShw.pPD->a[iShw].u & X86_PDE_PAE_PG_MASK,
390 pPage->idx,
391 iShw);
392 ASMAtomicWriteU32(&uShw.pPD->a[iShw].u, 0);
393 }
394 }
395 /* paranoia / a bit assumptive. */
396 if ( (off & 3)
397 && (off & 3) + cbWrite > sizeof(X86PTE))
398 {
399 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PTE);
400 if ( iShw2 != iShw
401 && iShw2 < RT_ELEMENTS(uShw.pPD->a))
402 {
403# ifdef VBOX_WITH_RAW_MODE_NOT_R0
404 if (uShw.pPD->a[iShw2].u & PGM_PDFLAGS_MAPPING)
405 {
406 Assert(pgmMapAreMappingsEnabled(pVM));
407 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
408 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
409 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw2=%#x!\n", iShw2));
410 break;
411 }
412# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
413 if (uShw.pPD->a[iShw2].n.u1Present)
414 {
415 LogFlow(("pgmPoolMonitorChainChanging: 32 bit pd iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPD->a[iShw2].u));
416 pgmPoolFree(pVM,
417 uShw.pPD->a[iShw2].u & X86_PDE_PAE_PG_MASK,
418 pPage->idx,
419 iShw2);
420 ASMAtomicWriteU32(&uShw.pPD->a[iShw2].u, 0);
421 }
422 }
423 }
424#if 0 /* useful when running PGMAssertCR3(), a bit too troublesome for general use (TLBs). */
425 if ( uShw.pPD->a[iShw].n.u1Present
426 && !VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3))
427 {
428 LogFlow(("pgmPoolMonitorChainChanging: iShw=%#x: %RX32 -> freeing it!\n", iShw, uShw.pPD->a[iShw].u));
429# ifdef IN_RC /* TLB load - we're pushing things a bit... */
430 ASMProbeReadByte(pvAddress);
431# endif
432 pgmPoolFree(pVM, uShw.pPD->a[iShw].u & X86_PDE_PG_MASK, pPage->idx, iShw);
433 ASMAtomicWriteU32(&uShw.pPD->a[iShw].u, 0);
434 }
435#endif
436 break;
437 }
438
439 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
440 {
441 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
442 const unsigned iShw = off / sizeof(X86PDEPAE);
443 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
444#ifdef VBOX_WITH_RAW_MODE_NOT_R0
445 if (uShw.pPDPae->a[iShw].u & PGM_PDFLAGS_MAPPING)
446 {
447 Assert(pgmMapAreMappingsEnabled(pVM));
448 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
449 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
450 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw=%#x!\n", iShw));
451 break;
452 }
453#endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
454 /*
455 * Causes trouble when the guest uses a PDE to refer to the whole page table level
456 * structure. (Invalidate here; faults later on when it tries to change the page
457 * table entries -> recheck; probably only applies to the RC case.)
458 */
459#ifdef VBOX_WITH_RAW_MODE_NOT_R0
460 else
461#endif
462 {
463 if (uShw.pPDPae->a[iShw].n.u1Present)
464 {
465 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPae->a[iShw].u));
466 pgmPoolFree(pVM,
467 uShw.pPDPae->a[iShw].u & X86_PDE_PAE_PG_MASK,
468 pPage->idx,
469 iShw);
470 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw].u, 0);
471 }
472 }
473 /* paranoia / a bit assumptive. */
474 if ( (off & 7)
475 && (off & 7) + cbWrite > sizeof(X86PDEPAE))
476 {
477 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDEPAE);
478 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPDPae->a));
479
480#ifdef VBOX_WITH_RAW_MODE_NOT_R0
481 if ( iShw2 != iShw
482 && uShw.pPDPae->a[iShw2].u & PGM_PDFLAGS_MAPPING)
483 {
484 Assert(pgmMapAreMappingsEnabled(pVM));
485 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
486 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
487 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw2=%#x!\n", iShw2));
488 break;
489 }
490 else
491#endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
492 if (uShw.pPDPae->a[iShw2].n.u1Present)
493 {
494 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPae->a[iShw2].u));
495 pgmPoolFree(pVM,
496 uShw.pPDPae->a[iShw2].u & X86_PDE_PAE_PG_MASK,
497 pPage->idx,
498 iShw2);
499 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
500 }
501 }
502 break;
503 }
504
505 case PGMPOOLKIND_PAE_PDPT:
506 {
507 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPDPT));
508 /*
509 * Hopefully this doesn't happen very often:
510 * - touching unused parts of the page
511 * - messing with the bits of pd pointers without changing the physical address
512 */
513 /* PDPT roots are not page aligned; 32 byte only! */
514 const unsigned offPdpt = GCPhysFault - pPage->GCPhys;
515
516 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
517 const unsigned iShw = offPdpt / sizeof(X86PDPE);
518 if (iShw < X86_PG_PAE_PDPE_ENTRIES) /* don't use RT_ELEMENTS(uShw.pPDPT->a), because that's for long mode only */
519 {
520# ifdef VBOX_WITH_RAW_MODE_NOT_R0
521 if (uShw.pPDPT->a[iShw].u & PGM_PLXFLAGS_MAPPING)
522 {
523 Assert(pgmMapAreMappingsEnabled(pVM));
524 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
525 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
526 LogFlow(("pgmPoolMonitorChainChanging: Detected pdpt conflict at iShw=%#x!\n", iShw));
527 break;
528 }
529 else
530# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
531 if (uShw.pPDPT->a[iShw].n.u1Present)
532 {
533 LogFlow(("pgmPoolMonitorChainChanging: pae pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPT->a[iShw].u));
534 pgmPoolFree(pVM,
535 uShw.pPDPT->a[iShw].u & X86_PDPE_PG_MASK,
536 pPage->idx,
537 iShw);
538 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw].u, 0);
539 }
540
541 /* paranoia / a bit assumptive. */
542 if ( (offPdpt & 7)
543 && (offPdpt & 7) + cbWrite > sizeof(X86PDPE))
544 {
545 const unsigned iShw2 = (offPdpt + cbWrite - 1) / sizeof(X86PDPE);
546 if ( iShw2 != iShw
547 && iShw2 < X86_PG_PAE_PDPE_ENTRIES)
548 {
549# ifdef VBOX_WITH_RAW_MODE_NOT_R0
550 if (uShw.pPDPT->a[iShw2].u & PGM_PLXFLAGS_MAPPING)
551 {
552 Assert(pgmMapAreMappingsEnabled(pVM));
553 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
554 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
555 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw2=%#x!\n", iShw2));
556 break;
557 }
558 else
559# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
560 if (uShw.pPDPT->a[iShw2].n.u1Present)
561 {
562 LogFlow(("pgmPoolMonitorChainChanging: pae pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPT->a[iShw2].u));
563 pgmPoolFree(pVM,
564 uShw.pPDPT->a[iShw2].u & X86_PDPE_PG_MASK,
565 pPage->idx,
566 iShw2);
567 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw2].u, 0);
568 }
569 }
570 }
571 }
572 break;
573 }
574
575#ifndef IN_RC
576 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
577 {
578 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
579 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
580 const unsigned iShw = off / sizeof(X86PDEPAE);
581 Assert(!(uShw.pPDPae->a[iShw].u & PGM_PDFLAGS_MAPPING));
582 if (uShw.pPDPae->a[iShw].n.u1Present)
583 {
584 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPae->a[iShw].u));
585 pgmPoolFree(pVM,
586 uShw.pPDPae->a[iShw].u & X86_PDE_PAE_PG_MASK,
587 pPage->idx,
588 iShw);
589 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw].u, 0);
590 }
591 /* paranoia / a bit assumptive. */
592 if ( (off & 7)
593 && (off & 7) + cbWrite > sizeof(X86PDEPAE))
594 {
595 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDEPAE);
596 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPDPae->a));
597
598 Assert(!(uShw.pPDPae->a[iShw2].u & PGM_PDFLAGS_MAPPING));
599 if (uShw.pPDPae->a[iShw2].n.u1Present)
600 {
601 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPae->a[iShw2].u));
602 pgmPoolFree(pVM,
603 uShw.pPDPae->a[iShw2].u & X86_PDE_PAE_PG_MASK,
604 pPage->idx,
605 iShw2);
606 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
607 }
608 }
609 break;
610 }
611
612 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
613 {
614 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPDPT));
615 /*
616 * Hopefully this doesn't happen very often:
617 * - messing with the bits of pd pointers without changing the physical address
618 */
619 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
620 const unsigned iShw = off / sizeof(X86PDPE);
621 if (uShw.pPDPT->a[iShw].n.u1Present)
622 {
623 LogFlow(("pgmPoolMonitorChainChanging: pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPT->a[iShw].u));
624 pgmPoolFree(pVM, uShw.pPDPT->a[iShw].u & X86_PDPE_PG_MASK, pPage->idx, iShw);
625 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw].u, 0);
626 }
627 /* paranoia / a bit assumptive. */
628 if ( (off & 7)
629 && (off & 7) + cbWrite > sizeof(X86PDPE))
630 {
631 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDPE);
632 if (uShw.pPDPT->a[iShw2].n.u1Present)
633 {
634 LogFlow(("pgmPoolMonitorChainChanging: pdpt iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPT->a[iShw2].u));
635 pgmPoolFree(pVM, uShw.pPDPT->a[iShw2].u & X86_PDPE_PG_MASK, pPage->idx, iShw2);
636 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw2].u, 0);
637 }
638 }
639 break;
640 }
641
642 case PGMPOOLKIND_64BIT_PML4:
643 {
644 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPML4));
645 /*
646 * Hopefully this doesn't happen very often:
647 * - messing with the bits of pd pointers without changing the physical address
648 */
649 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
650 const unsigned iShw = off / sizeof(X86PDPE);
651 if (uShw.pPML4->a[iShw].n.u1Present)
652 {
653 LogFlow(("pgmPoolMonitorChainChanging: pml4 iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPML4->a[iShw].u));
654 pgmPoolFree(pVM, uShw.pPML4->a[iShw].u & X86_PML4E_PG_MASK, pPage->idx, iShw);
655 ASMAtomicWriteU64(&uShw.pPML4->a[iShw].u, 0);
656 }
657 /* paranoia / a bit assumptive. */
658 if ( (off & 7)
659 && (off & 7) + cbWrite > sizeof(X86PDPE))
660 {
661 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PML4E);
662 if (uShw.pPML4->a[iShw2].n.u1Present)
663 {
664 LogFlow(("pgmPoolMonitorChainChanging: pml4 iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPML4->a[iShw2].u));
665 pgmPoolFree(pVM, uShw.pPML4->a[iShw2].u & X86_PML4E_PG_MASK, pPage->idx, iShw2);
666 ASMAtomicWriteU64(&uShw.pPML4->a[iShw2].u, 0);
667 }
668 }
669 break;
670 }
671#endif /* IN_RING0 */
672
673 default:
674 AssertFatalMsgFailed(("enmKind=%d\n", pPage->enmKind));
675 }
676 PGM_DYNMAP_UNUSED_HINT_VM(pVM, uShw.pv);
677
678 /* next */
679 if (pPage->iMonitoredNext == NIL_PGMPOOL_IDX)
680 return;
681 pPage = &pPool->aPages[pPage->iMonitoredNext];
682 }
683}
684
685# ifndef IN_RING3
686
687/**
688 * Checks if a access could be a fork operation in progress.
689 *
690 * Meaning, that the guest is setting up the parent process for Copy-On-Write.
691 *
692 * @returns true if it's likely that we're forking, otherwise false.
693 * @param pPool The pool.
694 * @param pDis The disassembled instruction.
695 * @param offFault The access offset.
696 */
697DECLINLINE(bool) pgmPoolMonitorIsForking(PPGMPOOL pPool, PDISCPUSTATE pDis, unsigned offFault)
698{
699 /*
700 * i386 linux is using btr to clear X86_PTE_RW.
701 * The functions involved are (2.6.16 source inspection):
702 * clear_bit
703 * ptep_set_wrprotect
704 * copy_one_pte
705 * copy_pte_range
706 * copy_pmd_range
707 * copy_pud_range
708 * copy_page_range
709 * dup_mmap
710 * dup_mm
711 * copy_mm
712 * copy_process
713 * do_fork
714 */
715 if ( pDis->pCurInstr->uOpcode == OP_BTR
716 && !(offFault & 4)
717 /** @todo Validate that the bit index is X86_PTE_RW. */
718 )
719 {
720 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,Fork));
721 return true;
722 }
723 return false;
724}
725
726
727/**
728 * Determine whether the page is likely to have been reused.
729 *
730 * @returns true if we consider the page as being reused for a different purpose.
731 * @returns false if we consider it to still be a paging page.
732 * @param pVM Pointer to the VM.
733 * @param pVCpu Pointer to the VMCPU.
734 * @param pRegFrame Trap register frame.
735 * @param pDis The disassembly info for the faulting instruction.
736 * @param pvFault The fault address.
737 *
738 * @remark The REP prefix check is left to the caller because of STOSD/W.
739 */
740DECLINLINE(bool) pgmPoolMonitorIsReused(PVM pVM, PVMCPU pVCpu, PCPUMCTXCORE pRegFrame, PDISCPUSTATE pDis, RTGCPTR pvFault)
741{
742#ifndef IN_RC
743 /** @todo could make this general, faulting close to rsp should be a safe reuse heuristic. */
744 if ( HMHasPendingIrq(pVM)
745 && (pRegFrame->rsp - pvFault) < 32)
746 {
747 /* Fault caused by stack writes while trying to inject an interrupt event. */
748 Log(("pgmPoolMonitorIsReused: reused %RGv for interrupt stack (rsp=%RGv).\n", pvFault, pRegFrame->rsp));
749 return true;
750 }
751#else
752 NOREF(pVM); NOREF(pvFault);
753#endif
754
755 LogFlow(("Reused instr %RGv %d at %RGv param1.fUse=%llx param1.reg=%d\n", pRegFrame->rip, pDis->pCurInstr->uOpcode, pvFault, pDis->Param1.fUse, pDis->Param1.Base.idxGenReg));
756
757 /* Non-supervisor mode write means it's used for something else. */
758 if (CPUMGetGuestCPL(pVCpu) != 0)
759 return true;
760
761 switch (pDis->pCurInstr->uOpcode)
762 {
763 /* call implies the actual push of the return address faulted */
764 case OP_CALL:
765 Log4(("pgmPoolMonitorIsReused: CALL\n"));
766 return true;
767 case OP_PUSH:
768 Log4(("pgmPoolMonitorIsReused: PUSH\n"));
769 return true;
770 case OP_PUSHF:
771 Log4(("pgmPoolMonitorIsReused: PUSHF\n"));
772 return true;
773 case OP_PUSHA:
774 Log4(("pgmPoolMonitorIsReused: PUSHA\n"));
775 return true;
776 case OP_FXSAVE:
777 Log4(("pgmPoolMonitorIsReused: FXSAVE\n"));
778 return true;
779 case OP_MOVNTI: /* solaris - block_zero_no_xmm */
780 Log4(("pgmPoolMonitorIsReused: MOVNTI\n"));
781 return true;
782 case OP_MOVNTDQ: /* solaris - hwblkclr & hwblkpagecopy */
783 Log4(("pgmPoolMonitorIsReused: MOVNTDQ\n"));
784 return true;
785 case OP_MOVSWD:
786 case OP_STOSWD:
787 if ( pDis->fPrefix == (DISPREFIX_REP|DISPREFIX_REX)
788 && pRegFrame->rcx >= 0x40
789 )
790 {
791 Assert(pDis->uCpuMode == DISCPUMODE_64BIT);
792
793 Log(("pgmPoolMonitorIsReused: OP_STOSQ\n"));
794 return true;
795 }
796 return false;
797 }
798 if ( ( (pDis->Param1.fUse & DISUSE_REG_GEN32)
799 || (pDis->Param1.fUse & DISUSE_REG_GEN64))
800 && (pDis->Param1.Base.idxGenReg == DISGREG_ESP))
801 {
802 Log4(("pgmPoolMonitorIsReused: ESP\n"));
803 return true;
804 }
805
806 return false;
807}
808
809
810/**
811 * Flushes the page being accessed.
812 *
813 * @returns VBox status code suitable for scheduling.
814 * @param pVM Pointer to the VM.
815 * @param pVCpu Pointer to the VMCPU.
816 * @param pPool The pool.
817 * @param pPage The pool page (head).
818 * @param pDis The disassembly of the write instruction.
819 * @param pRegFrame The trap register frame.
820 * @param GCPhysFault The fault address as guest physical address.
821 * @param pvFault The fault address.
822 * @todo VBOXSTRICTRC
823 */
824static int pgmPoolAccessHandlerFlush(PVM pVM, PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISCPUSTATE pDis,
825 PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, RTGCPTR pvFault)
826{
827 NOREF(GCPhysFault);
828
829 /*
830 * First, do the flushing.
831 */
832 int rc = pgmPoolMonitorChainFlush(pPool, pPage);
833
834 /*
835 * Emulate the instruction (xp/w2k problem, requires pc/cr2/sp detection).
836 * Must do this in raw mode (!); XP boot will fail otherwise.
837 */
838 VBOXSTRICTRC rc2 = EMInterpretInstructionDisasState(pVCpu, pDis, pRegFrame, pvFault, EMCODETYPE_ALL);
839 if (rc2 == VINF_SUCCESS)
840 { /* do nothing */ }
841#ifdef VBOX_WITH_IEM
842 else if (rc2 == VINF_EM_RESCHEDULE)
843 {
844 if (rc == VINF_SUCCESS)
845 rc = rc2;
846# ifndef IN_RING3
847 VMCPU_FF_SET(pVCpu, VMCPU_FF_TO_R3);
848# endif
849 }
850#endif
851 else if (rc2 == VERR_EM_INTERPRETER)
852 {
853#ifdef IN_RC
854 if (PATMIsPatchGCAddr(pVM, pRegFrame->eip))
855 {
856 LogFlow(("pgmPoolAccessHandlerPTWorker: Interpretation failed for patch code %04x:%RGv, ignoring.\n",
857 pRegFrame->cs.Sel, (RTGCPTR)pRegFrame->eip));
858 rc = VINF_SUCCESS;
859 STAM_COUNTER_INC(&pPool->StatMonitorRZIntrFailPatch2);
860 }
861 else
862#endif
863 {
864 rc = VINF_EM_RAW_EMULATE_INSTR;
865 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,EmulateInstr));
866 }
867 }
868 else if (RT_FAILURE_NP(rc2))
869 rc = VBOXSTRICTRC_VAL(rc2);
870 else
871 AssertMsgFailed(("%Rrc\n", VBOXSTRICTRC_VAL(rc2))); /* ASSUMES no complicated stuff here. */
872
873 LogFlow(("pgmPoolAccessHandlerPT: returns %Rrc (flushed)\n", rc));
874 return rc;
875}
876
877
878/**
879 * Handles the STOSD write accesses.
880 *
881 * @returns VBox status code suitable for scheduling.
882 * @param pVM Pointer to the VM.
883 * @param pPool The pool.
884 * @param pPage The pool page (head).
885 * @param pDis The disassembly of the write instruction.
886 * @param pRegFrame The trap register frame.
887 * @param GCPhysFault The fault address as guest physical address.
888 * @param pvFault The fault address.
889 */
890DECLINLINE(int) pgmPoolAccessHandlerSTOSD(PVM pVM, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISCPUSTATE pDis,
891 PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, RTGCPTR pvFault)
892{
893 unsigned uIncrement = pDis->Param1.cb;
894 NOREF(pVM);
895
896 Assert(pDis->uCpuMode == DISCPUMODE_32BIT || pDis->uCpuMode == DISCPUMODE_64BIT);
897 Assert(pRegFrame->rcx <= 0x20);
898
899#ifdef VBOX_STRICT
900 if (pDis->uOpMode == DISCPUMODE_32BIT)
901 Assert(uIncrement == 4);
902 else
903 Assert(uIncrement == 8);
904#endif
905
906 Log3(("pgmPoolAccessHandlerSTOSD\n"));
907
908 /*
909 * Increment the modification counter and insert it into the list
910 * of modified pages the first time.
911 */
912 if (!pPage->cModifications++)
913 pgmPoolMonitorModifiedInsert(pPool, pPage);
914
915 /*
916 * Execute REP STOSD.
917 *
918 * This ASSUMES that we're not invoked by Trap0e on in a out-of-sync
919 * write situation, meaning that it's safe to write here.
920 */
921 PVMCPU pVCpu = VMMGetCpu(pPool->CTX_SUFF(pVM));
922 RTGCUINTPTR pu32 = (RTGCUINTPTR)pvFault;
923 while (pRegFrame->rcx)
924 {
925#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
926 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
927 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, (RTGCPTR)pu32, uIncrement);
928 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
929#else
930 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, (RTGCPTR)pu32, uIncrement);
931#endif
932#ifdef IN_RC
933 *(uint32_t *)(uintptr_t)pu32 = pRegFrame->eax;
934#else
935 PGMPhysSimpleWriteGCPhys(pVM, GCPhysFault, &pRegFrame->rax, uIncrement);
936#endif
937 pu32 += uIncrement;
938 GCPhysFault += uIncrement;
939 pRegFrame->rdi += uIncrement;
940 pRegFrame->rcx--;
941 }
942 pRegFrame->rip += pDis->cbInstr;
943
944 LogFlow(("pgmPoolAccessHandlerSTOSD: returns\n"));
945 return VINF_SUCCESS;
946}
947
948
949/**
950 * Handles the simple write accesses.
951 *
952 * @returns VBox status code suitable for scheduling.
953 * @param pVM Pointer to the VM.
954 * @param pVCpu Pointer to the VMCPU.
955 * @param pPool The pool.
956 * @param pPage The pool page (head).
957 * @param pDis The disassembly of the write instruction.
958 * @param pRegFrame The trap register frame.
959 * @param GCPhysFault The fault address as guest physical address.
960 * @param pvFault The fault address.
961 * @param pfReused Reused state (in/out)
962 */
963DECLINLINE(int) pgmPoolAccessHandlerSimple(PVM pVM, PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISCPUSTATE pDis,
964 PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, RTGCPTR pvFault, bool *pfReused)
965{
966 Log3(("pgmPoolAccessHandlerSimple\n"));
967 NOREF(pfReused); /* initialized by caller */
968
969 /*
970 * Increment the modification counter and insert it into the list
971 * of modified pages the first time.
972 */
973 if (!pPage->cModifications++)
974 pgmPoolMonitorModifiedInsert(pPool, pPage);
975
976 /*
977 * Clear all the pages. ASSUMES that pvFault is readable.
978 */
979#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
980 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
981 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, pvFault, DISGetParamSize(pDis, &pDis->Param1));
982 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
983#else
984 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, pvFault, DISGetParamSize(pDis, &pDis->Param1));
985#endif
986
987 /*
988 * Interpret the instruction.
989 */
990 VBOXSTRICTRC rc = EMInterpretInstructionDisasState(pVCpu, pDis, pRegFrame, pvFault, EMCODETYPE_ALL);
991 if (RT_SUCCESS(rc))
992 AssertMsg(rc == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rc))); /* ASSUMES no complicated stuff here. */
993 else if (rc == VERR_EM_INTERPRETER)
994 {
995 LogFlow(("pgmPoolAccessHandlerPTWorker: Interpretation failed for %04x:%RGv - opcode=%d\n",
996 pRegFrame->cs.Sel, (RTGCPTR)pRegFrame->rip, pDis->pCurInstr->uOpcode));
997 rc = VINF_EM_RAW_EMULATE_INSTR;
998 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,EmulateInstr));
999 }
1000
1001#if 0 /* experimental code */
1002 if (rc == VINF_SUCCESS)
1003 {
1004 switch (pPage->enmKind)
1005 {
1006 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
1007 {
1008 X86PTEPAE GstPte;
1009 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvFault, GCPhysFault, sizeof(GstPte));
1010 AssertRC(rc);
1011
1012 /* Check the new value written by the guest. If present and with a bogus physical address, then
1013 * it's fairly safe to assume the guest is reusing the PT.
1014 */
1015 if (GstPte.n.u1Present)
1016 {
1017 RTHCPHYS HCPhys = -1;
1018 int rc = PGMPhysGCPhys2HCPhys(pVM, GstPte.u & X86_PTE_PAE_PG_MASK, &HCPhys);
1019 if (rc != VINF_SUCCESS)
1020 {
1021 *pfReused = true;
1022 STAM_COUNTER_INC(&pPool->StatForceFlushReused);
1023 }
1024 }
1025 break;
1026 }
1027 }
1028 }
1029#endif
1030
1031 LogFlow(("pgmPoolAccessHandlerSimple: returns %Rrc\n", VBOXSTRICTRC_VAL(rc)));
1032 return VBOXSTRICTRC_VAL(rc);
1033}
1034
1035
1036/**
1037 * \#PF Handler callback for PT write accesses.
1038 *
1039 * @returns VBox status code (appropriate for GC return).
1040 * @param pVM Pointer to the VM.
1041 * @param uErrorCode CPU Error code.
1042 * @param pRegFrame Trap register frame.
1043 * NULL on DMA and other non CPU access.
1044 * @param pvFault The fault address (cr2).
1045 * @param GCPhysFault The GC physical address corresponding to pvFault.
1046 * @param pvUser User argument.
1047 */
1048DECLEXPORT(int) pgmPoolAccessHandler(PVM pVM, RTGCUINT uErrorCode, PCPUMCTXCORE pRegFrame, RTGCPTR pvFault,
1049 RTGCPHYS GCPhysFault, void *pvUser)
1050{
1051 STAM_PROFILE_START(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), a);
1052 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1053 PPGMPOOLPAGE pPage = (PPGMPOOLPAGE)pvUser;
1054 PVMCPU pVCpu = VMMGetCpu(pVM);
1055 unsigned cMaxModifications;
1056 bool fForcedFlush = false;
1057 NOREF(uErrorCode);
1058
1059 LogFlow(("pgmPoolAccessHandler: pvFault=%RGv pPage=%p:{.idx=%d} GCPhysFault=%RGp\n", pvFault, pPage, pPage->idx, GCPhysFault));
1060
1061 pgmLock(pVM);
1062 if (PHYS_PAGE_ADDRESS(GCPhysFault) != PHYS_PAGE_ADDRESS(pPage->GCPhys))
1063 {
1064 /* Pool page changed while we were waiting for the lock; ignore. */
1065 Log(("CPU%d: pgmPoolAccessHandler pgm pool page for %RGp changed (to %RGp) while waiting!\n", pVCpu->idCpu, PHYS_PAGE_ADDRESS(GCPhysFault), PHYS_PAGE_ADDRESS(pPage->GCPhys)));
1066 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,Handled), a);
1067 pgmUnlock(pVM);
1068 return VINF_SUCCESS;
1069 }
1070#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
1071 if (pPage->fDirty)
1072 {
1073 Assert(VMCPU_FF_ISSET(pVCpu, VMCPU_FF_TLB_FLUSH));
1074 pgmUnlock(pVM);
1075 return VINF_SUCCESS; /* SMP guest case where we were blocking on the pgm lock while the same page was being marked dirty. */
1076 }
1077#endif
1078
1079#if 0 /* test code defined(VBOX_STRICT) && defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT) */
1080 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1081 {
1082 void *pvShw = PGMPOOL_PAGE_2_PTR(pPool->CTX_SUFF(pVM), pPage);
1083 void *pvGst;
1084 int rc = PGM_GCPHYS_2_PTR(pPool->CTX_SUFF(pVM), pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1085 pgmPoolTrackCheckPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
1086 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1087 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1088 }
1089#endif
1090
1091 /*
1092 * Disassemble the faulting instruction.
1093 */
1094 PDISCPUSTATE pDis = &pVCpu->pgm.s.DisState;
1095 int rc = EMInterpretDisasCurrent(pVM, pVCpu, pDis, NULL);
1096 if (RT_UNLIKELY(rc != VINF_SUCCESS))
1097 {
1098 AssertMsg(rc == VERR_PAGE_NOT_PRESENT || rc == VERR_PAGE_TABLE_NOT_PRESENT, ("Unexpected rc %d\n", rc));
1099 pgmUnlock(pVM);
1100 return rc;
1101 }
1102
1103 Assert(pPage->enmKind != PGMPOOLKIND_FREE);
1104
1105 /*
1106 * We should ALWAYS have the list head as user parameter. This
1107 * is because we use that page to record the changes.
1108 */
1109 Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1110
1111#ifdef IN_RING0
1112 /* Maximum nr of modifications depends on the page type. */
1113 if ( pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT
1114 || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1115 cMaxModifications = 4;
1116 else
1117 cMaxModifications = 24;
1118#else
1119 cMaxModifications = 48;
1120#endif
1121
1122 /*
1123 * Incremental page table updates should weigh more than random ones.
1124 * (Only applies when started from offset 0)
1125 */
1126 pVCpu->pgm.s.cPoolAccessHandler++;
1127 if ( pPage->GCPtrLastAccessHandlerRip >= pRegFrame->rip - 0x40 /* observed loops in Windows 7 x64 */
1128 && pPage->GCPtrLastAccessHandlerRip < pRegFrame->rip + 0x40
1129 && pvFault == (pPage->GCPtrLastAccessHandlerFault + pDis->Param1.cb)
1130 && pVCpu->pgm.s.cPoolAccessHandler == pPage->cLastAccessHandler + 1)
1131 {
1132 Log(("Possible page reuse cMods=%d -> %d (locked=%d type=%s)\n", pPage->cModifications, pPage->cModifications * 2, pgmPoolIsPageLocked(pPage), pgmPoolPoolKindToStr(pPage->enmKind)));
1133 Assert(pPage->cModifications < 32000);
1134 pPage->cModifications = pPage->cModifications * 2;
1135 pPage->GCPtrLastAccessHandlerFault = pvFault;
1136 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1137 if (pPage->cModifications >= cMaxModifications)
1138 {
1139 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FlushReinit));
1140 fForcedFlush = true;
1141 }
1142 }
1143
1144 if (pPage->cModifications >= cMaxModifications)
1145 Log(("Mod overflow %RGv cMods=%d (locked=%d type=%s)\n", pvFault, pPage->cModifications, pgmPoolIsPageLocked(pPage), pgmPoolPoolKindToStr(pPage->enmKind)));
1146
1147 /*
1148 * Check if it's worth dealing with.
1149 */
1150 bool fReused = false;
1151 bool fNotReusedNotForking = false;
1152 if ( ( pPage->cModifications < cMaxModifications /** @todo #define */ /** @todo need to check that it's not mapping EIP. */ /** @todo adjust this! */
1153 || pgmPoolIsPageLocked(pPage)
1154 )
1155 && !(fReused = pgmPoolMonitorIsReused(pVM, pVCpu, pRegFrame, pDis, pvFault))
1156 && !pgmPoolMonitorIsForking(pPool, pDis, GCPhysFault & PAGE_OFFSET_MASK))
1157 {
1158 /*
1159 * Simple instructions, no REP prefix.
1160 */
1161 if (!(pDis->fPrefix & (DISPREFIX_REP | DISPREFIX_REPNE)))
1162 {
1163 rc = pgmPoolAccessHandlerSimple(pVM, pVCpu, pPool, pPage, pDis, pRegFrame, GCPhysFault, pvFault, &fReused);
1164 if (fReused)
1165 goto flushPage;
1166
1167 /* A mov instruction to change the first page table entry will be remembered so we can detect
1168 * full page table changes early on. This will reduce the amount of unnecessary traps we'll take.
1169 */
1170 if ( rc == VINF_SUCCESS
1171 && !pPage->cLocked /* only applies to unlocked pages as we can't free locked ones (e.g. cr3 root). */
1172 && pDis->pCurInstr->uOpcode == OP_MOV
1173 && (pvFault & PAGE_OFFSET_MASK) == 0)
1174 {
1175 pPage->GCPtrLastAccessHandlerFault = pvFault;
1176 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1177 pPage->GCPtrLastAccessHandlerRip = pRegFrame->rip;
1178 /* Make sure we don't kick out a page too quickly. */
1179 if (pPage->cModifications > 8)
1180 pPage->cModifications = 2;
1181 }
1182 else if (pPage->GCPtrLastAccessHandlerFault == pvFault)
1183 {
1184 /* ignore the 2nd write to this page table entry. */
1185 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1186 }
1187 else
1188 {
1189 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
1190 pPage->GCPtrLastAccessHandlerRip = 0;
1191 }
1192
1193 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,Handled), a);
1194 pgmUnlock(pVM);
1195 return rc;
1196 }
1197
1198 /*
1199 * Windows is frequently doing small memset() operations (netio test 4k+).
1200 * We have to deal with these or we'll kill the cache and performance.
1201 */
1202 if ( pDis->pCurInstr->uOpcode == OP_STOSWD
1203 && !pRegFrame->eflags.Bits.u1DF
1204 && pDis->uOpMode == pDis->uCpuMode
1205 && pDis->uAddrMode == pDis->uCpuMode)
1206 {
1207 bool fValidStosd = false;
1208
1209 if ( pDis->uCpuMode == DISCPUMODE_32BIT
1210 && pDis->fPrefix == DISPREFIX_REP
1211 && pRegFrame->ecx <= 0x20
1212 && pRegFrame->ecx * 4 <= PAGE_SIZE - ((uintptr_t)pvFault & PAGE_OFFSET_MASK)
1213 && !((uintptr_t)pvFault & 3)
1214 && (pRegFrame->eax == 0 || pRegFrame->eax == 0x80) /* the two values observed. */
1215 )
1216 {
1217 fValidStosd = true;
1218 pRegFrame->rcx &= 0xffffffff; /* paranoia */
1219 }
1220 else
1221 if ( pDis->uCpuMode == DISCPUMODE_64BIT
1222 && pDis->fPrefix == (DISPREFIX_REP | DISPREFIX_REX)
1223 && pRegFrame->rcx <= 0x20
1224 && pRegFrame->rcx * 8 <= PAGE_SIZE - ((uintptr_t)pvFault & PAGE_OFFSET_MASK)
1225 && !((uintptr_t)pvFault & 7)
1226 && (pRegFrame->rax == 0 || pRegFrame->rax == 0x80) /* the two values observed. */
1227 )
1228 {
1229 fValidStosd = true;
1230 }
1231
1232 if (fValidStosd)
1233 {
1234 rc = pgmPoolAccessHandlerSTOSD(pVM, pPool, pPage, pDis, pRegFrame, GCPhysFault, pvFault);
1235 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,RepStosd), a);
1236 pgmUnlock(pVM);
1237 return rc;
1238 }
1239 }
1240
1241 /* REP prefix, don't bother. */
1242 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,RepPrefix));
1243 Log4(("pgmPoolAccessHandler: eax=%#x ecx=%#x edi=%#x esi=%#x rip=%RGv opcode=%d prefix=%#x\n",
1244 pRegFrame->eax, pRegFrame->ecx, pRegFrame->edi, pRegFrame->esi, (RTGCPTR)pRegFrame->rip, pDis->pCurInstr->uOpcode, pDis->fPrefix));
1245 fNotReusedNotForking = true;
1246 }
1247
1248#if defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT) && defined(IN_RING0)
1249 /* E.g. Windows 7 x64 initializes page tables and touches some pages in the table during the process. This
1250 * leads to pgm pool trashing and an excessive amount of write faults due to page monitoring.
1251 */
1252 if ( pPage->cModifications >= cMaxModifications
1253 && !fForcedFlush
1254 && (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1255 && ( fNotReusedNotForking
1256 || ( !pgmPoolMonitorIsReused(pVM, pVCpu, pRegFrame, pDis, pvFault)
1257 && !pgmPoolMonitorIsForking(pPool, pDis, GCPhysFault & PAGE_OFFSET_MASK))
1258 )
1259 )
1260 {
1261 Assert(!pgmPoolIsPageLocked(pPage));
1262 Assert(pPage->fDirty == false);
1263
1264 /* Flush any monitored duplicates as we will disable write protection. */
1265 if ( pPage->iMonitoredNext != NIL_PGMPOOL_IDX
1266 || pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
1267 {
1268 PPGMPOOLPAGE pPageHead = pPage;
1269
1270 /* Find the monitor head. */
1271 while (pPageHead->iMonitoredPrev != NIL_PGMPOOL_IDX)
1272 pPageHead = &pPool->aPages[pPageHead->iMonitoredPrev];
1273
1274 while (pPageHead)
1275 {
1276 unsigned idxNext = pPageHead->iMonitoredNext;
1277
1278 if (pPageHead != pPage)
1279 {
1280 STAM_COUNTER_INC(&pPool->StatDirtyPageDupFlush);
1281 Log(("Flush duplicate page idx=%d GCPhys=%RGp type=%s\n", pPageHead->idx, pPageHead->GCPhys, pgmPoolPoolKindToStr(pPageHead->enmKind)));
1282 int rc2 = pgmPoolFlushPage(pPool, pPageHead);
1283 AssertRC(rc2);
1284 }
1285
1286 if (idxNext == NIL_PGMPOOL_IDX)
1287 break;
1288
1289 pPageHead = &pPool->aPages[idxNext];
1290 }
1291 }
1292
1293 /* The flushing above might fail for locked pages, so double check. */
1294 if ( pPage->iMonitoredNext == NIL_PGMPOOL_IDX
1295 && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX)
1296 {
1297 pgmPoolAddDirtyPage(pVM, pPool, pPage);
1298
1299 /* Temporarily allow write access to the page table again. */
1300 rc = PGMHandlerPhysicalPageTempOff(pVM, pPage->GCPhys & PAGE_BASE_GC_MASK, pPage->GCPhys & PAGE_BASE_GC_MASK);
1301 if (rc == VINF_SUCCESS)
1302 {
1303 rc = PGMShwMakePageWritable(pVCpu, pvFault, PGM_MK_PG_IS_WRITE_FAULT);
1304 AssertMsg(rc == VINF_SUCCESS
1305 /* In the SMP case the page table might be removed while we wait for the PGM lock in the trap handler. */
1306 || rc == VERR_PAGE_TABLE_NOT_PRESENT
1307 || rc == VERR_PAGE_NOT_PRESENT,
1308 ("PGMShwModifyPage -> GCPtr=%RGv rc=%d\n", pvFault, rc));
1309# ifdef VBOX_STRICT
1310 pPage->GCPtrDirtyFault = pvFault;
1311# endif
1312
1313 STAM_PROFILE_STOP(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), a);
1314 pgmUnlock(pVM);
1315 return rc;
1316 }
1317 }
1318 }
1319#endif /* PGMPOOL_WITH_OPTIMIZED_DIRTY_PT */
1320
1321 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FlushModOverflow));
1322flushPage:
1323 /*
1324 * Not worth it, so flush it.
1325 *
1326 * If we considered it to be reused, don't go back to ring-3
1327 * to emulate failed instructions since we usually cannot
1328 * interpret then. This may be a bit risky, in which case
1329 * the reuse detection must be fixed.
1330 */
1331 rc = pgmPoolAccessHandlerFlush(pVM, pVCpu, pPool, pPage, pDis, pRegFrame, GCPhysFault, pvFault);
1332 if ( rc == VINF_EM_RAW_EMULATE_INSTR
1333 && fReused)
1334 {
1335 /* Make sure that the current instruction still has shadow page backing, otherwise we'll end up in a loop. */
1336 if (PGMShwGetPage(pVCpu, pRegFrame->rip, NULL, NULL) == VINF_SUCCESS)
1337 rc = VINF_SUCCESS; /* safe to restart the instruction. */
1338 }
1339 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,FlushPage), a);
1340 pgmUnlock(pVM);
1341 return rc;
1342}
1343
1344# endif /* !IN_RING3 */
1345
1346# ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
1347
1348# if defined(VBOX_STRICT) && !defined(IN_RING3)
1349
1350/**
1351 * Check references to guest physical memory in a PAE / PAE page table.
1352 *
1353 * @param pPool The pool.
1354 * @param pPage The page.
1355 * @param pShwPT The shadow page table (mapping of the page).
1356 * @param pGstPT The guest page table.
1357 */
1358static void pgmPoolTrackCheckPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT)
1359{
1360 unsigned cErrors = 0;
1361 int LastRc = -1; /* initialized to shut up gcc */
1362 unsigned LastPTE = ~0U; /* initialized to shut up gcc */
1363 RTHCPHYS LastHCPhys = NIL_RTHCPHYS; /* initialized to shut up gcc */
1364 PVM pVM = pPool->CTX_SUFF(pVM);
1365
1366#ifdef VBOX_STRICT
1367 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1368 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1369#endif
1370 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1371 {
1372 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1373 {
1374 RTHCPHYS HCPhys = NIL_RTHCPHYS;
1375 int rc = PGMPhysGCPhys2HCPhys(pVM, pGstPT->a[i].u & X86_PTE_PAE_PG_MASK, &HCPhys);
1376 if ( rc != VINF_SUCCESS
1377 || PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) != HCPhys)
1378 {
1379 Log(("rc=%d idx=%d guest %RX64 shw=%RX64 vs %RHp\n", rc, i, pGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1380 LastPTE = i;
1381 LastRc = rc;
1382 LastHCPhys = HCPhys;
1383 cErrors++;
1384
1385 RTHCPHYS HCPhysPT = NIL_RTHCPHYS;
1386 rc = PGMPhysGCPhys2HCPhys(pVM, pPage->GCPhys, &HCPhysPT);
1387 AssertRC(rc);
1388
1389 for (unsigned iPage = 0; iPage < pPool->cCurPages; iPage++)
1390 {
1391 PPGMPOOLPAGE pTempPage = &pPool->aPages[iPage];
1392
1393 if (pTempPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1394 {
1395 PPGMSHWPTPAE pShwPT2 = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pTempPage);
1396
1397 for (unsigned j = 0; j < RT_ELEMENTS(pShwPT->a); j++)
1398 {
1399 if ( PGMSHWPTEPAE_IS_P_RW(pShwPT2->a[j])
1400 && PGMSHWPTEPAE_GET_HCPHYS(pShwPT2->a[j]) == HCPhysPT)
1401 {
1402 Log(("GCPhys=%RGp idx=%d %RX64 vs %RX64\n", pTempPage->GCPhys, j, PGMSHWPTEPAE_GET_LOG(pShwPT->a[j]), PGMSHWPTEPAE_GET_LOG(pShwPT2->a[j])));
1403 }
1404 }
1405
1406 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pShwPT2);
1407 }
1408 }
1409 }
1410 }
1411 }
1412 AssertMsg(!cErrors, ("cErrors=%d: last rc=%d idx=%d guest %RX64 shw=%RX64 vs %RHp\n", cErrors, LastRc, LastPTE, pGstPT->a[LastPTE].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[LastPTE]), LastHCPhys));
1413}
1414
1415
1416/**
1417 * Check references to guest physical memory in a PAE / 32-bit page table.
1418 *
1419 * @param pPool The pool.
1420 * @param pPage The page.
1421 * @param pShwPT The shadow page table (mapping of the page).
1422 * @param pGstPT The guest page table.
1423 */
1424static void pgmPoolTrackCheckPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT)
1425{
1426 unsigned cErrors = 0;
1427 int LastRc = -1; /* initialized to shut up gcc */
1428 unsigned LastPTE = ~0U; /* initialized to shut up gcc */
1429 RTHCPHYS LastHCPhys = NIL_RTHCPHYS; /* initialized to shut up gcc */
1430 PVM pVM = pPool->CTX_SUFF(pVM);
1431
1432#ifdef VBOX_STRICT
1433 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1434 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1435#endif
1436 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1437 {
1438 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1439 {
1440 RTHCPHYS HCPhys = NIL_RTHCPHYS;
1441 int rc = PGMPhysGCPhys2HCPhys(pVM, pGstPT->a[i].u & X86_PTE_PG_MASK, &HCPhys);
1442 if ( rc != VINF_SUCCESS
1443 || PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) != HCPhys)
1444 {
1445 Log(("rc=%d idx=%d guest %x shw=%RX64 vs %RHp\n", rc, i, pGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1446 LastPTE = i;
1447 LastRc = rc;
1448 LastHCPhys = HCPhys;
1449 cErrors++;
1450
1451 RTHCPHYS HCPhysPT = NIL_RTHCPHYS;
1452 rc = PGMPhysGCPhys2HCPhys(pVM, pPage->GCPhys, &HCPhysPT);
1453 AssertRC(rc);
1454
1455 for (unsigned iPage = 0; iPage < pPool->cCurPages; iPage++)
1456 {
1457 PPGMPOOLPAGE pTempPage = &pPool->aPages[iPage];
1458
1459 if (pTempPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1460 {
1461 PPGMSHWPTPAE pShwPT2 = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pTempPage);
1462
1463 for (unsigned j = 0; j < RT_ELEMENTS(pShwPT->a); j++)
1464 {
1465 if ( PGMSHWPTEPAE_IS_P_RW(pShwPT2->a[j])
1466 && PGMSHWPTEPAE_GET_HCPHYS(pShwPT2->a[j]) == HCPhysPT)
1467 {
1468 Log(("GCPhys=%RGp idx=%d %RX64 vs %RX64\n", pTempPage->GCPhys, j, PGMSHWPTEPAE_GET_LOG(pShwPT->a[j]), PGMSHWPTEPAE_GET_LOG(pShwPT2->a[j])));
1469 }
1470 }
1471
1472 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pShwPT2);
1473 }
1474 }
1475 }
1476 }
1477 }
1478 AssertMsg(!cErrors, ("cErrors=%d: last rc=%d idx=%d guest %x shw=%RX64 vs %RHp\n", cErrors, LastRc, LastPTE, pGstPT->a[LastPTE].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[LastPTE]), LastHCPhys));
1479}
1480
1481# endif /* VBOX_STRICT && !IN_RING3 */
1482
1483/**
1484 * Clear references to guest physical memory in a PAE / PAE page table.
1485 *
1486 * @returns nr of changed PTEs
1487 * @param pPool The pool.
1488 * @param pPage The page.
1489 * @param pShwPT The shadow page table (mapping of the page).
1490 * @param pGstPT The guest page table.
1491 * @param pOldGstPT The old cached guest page table.
1492 * @param fAllowRemoval Bail out as soon as we encounter an invalid PTE
1493 * @param pfFlush Flush reused page table (out)
1494 */
1495DECLINLINE(unsigned) pgmPoolTrackFlushPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT,
1496 PCX86PTPAE pOldGstPT, bool fAllowRemoval, bool *pfFlush)
1497{
1498 unsigned cChanged = 0;
1499
1500#ifdef VBOX_STRICT
1501 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1502 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1503#endif
1504 *pfFlush = false;
1505
1506 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1507 {
1508 /* Check the new value written by the guest. If present and with a bogus physical address, then
1509 * it's fairly safe to assume the guest is reusing the PT.
1510 */
1511 if ( fAllowRemoval
1512 && pGstPT->a[i].n.u1Present)
1513 {
1514 if (!PGMPhysIsGCPhysValid(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK))
1515 {
1516 *pfFlush = true;
1517 return ++cChanged;
1518 }
1519 }
1520 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1521 {
1522 /* If the old cached PTE is identical, then there's no need to flush the shadow copy. */
1523 if ((pGstPT->a[i].u & X86_PTE_PAE_PG_MASK) == (pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK))
1524 {
1525#ifdef VBOX_STRICT
1526 RTHCPHYS HCPhys = NIL_RTGCPHYS;
1527 int rc = PGMPhysGCPhys2HCPhys(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK, &HCPhys);
1528 AssertMsg(rc == VINF_SUCCESS && PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) == HCPhys, ("rc=%d guest %RX64 old %RX64 shw=%RX64 vs %RHp\n", rc, pGstPT->a[i].u, pOldGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1529#endif
1530 uint64_t uHostAttr = PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G | X86_PTE_PAE_NX);
1531 bool fHostRW = !!(PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & X86_PTE_RW);
1532 uint64_t uGuestAttr = pGstPT->a[i].u & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G | X86_PTE_PAE_NX);
1533 bool fGuestRW = !!(pGstPT->a[i].u & X86_PTE_RW);
1534
1535 if ( uHostAttr == uGuestAttr
1536 && fHostRW <= fGuestRW)
1537 continue;
1538 }
1539 cChanged++;
1540 /* Something was changed, so flush it. */
1541 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX64 hint=%RX64\n",
1542 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK));
1543 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK, i);
1544 PGMSHWPTEPAE_ATOMIC_SET(pShwPT->a[i], 0);
1545 }
1546 }
1547 return cChanged;
1548}
1549
1550
1551/**
1552 * Clear references to guest physical memory in a PAE / PAE page table.
1553 *
1554 * @returns nr of changed PTEs
1555 * @param pPool The pool.
1556 * @param pPage The page.
1557 * @param pShwPT The shadow page table (mapping of the page).
1558 * @param pGstPT The guest page table.
1559 * @param pOldGstPT The old cached guest page table.
1560 * @param fAllowRemoval Bail out as soon as we encounter an invalid PTE
1561 * @param pfFlush Flush reused page table (out)
1562 */
1563DECLINLINE(unsigned) pgmPoolTrackFlushPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT,
1564 PCX86PT pOldGstPT, bool fAllowRemoval, bool *pfFlush)
1565{
1566 unsigned cChanged = 0;
1567
1568#ifdef VBOX_STRICT
1569 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1570 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1571#endif
1572 *pfFlush = false;
1573
1574 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1575 {
1576 /* Check the new value written by the guest. If present and with a bogus physical address, then
1577 * it's fairly safe to assume the guest is reusing the PT.
1578 */
1579 if ( fAllowRemoval
1580 && pGstPT->a[i].n.u1Present)
1581 {
1582 if (!PGMPhysIsGCPhysValid(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PG_MASK))
1583 {
1584 *pfFlush = true;
1585 return ++cChanged;
1586 }
1587 }
1588 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1589 {
1590 /* If the old cached PTE is identical, then there's no need to flush the shadow copy. */
1591 if ((pGstPT->a[i].u & X86_PTE_PG_MASK) == (pOldGstPT->a[i].u & X86_PTE_PG_MASK))
1592 {
1593#ifdef VBOX_STRICT
1594 RTHCPHYS HCPhys = NIL_RTGCPHYS;
1595 int rc = PGMPhysGCPhys2HCPhys(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PG_MASK, &HCPhys);
1596 AssertMsg(rc == VINF_SUCCESS && PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) == HCPhys, ("rc=%d guest %x old %x shw=%RX64 vs %RHp\n", rc, pGstPT->a[i].u, pOldGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1597#endif
1598 uint64_t uHostAttr = PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G);
1599 bool fHostRW = !!(PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & X86_PTE_RW);
1600 uint64_t uGuestAttr = pGstPT->a[i].u & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G);
1601 bool fGuestRW = !!(pGstPT->a[i].u & X86_PTE_RW);
1602
1603 if ( uHostAttr == uGuestAttr
1604 && fHostRW <= fGuestRW)
1605 continue;
1606 }
1607 cChanged++;
1608 /* Something was changed, so flush it. */
1609 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX64 hint=%x\n",
1610 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PG_MASK));
1611 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PG_MASK, i);
1612 PGMSHWPTEPAE_ATOMIC_SET(pShwPT->a[i], 0);
1613 }
1614 }
1615 return cChanged;
1616}
1617
1618
1619/**
1620 * Flush a dirty page
1621 *
1622 * @param pVM Pointer to the VM.
1623 * @param pPool The pool.
1624 * @param idxSlot Dirty array slot index
1625 * @param fAllowRemoval Allow a reused page table to be removed
1626 */
1627static void pgmPoolFlushDirtyPage(PVM pVM, PPGMPOOL pPool, unsigned idxSlot, bool fAllowRemoval = false)
1628{
1629 PPGMPOOLPAGE pPage;
1630 unsigned idxPage;
1631
1632 Assert(idxSlot < RT_ELEMENTS(pPool->aDirtyPages));
1633 if (pPool->aDirtyPages[idxSlot].uIdx == NIL_PGMPOOL_IDX)
1634 return;
1635
1636 idxPage = pPool->aDirtyPages[idxSlot].uIdx;
1637 AssertRelease(idxPage != NIL_PGMPOOL_IDX);
1638 pPage = &pPool->aPages[idxPage];
1639 Assert(pPage->idx == idxPage);
1640 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1641
1642 AssertMsg(pPage->fDirty, ("Page %RGp (slot=%d) not marked dirty!", pPage->GCPhys, idxSlot));
1643 Log(("Flush dirty page %RGp cMods=%d\n", pPage->GCPhys, pPage->cModifications));
1644
1645#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
1646 PVMCPU pVCpu = VMMGetCpu(pVM);
1647 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
1648#endif
1649
1650 /* First write protect the page again to catch all write accesses. (before checking for changes -> SMP) */
1651 int rc = PGMHandlerPhysicalReset(pVM, pPage->GCPhys & PAGE_BASE_GC_MASK);
1652 Assert(rc == VINF_SUCCESS);
1653 pPage->fDirty = false;
1654
1655#ifdef VBOX_STRICT
1656 uint64_t fFlags = 0;
1657 RTHCPHYS HCPhys;
1658 rc = PGMShwGetPage(VMMGetCpu(pVM), pPage->GCPtrDirtyFault, &fFlags, &HCPhys);
1659 AssertMsg( ( rc == VINF_SUCCESS
1660 && (!(fFlags & X86_PTE_RW) || HCPhys != pPage->Core.Key))
1661 /* In the SMP case the page table might be removed while we wait for the PGM lock in the trap handler. */
1662 || rc == VERR_PAGE_TABLE_NOT_PRESENT
1663 || rc == VERR_PAGE_NOT_PRESENT,
1664 ("PGMShwGetPage -> GCPtr=%RGv rc=%d flags=%RX64\n", pPage->GCPtrDirtyFault, rc, fFlags));
1665#endif
1666
1667 /* Flush those PTEs that have changed. */
1668 STAM_PROFILE_START(&pPool->StatTrackDeref,a);
1669 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
1670 void *pvGst;
1671 rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1672 bool fFlush;
1673 unsigned cChanges;
1674
1675 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1676 cChanges = pgmPoolTrackFlushPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst,
1677 (PCX86PTPAE)&pPool->aDirtyPages[idxSlot].aPage[0], fAllowRemoval, &fFlush);
1678 else
1679 cChanges = pgmPoolTrackFlushPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst,
1680 (PCX86PT)&pPool->aDirtyPages[idxSlot].aPage[0], fAllowRemoval, &fFlush);
1681
1682 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1683 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1684 STAM_PROFILE_STOP(&pPool->StatTrackDeref,a);
1685 /* Note: we might want to consider keeping the dirty page active in case there were many changes. */
1686
1687 /* This page is likely to be modified again, so reduce the nr of modifications just a bit here. */
1688 Assert(pPage->cModifications);
1689 if (cChanges < 4)
1690 pPage->cModifications = 1; /* must use > 0 here */
1691 else
1692 pPage->cModifications = RT_MAX(1, pPage->cModifications / 2);
1693
1694 STAM_COUNTER_INC(&pPool->StatResetDirtyPages);
1695 if (pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages))
1696 pPool->idxFreeDirtyPage = idxSlot;
1697
1698 pPool->cDirtyPages--;
1699 pPool->aDirtyPages[idxSlot].uIdx = NIL_PGMPOOL_IDX;
1700 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1701 if (fFlush)
1702 {
1703 Assert(fAllowRemoval);
1704 Log(("Flush reused page table!\n"));
1705 pgmPoolFlushPage(pPool, pPage);
1706 STAM_COUNTER_INC(&pPool->StatForceFlushReused);
1707 }
1708 else
1709 Log(("Removed dirty page %RGp cMods=%d cChanges=%d\n", pPage->GCPhys, pPage->cModifications, cChanges));
1710
1711#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
1712 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
1713#endif
1714}
1715
1716
1717# ifndef IN_RING3
1718/**
1719 * Add a new dirty page
1720 *
1721 * @param pVM Pointer to the VM.
1722 * @param pPool The pool.
1723 * @param pPage The page.
1724 */
1725void pgmPoolAddDirtyPage(PVM pVM, PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1726{
1727 unsigned idxFree;
1728
1729 PGM_LOCK_ASSERT_OWNER(pVM);
1730 AssertCompile(RT_ELEMENTS(pPool->aDirtyPages) == 8 || RT_ELEMENTS(pPool->aDirtyPages) == 16);
1731 Assert(!pPage->fDirty);
1732
1733 idxFree = pPool->idxFreeDirtyPage;
1734 Assert(idxFree < RT_ELEMENTS(pPool->aDirtyPages));
1735 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1736
1737 if (pPool->cDirtyPages >= RT_ELEMENTS(pPool->aDirtyPages))
1738 {
1739 STAM_COUNTER_INC(&pPool->StatDirtyPageOverFlowFlush);
1740 pgmPoolFlushDirtyPage(pVM, pPool, idxFree, true /* allow removal of reused page tables*/);
1741 }
1742 Assert(pPool->cDirtyPages < RT_ELEMENTS(pPool->aDirtyPages));
1743 AssertMsg(pPool->aDirtyPages[idxFree].uIdx == NIL_PGMPOOL_IDX, ("idxFree=%d cDirtyPages=%d\n", idxFree, pPool->cDirtyPages));
1744
1745 Log(("Add dirty page %RGp (slot=%d)\n", pPage->GCPhys, idxFree));
1746
1747 /*
1748 * Make a copy of the guest page table as we require valid GCPhys addresses
1749 * when removing references to physical pages.
1750 * (The HCPhys linear lookup is *extremely* expensive!)
1751 */
1752 void *pvGst;
1753 int rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1754 memcpy(&pPool->aDirtyPages[idxFree].aPage[0], pvGst, (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT) ? PAGE_SIZE : PAGE_SIZE/2);
1755# ifdef VBOX_STRICT
1756 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
1757 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1758 pgmPoolTrackCheckPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
1759 else
1760 pgmPoolTrackCheckPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst);
1761 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1762# endif
1763 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1764
1765 STAM_COUNTER_INC(&pPool->StatDirtyPage);
1766 pPage->fDirty = true;
1767 pPage->idxDirtyEntry = (uint8_t)idxFree; Assert(pPage->idxDirtyEntry == idxFree);
1768 pPool->aDirtyPages[idxFree].uIdx = pPage->idx;
1769 pPool->cDirtyPages++;
1770
1771 pPool->idxFreeDirtyPage = (pPool->idxFreeDirtyPage + 1) & (RT_ELEMENTS(pPool->aDirtyPages) - 1);
1772 if ( pPool->cDirtyPages < RT_ELEMENTS(pPool->aDirtyPages)
1773 && pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx != NIL_PGMPOOL_IDX)
1774 {
1775 unsigned i;
1776 for (i = 1; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1777 {
1778 idxFree = (pPool->idxFreeDirtyPage + i) & (RT_ELEMENTS(pPool->aDirtyPages) - 1);
1779 if (pPool->aDirtyPages[idxFree].uIdx == NIL_PGMPOOL_IDX)
1780 {
1781 pPool->idxFreeDirtyPage = idxFree;
1782 break;
1783 }
1784 }
1785 Assert(i != RT_ELEMENTS(pPool->aDirtyPages));
1786 }
1787
1788 Assert(pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages) || pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx == NIL_PGMPOOL_IDX);
1789 return;
1790}
1791# endif /* !IN_RING3 */
1792
1793
1794/**
1795 * Check if the specified page is dirty (not write monitored)
1796 *
1797 * @return dirty or not
1798 * @param pVM Pointer to the VM.
1799 * @param GCPhys Guest physical address
1800 */
1801bool pgmPoolIsDirtyPage(PVM pVM, RTGCPHYS GCPhys)
1802{
1803 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1804 PGM_LOCK_ASSERT_OWNER(pVM);
1805 if (!pPool->cDirtyPages)
1806 return false;
1807
1808 GCPhys = GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
1809
1810 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1811 {
1812 if (pPool->aDirtyPages[i].uIdx != NIL_PGMPOOL_IDX)
1813 {
1814 PPGMPOOLPAGE pPage;
1815 unsigned idxPage = pPool->aDirtyPages[i].uIdx;
1816
1817 pPage = &pPool->aPages[idxPage];
1818 if (pPage->GCPhys == GCPhys)
1819 return true;
1820 }
1821 }
1822 return false;
1823}
1824
1825
1826/**
1827 * Reset all dirty pages by reinstating page monitoring.
1828 *
1829 * @param pVM Pointer to the VM.
1830 */
1831void pgmPoolResetDirtyPages(PVM pVM)
1832{
1833 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1834 PGM_LOCK_ASSERT_OWNER(pVM);
1835 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1836
1837 if (!pPool->cDirtyPages)
1838 return;
1839
1840 Log(("pgmPoolResetDirtyPages\n"));
1841 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1842 pgmPoolFlushDirtyPage(pVM, pPool, i, true /* allow removal of reused page tables*/);
1843
1844 pPool->idxFreeDirtyPage = 0;
1845 if ( pPool->cDirtyPages != RT_ELEMENTS(pPool->aDirtyPages)
1846 && pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx != NIL_PGMPOOL_IDX)
1847 {
1848 unsigned i;
1849 for (i = 1; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1850 {
1851 if (pPool->aDirtyPages[i].uIdx == NIL_PGMPOOL_IDX)
1852 {
1853 pPool->idxFreeDirtyPage = i;
1854 break;
1855 }
1856 }
1857 AssertMsg(i != RT_ELEMENTS(pPool->aDirtyPages), ("cDirtyPages %d", pPool->cDirtyPages));
1858 }
1859
1860 Assert(pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx == NIL_PGMPOOL_IDX || pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages));
1861 return;
1862}
1863
1864
1865/**
1866 * Invalidate the PT entry for the specified page
1867 *
1868 * @param pVM Pointer to the VM.
1869 * @param GCPtrPage Guest page to invalidate
1870 */
1871void pgmPoolResetDirtyPage(PVM pVM, RTGCPTR GCPtrPage)
1872{
1873 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1874 PGM_LOCK_ASSERT_OWNER(pVM);
1875 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1876
1877 if (!pPool->cDirtyPages)
1878 return;
1879
1880 Log(("pgmPoolResetDirtyPage %RGv\n", GCPtrPage));
1881 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1882 {
1883 }
1884}
1885
1886
1887/**
1888 * Reset all dirty pages by reinstating page monitoring.
1889 *
1890 * @param pVM Pointer to the VM.
1891 * @param GCPhysPT Physical address of the page table
1892 */
1893void pgmPoolInvalidateDirtyPage(PVM pVM, RTGCPHYS GCPhysPT)
1894{
1895 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1896 PGM_LOCK_ASSERT_OWNER(pVM);
1897 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1898 unsigned idxDirtyPage = RT_ELEMENTS(pPool->aDirtyPages);
1899
1900 if (!pPool->cDirtyPages)
1901 return;
1902
1903 GCPhysPT = GCPhysPT & ~(RTGCPHYS)PAGE_OFFSET_MASK;
1904
1905 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1906 {
1907 if (pPool->aDirtyPages[i].uIdx != NIL_PGMPOOL_IDX)
1908 {
1909 unsigned idxPage = pPool->aDirtyPages[i].uIdx;
1910
1911 PPGMPOOLPAGE pPage = &pPool->aPages[idxPage];
1912 if (pPage->GCPhys == GCPhysPT)
1913 {
1914 idxDirtyPage = i;
1915 break;
1916 }
1917 }
1918 }
1919
1920 if (idxDirtyPage != RT_ELEMENTS(pPool->aDirtyPages))
1921 {
1922 pgmPoolFlushDirtyPage(pVM, pPool, idxDirtyPage, true /* allow removal of reused page tables*/);
1923 if ( pPool->cDirtyPages != RT_ELEMENTS(pPool->aDirtyPages)
1924 && pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx != NIL_PGMPOOL_IDX)
1925 {
1926 unsigned i;
1927 for (i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1928 {
1929 if (pPool->aDirtyPages[i].uIdx == NIL_PGMPOOL_IDX)
1930 {
1931 pPool->idxFreeDirtyPage = i;
1932 break;
1933 }
1934 }
1935 AssertMsg(i != RT_ELEMENTS(pPool->aDirtyPages), ("cDirtyPages %d", pPool->cDirtyPages));
1936 }
1937 }
1938}
1939
1940# endif /* PGMPOOL_WITH_OPTIMIZED_DIRTY_PT */
1941
1942/**
1943 * Inserts a page into the GCPhys hash table.
1944 *
1945 * @param pPool The pool.
1946 * @param pPage The page.
1947 */
1948DECLINLINE(void) pgmPoolHashInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1949{
1950 Log3(("pgmPoolHashInsert: %RGp\n", pPage->GCPhys));
1951 Assert(pPage->GCPhys != NIL_RTGCPHYS); Assert(pPage->iNext == NIL_PGMPOOL_IDX);
1952 uint16_t iHash = PGMPOOL_HASH(pPage->GCPhys);
1953 pPage->iNext = pPool->aiHash[iHash];
1954 pPool->aiHash[iHash] = pPage->idx;
1955}
1956
1957
1958/**
1959 * Removes a page from the GCPhys hash table.
1960 *
1961 * @param pPool The pool.
1962 * @param pPage The page.
1963 */
1964DECLINLINE(void) pgmPoolHashRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1965{
1966 Log3(("pgmPoolHashRemove: %RGp\n", pPage->GCPhys));
1967 uint16_t iHash = PGMPOOL_HASH(pPage->GCPhys);
1968 if (pPool->aiHash[iHash] == pPage->idx)
1969 pPool->aiHash[iHash] = pPage->iNext;
1970 else
1971 {
1972 uint16_t iPrev = pPool->aiHash[iHash];
1973 for (;;)
1974 {
1975 const int16_t i = pPool->aPages[iPrev].iNext;
1976 if (i == pPage->idx)
1977 {
1978 pPool->aPages[iPrev].iNext = pPage->iNext;
1979 break;
1980 }
1981 if (i == NIL_PGMPOOL_IDX)
1982 {
1983 AssertReleaseMsgFailed(("GCPhys=%RGp idx=%d\n", pPage->GCPhys, pPage->idx));
1984 break;
1985 }
1986 iPrev = i;
1987 }
1988 }
1989 pPage->iNext = NIL_PGMPOOL_IDX;
1990}
1991
1992
1993/**
1994 * Frees up one cache page.
1995 *
1996 * @returns VBox status code.
1997 * @retval VINF_SUCCESS on success.
1998 * @param pPool The pool.
1999 * @param iUser The user index.
2000 */
2001static int pgmPoolCacheFreeOne(PPGMPOOL pPool, uint16_t iUser)
2002{
2003#ifndef IN_RC
2004 const PVM pVM = pPool->CTX_SUFF(pVM);
2005#endif
2006 Assert(pPool->iAgeHead != pPool->iAgeTail); /* We shouldn't be here if there < 2 cached entries! */
2007 STAM_COUNTER_INC(&pPool->StatCacheFreeUpOne);
2008
2009 /*
2010 * Select one page from the tail of the age list.
2011 */
2012 PPGMPOOLPAGE pPage;
2013 for (unsigned iLoop = 0; ; iLoop++)
2014 {
2015 uint16_t iToFree = pPool->iAgeTail;
2016 if (iToFree == iUser)
2017 iToFree = pPool->aPages[iToFree].iAgePrev;
2018/* This is the alternative to the SyncCR3 pgmPoolCacheUsed calls.
2019 if (pPool->aPages[iToFree].iUserHead != NIL_PGMPOOL_USER_INDEX)
2020 {
2021 uint16_t i = pPool->aPages[iToFree].iAgePrev;
2022 for (unsigned j = 0; j < 10 && i != NIL_PGMPOOL_USER_INDEX; j++, i = pPool->aPages[i].iAgePrev)
2023 {
2024 if (pPool->aPages[iToFree].iUserHead == NIL_PGMPOOL_USER_INDEX)
2025 continue;
2026 iToFree = i;
2027 break;
2028 }
2029 }
2030*/
2031 Assert(iToFree != iUser);
2032 AssertRelease(iToFree != NIL_PGMPOOL_IDX);
2033 pPage = &pPool->aPages[iToFree];
2034
2035 /*
2036 * Reject any attempts at flushing the currently active shadow CR3 mapping.
2037 * Call pgmPoolCacheUsed to move the page to the head of the age list.
2038 */
2039 if ( !pgmPoolIsPageLocked(pPage)
2040 && pPage->idx >= PGMPOOL_IDX_FIRST /* paranoia (#6349) */)
2041 break;
2042 LogFlow(("pgmPoolCacheFreeOne: refuse CR3 mapping\n"));
2043 pgmPoolCacheUsed(pPool, pPage);
2044 AssertLogRelReturn(iLoop < 8192, VERR_PGM_POOL_TOO_MANY_LOOPS);
2045 }
2046
2047 /*
2048 * Found a usable page, flush it and return.
2049 */
2050 int rc = pgmPoolFlushPage(pPool, pPage);
2051 /* This flush was initiated by us and not the guest, so explicitly flush the TLB. */
2052 /* todo: find out why this is necessary; pgmPoolFlushPage should trigger a flush if one is really needed. */
2053 if (rc == VINF_SUCCESS)
2054 PGM_INVL_ALL_VCPU_TLBS(pVM);
2055 return rc;
2056}
2057
2058
2059/**
2060 * Checks if a kind mismatch is really a page being reused
2061 * or if it's just normal remappings.
2062 *
2063 * @returns true if reused and the cached page (enmKind1) should be flushed
2064 * @returns false if not reused.
2065 * @param enmKind1 The kind of the cached page.
2066 * @param enmKind2 The kind of the requested page.
2067 */
2068static bool pgmPoolCacheReusedByKind(PGMPOOLKIND enmKind1, PGMPOOLKIND enmKind2)
2069{
2070 switch (enmKind1)
2071 {
2072 /*
2073 * Never reuse them. There is no remapping in non-paging mode.
2074 */
2075 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2076 case PGMPOOLKIND_32BIT_PD_PHYS:
2077 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2078 case PGMPOOLKIND_PAE_PD_PHYS:
2079 case PGMPOOLKIND_PAE_PDPT_PHYS:
2080 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2081 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2082 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2083 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2084 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2085 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT: /* never reuse them for other types */
2086 return false;
2087
2088 /*
2089 * It's perfectly fine to reuse these, except for PAE and non-paging stuff.
2090 */
2091 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2092 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2093 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2094 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2095 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2096 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2097 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2098 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2099 case PGMPOOLKIND_32BIT_PD:
2100 case PGMPOOLKIND_PAE_PDPT:
2101 switch (enmKind2)
2102 {
2103 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2104 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2105 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2106 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2107 case PGMPOOLKIND_64BIT_PML4:
2108 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2109 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2110 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2111 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2112 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2113 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2114 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2115 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2116 return true;
2117 default:
2118 return false;
2119 }
2120
2121 /*
2122 * It's perfectly fine to reuse these, except for PAE and non-paging stuff.
2123 */
2124 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2125 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2126 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2127 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2128 case PGMPOOLKIND_64BIT_PML4:
2129 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2130 switch (enmKind2)
2131 {
2132 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2133 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2134 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2135 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2136 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2137 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2138 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2139 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2140 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2141 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2142 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2143 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2144 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2145 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2146 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2147 return true;
2148 default:
2149 return false;
2150 }
2151
2152 /*
2153 * These cannot be flushed, and it's common to reuse the PDs as PTs.
2154 */
2155 case PGMPOOLKIND_ROOT_NESTED:
2156 return false;
2157
2158 default:
2159 AssertFatalMsgFailed(("enmKind1=%d\n", enmKind1));
2160 }
2161}
2162
2163
2164/**
2165 * Attempts to satisfy a pgmPoolAlloc request from the cache.
2166 *
2167 * @returns VBox status code.
2168 * @retval VINF_PGM_CACHED_PAGE on success.
2169 * @retval VERR_FILE_NOT_FOUND if not found.
2170 * @param pPool The pool.
2171 * @param GCPhys The GC physical address of the page we're gonna shadow.
2172 * @param enmKind The kind of mapping.
2173 * @param enmAccess Access type for the mapping (only relevant for big pages)
2174 * @param fA20Enabled Whether the CPU has the A20 gate enabled.
2175 * @param iUser The shadow page pool index of the user table.
2176 * @param iUserTable The index into the user table (shadowed).
2177 * @param ppPage Where to store the pointer to the page.
2178 */
2179static int pgmPoolCacheAlloc(PPGMPOOL pPool, RTGCPHYS GCPhys, PGMPOOLKIND enmKind, PGMPOOLACCESS enmAccess, bool fA20Enabled,
2180 uint16_t iUser, uint32_t iUserTable, PPPGMPOOLPAGE ppPage)
2181{
2182 /*
2183 * Look up the GCPhys in the hash.
2184 */
2185 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
2186 Log3(("pgmPoolCacheAlloc: %RGp kind %s iUser=%d iUserTable=%x SLOT=%d\n", GCPhys, pgmPoolPoolKindToStr(enmKind), iUser, iUserTable, i));
2187 if (i != NIL_PGMPOOL_IDX)
2188 {
2189 do
2190 {
2191 PPGMPOOLPAGE pPage = &pPool->aPages[i];
2192 Log4(("pgmPoolCacheAlloc: slot %d found page %RGp\n", i, pPage->GCPhys));
2193 if (pPage->GCPhys == GCPhys)
2194 {
2195 if ( (PGMPOOLKIND)pPage->enmKind == enmKind
2196 && (PGMPOOLACCESS)pPage->enmAccess == enmAccess
2197 && pPage->fA20Enabled == fA20Enabled)
2198 {
2199 /* Put it at the start of the use list to make sure pgmPoolTrackAddUser
2200 * doesn't flush it in case there are no more free use records.
2201 */
2202 pgmPoolCacheUsed(pPool, pPage);
2203
2204 int rc = pgmPoolTrackAddUser(pPool, pPage, iUser, iUserTable);
2205 if (RT_SUCCESS(rc))
2206 {
2207 Assert((PGMPOOLKIND)pPage->enmKind == enmKind);
2208 *ppPage = pPage;
2209 if (pPage->cModifications)
2210 pPage->cModifications = 1; /* reset counter (can't use 0, or else it will be reinserted in the modified list) */
2211 STAM_COUNTER_INC(&pPool->StatCacheHits);
2212 return VINF_PGM_CACHED_PAGE;
2213 }
2214 return rc;
2215 }
2216
2217 if ((PGMPOOLKIND)pPage->enmKind != enmKind)
2218 {
2219 /*
2220 * The kind is different. In some cases we should now flush the page
2221 * as it has been reused, but in most cases this is normal remapping
2222 * of PDs as PT or big pages using the GCPhys field in a slightly
2223 * different way than the other kinds.
2224 */
2225 if (pgmPoolCacheReusedByKind((PGMPOOLKIND)pPage->enmKind, enmKind))
2226 {
2227 STAM_COUNTER_INC(&pPool->StatCacheKindMismatches);
2228 pgmPoolFlushPage(pPool, pPage);
2229 break;
2230 }
2231 }
2232 }
2233
2234 /* next */
2235 i = pPage->iNext;
2236 } while (i != NIL_PGMPOOL_IDX);
2237 }
2238
2239 Log3(("pgmPoolCacheAlloc: Missed GCPhys=%RGp enmKind=%s\n", GCPhys, pgmPoolPoolKindToStr(enmKind)));
2240 STAM_COUNTER_INC(&pPool->StatCacheMisses);
2241 return VERR_FILE_NOT_FOUND;
2242}
2243
2244
2245/**
2246 * Inserts a page into the cache.
2247 *
2248 * @param pPool The pool.
2249 * @param pPage The cached page.
2250 * @param fCanBeCached Set if the page is fit for caching from the caller's point of view.
2251 */
2252static void pgmPoolCacheInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage, bool fCanBeCached)
2253{
2254 /*
2255 * Insert into the GCPhys hash if the page is fit for that.
2256 */
2257 Assert(!pPage->fCached);
2258 if (fCanBeCached)
2259 {
2260 pPage->fCached = true;
2261 pgmPoolHashInsert(pPool, pPage);
2262 Log3(("pgmPoolCacheInsert: Caching %p:{.Core=%RHp, .idx=%d, .enmKind=%s, GCPhys=%RGp}\n",
2263 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
2264 STAM_COUNTER_INC(&pPool->StatCacheCacheable);
2265 }
2266 else
2267 {
2268 Log3(("pgmPoolCacheInsert: Not caching %p:{.Core=%RHp, .idx=%d, .enmKind=%s, GCPhys=%RGp}\n",
2269 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
2270 STAM_COUNTER_INC(&pPool->StatCacheUncacheable);
2271 }
2272
2273 /*
2274 * Insert at the head of the age list.
2275 */
2276 pPage->iAgePrev = NIL_PGMPOOL_IDX;
2277 pPage->iAgeNext = pPool->iAgeHead;
2278 if (pPool->iAgeHead != NIL_PGMPOOL_IDX)
2279 pPool->aPages[pPool->iAgeHead].iAgePrev = pPage->idx;
2280 else
2281 pPool->iAgeTail = pPage->idx;
2282 pPool->iAgeHead = pPage->idx;
2283}
2284
2285
2286/**
2287 * Flushes a cached page.
2288 *
2289 * @param pPool The pool.
2290 * @param pPage The cached page.
2291 */
2292static void pgmPoolCacheFlushPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2293{
2294 Log3(("pgmPoolCacheFlushPage: %RGp\n", pPage->GCPhys));
2295
2296 /*
2297 * Remove the page from the hash.
2298 */
2299 if (pPage->fCached)
2300 {
2301 pPage->fCached = false;
2302 pgmPoolHashRemove(pPool, pPage);
2303 }
2304 else
2305 Assert(pPage->iNext == NIL_PGMPOOL_IDX);
2306
2307 /*
2308 * Remove it from the age list.
2309 */
2310 if (pPage->iAgeNext != NIL_PGMPOOL_IDX)
2311 pPool->aPages[pPage->iAgeNext].iAgePrev = pPage->iAgePrev;
2312 else
2313 pPool->iAgeTail = pPage->iAgePrev;
2314 if (pPage->iAgePrev != NIL_PGMPOOL_IDX)
2315 pPool->aPages[pPage->iAgePrev].iAgeNext = pPage->iAgeNext;
2316 else
2317 pPool->iAgeHead = pPage->iAgeNext;
2318 pPage->iAgeNext = NIL_PGMPOOL_IDX;
2319 pPage->iAgePrev = NIL_PGMPOOL_IDX;
2320}
2321
2322
2323/**
2324 * Looks for pages sharing the monitor.
2325 *
2326 * @returns Pointer to the head page.
2327 * @returns NULL if not found.
2328 * @param pPool The Pool
2329 * @param pNewPage The page which is going to be monitored.
2330 */
2331static PPGMPOOLPAGE pgmPoolMonitorGetPageByGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pNewPage)
2332{
2333 /*
2334 * Look up the GCPhys in the hash.
2335 */
2336 RTGCPHYS GCPhys = pNewPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2337 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
2338 if (i == NIL_PGMPOOL_IDX)
2339 return NULL;
2340 do
2341 {
2342 PPGMPOOLPAGE pPage = &pPool->aPages[i];
2343 if ( pPage->GCPhys - GCPhys < PAGE_SIZE
2344 && pPage != pNewPage)
2345 {
2346 switch (pPage->enmKind)
2347 {
2348 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2349 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2350 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2351 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2352 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2353 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2354 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2355 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2356 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2357 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2358 case PGMPOOLKIND_64BIT_PML4:
2359 case PGMPOOLKIND_32BIT_PD:
2360 case PGMPOOLKIND_PAE_PDPT:
2361 {
2362 /* find the head */
2363 while (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
2364 {
2365 Assert(pPage->iMonitoredPrev != pPage->idx);
2366 pPage = &pPool->aPages[pPage->iMonitoredPrev];
2367 }
2368 return pPage;
2369 }
2370
2371 /* ignore, no monitoring. */
2372 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2373 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2374 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2375 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2376 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2377 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2378 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2379 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2380 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2381 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2382 case PGMPOOLKIND_ROOT_NESTED:
2383 case PGMPOOLKIND_PAE_PD_PHYS:
2384 case PGMPOOLKIND_PAE_PDPT_PHYS:
2385 case PGMPOOLKIND_32BIT_PD_PHYS:
2386 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
2387 break;
2388 default:
2389 AssertFatalMsgFailed(("enmKind=%d idx=%d\n", pPage->enmKind, pPage->idx));
2390 }
2391 }
2392
2393 /* next */
2394 i = pPage->iNext;
2395 } while (i != NIL_PGMPOOL_IDX);
2396 return NULL;
2397}
2398
2399
2400/**
2401 * Enabled write monitoring of a guest page.
2402 *
2403 * @returns VBox status code.
2404 * @retval VINF_SUCCESS on success.
2405 * @param pPool The pool.
2406 * @param pPage The cached page.
2407 */
2408static int pgmPoolMonitorInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2409{
2410 LogFlow(("pgmPoolMonitorInsert %RGp\n", pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK));
2411
2412 /*
2413 * Filter out the relevant kinds.
2414 */
2415 switch (pPage->enmKind)
2416 {
2417 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2418 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2419 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2420 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2421 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2422 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2423 case PGMPOOLKIND_64BIT_PML4:
2424 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2425 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2426 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2427 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2428 case PGMPOOLKIND_32BIT_PD:
2429 case PGMPOOLKIND_PAE_PDPT:
2430 break;
2431
2432 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2433 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2434 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2435 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2436 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2437 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2438 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2439 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2440 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2441 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2442 case PGMPOOLKIND_ROOT_NESTED:
2443 /* Nothing to monitor here. */
2444 return VINF_SUCCESS;
2445
2446 case PGMPOOLKIND_32BIT_PD_PHYS:
2447 case PGMPOOLKIND_PAE_PDPT_PHYS:
2448 case PGMPOOLKIND_PAE_PD_PHYS:
2449 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
2450 /* Nothing to monitor here. */
2451 return VINF_SUCCESS;
2452 default:
2453 AssertFatalMsgFailed(("This can't happen! enmKind=%d\n", pPage->enmKind));
2454 }
2455
2456 /*
2457 * Install handler.
2458 */
2459 int rc;
2460 PPGMPOOLPAGE pPageHead = pgmPoolMonitorGetPageByGCPhys(pPool, pPage);
2461 if (pPageHead)
2462 {
2463 Assert(pPageHead != pPage); Assert(pPageHead->iMonitoredNext != pPage->idx);
2464 Assert(pPageHead->iMonitoredPrev != pPage->idx);
2465
2466#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2467 if (pPageHead->fDirty)
2468 pgmPoolFlushDirtyPage(pPool->CTX_SUFF(pVM), pPool, pPageHead->idxDirtyEntry, false /* do not remove */);
2469#endif
2470
2471 pPage->iMonitoredPrev = pPageHead->idx;
2472 pPage->iMonitoredNext = pPageHead->iMonitoredNext;
2473 if (pPageHead->iMonitoredNext != NIL_PGMPOOL_IDX)
2474 pPool->aPages[pPageHead->iMonitoredNext].iMonitoredPrev = pPage->idx;
2475 pPageHead->iMonitoredNext = pPage->idx;
2476 rc = VINF_SUCCESS;
2477 }
2478 else
2479 {
2480 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX); Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
2481 PVM pVM = pPool->CTX_SUFF(pVM);
2482 const RTGCPHYS GCPhysPage = pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2483 rc = PGMHandlerPhysicalRegisterEx(pVM, PGMPHYSHANDLERTYPE_PHYSICAL_WRITE,
2484 GCPhysPage, GCPhysPage + PAGE_OFFSET_MASK,
2485 pPool->pfnAccessHandlerR3, MMHyperCCToR3(pVM, pPage),
2486 pPool->pfnAccessHandlerR0, MMHyperCCToR0(pVM, pPage),
2487 pPool->pfnAccessHandlerRC, MMHyperCCToRC(pVM, pPage),
2488 pPool->pszAccessHandler);
2489 /** @todo we should probably deal with out-of-memory conditions here, but for now increasing
2490 * the heap size should suffice. */
2491 AssertFatalMsgRC(rc, ("PGMHandlerPhysicalRegisterEx %RGp failed with %Rrc\n", GCPhysPage, rc));
2492 PVMCPU pVCpu = VMMGetCpu(pVM);
2493 AssertFatalMsg(!(pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL) || VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3), ("fSyncFlags=%x syncff=%d\n", pVCpu->pgm.s.fSyncFlags, VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3)));
2494 }
2495 pPage->fMonitored = true;
2496 return rc;
2497}
2498
2499
2500/**
2501 * Disables write monitoring of a guest page.
2502 *
2503 * @returns VBox status code.
2504 * @retval VINF_SUCCESS on success.
2505 * @param pPool The pool.
2506 * @param pPage The cached page.
2507 */
2508static int pgmPoolMonitorFlush(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2509{
2510 /*
2511 * Filter out the relevant kinds.
2512 */
2513 switch (pPage->enmKind)
2514 {
2515 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2516 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2517 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2518 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2519 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2520 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2521 case PGMPOOLKIND_64BIT_PML4:
2522 case PGMPOOLKIND_32BIT_PD:
2523 case PGMPOOLKIND_PAE_PDPT:
2524 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2525 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2526 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2527 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2528 break;
2529
2530 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2531 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2532 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2533 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2534 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2535 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2536 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2537 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2538 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2539 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2540 case PGMPOOLKIND_ROOT_NESTED:
2541 case PGMPOOLKIND_PAE_PD_PHYS:
2542 case PGMPOOLKIND_PAE_PDPT_PHYS:
2543 case PGMPOOLKIND_32BIT_PD_PHYS:
2544 /* Nothing to monitor here. */
2545 Assert(!pPage->fMonitored);
2546 return VINF_SUCCESS;
2547
2548 default:
2549 AssertFatalMsgFailed(("This can't happen! enmKind=%d\n", pPage->enmKind));
2550 }
2551 Assert(pPage->fMonitored);
2552
2553 /*
2554 * Remove the page from the monitored list or uninstall it if last.
2555 */
2556 const PVM pVM = pPool->CTX_SUFF(pVM);
2557 int rc;
2558 if ( pPage->iMonitoredNext != NIL_PGMPOOL_IDX
2559 || pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
2560 {
2561 if (pPage->iMonitoredPrev == NIL_PGMPOOL_IDX)
2562 {
2563 PPGMPOOLPAGE pNewHead = &pPool->aPages[pPage->iMonitoredNext];
2564 pNewHead->iMonitoredPrev = NIL_PGMPOOL_IDX;
2565 rc = PGMHandlerPhysicalChangeCallbacks(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK,
2566 pPool->pfnAccessHandlerR3, MMHyperCCToR3(pVM, pNewHead),
2567 pPool->pfnAccessHandlerR0, MMHyperCCToR0(pVM, pNewHead),
2568 pPool->pfnAccessHandlerRC, MMHyperCCToRC(pVM, pNewHead),
2569 pPool->pszAccessHandler);
2570 AssertFatalRCSuccess(rc);
2571 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
2572 }
2573 else
2574 {
2575 pPool->aPages[pPage->iMonitoredPrev].iMonitoredNext = pPage->iMonitoredNext;
2576 if (pPage->iMonitoredNext != NIL_PGMPOOL_IDX)
2577 {
2578 pPool->aPages[pPage->iMonitoredNext].iMonitoredPrev = pPage->iMonitoredPrev;
2579 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
2580 }
2581 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
2582 rc = VINF_SUCCESS;
2583 }
2584 }
2585 else
2586 {
2587 rc = PGMHandlerPhysicalDeregister(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK);
2588 AssertFatalRC(rc);
2589 PVMCPU pVCpu = VMMGetCpu(pVM);
2590 AssertFatalMsg(!(pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL) || VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3),
2591 ("%#x %#x\n", pVCpu->pgm.s.fSyncFlags, pVM->fGlobalForcedActions));
2592 }
2593 pPage->fMonitored = false;
2594
2595 /*
2596 * Remove it from the list of modified pages (if in it).
2597 */
2598 pgmPoolMonitorModifiedRemove(pPool, pPage);
2599
2600 return rc;
2601}
2602
2603
2604/**
2605 * Inserts the page into the list of modified pages.
2606 *
2607 * @param pPool The pool.
2608 * @param pPage The page.
2609 */
2610void pgmPoolMonitorModifiedInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2611{
2612 Log3(("pgmPoolMonitorModifiedInsert: idx=%d\n", pPage->idx));
2613 AssertMsg( pPage->iModifiedNext == NIL_PGMPOOL_IDX
2614 && pPage->iModifiedPrev == NIL_PGMPOOL_IDX
2615 && pPool->iModifiedHead != pPage->idx,
2616 ("Next=%d Prev=%d idx=%d cModifications=%d Head=%d cModifiedPages=%d\n",
2617 pPage->iModifiedNext, pPage->iModifiedPrev, pPage->idx, pPage->cModifications,
2618 pPool->iModifiedHead, pPool->cModifiedPages));
2619
2620 pPage->iModifiedNext = pPool->iModifiedHead;
2621 if (pPool->iModifiedHead != NIL_PGMPOOL_IDX)
2622 pPool->aPages[pPool->iModifiedHead].iModifiedPrev = pPage->idx;
2623 pPool->iModifiedHead = pPage->idx;
2624 pPool->cModifiedPages++;
2625#ifdef VBOX_WITH_STATISTICS
2626 if (pPool->cModifiedPages > pPool->cModifiedPagesHigh)
2627 pPool->cModifiedPagesHigh = pPool->cModifiedPages;
2628#endif
2629}
2630
2631
2632/**
2633 * Removes the page from the list of modified pages and resets the
2634 * modification counter.
2635 *
2636 * @param pPool The pool.
2637 * @param pPage The page which is believed to be in the list of modified pages.
2638 */
2639static void pgmPoolMonitorModifiedRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2640{
2641 Log3(("pgmPoolMonitorModifiedRemove: idx=%d cModifications=%d\n", pPage->idx, pPage->cModifications));
2642 if (pPool->iModifiedHead == pPage->idx)
2643 {
2644 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
2645 pPool->iModifiedHead = pPage->iModifiedNext;
2646 if (pPage->iModifiedNext != NIL_PGMPOOL_IDX)
2647 {
2648 pPool->aPages[pPage->iModifiedNext].iModifiedPrev = NIL_PGMPOOL_IDX;
2649 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2650 }
2651 pPool->cModifiedPages--;
2652 }
2653 else if (pPage->iModifiedPrev != NIL_PGMPOOL_IDX)
2654 {
2655 pPool->aPages[pPage->iModifiedPrev].iModifiedNext = pPage->iModifiedNext;
2656 if (pPage->iModifiedNext != NIL_PGMPOOL_IDX)
2657 {
2658 pPool->aPages[pPage->iModifiedNext].iModifiedPrev = pPage->iModifiedPrev;
2659 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2660 }
2661 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
2662 pPool->cModifiedPages--;
2663 }
2664 else
2665 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
2666 pPage->cModifications = 0;
2667}
2668
2669
2670/**
2671 * Zaps the list of modified pages, resetting their modification counters in the process.
2672 *
2673 * @param pVM Pointer to the VM.
2674 */
2675static void pgmPoolMonitorModifiedClearAll(PVM pVM)
2676{
2677 pgmLock(pVM);
2678 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
2679 LogFlow(("pgmPoolMonitorModifiedClearAll: cModifiedPages=%d\n", pPool->cModifiedPages));
2680
2681 unsigned cPages = 0; NOREF(cPages);
2682
2683#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2684 pgmPoolResetDirtyPages(pVM);
2685#endif
2686
2687 uint16_t idx = pPool->iModifiedHead;
2688 pPool->iModifiedHead = NIL_PGMPOOL_IDX;
2689 while (idx != NIL_PGMPOOL_IDX)
2690 {
2691 PPGMPOOLPAGE pPage = &pPool->aPages[idx];
2692 idx = pPage->iModifiedNext;
2693 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2694 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
2695 pPage->cModifications = 0;
2696 Assert(++cPages);
2697 }
2698 AssertMsg(cPages == pPool->cModifiedPages, ("%d != %d\n", cPages, pPool->cModifiedPages));
2699 pPool->cModifiedPages = 0;
2700 pgmUnlock(pVM);
2701}
2702
2703
2704/**
2705 * Handle SyncCR3 pool tasks
2706 *
2707 * @returns VBox status code.
2708 * @retval VINF_SUCCESS if successfully added.
2709 * @retval VINF_PGM_SYNC_CR3 is it needs to be deferred to ring 3 (GC only)
2710 * @param pVCpu Pointer to the VMCPU.
2711 * @remark Should only be used when monitoring is available, thus placed in
2712 * the PGMPOOL_WITH_MONITORING #ifdef.
2713 */
2714int pgmPoolSyncCR3(PVMCPU pVCpu)
2715{
2716 PVM pVM = pVCpu->CTX_SUFF(pVM);
2717 LogFlow(("pgmPoolSyncCR3 fSyncFlags=%x\n", pVCpu->pgm.s.fSyncFlags));
2718
2719 /*
2720 * When monitoring shadowed pages, we reset the modification counters on CR3 sync.
2721 * Occasionally we will have to clear all the shadow page tables because we wanted
2722 * to monitor a page which was mapped by too many shadowed page tables. This operation
2723 * sometimes referred to as a 'lightweight flush'.
2724 */
2725# ifdef IN_RING3 /* Don't flush in ring-0 or raw mode, it's taking too long. */
2726 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2727 pgmR3PoolClearAll(pVM, false /*fFlushRemTlb*/);
2728# else /* !IN_RING3 */
2729 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2730 {
2731 Log(("SyncCR3: PGM_SYNC_CLEAR_PGM_POOL is set -> VINF_PGM_SYNC_CR3\n"));
2732 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3); /** @todo no need to do global sync, right? */
2733
2734 /* Make sure all other VCPUs return to ring 3. */
2735 if (pVM->cCpus > 1)
2736 {
2737 VM_FF_SET(pVM, VM_FF_PGM_POOL_FLUSH_PENDING);
2738 PGM_INVL_ALL_VCPU_TLBS(pVM);
2739 }
2740 return VINF_PGM_SYNC_CR3;
2741 }
2742# endif /* !IN_RING3 */
2743 else
2744 {
2745 pgmPoolMonitorModifiedClearAll(pVM);
2746
2747 /* pgmPoolMonitorModifiedClearAll can cause a pgm pool flush (dirty page clearing), so make sure we handle this! */
2748 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2749 {
2750 Log(("pgmPoolMonitorModifiedClearAll caused a pgm flush -> call pgmPoolSyncCR3 again!\n"));
2751 return pgmPoolSyncCR3(pVCpu);
2752 }
2753 }
2754 return VINF_SUCCESS;
2755}
2756
2757
2758/**
2759 * Frees up at least one user entry.
2760 *
2761 * @returns VBox status code.
2762 * @retval VINF_SUCCESS if successfully added.
2763 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
2764 * @param pPool The pool.
2765 * @param iUser The user index.
2766 */
2767static int pgmPoolTrackFreeOneUser(PPGMPOOL pPool, uint16_t iUser)
2768{
2769 STAM_COUNTER_INC(&pPool->StatTrackFreeUpOneUser);
2770 /*
2771 * Just free cached pages in a braindead fashion.
2772 */
2773 /** @todo walk the age list backwards and free the first with usage. */
2774 int rc = VINF_SUCCESS;
2775 do
2776 {
2777 int rc2 = pgmPoolCacheFreeOne(pPool, iUser);
2778 if (RT_FAILURE(rc2) && rc == VINF_SUCCESS)
2779 rc = rc2;
2780 } while (pPool->iUserFreeHead == NIL_PGMPOOL_USER_INDEX);
2781 return rc;
2782}
2783
2784
2785/**
2786 * Inserts a page into the cache.
2787 *
2788 * This will create user node for the page, insert it into the GCPhys
2789 * hash, and insert it into the age list.
2790 *
2791 * @returns VBox status code.
2792 * @retval VINF_SUCCESS if successfully added.
2793 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
2794 * @param pPool The pool.
2795 * @param pPage The cached page.
2796 * @param GCPhys The GC physical address of the page we're gonna shadow.
2797 * @param iUser The user index.
2798 * @param iUserTable The user table index.
2799 */
2800DECLINLINE(int) pgmPoolTrackInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTGCPHYS GCPhys, uint16_t iUser, uint32_t iUserTable)
2801{
2802 int rc = VINF_SUCCESS;
2803 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
2804
2805 LogFlow(("pgmPoolTrackInsert GCPhys=%RGp iUser=%d iUserTable=%x\n", GCPhys, iUser, iUserTable));
2806
2807#ifdef VBOX_STRICT
2808 /*
2809 * Check that the entry doesn't already exists.
2810 */
2811 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
2812 {
2813 uint16_t i = pPage->iUserHead;
2814 do
2815 {
2816 Assert(i < pPool->cMaxUsers);
2817 AssertMsg(paUsers[i].iUser != iUser || paUsers[i].iUserTable != iUserTable, ("%x %x vs new %x %x\n", paUsers[i].iUser, paUsers[i].iUserTable, iUser, iUserTable));
2818 i = paUsers[i].iNext;
2819 } while (i != NIL_PGMPOOL_USER_INDEX);
2820 }
2821#endif
2822
2823 /*
2824 * Find free a user node.
2825 */
2826 uint16_t i = pPool->iUserFreeHead;
2827 if (i == NIL_PGMPOOL_USER_INDEX)
2828 {
2829 rc = pgmPoolTrackFreeOneUser(pPool, iUser);
2830 if (RT_FAILURE(rc))
2831 return rc;
2832 i = pPool->iUserFreeHead;
2833 }
2834
2835 /*
2836 * Unlink the user node from the free list,
2837 * initialize and insert it into the user list.
2838 */
2839 pPool->iUserFreeHead = paUsers[i].iNext;
2840 paUsers[i].iNext = NIL_PGMPOOL_USER_INDEX;
2841 paUsers[i].iUser = iUser;
2842 paUsers[i].iUserTable = iUserTable;
2843 pPage->iUserHead = i;
2844
2845 /*
2846 * Insert into cache and enable monitoring of the guest page if enabled.
2847 *
2848 * Until we implement caching of all levels, including the CR3 one, we'll
2849 * have to make sure we don't try monitor & cache any recursive reuse of
2850 * a monitored CR3 page. Because all windows versions are doing this we'll
2851 * have to be able to do combined access monitoring, CR3 + PT and
2852 * PD + PT (guest PAE).
2853 *
2854 * Update:
2855 * We're now cooperating with the CR3 monitor if an uncachable page is found.
2856 */
2857 const bool fCanBeMonitored = true;
2858 pgmPoolCacheInsert(pPool, pPage, fCanBeMonitored); /* This can be expanded. */
2859 if (fCanBeMonitored)
2860 {
2861 rc = pgmPoolMonitorInsert(pPool, pPage);
2862 AssertRC(rc);
2863 }
2864 return rc;
2865}
2866
2867
2868/**
2869 * Adds a user reference to a page.
2870 *
2871 * This will move the page to the head of the
2872 *
2873 * @returns VBox status code.
2874 * @retval VINF_SUCCESS if successfully added.
2875 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
2876 * @param pPool The pool.
2877 * @param pPage The cached page.
2878 * @param iUser The user index.
2879 * @param iUserTable The user table.
2880 */
2881static int pgmPoolTrackAddUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
2882{
2883 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
2884
2885 Log3(("pgmPoolTrackAddUser GCPhys = %RGp iUser %x iUserTable %x\n", pPage->GCPhys, iUser, iUserTable));
2886
2887# ifdef VBOX_STRICT
2888 /*
2889 * Check that the entry doesn't already exists. We only allow multiple
2890 * users of top-level paging structures (SHW_POOL_ROOT_IDX).
2891 */
2892 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
2893 {
2894 uint16_t i = pPage->iUserHead;
2895 do
2896 {
2897 Assert(i < pPool->cMaxUsers);
2898 AssertMsg(iUser != PGMPOOL_IDX_PD || iUser != PGMPOOL_IDX_PDPT || iUser != PGMPOOL_IDX_NESTED_ROOT || iUser != PGMPOOL_IDX_AMD64_CR3 ||
2899 paUsers[i].iUser != iUser || paUsers[i].iUserTable != iUserTable, ("%x %x vs new %x %x\n", paUsers[i].iUser, paUsers[i].iUserTable, iUser, iUserTable));
2900 i = paUsers[i].iNext;
2901 } while (i != NIL_PGMPOOL_USER_INDEX);
2902 }
2903# endif
2904
2905 /*
2906 * Allocate a user node.
2907 */
2908 uint16_t i = pPool->iUserFreeHead;
2909 if (i == NIL_PGMPOOL_USER_INDEX)
2910 {
2911 int rc = pgmPoolTrackFreeOneUser(pPool, iUser);
2912 if (RT_FAILURE(rc))
2913 return rc;
2914 i = pPool->iUserFreeHead;
2915 }
2916 pPool->iUserFreeHead = paUsers[i].iNext;
2917
2918 /*
2919 * Initialize the user node and insert it.
2920 */
2921 paUsers[i].iNext = pPage->iUserHead;
2922 paUsers[i].iUser = iUser;
2923 paUsers[i].iUserTable = iUserTable;
2924 pPage->iUserHead = i;
2925
2926# ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2927 if (pPage->fDirty)
2928 pgmPoolFlushDirtyPage(pPool->CTX_SUFF(pVM), pPool, pPage->idxDirtyEntry, false /* do not remove */);
2929# endif
2930
2931 /*
2932 * Tell the cache to update its replacement stats for this page.
2933 */
2934 pgmPoolCacheUsed(pPool, pPage);
2935 return VINF_SUCCESS;
2936}
2937
2938
2939/**
2940 * Frees a user record associated with a page.
2941 *
2942 * This does not clear the entry in the user table, it simply replaces the
2943 * user record to the chain of free records.
2944 *
2945 * @param pPool The pool.
2946 * @param HCPhys The HC physical address of the shadow page.
2947 * @param iUser The shadow page pool index of the user table.
2948 * @param iUserTable The index into the user table (shadowed).
2949 */
2950static void pgmPoolTrackFreeUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
2951{
2952 /*
2953 * Unlink and free the specified user entry.
2954 */
2955 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
2956
2957 Log3(("pgmPoolTrackFreeUser %RGp %x %x\n", pPage->GCPhys, iUser, iUserTable));
2958 /* Special: For PAE and 32-bit paging, there is usually no more than one user. */
2959 uint16_t i = pPage->iUserHead;
2960 if ( i != NIL_PGMPOOL_USER_INDEX
2961 && paUsers[i].iUser == iUser
2962 && paUsers[i].iUserTable == iUserTable)
2963 {
2964 pPage->iUserHead = paUsers[i].iNext;
2965
2966 paUsers[i].iUser = NIL_PGMPOOL_IDX;
2967 paUsers[i].iNext = pPool->iUserFreeHead;
2968 pPool->iUserFreeHead = i;
2969 return;
2970 }
2971
2972 /* General: Linear search. */
2973 uint16_t iPrev = NIL_PGMPOOL_USER_INDEX;
2974 while (i != NIL_PGMPOOL_USER_INDEX)
2975 {
2976 if ( paUsers[i].iUser == iUser
2977 && paUsers[i].iUserTable == iUserTable)
2978 {
2979 if (iPrev != NIL_PGMPOOL_USER_INDEX)
2980 paUsers[iPrev].iNext = paUsers[i].iNext;
2981 else
2982 pPage->iUserHead = paUsers[i].iNext;
2983
2984 paUsers[i].iUser = NIL_PGMPOOL_IDX;
2985 paUsers[i].iNext = pPool->iUserFreeHead;
2986 pPool->iUserFreeHead = i;
2987 return;
2988 }
2989 iPrev = i;
2990 i = paUsers[i].iNext;
2991 }
2992
2993 /* Fatal: didn't find it */
2994 AssertFatalMsgFailed(("Didn't find the user entry! iUser=%d iUserTable=%#x GCPhys=%RGp\n",
2995 iUser, iUserTable, pPage->GCPhys));
2996}
2997
2998
2999/**
3000 * Gets the entry size of a shadow table.
3001 *
3002 * @param enmKind The kind of page.
3003 *
3004 * @returns The size of the entry in bytes. That is, 4 or 8.
3005 * @returns If the kind is not for a table, an assertion is raised and 0 is
3006 * returned.
3007 */
3008DECLINLINE(unsigned) pgmPoolTrackGetShadowEntrySize(PGMPOOLKIND enmKind)
3009{
3010 switch (enmKind)
3011 {
3012 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3013 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3014 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3015 case PGMPOOLKIND_32BIT_PD:
3016 case PGMPOOLKIND_32BIT_PD_PHYS:
3017 return 4;
3018
3019 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3020 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3021 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3022 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3023 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3024 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3025 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3026 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3027 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3028 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3029 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3030 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3031 case PGMPOOLKIND_64BIT_PML4:
3032 case PGMPOOLKIND_PAE_PDPT:
3033 case PGMPOOLKIND_ROOT_NESTED:
3034 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3035 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3036 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3037 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3038 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3039 case PGMPOOLKIND_PAE_PD_PHYS:
3040 case PGMPOOLKIND_PAE_PDPT_PHYS:
3041 return 8;
3042
3043 default:
3044 AssertFatalMsgFailed(("enmKind=%d\n", enmKind));
3045 }
3046}
3047
3048
3049/**
3050 * Gets the entry size of a guest table.
3051 *
3052 * @param enmKind The kind of page.
3053 *
3054 * @returns The size of the entry in bytes. That is, 0, 4 or 8.
3055 * @returns If the kind is not for a table, an assertion is raised and 0 is
3056 * returned.
3057 */
3058DECLINLINE(unsigned) pgmPoolTrackGetGuestEntrySize(PGMPOOLKIND enmKind)
3059{
3060 switch (enmKind)
3061 {
3062 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3063 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3064 case PGMPOOLKIND_32BIT_PD:
3065 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3066 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3067 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3068 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3069 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3070 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3071 return 4;
3072
3073 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3074 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3075 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3076 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3077 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3078 case PGMPOOLKIND_64BIT_PML4:
3079 case PGMPOOLKIND_PAE_PDPT:
3080 return 8;
3081
3082 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3083 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3084 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3085 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3086 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3087 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3088 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3089 case PGMPOOLKIND_ROOT_NESTED:
3090 case PGMPOOLKIND_PAE_PD_PHYS:
3091 case PGMPOOLKIND_PAE_PDPT_PHYS:
3092 case PGMPOOLKIND_32BIT_PD_PHYS:
3093 /** @todo can we return 0? (nobody is calling this...) */
3094 AssertFailed();
3095 return 0;
3096
3097 default:
3098 AssertFatalMsgFailed(("enmKind=%d\n", enmKind));
3099 }
3100}
3101
3102
3103/**
3104 * Checks one shadow page table entry for a mapping of a physical page.
3105 *
3106 * @returns true / false indicating removal of all relevant PTEs
3107 *
3108 * @param pVM Pointer to the VM.
3109 * @param pPhysPage The guest page in question.
3110 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3111 * @param iShw The shadow page table.
3112 * @param iPte Page table entry or NIL_PGMPOOL_PHYSEXT_IDX_PTE if unknown
3113 */
3114static bool pgmPoolTrackFlushGCPhysPTInt(PVM pVM, PCPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iShw, uint16_t iPte)
3115{
3116 LogFlow(("pgmPoolTrackFlushGCPhysPTInt: pPhysPage=%RHp iShw=%d iPte=%d\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iShw, iPte));
3117 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3118 bool fRet = false;
3119
3120 /*
3121 * Assert sanity.
3122 */
3123 Assert(iPte != NIL_PGMPOOL_PHYSEXT_IDX_PTE);
3124 AssertFatalMsg(iShw < pPool->cCurPages && iShw != NIL_PGMPOOL_IDX, ("iShw=%d\n", iShw));
3125 PPGMPOOLPAGE pPage = &pPool->aPages[iShw];
3126
3127 /*
3128 * Then, clear the actual mappings to the page in the shadow PT.
3129 */
3130 switch (pPage->enmKind)
3131 {
3132 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3133 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3134 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3135 {
3136 const uint32_t u32 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P;
3137 PX86PT pPT = (PX86PT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3138 uint32_t u32AndMask = 0;
3139 uint32_t u32OrMask = 0;
3140
3141 if (!fFlushPTEs)
3142 {
3143 switch (PGM_PAGE_GET_HNDL_PHYS_STATE(pPhysPage))
3144 {
3145 case PGM_PAGE_HNDL_PHYS_STATE_NONE: /** No handler installed. */
3146 case PGM_PAGE_HNDL_PHYS_STATE_DISABLED: /** Monitoring is temporarily disabled. */
3147 u32OrMask = X86_PTE_RW;
3148 u32AndMask = UINT32_MAX;
3149 fRet = true;
3150 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3151 break;
3152
3153 case PGM_PAGE_HNDL_PHYS_STATE_WRITE: /** Write access is monitored. */
3154 u32OrMask = 0;
3155 u32AndMask = ~X86_PTE_RW;
3156 fRet = true;
3157 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3158 break;
3159 default:
3160 /* (shouldn't be here, will assert below) */
3161 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3162 break;
3163 }
3164 }
3165 else
3166 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3167
3168 /* Update the counter if we're removing references. */
3169 if (!u32AndMask)
3170 {
3171 Assert(pPage->cPresent);
3172 Assert(pPool->cPresent);
3173 pPage->cPresent--;
3174 pPool->cPresent--;
3175 }
3176
3177 if ((pPT->a[iPte].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3178 {
3179 X86PTE Pte;
3180
3181 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pte=%RX32\n", iPte, pPT->a[iPte]));
3182 Pte.u = (pPT->a[iPte].u & u32AndMask) | u32OrMask;
3183 if (Pte.u & PGM_PTFLAGS_TRACK_DIRTY)
3184 Pte.n.u1Write = 0; /* need to disallow writes when dirty bit tracking is still active. */
3185
3186 ASMAtomicWriteU32(&pPT->a[iPte].u, Pte.u);
3187 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3188 return fRet;
3189 }
3190#ifdef LOG_ENABLED
3191 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3192 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPT->a); i++)
3193 if ((pPT->a[i].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3194 {
3195 Log(("i=%d cFound=%d\n", i, ++cFound));
3196 }
3197#endif
3198 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d u32=%RX32 poolkind=%x\n", pPage->iFirstPresent, pPage->cPresent, u32, pPage->enmKind));
3199 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);*/
3200 break;
3201 }
3202
3203 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3204 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3205 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3206 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3207 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3208 case PGMPOOLKIND_EPT_PT_FOR_PHYS: /* physical mask the same as PAE; RW bit as well; be careful! */
3209 {
3210 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P;
3211 PPGMSHWPTPAE pPT = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3212 uint64_t u64OrMask = 0;
3213 uint64_t u64AndMask = 0;
3214
3215 if (!fFlushPTEs)
3216 {
3217 switch (PGM_PAGE_GET_HNDL_PHYS_STATE(pPhysPage))
3218 {
3219 case PGM_PAGE_HNDL_PHYS_STATE_NONE: /* No handler installed. */
3220 case PGM_PAGE_HNDL_PHYS_STATE_DISABLED: /* Monitoring is temporarily disabled. */
3221 u64OrMask = X86_PTE_RW;
3222 u64AndMask = UINT64_MAX;
3223 fRet = true;
3224 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3225 break;
3226
3227 case PGM_PAGE_HNDL_PHYS_STATE_WRITE: /* Write access is monitored. */
3228 u64OrMask = 0;
3229 u64AndMask = ~(uint64_t)X86_PTE_RW;
3230 fRet = true;
3231 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3232 break;
3233
3234 default:
3235 /* (shouldn't be here, will assert below) */
3236 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3237 break;
3238 }
3239 }
3240 else
3241 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3242
3243 /* Update the counter if we're removing references. */
3244 if (!u64AndMask)
3245 {
3246 Assert(pPage->cPresent);
3247 Assert(pPool->cPresent);
3248 pPage->cPresent--;
3249 pPool->cPresent--;
3250 }
3251
3252 if ((PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX)) == u64)
3253 {
3254 X86PTEPAE Pte;
3255
3256 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pte=%RX64\n", iPte, PGMSHWPTEPAE_GET_LOG(pPT->a[iPte])));
3257 Pte.u = (PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & u64AndMask) | u64OrMask;
3258 if (Pte.u & PGM_PTFLAGS_TRACK_DIRTY)
3259 Pte.n.u1Write = 0; /* need to disallow writes when dirty bit tracking is still active. */
3260
3261 PGMSHWPTEPAE_ATOMIC_SET(pPT->a[iPte], Pte.u);
3262 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3263 return fRet;
3264 }
3265#ifdef LOG_ENABLED
3266 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3267 Log(("Found %RX64 expected %RX64\n", PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX), u64));
3268 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPT->a); i++)
3269 if ((PGMSHWPTEPAE_GET_U(pPT->a[i]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX)) == u64)
3270 Log(("i=%d cFound=%d\n", i, ++cFound));
3271#endif
3272 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d u64=%RX64 poolkind=%x iPte=%d PT=%RX64\n", pPage->iFirstPresent, pPage->cPresent, u64, pPage->enmKind, iPte, PGMSHWPTEPAE_GET_LOG(pPT->a[iPte])));
3273 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);*/
3274 break;
3275 }
3276
3277#ifdef PGM_WITH_LARGE_PAGES
3278 /* Large page case only. */
3279 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3280 {
3281 Assert(pVM->pgm.s.fNestedPaging);
3282
3283 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PDE4M_P | X86_PDE4M_PS;
3284 PEPTPD pPD = (PEPTPD)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3285
3286 if ((pPD->a[iPte].u & (EPT_PDE2M_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3287 {
3288 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pde=%RX64\n", iPte, pPD->a[iPte]));
3289 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3290 pPD->a[iPte].u = 0;
3291 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);
3292
3293 /* Update the counter as we're removing references. */
3294 Assert(pPage->cPresent);
3295 Assert(pPool->cPresent);
3296 pPage->cPresent--;
3297 pPool->cPresent--;
3298
3299 return fRet;
3300 }
3301# ifdef LOG_ENABLED
3302 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3303 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPD->a); i++)
3304 if ((pPD->a[i].u & (EPT_PDE2M_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3305 Log(("i=%d cFound=%d\n", i, ++cFound));
3306# endif
3307 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3308 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);*/
3309 break;
3310 }
3311
3312 /* AMD-V nested paging */ /** @todo merge with EPT as we only check the parts that are identical. */
3313 case PGMPOOLKIND_PAE_PD_PHYS:
3314 {
3315 Assert(pVM->pgm.s.fNestedPaging);
3316
3317 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PDE4M_P | X86_PDE4M_PS;
3318 PX86PD pPD = (PX86PD)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3319
3320 if ((pPD->a[iPte].u & (X86_PDE2M_PAE_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3321 {
3322 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pde=%RX64\n", iPte, pPD->a[iPte]));
3323 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3324 pPD->a[iPte].u = 0;
3325 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);
3326
3327 /* Update the counter as we're removing references. */
3328 Assert(pPage->cPresent);
3329 Assert(pPool->cPresent);
3330 pPage->cPresent--;
3331 pPool->cPresent--;
3332 return fRet;
3333 }
3334# ifdef LOG_ENABLED
3335 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3336 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPD->a); i++)
3337 if ((pPD->a[i].u & (X86_PDE2M_PAE_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3338 Log(("i=%d cFound=%d\n", i, ++cFound));
3339# endif
3340 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3341 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);*/
3342 break;
3343 }
3344#endif /* PGM_WITH_LARGE_PAGES */
3345
3346 default:
3347 AssertFatalMsgFailed(("enmKind=%d iShw=%d\n", pPage->enmKind, iShw));
3348 }
3349
3350 /* not reached. */
3351#ifndef _MSC_VER
3352 return fRet;
3353#endif
3354}
3355
3356
3357/**
3358 * Scans one shadow page table for mappings of a physical page.
3359 *
3360 * @param pVM Pointer to the VM.
3361 * @param pPhysPage The guest page in question.
3362 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3363 * @param iShw The shadow page table.
3364 */
3365static void pgmPoolTrackFlushGCPhysPT(PVM pVM, PPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iShw)
3366{
3367 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool); NOREF(pPool);
3368
3369 /* We should only come here with when there's only one reference to this physical page. */
3370 Assert(PGMPOOL_TD_GET_CREFS(PGM_PAGE_GET_TRACKING(pPhysPage)) == 1);
3371
3372 Log2(("pgmPoolTrackFlushGCPhysPT: pPhysPage=%RHp iShw=%d\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iShw));
3373 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPT, f);
3374 bool fKeptPTEs = pgmPoolTrackFlushGCPhysPTInt(pVM, pPhysPage, fFlushPTEs, iShw, PGM_PAGE_GET_PTE_INDEX(pPhysPage));
3375 if (!fKeptPTEs)
3376 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3377 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPT, f);
3378}
3379
3380
3381/**
3382 * Flushes a list of shadow page tables mapping the same physical page.
3383 *
3384 * @param pVM Pointer to the VM.
3385 * @param pPhysPage The guest page in question.
3386 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3387 * @param iPhysExt The physical cross reference extent list to flush.
3388 */
3389static void pgmPoolTrackFlushGCPhysPTs(PVM pVM, PPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iPhysExt)
3390{
3391 PGM_LOCK_ASSERT_OWNER(pVM);
3392 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3393 bool fKeepList = false;
3394
3395 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPTs, f);
3396 Log2(("pgmPoolTrackFlushGCPhysPTs: pPhysPage=%RHp iPhysExt\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iPhysExt));
3397
3398 const uint16_t iPhysExtStart = iPhysExt;
3399 PPGMPOOLPHYSEXT pPhysExt;
3400 do
3401 {
3402 Assert(iPhysExt < pPool->cMaxPhysExts);
3403 pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3404 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3405 {
3406 if (pPhysExt->aidx[i] != NIL_PGMPOOL_IDX)
3407 {
3408 bool fKeptPTEs = pgmPoolTrackFlushGCPhysPTInt(pVM, pPhysPage, fFlushPTEs, pPhysExt->aidx[i], pPhysExt->apte[i]);
3409 if (!fKeptPTEs)
3410 {
3411 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3412 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3413 }
3414 else
3415 fKeepList = true;
3416 }
3417 }
3418 /* next */
3419 iPhysExt = pPhysExt->iNext;
3420 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
3421
3422 if (!fKeepList)
3423 {
3424 /* insert the list into the free list and clear the ram range entry. */
3425 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3426 pPool->iPhysExtFreeHead = iPhysExtStart;
3427 /* Invalidate the tracking data. */
3428 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3429 }
3430
3431 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTs, f);
3432}
3433
3434
3435/**
3436 * Flushes all shadow page table mappings of the given guest page.
3437 *
3438 * This is typically called when the host page backing the guest one has been
3439 * replaced or when the page protection was changed due to a guest access
3440 * caught by the monitoring.
3441 *
3442 * @returns VBox status code.
3443 * @retval VINF_SUCCESS if all references has been successfully cleared.
3444 * @retval VINF_PGM_SYNC_CR3 if we're better off with a CR3 sync and a page
3445 * pool cleaning. FF and sync flags are set.
3446 *
3447 * @param pVM Pointer to the VM.
3448 * @param GCPhysPage GC physical address of the page in question
3449 * @param pPhysPage The guest page in question.
3450 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3451 * @param pfFlushTLBs This is set to @a true if the shadow TLBs should be
3452 * flushed, it is NOT touched if this isn't necessary.
3453 * The caller MUST initialized this to @a false.
3454 */
3455int pgmPoolTrackUpdateGCPhys(PVM pVM, RTGCPHYS GCPhysPage, PPGMPAGE pPhysPage, bool fFlushPTEs, bool *pfFlushTLBs)
3456{
3457 PVMCPU pVCpu = VMMGetCpu(pVM);
3458 pgmLock(pVM);
3459 int rc = VINF_SUCCESS;
3460
3461#ifdef PGM_WITH_LARGE_PAGES
3462 /* Is this page part of a large page? */
3463 if (PGM_PAGE_GET_PDE_TYPE(pPhysPage) == PGM_PAGE_PDE_TYPE_PDE)
3464 {
3465 RTGCPHYS GCPhysBase = GCPhysPage & X86_PDE2M_PAE_PG_MASK;
3466 GCPhysPage &= X86_PDE_PAE_PG_MASK;
3467
3468 /* Fetch the large page base. */
3469 PPGMPAGE pLargePage;
3470 if (GCPhysBase != GCPhysPage)
3471 {
3472 pLargePage = pgmPhysGetPage(pVM, GCPhysBase);
3473 AssertFatal(pLargePage);
3474 }
3475 else
3476 pLargePage = pPhysPage;
3477
3478 Log(("pgmPoolTrackUpdateGCPhys: update large page PDE for %RGp (%RGp)\n", GCPhysBase, GCPhysPage));
3479
3480 if (PGM_PAGE_GET_PDE_TYPE(pLargePage) == PGM_PAGE_PDE_TYPE_PDE)
3481 {
3482 /* Mark the large page as disabled as we need to break it up to change a single page in the 2 MB range. */
3483 PGM_PAGE_SET_PDE_TYPE(pVM, pLargePage, PGM_PAGE_PDE_TYPE_PDE_DISABLED);
3484 pVM->pgm.s.cLargePagesDisabled++;
3485
3486 /* Update the base as that *only* that one has a reference and there's only one PDE to clear. */
3487 rc = pgmPoolTrackUpdateGCPhys(pVM, GCPhysBase, pLargePage, fFlushPTEs, pfFlushTLBs);
3488
3489 *pfFlushTLBs = true;
3490 pgmUnlock(pVM);
3491 return rc;
3492 }
3493 }
3494#else
3495 NOREF(GCPhysPage);
3496#endif /* PGM_WITH_LARGE_PAGES */
3497
3498 const uint16_t u16 = PGM_PAGE_GET_TRACKING(pPhysPage);
3499 if (u16)
3500 {
3501 /*
3502 * The zero page is currently screwing up the tracking and we'll
3503 * have to flush the whole shebang. Unless VBOX_WITH_NEW_LAZY_PAGE_ALLOC
3504 * is defined, zero pages won't normally be mapped. Some kind of solution
3505 * will be needed for this problem of course, but it will have to wait...
3506 */
3507 if ( PGM_PAGE_IS_ZERO(pPhysPage)
3508 || PGM_PAGE_IS_BALLOONED(pPhysPage))
3509 rc = VINF_PGM_GCPHYS_ALIASED;
3510 else
3511 {
3512# if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC) /** @todo we can drop this now. */
3513 /* Start a subset here because pgmPoolTrackFlushGCPhysPTsSlow and
3514 pgmPoolTrackFlushGCPhysPTs will/may kill the pool otherwise. */
3515 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
3516# endif
3517
3518 if (PGMPOOL_TD_GET_CREFS(u16) != PGMPOOL_TD_CREFS_PHYSEXT)
3519 {
3520 Assert(PGMPOOL_TD_GET_CREFS(u16) == 1);
3521 pgmPoolTrackFlushGCPhysPT(pVM,
3522 pPhysPage,
3523 fFlushPTEs,
3524 PGMPOOL_TD_GET_IDX(u16));
3525 }
3526 else if (u16 != PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED))
3527 pgmPoolTrackFlushGCPhysPTs(pVM, pPhysPage, fFlushPTEs, PGMPOOL_TD_GET_IDX(u16));
3528 else
3529 rc = pgmPoolTrackFlushGCPhysPTsSlow(pVM, pPhysPage);
3530 *pfFlushTLBs = true;
3531
3532# if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
3533 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
3534# endif
3535 }
3536 }
3537
3538 if (rc == VINF_PGM_GCPHYS_ALIASED)
3539 {
3540 pVCpu->pgm.s.fSyncFlags |= PGM_SYNC_CLEAR_PGM_POOL;
3541 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
3542 rc = VINF_PGM_SYNC_CR3;
3543 }
3544 pgmUnlock(pVM);
3545 return rc;
3546}
3547
3548
3549/**
3550 * Scans all shadow page tables for mappings of a physical page.
3551 *
3552 * This may be slow, but it's most likely more efficient than cleaning
3553 * out the entire page pool / cache.
3554 *
3555 * @returns VBox status code.
3556 * @retval VINF_SUCCESS if all references has been successfully cleared.
3557 * @retval VINF_PGM_GCPHYS_ALIASED if we're better off with a CR3 sync and
3558 * a page pool cleaning.
3559 *
3560 * @param pVM Pointer to the VM.
3561 * @param pPhysPage The guest page in question.
3562 */
3563int pgmPoolTrackFlushGCPhysPTsSlow(PVM pVM, PPGMPAGE pPhysPage)
3564{
3565 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3566 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3567 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: cUsedPages=%d cPresent=%d pPhysPage=%R[pgmpage]\n",
3568 pPool->cUsedPages, pPool->cPresent, pPhysPage));
3569
3570 /*
3571 * There is a limit to what makes sense.
3572 */
3573 if ( pPool->cPresent > 1024
3574 && pVM->cCpus == 1)
3575 {
3576 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: giving up... (cPresent=%d)\n", pPool->cPresent));
3577 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3578 return VINF_PGM_GCPHYS_ALIASED;
3579 }
3580
3581 /*
3582 * Iterate all the pages until we've encountered all that in use.
3583 * This is simple but not quite optimal solution.
3584 */
3585 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P; /** @todo drop X86_PTE_P here as we always test if present separately, anyway. */
3586 const uint32_t u32 = u64; /** @todo move into the 32BIT_PT_xx case */
3587 unsigned cLeft = pPool->cUsedPages;
3588 unsigned iPage = pPool->cCurPages;
3589 while (--iPage >= PGMPOOL_IDX_FIRST)
3590 {
3591 PPGMPOOLPAGE pPage = &pPool->aPages[iPage];
3592 if ( pPage->GCPhys != NIL_RTGCPHYS
3593 && pPage->cPresent)
3594 {
3595 switch (pPage->enmKind)
3596 {
3597 /*
3598 * We only care about shadow page tables.
3599 */
3600 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3601 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3602 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3603 {
3604 unsigned cPresent = pPage->cPresent;
3605 PX86PT pPT = (PX86PT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3606 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3607 if (pPT->a[i].n.u1Present)
3608 {
3609 if ((pPT->a[i].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3610 {
3611 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX32\n", iPage, i, pPT->a[i]));
3612 pPT->a[i].u = 0;
3613
3614 /* Update the counter as we're removing references. */
3615 Assert(pPage->cPresent);
3616 Assert(pPool->cPresent);
3617 pPage->cPresent--;
3618 pPool->cPresent--;
3619 }
3620 if (!--cPresent)
3621 break;
3622 }
3623 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3624 break;
3625 }
3626
3627 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3628 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3629 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3630 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3631 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3632 {
3633 unsigned cPresent = pPage->cPresent;
3634 PPGMSHWPTPAE pPT = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3635 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3636 if (PGMSHWPTEPAE_IS_P(pPT->a[i]))
3637 {
3638 if ((PGMSHWPTEPAE_GET_U(pPT->a[i]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P)) == u64)
3639 {
3640 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX64\n", iPage, i, pPT->a[i]));
3641 PGMSHWPTEPAE_SET(pPT->a[i], 0); /// @todo why not atomic?
3642
3643 /* Update the counter as we're removing references. */
3644 Assert(pPage->cPresent);
3645 Assert(pPool->cPresent);
3646 pPage->cPresent--;
3647 pPool->cPresent--;
3648 }
3649 if (!--cPresent)
3650 break;
3651 }
3652 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3653 break;
3654 }
3655#ifndef IN_RC
3656 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3657 {
3658 unsigned cPresent = pPage->cPresent;
3659 PEPTPT pPT = (PEPTPT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3660 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3661 if (pPT->a[i].n.u1Present)
3662 {
3663 if ((pPT->a[i].u & (EPT_PTE_PG_MASK | X86_PTE_P)) == u64)
3664 {
3665 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX64\n", iPage, i, pPT->a[i]));
3666 pPT->a[i].u = 0;
3667
3668 /* Update the counter as we're removing references. */
3669 Assert(pPage->cPresent);
3670 Assert(pPool->cPresent);
3671 pPage->cPresent--;
3672 pPool->cPresent--;
3673 }
3674 if (!--cPresent)
3675 break;
3676 }
3677 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3678 break;
3679 }
3680#endif
3681 }
3682 if (!--cLeft)
3683 break;
3684 }
3685 }
3686
3687 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3688 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3689
3690 /*
3691 * There is a limit to what makes sense. The above search is very expensive, so force a pgm pool flush.
3692 */
3693 if (pPool->cPresent > 1024)
3694 {
3695 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: giving up... (cPresent=%d)\n", pPool->cPresent));
3696 return VINF_PGM_GCPHYS_ALIASED;
3697 }
3698
3699 return VINF_SUCCESS;
3700}
3701
3702
3703/**
3704 * Clears the user entry in a user table.
3705 *
3706 * This is used to remove all references to a page when flushing it.
3707 */
3708static void pgmPoolTrackClearPageUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PCPGMPOOLUSER pUser)
3709{
3710 Assert(pUser->iUser != NIL_PGMPOOL_IDX);
3711 Assert(pUser->iUser < pPool->cCurPages);
3712 uint32_t iUserTable = pUser->iUserTable;
3713
3714 /*
3715 * Map the user page. Ignore references made by fictitious pages.
3716 */
3717 PPGMPOOLPAGE pUserPage = &pPool->aPages[pUser->iUser];
3718 LogFlow(("pgmPoolTrackClearPageUser: clear %x in %s (%RGp) (flushing %s)\n", iUserTable, pgmPoolPoolKindToStr(pUserPage->enmKind), pUserPage->Core.Key, pgmPoolPoolKindToStr(pPage->enmKind)));
3719 union
3720 {
3721 uint64_t *pau64;
3722 uint32_t *pau32;
3723 } u;
3724 if (pUserPage->idx < PGMPOOL_IDX_FIRST)
3725 {
3726 Assert(!pUserPage->pvPageR3);
3727 return;
3728 }
3729 u.pau64 = (uint64_t *)PGMPOOL_PAGE_2_PTR(pPool->CTX_SUFF(pVM), pUserPage);
3730
3731
3732 /* Safety precaution in case we change the paging for other modes too in the future. */
3733 Assert(!pgmPoolIsPageLocked(pPage));
3734
3735#ifdef VBOX_STRICT
3736 /*
3737 * Some sanity checks.
3738 */
3739 switch (pUserPage->enmKind)
3740 {
3741 case PGMPOOLKIND_32BIT_PD:
3742 case PGMPOOLKIND_32BIT_PD_PHYS:
3743 Assert(iUserTable < X86_PG_ENTRIES);
3744 break;
3745 case PGMPOOLKIND_PAE_PDPT:
3746 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
3747 case PGMPOOLKIND_PAE_PDPT_PHYS:
3748 Assert(iUserTable < 4);
3749 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3750 break;
3751 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3752 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3753 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3754 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3755 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3756 case PGMPOOLKIND_PAE_PD_PHYS:
3757 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3758 break;
3759 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3760 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3761 Assert(!(u.pau64[iUserTable] & PGM_PDFLAGS_MAPPING));
3762 break;
3763 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3764 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3765 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3766 break;
3767 case PGMPOOLKIND_64BIT_PML4:
3768 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3769 /* GCPhys >> PAGE_SHIFT is the index here */
3770 break;
3771 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3772 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3773 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3774 break;
3775
3776 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3777 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3778 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3779 break;
3780
3781 case PGMPOOLKIND_ROOT_NESTED:
3782 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3783 break;
3784
3785 default:
3786 AssertMsgFailed(("enmKind=%d\n", pUserPage->enmKind));
3787 break;
3788 }
3789#endif /* VBOX_STRICT */
3790
3791 /*
3792 * Clear the entry in the user page.
3793 */
3794 switch (pUserPage->enmKind)
3795 {
3796 /* 32-bit entries */
3797 case PGMPOOLKIND_32BIT_PD:
3798 case PGMPOOLKIND_32BIT_PD_PHYS:
3799 ASMAtomicWriteU32(&u.pau32[iUserTable], 0);
3800 break;
3801
3802 /* 64-bit entries */
3803 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3804 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3805 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3806 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3807 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3808#ifdef IN_RC
3809 /*
3810 * In 32 bits PAE mode we *must* invalidate the TLB when changing a
3811 * PDPT entry; the CPU fetches them only during cr3 load, so any
3812 * non-present PDPT will continue to cause page faults.
3813 */
3814 ASMReloadCR3();
3815 /* no break */
3816#endif
3817 case PGMPOOLKIND_PAE_PD_PHYS:
3818 case PGMPOOLKIND_PAE_PDPT_PHYS:
3819 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3820 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3821 case PGMPOOLKIND_64BIT_PML4:
3822 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3823 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3824 case PGMPOOLKIND_PAE_PDPT:
3825 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
3826 case PGMPOOLKIND_ROOT_NESTED:
3827 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3828 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3829 ASMAtomicWriteU64(&u.pau64[iUserTable], 0);
3830 break;
3831
3832 default:
3833 AssertFatalMsgFailed(("enmKind=%d iUser=%d iUserTable=%#x\n", pUserPage->enmKind, pUser->iUser, pUser->iUserTable));
3834 }
3835 PGM_DYNMAP_UNUSED_HINT_VM(pPool->CTX_SUFF(pVM), u.pau64);
3836}
3837
3838
3839/**
3840 * Clears all users of a page.
3841 */
3842static void pgmPoolTrackClearPageUsers(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
3843{
3844 /*
3845 * Free all the user records.
3846 */
3847 LogFlow(("pgmPoolTrackClearPageUsers %RGp\n", pPage->GCPhys));
3848
3849 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
3850 uint16_t i = pPage->iUserHead;
3851 while (i != NIL_PGMPOOL_USER_INDEX)
3852 {
3853 /* Clear enter in user table. */
3854 pgmPoolTrackClearPageUser(pPool, pPage, &paUsers[i]);
3855
3856 /* Free it. */
3857 const uint16_t iNext = paUsers[i].iNext;
3858 paUsers[i].iUser = NIL_PGMPOOL_IDX;
3859 paUsers[i].iNext = pPool->iUserFreeHead;
3860 pPool->iUserFreeHead = i;
3861
3862 /* Next. */
3863 i = iNext;
3864 }
3865 pPage->iUserHead = NIL_PGMPOOL_USER_INDEX;
3866}
3867
3868
3869/**
3870 * Allocates a new physical cross reference extent.
3871 *
3872 * @returns Pointer to the allocated extent on success. NULL if we're out of them.
3873 * @param pVM Pointer to the VM.
3874 * @param piPhysExt Where to store the phys ext index.
3875 */
3876PPGMPOOLPHYSEXT pgmPoolTrackPhysExtAlloc(PVM pVM, uint16_t *piPhysExt)
3877{
3878 PGM_LOCK_ASSERT_OWNER(pVM);
3879 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3880 uint16_t iPhysExt = pPool->iPhysExtFreeHead;
3881 if (iPhysExt == NIL_PGMPOOL_PHYSEXT_INDEX)
3882 {
3883 STAM_COUNTER_INC(&pPool->StamTrackPhysExtAllocFailures);
3884 return NULL;
3885 }
3886 PPGMPOOLPHYSEXT pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3887 pPool->iPhysExtFreeHead = pPhysExt->iNext;
3888 pPhysExt->iNext = NIL_PGMPOOL_PHYSEXT_INDEX;
3889 *piPhysExt = iPhysExt;
3890 return pPhysExt;
3891}
3892
3893
3894/**
3895 * Frees a physical cross reference extent.
3896 *
3897 * @param pVM Pointer to the VM.
3898 * @param iPhysExt The extent to free.
3899 */
3900void pgmPoolTrackPhysExtFree(PVM pVM, uint16_t iPhysExt)
3901{
3902 PGM_LOCK_ASSERT_OWNER(pVM);
3903 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3904 Assert(iPhysExt < pPool->cMaxPhysExts);
3905 PPGMPOOLPHYSEXT pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3906 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3907 {
3908 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3909 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3910 }
3911 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3912 pPool->iPhysExtFreeHead = iPhysExt;
3913}
3914
3915
3916/**
3917 * Frees a physical cross reference extent.
3918 *
3919 * @param pVM Pointer to the VM.
3920 * @param iPhysExt The extent to free.
3921 */
3922void pgmPoolTrackPhysExtFreeList(PVM pVM, uint16_t iPhysExt)
3923{
3924 PGM_LOCK_ASSERT_OWNER(pVM);
3925 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3926
3927 const uint16_t iPhysExtStart = iPhysExt;
3928 PPGMPOOLPHYSEXT pPhysExt;
3929 do
3930 {
3931 Assert(iPhysExt < pPool->cMaxPhysExts);
3932 pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3933 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3934 {
3935 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3936 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3937 }
3938
3939 /* next */
3940 iPhysExt = pPhysExt->iNext;
3941 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
3942
3943 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3944 pPool->iPhysExtFreeHead = iPhysExtStart;
3945}
3946
3947
3948/**
3949 * Insert a reference into a list of physical cross reference extents.
3950 *
3951 * @returns The new tracking data for PGMPAGE.
3952 *
3953 * @param pVM Pointer to the VM.
3954 * @param iPhysExt The physical extent index of the list head.
3955 * @param iShwPT The shadow page table index.
3956 * @param iPte Page table entry
3957 *
3958 */
3959static uint16_t pgmPoolTrackPhysExtInsert(PVM pVM, uint16_t iPhysExt, uint16_t iShwPT, uint16_t iPte)
3960{
3961 PGM_LOCK_ASSERT_OWNER(pVM);
3962 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3963 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
3964
3965 /*
3966 * Special common cases.
3967 */
3968 if (paPhysExts[iPhysExt].aidx[1] == NIL_PGMPOOL_IDX)
3969 {
3970 paPhysExts[iPhysExt].aidx[1] = iShwPT;
3971 paPhysExts[iPhysExt].apte[1] = iPte;
3972 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedMany);
3973 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{,%d pte %d,}\n", iPhysExt, iShwPT, iPte));
3974 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
3975 }
3976 if (paPhysExts[iPhysExt].aidx[2] == NIL_PGMPOOL_IDX)
3977 {
3978 paPhysExts[iPhysExt].aidx[2] = iShwPT;
3979 paPhysExts[iPhysExt].apte[2] = iPte;
3980 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedMany);
3981 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{,,%d pte %d}\n", iPhysExt, iShwPT, iPte));
3982 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
3983 }
3984 AssertCompile(RT_ELEMENTS(paPhysExts[iPhysExt].aidx) == 3);
3985
3986 /*
3987 * General treatment.
3988 */
3989 const uint16_t iPhysExtStart = iPhysExt;
3990 unsigned cMax = 15;
3991 for (;;)
3992 {
3993 Assert(iPhysExt < pPool->cMaxPhysExts);
3994 for (unsigned i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
3995 if (paPhysExts[iPhysExt].aidx[i] == NIL_PGMPOOL_IDX)
3996 {
3997 paPhysExts[iPhysExt].aidx[i] = iShwPT;
3998 paPhysExts[iPhysExt].apte[i] = iPte;
3999 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedMany);
4000 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{%d pte %d} i=%d cMax=%d\n", iPhysExt, iShwPT, iPte, i, cMax));
4001 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExtStart);
4002 }
4003 if (!--cMax)
4004 {
4005 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackOverflows);
4006 pgmPoolTrackPhysExtFreeList(pVM, iPhysExtStart);
4007 LogFlow(("pgmPoolTrackPhysExtInsert: overflow (1) iShwPT=%d\n", iShwPT));
4008 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4009 }
4010
4011 /* advance */
4012 iPhysExt = paPhysExts[iPhysExt].iNext;
4013 if (iPhysExt == NIL_PGMPOOL_PHYSEXT_INDEX)
4014 break;
4015 }
4016
4017 /*
4018 * Add another extent to the list.
4019 */
4020 PPGMPOOLPHYSEXT pNew = pgmPoolTrackPhysExtAlloc(pVM, &iPhysExt);
4021 if (!pNew)
4022 {
4023 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackNoExtentsLeft);
4024 pgmPoolTrackPhysExtFreeList(pVM, iPhysExtStart);
4025 LogFlow(("pgmPoolTrackPhysExtInsert: pgmPoolTrackPhysExtAlloc failed iShwPT=%d\n", iShwPT));
4026 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4027 }
4028 pNew->iNext = iPhysExtStart;
4029 pNew->aidx[0] = iShwPT;
4030 pNew->apte[0] = iPte;
4031 LogFlow(("pgmPoolTrackPhysExtInsert: added new extent %d:{%d pte %d}->%d\n", iPhysExt, iShwPT, iPte, iPhysExtStart));
4032 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4033}
4034
4035
4036/**
4037 * Add a reference to guest physical page where extents are in use.
4038 *
4039 * @returns The new tracking data for PGMPAGE.
4040 *
4041 * @param pVM Pointer to the VM.
4042 * @param pPhysPage Pointer to the aPages entry in the ram range.
4043 * @param u16 The ram range flags (top 16-bits).
4044 * @param iShwPT The shadow page table index.
4045 * @param iPte Page table entry
4046 */
4047uint16_t pgmPoolTrackPhysExtAddref(PVM pVM, PPGMPAGE pPhysPage, uint16_t u16, uint16_t iShwPT, uint16_t iPte)
4048{
4049 pgmLock(pVM);
4050 if (PGMPOOL_TD_GET_CREFS(u16) != PGMPOOL_TD_CREFS_PHYSEXT)
4051 {
4052 /*
4053 * Convert to extent list.
4054 */
4055 Assert(PGMPOOL_TD_GET_CREFS(u16) == 1);
4056 uint16_t iPhysExt;
4057 PPGMPOOLPHYSEXT pPhysExt = pgmPoolTrackPhysExtAlloc(pVM, &iPhysExt);
4058 if (pPhysExt)
4059 {
4060 LogFlow(("pgmPoolTrackPhysExtAddref: new extent: %d:{%d, %d}\n", iPhysExt, PGMPOOL_TD_GET_IDX(u16), iShwPT));
4061 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliased);
4062 pPhysExt->aidx[0] = PGMPOOL_TD_GET_IDX(u16);
4063 pPhysExt->apte[0] = PGM_PAGE_GET_PTE_INDEX(pPhysPage);
4064 pPhysExt->aidx[1] = iShwPT;
4065 pPhysExt->apte[1] = iPte;
4066 u16 = PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4067 }
4068 else
4069 u16 = PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4070 }
4071 else if (u16 != PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED))
4072 {
4073 /*
4074 * Insert into the extent list.
4075 */
4076 u16 = pgmPoolTrackPhysExtInsert(pVM, PGMPOOL_TD_GET_IDX(u16), iShwPT, iPte);
4077 }
4078 else
4079 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedLots);
4080 pgmUnlock(pVM);
4081 return u16;
4082}
4083
4084
4085/**
4086 * Clear references to guest physical memory.
4087 *
4088 * @param pPool The pool.
4089 * @param pPage The page.
4090 * @param pPhysPage Pointer to the aPages entry in the ram range.
4091 * @param iPte Shadow PTE index
4092 */
4093void pgmPoolTrackPhysExtDerefGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMPAGE pPhysPage, uint16_t iPte)
4094{
4095 PVM pVM = pPool->CTX_SUFF(pVM);
4096 const unsigned cRefs = PGM_PAGE_GET_TD_CREFS(pPhysPage);
4097 AssertFatalMsg(cRefs == PGMPOOL_TD_CREFS_PHYSEXT, ("cRefs=%d pPhysPage=%R[pgmpage] pPage=%p:{.idx=%d}\n", cRefs, pPhysPage, pPage, pPage->idx));
4098
4099 uint16_t iPhysExt = PGM_PAGE_GET_TD_IDX(pPhysPage);
4100 if (iPhysExt != PGMPOOL_TD_IDX_OVERFLOWED)
4101 {
4102 pgmLock(pVM);
4103
4104 uint16_t iPhysExtPrev = NIL_PGMPOOL_PHYSEXT_INDEX;
4105 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
4106 do
4107 {
4108 Assert(iPhysExt < pPool->cMaxPhysExts);
4109
4110 /*
4111 * Look for the shadow page and check if it's all freed.
4112 */
4113 for (unsigned i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4114 {
4115 if ( paPhysExts[iPhysExt].aidx[i] == pPage->idx
4116 && paPhysExts[iPhysExt].apte[i] == iPte)
4117 {
4118 paPhysExts[iPhysExt].aidx[i] = NIL_PGMPOOL_IDX;
4119 paPhysExts[iPhysExt].apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
4120
4121 for (i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4122 if (paPhysExts[iPhysExt].aidx[i] != NIL_PGMPOOL_IDX)
4123 {
4124 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d\n", pPhysPage, pPage->idx));
4125 pgmUnlock(pVM);
4126 return;
4127 }
4128
4129 /* we can free the node. */
4130 const uint16_t iPhysExtNext = paPhysExts[iPhysExt].iNext;
4131 if ( iPhysExtPrev == NIL_PGMPOOL_PHYSEXT_INDEX
4132 && iPhysExtNext == NIL_PGMPOOL_PHYSEXT_INDEX)
4133 {
4134 /* lonely node */
4135 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4136 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d lonely\n", pPhysPage, pPage->idx));
4137 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
4138 }
4139 else if (iPhysExtPrev == NIL_PGMPOOL_PHYSEXT_INDEX)
4140 {
4141 /* head */
4142 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d head\n", pPhysPage, pPage->idx));
4143 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExtNext));
4144 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4145 }
4146 else
4147 {
4148 /* in list */
4149 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d in list\n", pPhysPage, pPage->idx));
4150 paPhysExts[iPhysExtPrev].iNext = iPhysExtNext;
4151 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4152 }
4153 iPhysExt = iPhysExtNext;
4154 pgmUnlock(pVM);
4155 return;
4156 }
4157 }
4158
4159 /* next */
4160 iPhysExtPrev = iPhysExt;
4161 iPhysExt = paPhysExts[iPhysExt].iNext;
4162 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
4163
4164 pgmUnlock(pVM);
4165 AssertFatalMsgFailed(("not-found! cRefs=%d pPhysPage=%R[pgmpage] pPage=%p:{.idx=%d}\n", cRefs, pPhysPage, pPage, pPage->idx));
4166 }
4167 else /* nothing to do */
4168 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage]\n", pPhysPage));
4169}
4170
4171/**
4172 * Clear references to guest physical memory.
4173 *
4174 * This is the same as pgmPoolTracDerefGCPhysHint except that the guest
4175 * physical address is assumed to be correct, so the linear search can be
4176 * skipped and we can assert at an earlier point.
4177 *
4178 * @param pPool The pool.
4179 * @param pPage The page.
4180 * @param HCPhys The host physical address corresponding to the guest page.
4181 * @param GCPhys The guest physical address corresponding to HCPhys.
4182 * @param iPte Shadow PTE index
4183 */
4184static void pgmPoolTracDerefGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTHCPHYS HCPhys, RTGCPHYS GCPhys, uint16_t iPte)
4185{
4186 /*
4187 * Lookup the page and check if it checks out before derefing it.
4188 */
4189 PVM pVM = pPool->CTX_SUFF(pVM);
4190 PPGMPAGE pPhysPage = pgmPhysGetPage(pVM, GCPhys);
4191 if (pPhysPage)
4192 {
4193 Assert(PGM_PAGE_GET_HCPHYS(pPhysPage));
4194#ifdef LOG_ENABLED
4195 RTHCPHYS HCPhysPage = PGM_PAGE_GET_HCPHYS(pPhysPage);
4196 Log2(("pgmPoolTracDerefGCPhys %RHp vs %RHp\n", HCPhysPage, HCPhys));
4197#endif
4198 if (PGM_PAGE_GET_HCPHYS(pPhysPage) == HCPhys)
4199 {
4200 Assert(pPage->cPresent);
4201 Assert(pPool->cPresent);
4202 pPage->cPresent--;
4203 pPool->cPresent--;
4204 pgmTrackDerefGCPhys(pPool, pPage, pPhysPage, iPte);
4205 return;
4206 }
4207
4208 AssertFatalMsgFailed(("HCPhys=%RHp GCPhys=%RGp; found page has HCPhys=%RHp\n",
4209 HCPhys, GCPhys, PGM_PAGE_GET_HCPHYS(pPhysPage)));
4210 }
4211 AssertFatalMsgFailed(("HCPhys=%RHp GCPhys=%RGp\n", HCPhys, GCPhys));
4212}
4213
4214
4215/**
4216 * Clear references to guest physical memory.
4217 *
4218 * @param pPool The pool.
4219 * @param pPage The page.
4220 * @param HCPhys The host physical address corresponding to the guest page.
4221 * @param GCPhysHint The guest physical address which may corresponding to HCPhys.
4222 * @param iPte Shadow pte index
4223 */
4224void pgmPoolTracDerefGCPhysHint(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTHCPHYS HCPhys, RTGCPHYS GCPhysHint, uint16_t iPte)
4225{
4226 Log4(("pgmPoolTracDerefGCPhysHint %RHp %RGp\n", HCPhys, GCPhysHint));
4227
4228 /*
4229 * Try the hint first.
4230 */
4231 RTHCPHYS HCPhysHinted;
4232 PVM pVM = pPool->CTX_SUFF(pVM);
4233 PPGMPAGE pPhysPage = pgmPhysGetPage(pVM, GCPhysHint);
4234 if (pPhysPage)
4235 {
4236 HCPhysHinted = PGM_PAGE_GET_HCPHYS(pPhysPage);
4237 Assert(HCPhysHinted);
4238 if (HCPhysHinted == HCPhys)
4239 {
4240 Assert(pPage->cPresent);
4241 Assert(pPool->cPresent);
4242 pPage->cPresent--;
4243 pPool->cPresent--;
4244 pgmTrackDerefGCPhys(pPool, pPage, pPhysPage, iPte);
4245 return;
4246 }
4247 }
4248 else
4249 HCPhysHinted = UINT64_C(0xdeadbeefdeadbeef);
4250
4251 /*
4252 * Damn, the hint didn't work. We'll have to do an expensive linear search.
4253 */
4254 STAM_COUNTER_INC(&pPool->StatTrackLinearRamSearches);
4255 PPGMRAMRANGE pRam = pPool->CTX_SUFF(pVM)->pgm.s.CTX_SUFF(pRamRangesX);
4256 while (pRam)
4257 {
4258 unsigned iPage = pRam->cb >> PAGE_SHIFT;
4259 while (iPage-- > 0)
4260 {
4261 if (PGM_PAGE_GET_HCPHYS(&pRam->aPages[iPage]) == HCPhys)
4262 {
4263 Log4(("pgmPoolTracDerefGCPhysHint: Linear HCPhys=%RHp GCPhysHint=%RGp GCPhysReal=%RGp\n",
4264 HCPhys, GCPhysHint, pRam->GCPhys + (iPage << PAGE_SHIFT)));
4265 Assert(pPage->cPresent);
4266 Assert(pPool->cPresent);
4267 pPage->cPresent--;
4268 pPool->cPresent--;
4269 pgmTrackDerefGCPhys(pPool, pPage, &pRam->aPages[iPage], iPte);
4270 return;
4271 }
4272 }
4273 pRam = pRam->CTX_SUFF(pNext);
4274 }
4275
4276 AssertFatalMsgFailed(("HCPhys=%RHp GCPhysHint=%RGp (Hinted page has HCPhys = %RHp)\n", HCPhys, GCPhysHint, HCPhysHinted));
4277}
4278
4279
4280/**
4281 * Clear references to guest physical memory in a 32-bit / 32-bit page table.
4282 *
4283 * @param pPool The pool.
4284 * @param pPage The page.
4285 * @param pShwPT The shadow page table (mapping of the page).
4286 * @param pGstPT The guest page table.
4287 */
4288DECLINLINE(void) pgmPoolTrackDerefPT32Bit32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PT pShwPT, PCX86PT pGstPT)
4289{
4290 RTGCPHYS32 const fPgMask = pPage->fA20Enabled ? X86_PTE_PG_MASK : X86_PTE_PG_MASK & ~RT_BIT_32(20);
4291 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4292 {
4293 Assert(!(pShwPT->a[i].u & RT_BIT_32(10)));
4294 if (pShwPT->a[i].n.u1Present)
4295 {
4296 Log4(("pgmPoolTrackDerefPT32Bit32Bit: i=%d pte=%RX32 hint=%RX32\n",
4297 i, pShwPT->a[i].u & X86_PTE_PG_MASK, pGstPT->a[i].u & X86_PTE_PG_MASK));
4298 pgmPoolTracDerefGCPhysHint(pPool, pPage, pShwPT->a[i].u & X86_PTE_PG_MASK, pGstPT->a[i].u & fPgMask, i);
4299 if (!pPage->cPresent)
4300 break;
4301 }
4302 }
4303}
4304
4305
4306/**
4307 * Clear references to guest physical memory in a PAE / 32-bit page table.
4308 *
4309 * @param pPool The pool.
4310 * @param pPage The page.
4311 * @param pShwPT The shadow page table (mapping of the page).
4312 * @param pGstPT The guest page table (just a half one).
4313 */
4314DECLINLINE(void) pgmPoolTrackDerefPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT)
4315{
4316 RTGCPHYS32 const fPgMask = pPage->fA20Enabled ? X86_PTE_PG_MASK : X86_PTE_PG_MASK & ~RT_BIT_32(20);
4317 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4318 {
4319 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4320 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4321 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4322 {
4323 Log4(("pgmPoolTrackDerefPTPae32Bit: i=%d pte=%RX64 hint=%RX32\n",
4324 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & X86_PTE_PG_MASK));
4325 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & fPgMask, i);
4326 if (!pPage->cPresent)
4327 break;
4328 }
4329 }
4330}
4331
4332
4333/**
4334 * Clear references to guest physical memory in a PAE / PAE page table.
4335 *
4336 * @param pPool The pool.
4337 * @param pPage The page.
4338 * @param pShwPT The shadow page table (mapping of the page).
4339 * @param pGstPT The guest page table.
4340 */
4341DECLINLINE(void) pgmPoolTrackDerefPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT)
4342{
4343 RTGCPHYS const fPgMask = pPage->fA20Enabled ? X86_PTE_PAE_PG_MASK : X86_PTE_PAE_PG_MASK & ~RT_BIT_64(20);
4344 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4345 {
4346 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4347 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4348 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4349 {
4350 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX32 hint=%RX32\n",
4351 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK));
4352 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & fPgMask, i);
4353 if (!pPage->cPresent)
4354 break;
4355 }
4356 }
4357}
4358
4359
4360/**
4361 * Clear references to guest physical memory in a 32-bit / 4MB page table.
4362 *
4363 * @param pPool The pool.
4364 * @param pPage The page.
4365 * @param pShwPT The shadow page table (mapping of the page).
4366 */
4367DECLINLINE(void) pgmPoolTrackDerefPT32Bit4MB(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PT pShwPT)
4368{
4369 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4370 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4371 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4372 {
4373 Assert(!(pShwPT->a[i].u & RT_BIT_32(10)));
4374 if (pShwPT->a[i].n.u1Present)
4375 {
4376 Log4(("pgmPoolTrackDerefPT32Bit4MB: i=%d pte=%RX32 GCPhys=%RGp\n",
4377 i, pShwPT->a[i].u & X86_PTE_PG_MASK, GCPhys));
4378 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPT->a[i].u & X86_PTE_PG_MASK, GCPhys & GCPhysA20Mask, i);
4379 if (!pPage->cPresent)
4380 break;
4381 }
4382 }
4383}
4384
4385
4386/**
4387 * Clear references to guest physical memory in a PAE / 2/4MB page table.
4388 *
4389 * @param pPool The pool.
4390 * @param pPage The page.
4391 * @param pShwPT The shadow page table (mapping of the page).
4392 */
4393DECLINLINE(void) pgmPoolTrackDerefPTPaeBig(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT)
4394{
4395 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4396 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4397 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4398 {
4399 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4400 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4401 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4402 {
4403 Log4(("pgmPoolTrackDerefPTPaeBig: i=%d pte=%RX64 hint=%RGp\n",
4404 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), GCPhys));
4405 pgmPoolTracDerefGCPhys(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), GCPhys & GCPhysA20Mask, i);
4406 if (!pPage->cPresent)
4407 break;
4408 }
4409 }
4410}
4411
4412
4413/**
4414 * Clear references to shadowed pages in an EPT page table.
4415 *
4416 * @param pPool The pool.
4417 * @param pPage The page.
4418 * @param pShwPML4 The shadow page directory pointer table (mapping of the page).
4419 */
4420DECLINLINE(void) pgmPoolTrackDerefPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPT pShwPT)
4421{
4422 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4423 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4424 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4425 {
4426 Assert((pShwPT->a[i].u & UINT64_C(0xfff0000000000f80)) == 0);
4427 if (pShwPT->a[i].n.u1Present)
4428 {
4429 Log4(("pgmPoolTrackDerefPTEPT: i=%d pte=%RX64 GCPhys=%RX64\n",
4430 i, pShwPT->a[i].u & EPT_PTE_PG_MASK, pPage->GCPhys));
4431 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPT->a[i].u & EPT_PTE_PG_MASK, GCPhys & GCPhysA20Mask, i);
4432 if (!pPage->cPresent)
4433 break;
4434 }
4435 }
4436}
4437
4438
4439/**
4440 * Clear references to shadowed pages in a 32 bits page directory.
4441 *
4442 * @param pPool The pool.
4443 * @param pPage The page.
4444 * @param pShwPD The shadow page directory (mapping of the page).
4445 */
4446DECLINLINE(void) pgmPoolTrackDerefPD(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PD pShwPD)
4447{
4448 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4449 {
4450 Assert(!(pShwPD->a[i].u & RT_BIT_32(9)));
4451 if ( pShwPD->a[i].n.u1Present
4452 && !(pShwPD->a[i].u & PGM_PDFLAGS_MAPPING)
4453 )
4454 {
4455 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & X86_PDE_PG_MASK);
4456 if (pSubPage)
4457 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4458 else
4459 AssertFatalMsgFailed(("%x\n", pShwPD->a[i].u & X86_PDE_PG_MASK));
4460 }
4461 }
4462}
4463
4464
4465/**
4466 * Clear references to shadowed pages in a PAE (legacy or 64 bits) page directory.
4467 *
4468 * @param pPool The pool.
4469 * @param pPage The page.
4470 * @param pShwPD The shadow page directory (mapping of the page).
4471 */
4472DECLINLINE(void) pgmPoolTrackDerefPDPae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPAE pShwPD)
4473{
4474 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4475 {
4476 if ( pShwPD->a[i].n.u1Present
4477 && !(pShwPD->a[i].u & PGM_PDFLAGS_MAPPING))
4478 {
4479#ifdef PGM_WITH_LARGE_PAGES
4480 if (pShwPD->a[i].b.u1Size)
4481 {
4482 Log4(("pgmPoolTrackDerefPDPae: i=%d pde=%RX64 GCPhys=%RX64\n",
4483 i, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK, pPage->GCPhys));
4484 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK,
4485 pPage->GCPhys + i * 2 * _1M /* pPage->GCPhys = base address of the memory described by the PD */,
4486 i);
4487 }
4488 else
4489#endif
4490 {
4491 Assert((pShwPD->a[i].u & (X86_PDE_PAE_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4492 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & X86_PDE_PAE_PG_MASK);
4493 if (pSubPage)
4494 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4495 else
4496 AssertFatalMsgFailed(("%RX64\n", pShwPD->a[i].u & X86_PDE_PAE_PG_MASK));
4497 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4498 }
4499 }
4500 }
4501}
4502
4503
4504/**
4505 * Clear references to shadowed pages in a PAE page directory pointer table.
4506 *
4507 * @param pPool The pool.
4508 * @param pPage The page.
4509 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4510 */
4511DECLINLINE(void) pgmPoolTrackDerefPDPTPae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPT pShwPDPT)
4512{
4513 for (unsigned i = 0; i < X86_PG_PAE_PDPE_ENTRIES; i++)
4514 {
4515 Assert((pShwPDPT->a[i].u & (X86_PDPE_PAE_MBZ_MASK | UINT64_C(0x7ff0000000000200))) == 0);
4516 if ( pShwPDPT->a[i].n.u1Present
4517 && !(pShwPDPT->a[i].u & PGM_PLXFLAGS_MAPPING)
4518 )
4519 {
4520 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPDPT->a[i].u & X86_PDPE_PG_MASK);
4521 if (pSubPage)
4522 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4523 else
4524 AssertFatalMsgFailed(("%RX64\n", pShwPDPT->a[i].u & X86_PDPE_PG_MASK));
4525 }
4526 }
4527}
4528
4529
4530/**
4531 * Clear references to shadowed pages in a 64-bit page directory pointer table.
4532 *
4533 * @param pPool The pool.
4534 * @param pPage The page.
4535 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4536 */
4537DECLINLINE(void) pgmPoolTrackDerefPDPT64Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPT pShwPDPT)
4538{
4539 for (unsigned i = 0; i < RT_ELEMENTS(pShwPDPT->a); i++)
4540 {
4541 Assert((pShwPDPT->a[i].u & (X86_PDPE_LM_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4542 if (pShwPDPT->a[i].n.u1Present)
4543 {
4544 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPDPT->a[i].u & X86_PDPE_PG_MASK);
4545 if (pSubPage)
4546 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4547 else
4548 AssertFatalMsgFailed(("%RX64\n", pShwPDPT->a[i].u & X86_PDPE_PG_MASK));
4549 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4550 }
4551 }
4552}
4553
4554
4555/**
4556 * Clear references to shadowed pages in a 64-bit level 4 page table.
4557 *
4558 * @param pPool The pool.
4559 * @param pPage The page.
4560 * @param pShwPML4 The shadow page directory pointer table (mapping of the page).
4561 */
4562DECLINLINE(void) pgmPoolTrackDerefPML464Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PML4 pShwPML4)
4563{
4564 for (unsigned i = 0; i < RT_ELEMENTS(pShwPML4->a); i++)
4565 {
4566 Assert((pShwPML4->a[i].u & (X86_PML4E_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4567 if (pShwPML4->a[i].n.u1Present)
4568 {
4569 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPML4->a[i].u & X86_PDPE_PG_MASK);
4570 if (pSubPage)
4571 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4572 else
4573 AssertFatalMsgFailed(("%RX64\n", pShwPML4->a[i].u & X86_PML4E_PG_MASK));
4574 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4575 }
4576 }
4577}
4578
4579
4580/**
4581 * Clear references to shadowed pages in an EPT page directory.
4582 *
4583 * @param pPool The pool.
4584 * @param pPage The page.
4585 * @param pShwPD The shadow page directory (mapping of the page).
4586 */
4587DECLINLINE(void) pgmPoolTrackDerefPDEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPD pShwPD)
4588{
4589 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4590 {
4591 Assert((pShwPD->a[i].u & UINT64_C(0xfff0000000000f80)) == 0);
4592 if (pShwPD->a[i].n.u1Present)
4593 {
4594#ifdef PGM_WITH_LARGE_PAGES
4595 if (pShwPD->a[i].b.u1Size)
4596 {
4597 Log4(("pgmPoolTrackDerefPDEPT: i=%d pde=%RX64 GCPhys=%RX64\n",
4598 i, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK, pPage->GCPhys));
4599 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK,
4600 pPage->GCPhys + i * 2 * _1M /* pPage->GCPhys = base address of the memory described by the PD */,
4601 i);
4602 }
4603 else
4604#endif
4605 {
4606 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & EPT_PDE_PG_MASK);
4607 if (pSubPage)
4608 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4609 else
4610 AssertFatalMsgFailed(("%RX64\n", pShwPD->a[i].u & EPT_PDE_PG_MASK));
4611 }
4612 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4613 }
4614 }
4615}
4616
4617
4618/**
4619 * Clear references to shadowed pages in an EPT page directory pointer table.
4620 *
4621 * @param pPool The pool.
4622 * @param pPage The page.
4623 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4624 */
4625DECLINLINE(void) pgmPoolTrackDerefPDPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPDPT pShwPDPT)
4626{
4627 for (unsigned i = 0; i < RT_ELEMENTS(pShwPDPT->a); i++)
4628 {
4629 Assert((pShwPDPT->a[i].u & UINT64_C(0xfff0000000000f80)) == 0);
4630 if (pShwPDPT->a[i].n.u1Present)
4631 {
4632 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPDPT->a[i].u & EPT_PDPTE_PG_MASK);
4633 if (pSubPage)
4634 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4635 else
4636 AssertFatalMsgFailed(("%RX64\n", pShwPDPT->a[i].u & EPT_PDPTE_PG_MASK));
4637 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4638 }
4639 }
4640}
4641
4642
4643/**
4644 * Clears all references made by this page.
4645 *
4646 * This includes other shadow pages and GC physical addresses.
4647 *
4648 * @param pPool The pool.
4649 * @param pPage The page.
4650 */
4651static void pgmPoolTrackDeref(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
4652{
4653 /*
4654 * Map the shadow page and take action according to the page kind.
4655 */
4656 PVM pVM = pPool->CTX_SUFF(pVM);
4657 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
4658 switch (pPage->enmKind)
4659 {
4660 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
4661 {
4662 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4663 void *pvGst;
4664 int rc = PGM_GCPHYS_2_PTR(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
4665 pgmPoolTrackDerefPT32Bit32Bit(pPool, pPage, (PX86PT)pvShw, (PCX86PT)pvGst);
4666 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
4667 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4668 break;
4669 }
4670
4671 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
4672 {
4673 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4674 void *pvGst;
4675 int rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
4676 pgmPoolTrackDerefPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst);
4677 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
4678 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4679 break;
4680 }
4681
4682 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
4683 {
4684 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4685 void *pvGst;
4686 int rc = PGM_GCPHYS_2_PTR(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
4687 pgmPoolTrackDerefPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
4688 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
4689 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4690 break;
4691 }
4692
4693 case PGMPOOLKIND_32BIT_PT_FOR_PHYS: /* treat it like a 4 MB page */
4694 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
4695 {
4696 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4697 pgmPoolTrackDerefPT32Bit4MB(pPool, pPage, (PX86PT)pvShw);
4698 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4699 break;
4700 }
4701
4702 case PGMPOOLKIND_PAE_PT_FOR_PHYS: /* treat it like a 2 MB page */
4703 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
4704 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
4705 {
4706 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4707 pgmPoolTrackDerefPTPaeBig(pPool, pPage, (PPGMSHWPTPAE)pvShw);
4708 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4709 break;
4710 }
4711
4712 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
4713 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
4714 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
4715 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
4716 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
4717 case PGMPOOLKIND_PAE_PD_PHYS:
4718 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
4719 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
4720 pgmPoolTrackDerefPDPae(pPool, pPage, (PX86PDPAE)pvShw);
4721 break;
4722
4723 case PGMPOOLKIND_32BIT_PD_PHYS:
4724 case PGMPOOLKIND_32BIT_PD:
4725 pgmPoolTrackDerefPD(pPool, pPage, (PX86PD)pvShw);
4726 break;
4727
4728 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
4729 case PGMPOOLKIND_PAE_PDPT:
4730 case PGMPOOLKIND_PAE_PDPT_PHYS:
4731 pgmPoolTrackDerefPDPTPae(pPool, pPage, (PX86PDPT)pvShw);
4732 break;
4733
4734 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
4735 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
4736 pgmPoolTrackDerefPDPT64Bit(pPool, pPage, (PX86PDPT)pvShw);
4737 break;
4738
4739 case PGMPOOLKIND_64BIT_PML4:
4740 pgmPoolTrackDerefPML464Bit(pPool, pPage, (PX86PML4)pvShw);
4741 break;
4742
4743 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
4744 pgmPoolTrackDerefPTEPT(pPool, pPage, (PEPTPT)pvShw);
4745 break;
4746
4747 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
4748 pgmPoolTrackDerefPDEPT(pPool, pPage, (PEPTPD)pvShw);
4749 break;
4750
4751 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
4752 pgmPoolTrackDerefPDPTEPT(pPool, pPage, (PEPTPDPT)pvShw);
4753 break;
4754
4755 default:
4756 AssertFatalMsgFailed(("enmKind=%d\n", pPage->enmKind));
4757 }
4758
4759 /* paranoia, clear the shadow page. Remove this laser (i.e. let Alloc and ClearAll do it). */
4760 STAM_PROFILE_START(&pPool->StatZeroPage, z);
4761 ASMMemZeroPage(pvShw);
4762 STAM_PROFILE_STOP(&pPool->StatZeroPage, z);
4763 pPage->fZeroed = true;
4764 Assert(!pPage->cPresent);
4765 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
4766}
4767
4768
4769/**
4770 * Flushes a pool page.
4771 *
4772 * This moves the page to the free list after removing all user references to it.
4773 *
4774 * @returns VBox status code.
4775 * @retval VINF_SUCCESS on success.
4776 * @param pPool The pool.
4777 * @param HCPhys The HC physical address of the shadow page.
4778 * @param fFlush Flush the TLBS when required (should only be false in very specific use cases!!)
4779 */
4780int pgmPoolFlushPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage, bool fFlush)
4781{
4782 PVM pVM = pPool->CTX_SUFF(pVM);
4783 bool fFlushRequired = false;
4784
4785 int rc = VINF_SUCCESS;
4786 STAM_PROFILE_START(&pPool->StatFlushPage, f);
4787 LogFlow(("pgmPoolFlushPage: pPage=%p:{.Key=%RHp, .idx=%d, .enmKind=%s, .GCPhys=%RGp}\n",
4788 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
4789
4790 /*
4791 * Reject any attempts at flushing any of the special root pages (shall
4792 * not happen).
4793 */
4794 AssertMsgReturn(pPage->idx >= PGMPOOL_IDX_FIRST,
4795 ("pgmPoolFlushPage: special root page, rejected. enmKind=%s idx=%d\n",
4796 pgmPoolPoolKindToStr(pPage->enmKind), pPage->idx),
4797 VINF_SUCCESS);
4798
4799 pgmLock(pVM);
4800
4801 /*
4802 * Quietly reject any attempts at flushing the currently active shadow CR3 mapping
4803 */
4804 if (pgmPoolIsPageLocked(pPage))
4805 {
4806 AssertMsg( pPage->enmKind == PGMPOOLKIND_64BIT_PML4
4807 || pPage->enmKind == PGMPOOLKIND_PAE_PDPT
4808 || pPage->enmKind == PGMPOOLKIND_PAE_PDPT_FOR_32BIT
4809 || pPage->enmKind == PGMPOOLKIND_32BIT_PD
4810 || pPage->enmKind == PGMPOOLKIND_PAE_PD_FOR_PAE_PD
4811 || pPage->enmKind == PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD
4812 || pPage->enmKind == PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD
4813 || pPage->enmKind == PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD
4814 || pPage->enmKind == PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD
4815 || pPage->enmKind == PGMPOOLKIND_ROOT_NESTED,
4816 ("Can't free the shadow CR3! (%RHp vs %RHp kind=%d\n", PGMGetHyperCR3(VMMGetCpu(pVM)), pPage->Core.Key, pPage->enmKind));
4817 Log(("pgmPoolFlushPage: current active shadow CR3, rejected. enmKind=%s idx=%d\n", pgmPoolPoolKindToStr(pPage->enmKind), pPage->idx));
4818 pgmUnlock(pVM);
4819 return VINF_SUCCESS;
4820 }
4821
4822#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
4823 /* Start a subset so we won't run out of mapping space. */
4824 PVMCPU pVCpu = VMMGetCpu(pVM);
4825 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
4826#endif
4827
4828 /*
4829 * Mark the page as being in need of an ASMMemZeroPage().
4830 */
4831 pPage->fZeroed = false;
4832
4833#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
4834 if (pPage->fDirty)
4835 pgmPoolFlushDirtyPage(pVM, pPool, pPage->idxDirtyEntry, false /* do not remove */);
4836#endif
4837
4838 /* If there are any users of this table, then we *must* issue a tlb flush on all VCPUs. */
4839 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
4840 fFlushRequired = true;
4841
4842 /*
4843 * Clear the page.
4844 */
4845 pgmPoolTrackClearPageUsers(pPool, pPage);
4846 STAM_PROFILE_START(&pPool->StatTrackDeref,a);
4847 pgmPoolTrackDeref(pPool, pPage);
4848 STAM_PROFILE_STOP(&pPool->StatTrackDeref,a);
4849
4850 /*
4851 * Flush it from the cache.
4852 */
4853 pgmPoolCacheFlushPage(pPool, pPage);
4854
4855#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
4856 /* Heavy stuff done. */
4857 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
4858#endif
4859
4860 /*
4861 * Deregistering the monitoring.
4862 */
4863 if (pPage->fMonitored)
4864 rc = pgmPoolMonitorFlush(pPool, pPage);
4865
4866 /*
4867 * Free the page.
4868 */
4869 Assert(pPage->iNext == NIL_PGMPOOL_IDX);
4870 pPage->iNext = pPool->iFreeHead;
4871 pPool->iFreeHead = pPage->idx;
4872 pPage->enmKind = PGMPOOLKIND_FREE;
4873 pPage->enmAccess = PGMPOOLACCESS_DONTCARE;
4874 pPage->GCPhys = NIL_RTGCPHYS;
4875 pPage->fReusedFlushPending = false;
4876
4877 pPool->cUsedPages--;
4878
4879 /* Flush the TLBs of all VCPUs if required. */
4880 if ( fFlushRequired
4881 && fFlush)
4882 {
4883 PGM_INVL_ALL_VCPU_TLBS(pVM);
4884 }
4885
4886 pgmUnlock(pVM);
4887 STAM_PROFILE_STOP(&pPool->StatFlushPage, f);
4888 return rc;
4889}
4890
4891
4892/**
4893 * Frees a usage of a pool page.
4894 *
4895 * The caller is responsible to updating the user table so that it no longer
4896 * references the shadow page.
4897 *
4898 * @param pPool The pool.
4899 * @param HCPhys The HC physical address of the shadow page.
4900 * @param iUser The shadow page pool index of the user table.
4901 * @param iUserTable The index into the user table (shadowed).
4902 */
4903void pgmPoolFreeByPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
4904{
4905 PVM pVM = pPool->CTX_SUFF(pVM);
4906
4907 STAM_PROFILE_START(&pPool->StatFree, a);
4908 LogFlow(("pgmPoolFreeByPage: pPage=%p:{.Key=%RHp, .idx=%d, enmKind=%s} iUser=%d iUserTable=%#x\n",
4909 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), iUser, iUserTable));
4910 AssertReturnVoid(pPage->idx >= PGMPOOL_IDX_FIRST); /* paranoia (#6349) */
4911
4912 pgmLock(pVM);
4913 pgmPoolTrackFreeUser(pPool, pPage, iUser, iUserTable);
4914 if (!pPage->fCached)
4915 pgmPoolFlushPage(pPool, pPage);
4916 pgmUnlock(pVM);
4917 STAM_PROFILE_STOP(&pPool->StatFree, a);
4918}
4919
4920
4921/**
4922 * Makes one or more free page free.
4923 *
4924 * @returns VBox status code.
4925 * @retval VINF_SUCCESS on success.
4926 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
4927 *
4928 * @param pPool The pool.
4929 * @param enmKind Page table kind
4930 * @param iUser The user of the page.
4931 */
4932static int pgmPoolMakeMoreFreePages(PPGMPOOL pPool, PGMPOOLKIND enmKind, uint16_t iUser)
4933{
4934 PVM pVM = pPool->CTX_SUFF(pVM);
4935 LogFlow(("pgmPoolMakeMoreFreePages: iUser=%d\n", iUser));
4936 NOREF(enmKind);
4937
4938 /*
4939 * If the pool isn't full grown yet, expand it.
4940 */
4941 if ( pPool->cCurPages < pPool->cMaxPages
4942#if defined(IN_RC)
4943 /* Hack alert: we can't deal with jumps to ring 3 when called from MapCR3 and allocating pages for PAE PDs. */
4944 && enmKind != PGMPOOLKIND_PAE_PD_FOR_PAE_PD
4945 && (enmKind < PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD || enmKind > PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD)
4946#endif
4947 )
4948 {
4949 STAM_PROFILE_ADV_SUSPEND(&pPool->StatAlloc, a);
4950#ifdef IN_RING3
4951 int rc = PGMR3PoolGrow(pVM);
4952#else
4953 int rc = VMMRZCallRing3NoCpu(pVM, VMMCALLRING3_PGM_POOL_GROW, 0);
4954#endif
4955 if (RT_FAILURE(rc))
4956 return rc;
4957 STAM_PROFILE_ADV_RESUME(&pPool->StatAlloc, a);
4958 if (pPool->iFreeHead != NIL_PGMPOOL_IDX)
4959 return VINF_SUCCESS;
4960 }
4961
4962 /*
4963 * Free one cached page.
4964 */
4965 return pgmPoolCacheFreeOne(pPool, iUser);
4966}
4967
4968
4969/**
4970 * Allocates a page from the pool.
4971 *
4972 * This page may actually be a cached page and not in need of any processing
4973 * on the callers part.
4974 *
4975 * @returns VBox status code.
4976 * @retval VINF_SUCCESS if a NEW page was allocated.
4977 * @retval VINF_PGM_CACHED_PAGE if a CACHED page was returned.
4978 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
4979 *
4980 * @param pVM Pointer to the VM.
4981 * @param GCPhys The GC physical address of the page we're gonna shadow.
4982 * For 4MB and 2MB PD entries, it's the first address the
4983 * shadow PT is covering.
4984 * @param enmKind The kind of mapping.
4985 * @param enmAccess Access type for the mapping (only relevant for big pages)
4986 * @param fA20Enabled Whether the A20 gate is enabled or not.
4987 * @param iUser The shadow page pool index of the user table.
4988 * @param iUserTable The index into the user table (shadowed).
4989 * @param fLockPage Lock the page
4990 * @param ppPage Where to store the pointer to the page. NULL is stored here on failure.
4991 */
4992int pgmPoolAlloc(PVM pVM, RTGCPHYS GCPhys, PGMPOOLKIND enmKind, PGMPOOLACCESS enmAccess, bool fA20Enabled,
4993 uint16_t iUser, uint32_t iUserTable, bool fLockPage, PPPGMPOOLPAGE ppPage)
4994{
4995 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
4996 STAM_PROFILE_ADV_START(&pPool->StatAlloc, a);
4997 LogFlow(("pgmPoolAlloc: GCPhys=%RGp enmKind=%s iUser=%d iUserTable=%#x\n", GCPhys, pgmPoolPoolKindToStr(enmKind), iUser, iUserTable));
4998 *ppPage = NULL;
4999 /** @todo CSAM/PGMPrefetchPage messes up here during CSAMR3CheckGates
5000 * (TRPMR3SyncIDT) because of FF priority. Try fix that?
5001 * Assert(!(pVM->pgm.s.fGlobalSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)); */
5002
5003 pgmLock(pVM);
5004
5005 if (pPool->fCacheEnabled)
5006 {
5007 int rc2 = pgmPoolCacheAlloc(pPool, GCPhys, enmKind, enmAccess, fA20Enabled, iUser, iUserTable, ppPage);
5008 if (RT_SUCCESS(rc2))
5009 {
5010 if (fLockPage)
5011 pgmPoolLockPage(pPool, *ppPage);
5012 pgmUnlock(pVM);
5013 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5014 LogFlow(("pgmPoolAlloc: cached returns %Rrc *ppPage=%p:{.Key=%RHp, .idx=%d}\n", rc2, *ppPage, (*ppPage)->Core.Key, (*ppPage)->idx));
5015 return rc2;
5016 }
5017 }
5018
5019 /*
5020 * Allocate a new one.
5021 */
5022 int rc = VINF_SUCCESS;
5023 uint16_t iNew = pPool->iFreeHead;
5024 if (iNew == NIL_PGMPOOL_IDX)
5025 {
5026 rc = pgmPoolMakeMoreFreePages(pPool, enmKind, iUser);
5027 if (RT_FAILURE(rc))
5028 {
5029 pgmUnlock(pVM);
5030 Log(("pgmPoolAlloc: returns %Rrc (Free)\n", rc));
5031 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5032 return rc;
5033 }
5034 iNew = pPool->iFreeHead;
5035 AssertReleaseReturn(iNew != NIL_PGMPOOL_IDX, VERR_PGM_POOL_IPE);
5036 }
5037
5038 /* unlink the free head */
5039 PPGMPOOLPAGE pPage = &pPool->aPages[iNew];
5040 pPool->iFreeHead = pPage->iNext;
5041 pPage->iNext = NIL_PGMPOOL_IDX;
5042
5043 /*
5044 * Initialize it.
5045 */
5046 pPool->cUsedPages++; /* physical handler registration / pgmPoolTrackFlushGCPhysPTsSlow requirement. */
5047 pPage->enmKind = enmKind;
5048 pPage->enmAccess = enmAccess;
5049 pPage->GCPhys = GCPhys;
5050 pPage->fA20Enabled = fA20Enabled;
5051 pPage->fSeenNonGlobal = false; /* Set this to 'true' to disable this feature. */
5052 pPage->fMonitored = false;
5053 pPage->fCached = false;
5054 pPage->fDirty = false;
5055 pPage->fReusedFlushPending = false;
5056 pPage->cModifications = 0;
5057 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
5058 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
5059 pPage->cPresent = 0;
5060 pPage->iFirstPresent = NIL_PGMPOOL_PRESENT_INDEX;
5061 pPage->idxDirtyEntry = 0;
5062 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
5063 pPage->GCPtrLastAccessHandlerRip = NIL_RTGCPTR;
5064 pPage->cLastAccessHandler = 0;
5065 pPage->cLocked = 0;
5066# ifdef VBOX_STRICT
5067 pPage->GCPtrDirtyFault = NIL_RTGCPTR;
5068# endif
5069
5070 /*
5071 * Insert into the tracking and cache. If this fails, free the page.
5072 */
5073 int rc3 = pgmPoolTrackInsert(pPool, pPage, GCPhys, iUser, iUserTable);
5074 if (RT_FAILURE(rc3))
5075 {
5076 pPool->cUsedPages--;
5077 pPage->enmKind = PGMPOOLKIND_FREE;
5078 pPage->enmAccess = PGMPOOLACCESS_DONTCARE;
5079 pPage->GCPhys = NIL_RTGCPHYS;
5080 pPage->iNext = pPool->iFreeHead;
5081 pPool->iFreeHead = pPage->idx;
5082 pgmUnlock(pVM);
5083 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5084 Log(("pgmPoolAlloc: returns %Rrc (Insert)\n", rc3));
5085 return rc3;
5086 }
5087
5088 /*
5089 * Commit the allocation, clear the page and return.
5090 */
5091#ifdef VBOX_WITH_STATISTICS
5092 if (pPool->cUsedPages > pPool->cUsedPagesHigh)
5093 pPool->cUsedPagesHigh = pPool->cUsedPages;
5094#endif
5095
5096 if (!pPage->fZeroed)
5097 {
5098 STAM_PROFILE_START(&pPool->StatZeroPage, z);
5099 void *pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
5100 ASMMemZeroPage(pv);
5101 STAM_PROFILE_STOP(&pPool->StatZeroPage, z);
5102 }
5103
5104 *ppPage = pPage;
5105 if (fLockPage)
5106 pgmPoolLockPage(pPool, pPage);
5107 pgmUnlock(pVM);
5108 LogFlow(("pgmPoolAlloc: returns %Rrc *ppPage=%p:{.Key=%RHp, .idx=%d, .fCached=%RTbool, .fMonitored=%RTbool}\n",
5109 rc, pPage, pPage->Core.Key, pPage->idx, pPage->fCached, pPage->fMonitored));
5110 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5111 return rc;
5112}
5113
5114
5115/**
5116 * Frees a usage of a pool page.
5117 *
5118 * @param pVM Pointer to the VM.
5119 * @param HCPhys The HC physical address of the shadow page.
5120 * @param iUser The shadow page pool index of the user table.
5121 * @param iUserTable The index into the user table (shadowed).
5122 */
5123void pgmPoolFree(PVM pVM, RTHCPHYS HCPhys, uint16_t iUser, uint32_t iUserTable)
5124{
5125 LogFlow(("pgmPoolFree: HCPhys=%RHp iUser=%d iUserTable=%#x\n", HCPhys, iUser, iUserTable));
5126 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
5127 pgmPoolFreeByPage(pPool, pgmPoolGetPage(pPool, HCPhys), iUser, iUserTable);
5128}
5129
5130
5131/**
5132 * Internal worker for finding a 'in-use' shadow page give by it's physical address.
5133 *
5134 * @returns Pointer to the shadow page structure.
5135 * @param pPool The pool.
5136 * @param HCPhys The HC physical address of the shadow page.
5137 */
5138PPGMPOOLPAGE pgmPoolGetPage(PPGMPOOL pPool, RTHCPHYS HCPhys)
5139{
5140 PGM_LOCK_ASSERT_OWNER(pPool->CTX_SUFF(pVM));
5141
5142 /*
5143 * Look up the page.
5144 */
5145 PPGMPOOLPAGE pPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, HCPhys & X86_PTE_PAE_PG_MASK);
5146
5147 AssertFatalMsg(pPage && pPage->enmKind != PGMPOOLKIND_FREE, ("HCPhys=%RHp pPage=%p idx=%d\n", HCPhys, pPage, (pPage) ? pPage->idx : 0));
5148 return pPage;
5149}
5150
5151
5152/**
5153 * Internal worker for finding a page for debugging purposes, no assertions.
5154 *
5155 * @returns Pointer to the shadow page structure. NULL on if not found.
5156 * @param pPool The pool.
5157 * @param HCPhys The HC physical address of the shadow page.
5158 */
5159PPGMPOOLPAGE pgmPoolQueryPageForDbg(PPGMPOOL pPool, RTHCPHYS HCPhys)
5160{
5161 PGM_LOCK_ASSERT_OWNER(pPool->CTX_SUFF(pVM));
5162 return (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, HCPhys & X86_PTE_PAE_PG_MASK);
5163}
5164
5165#ifdef IN_RING3 /* currently only used in ring 3; save some space in the R0 & GC modules (left it here as we might need it elsewhere later on) */
5166
5167/**
5168 * Flush the specified page if present
5169 *
5170 * @param pVM Pointer to the VM.
5171 * @param GCPhys Guest physical address of the page to flush
5172 */
5173void pgmPoolFlushPageByGCPhys(PVM pVM, RTGCPHYS GCPhys)
5174{
5175 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
5176
5177 VM_ASSERT_EMT(pVM);
5178
5179 /*
5180 * Look up the GCPhys in the hash.
5181 */
5182 GCPhys = GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
5183 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
5184 if (i == NIL_PGMPOOL_IDX)
5185 return;
5186
5187 do
5188 {
5189 PPGMPOOLPAGE pPage = &pPool->aPages[i];
5190 if (pPage->GCPhys - GCPhys < PAGE_SIZE)
5191 {
5192 switch (pPage->enmKind)
5193 {
5194 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
5195 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
5196 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
5197 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
5198 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
5199 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
5200 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
5201 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
5202 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
5203 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
5204 case PGMPOOLKIND_64BIT_PML4:
5205 case PGMPOOLKIND_32BIT_PD:
5206 case PGMPOOLKIND_PAE_PDPT:
5207 {
5208 Log(("PGMPoolFlushPage: found pgm pool pages for %RGp\n", GCPhys));
5209#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
5210 if (pPage->fDirty)
5211 STAM_COUNTER_INC(&pPool->StatForceFlushDirtyPage);
5212 else
5213#endif
5214 STAM_COUNTER_INC(&pPool->StatForceFlushPage);
5215 Assert(!pgmPoolIsPageLocked(pPage));
5216 pgmPoolMonitorChainFlush(pPool, pPage);
5217 return;
5218 }
5219
5220 /* ignore, no monitoring. */
5221 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
5222 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
5223 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
5224 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
5225 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
5226 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
5227 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
5228 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
5229 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
5230 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
5231 case PGMPOOLKIND_ROOT_NESTED:
5232 case PGMPOOLKIND_PAE_PD_PHYS:
5233 case PGMPOOLKIND_PAE_PDPT_PHYS:
5234 case PGMPOOLKIND_32BIT_PD_PHYS:
5235 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
5236 break;
5237
5238 default:
5239 AssertFatalMsgFailed(("enmKind=%d idx=%d\n", pPage->enmKind, pPage->idx));
5240 }
5241 }
5242
5243 /* next */
5244 i = pPage->iNext;
5245 } while (i != NIL_PGMPOOL_IDX);
5246 return;
5247}
5248
5249#endif /* IN_RING3 */
5250#ifdef IN_RING3
5251
5252/**
5253 * Reset CPU on hot plugging.
5254 *
5255 * @param pVM Pointer to the VM.
5256 * @param pVCpu The virtual CPU.
5257 */
5258void pgmR3PoolResetUnpluggedCpu(PVM pVM, PVMCPU pVCpu)
5259{
5260 pgmR3ExitShadowModeBeforePoolFlush(pVCpu);
5261
5262 pgmR3ReEnterShadowModeAfterPoolFlush(pVM, pVCpu);
5263 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
5264 VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH);
5265}
5266
5267
5268/**
5269 * Flushes the entire cache.
5270 *
5271 * It will assert a global CR3 flush (FF) and assumes the caller is aware of
5272 * this and execute this CR3 flush.
5273 *
5274 * @param pPool The pool.
5275 */
5276void pgmR3PoolReset(PVM pVM)
5277{
5278 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
5279
5280 PGM_LOCK_ASSERT_OWNER(pVM);
5281 STAM_PROFILE_START(&pPool->StatR3Reset, a);
5282 LogFlow(("pgmR3PoolReset:\n"));
5283
5284 /*
5285 * If there are no pages in the pool, there is nothing to do.
5286 */
5287 if (pPool->cCurPages <= PGMPOOL_IDX_FIRST)
5288 {
5289 STAM_PROFILE_STOP(&pPool->StatR3Reset, a);
5290 return;
5291 }
5292
5293 /*
5294 * Exit the shadow mode since we're going to clear everything,
5295 * including the root page.
5296 */
5297 for (VMCPUID i = 0; i < pVM->cCpus; i++)
5298 pgmR3ExitShadowModeBeforePoolFlush(&pVM->aCpus[i]);
5299
5300 /*
5301 * Nuke the free list and reinsert all pages into it.
5302 */
5303 for (unsigned i = pPool->cCurPages - 1; i >= PGMPOOL_IDX_FIRST; i--)
5304 {
5305 PPGMPOOLPAGE pPage = &pPool->aPages[i];
5306
5307 Assert(pPage->Core.Key == MMPage2Phys(pVM, pPage->pvPageR3));
5308 if (pPage->fMonitored)
5309 pgmPoolMonitorFlush(pPool, pPage);
5310 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
5311 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
5312 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
5313 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
5314 pPage->cModifications = 0;
5315 pPage->GCPhys = NIL_RTGCPHYS;
5316 pPage->enmKind = PGMPOOLKIND_FREE;
5317 pPage->enmAccess = PGMPOOLACCESS_DONTCARE;
5318 Assert(pPage->idx == i);
5319 pPage->iNext = i + 1;
5320 pPage->fA20Enabled = true;
5321 pPage->fZeroed = false; /* This could probably be optimized, but better safe than sorry. */
5322 pPage->fSeenNonGlobal = false;
5323 pPage->fMonitored = false;
5324 pPage->fDirty = false;
5325 pPage->fCached = false;
5326 pPage->fReusedFlushPending = false;
5327 pPage->iUserHead = NIL_PGMPOOL_USER_INDEX;
5328 pPage->iAgeNext = NIL_PGMPOOL_IDX;
5329 pPage->iAgePrev = NIL_PGMPOOL_IDX;
5330 pPage->GCPtrLastAccessHandlerRip = NIL_RTGCPTR;
5331 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
5332 pPage->cLastAccessHandler = 0;
5333 pPage->cLocked = 0;
5334#ifdef VBOX_STRICT
5335 pPage->GCPtrDirtyFault = NIL_RTGCPTR;
5336#endif
5337 }
5338 pPool->aPages[pPool->cCurPages - 1].iNext = NIL_PGMPOOL_IDX;
5339 pPool->iFreeHead = PGMPOOL_IDX_FIRST;
5340 pPool->cUsedPages = 0;
5341
5342 /*
5343 * Zap and reinitialize the user records.
5344 */
5345 pPool->cPresent = 0;
5346 pPool->iUserFreeHead = 0;
5347 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
5348 const unsigned cMaxUsers = pPool->cMaxUsers;
5349 for (unsigned i = 0; i < cMaxUsers; i++)
5350 {
5351 paUsers[i].iNext = i + 1;
5352 paUsers[i].iUser = NIL_PGMPOOL_IDX;
5353 paUsers[i].iUserTable = 0xfffffffe;
5354 }
5355 paUsers[cMaxUsers - 1].iNext = NIL_PGMPOOL_USER_INDEX;
5356
5357 /*
5358 * Clear all the GCPhys links and rebuild the phys ext free list.
5359 */
5360 for (PPGMRAMRANGE pRam = pVM->pgm.s.CTX_SUFF(pRamRangesX);
5361 pRam;
5362 pRam = pRam->CTX_SUFF(pNext))
5363 {
5364 unsigned iPage = pRam->cb >> PAGE_SHIFT;
5365 while (iPage-- > 0)
5366 PGM_PAGE_SET_TRACKING(pVM, &pRam->aPages[iPage], 0);
5367 }
5368
5369 pPool->iPhysExtFreeHead = 0;
5370 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
5371 const unsigned cMaxPhysExts = pPool->cMaxPhysExts;
5372 for (unsigned i = 0; i < cMaxPhysExts; i++)
5373 {
5374 paPhysExts[i].iNext = i + 1;
5375 paPhysExts[i].aidx[0] = NIL_PGMPOOL_IDX;
5376 paPhysExts[i].apte[0] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
5377 paPhysExts[i].aidx[1] = NIL_PGMPOOL_IDX;
5378 paPhysExts[i].apte[1] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
5379 paPhysExts[i].aidx[2] = NIL_PGMPOOL_IDX;
5380 paPhysExts[i].apte[2] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
5381 }
5382 paPhysExts[cMaxPhysExts - 1].iNext = NIL_PGMPOOL_PHYSEXT_INDEX;
5383
5384 /*
5385 * Just zap the modified list.
5386 */
5387 pPool->cModifiedPages = 0;
5388 pPool->iModifiedHead = NIL_PGMPOOL_IDX;
5389
5390 /*
5391 * Clear the GCPhys hash and the age list.
5392 */
5393 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aiHash); i++)
5394 pPool->aiHash[i] = NIL_PGMPOOL_IDX;
5395 pPool->iAgeHead = NIL_PGMPOOL_IDX;
5396 pPool->iAgeTail = NIL_PGMPOOL_IDX;
5397
5398#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
5399 /* Clear all dirty pages. */
5400 pPool->idxFreeDirtyPage = 0;
5401 pPool->cDirtyPages = 0;
5402 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
5403 pPool->aDirtyPages[i].uIdx = NIL_PGMPOOL_IDX;
5404#endif
5405
5406 /*
5407 * Reinsert active pages into the hash and ensure monitoring chains are correct.
5408 */
5409 for (unsigned i = PGMPOOL_IDX_FIRST_SPECIAL; i < PGMPOOL_IDX_FIRST; i++)
5410 {
5411 PPGMPOOLPAGE pPage = &pPool->aPages[i];
5412
5413 /** @todo r=bird: Is this code still needed in any way? The special root
5414 * pages should not be monitored or anything these days AFAIK. */
5415 Assert(pPage->iNext == NIL_PGMPOOL_IDX);
5416 Assert(pPage->iModifiedNext == NIL_PGMPOOL_IDX);
5417 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
5418 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX);
5419 Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
5420 Assert(!pPage->fMonitored);
5421
5422 pPage->iNext = NIL_PGMPOOL_IDX;
5423 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
5424 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
5425 pPage->cModifications = 0;
5426 /* ASSUMES that we're not sharing with any of the other special pages (safe for now). */
5427 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
5428 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
5429 if (pPage->fMonitored)
5430 {
5431 int rc = PGMHandlerPhysicalChangeCallbacks(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK,
5432 pPool->pfnAccessHandlerR3, MMHyperCCToR3(pVM, pPage),
5433 pPool->pfnAccessHandlerR0, MMHyperCCToR0(pVM, pPage),
5434 pPool->pfnAccessHandlerRC, MMHyperCCToRC(pVM, pPage),
5435 pPool->pszAccessHandler);
5436 AssertFatalRCSuccess(rc);
5437 pgmPoolHashInsert(pPool, pPage);
5438 }
5439 Assert(pPage->iUserHead == NIL_PGMPOOL_USER_INDEX); /* for now */
5440 Assert(pPage->iAgeNext == NIL_PGMPOOL_IDX);
5441 Assert(pPage->iAgePrev == NIL_PGMPOOL_IDX);
5442 }
5443
5444 for (VMCPUID i = 0; i < pVM->cCpus; i++)
5445 {
5446 /*
5447 * Re-enter the shadowing mode and assert Sync CR3 FF.
5448 */
5449 PVMCPU pVCpu = &pVM->aCpus[i];
5450 pgmR3ReEnterShadowModeAfterPoolFlush(pVM, pVCpu);
5451 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
5452 VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH);
5453 }
5454
5455 STAM_PROFILE_STOP(&pPool->StatR3Reset, a);
5456}
5457
5458#endif /* IN_RING3 */
5459
5460#ifdef LOG_ENABLED
5461/**
5462 * Stringifies a PGMPOOLKIND value.
5463 */
5464static const char *pgmPoolPoolKindToStr(uint8_t enmKind)
5465{
5466 switch ((PGMPOOLKIND)enmKind)
5467 {
5468 case PGMPOOLKIND_INVALID:
5469 return "PGMPOOLKIND_INVALID";
5470 case PGMPOOLKIND_FREE:
5471 return "PGMPOOLKIND_FREE";
5472 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
5473 return "PGMPOOLKIND_32BIT_PT_FOR_PHYS";
5474 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
5475 return "PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT";
5476 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
5477 return "PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB";
5478 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
5479 return "PGMPOOLKIND_PAE_PT_FOR_PHYS";
5480 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
5481 return "PGMPOOLKIND_PAE_PT_FOR_32BIT_PT";
5482 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
5483 return "PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB";
5484 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
5485 return "PGMPOOLKIND_PAE_PT_FOR_PAE_PT";
5486 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
5487 return "PGMPOOLKIND_PAE_PT_FOR_PAE_2MB";
5488 case PGMPOOLKIND_32BIT_PD:
5489 return "PGMPOOLKIND_32BIT_PD";
5490 case PGMPOOLKIND_32BIT_PD_PHYS:
5491 return "PGMPOOLKIND_32BIT_PD_PHYS";
5492 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
5493 return "PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD";
5494 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
5495 return "PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD";
5496 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
5497 return "PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD";
5498 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
5499 return "PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD";
5500 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
5501 return "PGMPOOLKIND_PAE_PD_FOR_PAE_PD";
5502 case PGMPOOLKIND_PAE_PD_PHYS:
5503 return "PGMPOOLKIND_PAE_PD_PHYS";
5504 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
5505 return "PGMPOOLKIND_PAE_PDPT_FOR_32BIT";
5506 case PGMPOOLKIND_PAE_PDPT:
5507 return "PGMPOOLKIND_PAE_PDPT";
5508 case PGMPOOLKIND_PAE_PDPT_PHYS:
5509 return "PGMPOOLKIND_PAE_PDPT_PHYS";
5510 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
5511 return "PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT";
5512 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
5513 return "PGMPOOLKIND_64BIT_PDPT_FOR_PHYS";
5514 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
5515 return "PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD";
5516 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
5517 return "PGMPOOLKIND_64BIT_PD_FOR_PHYS";
5518 case PGMPOOLKIND_64BIT_PML4:
5519 return "PGMPOOLKIND_64BIT_PML4";
5520 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
5521 return "PGMPOOLKIND_EPT_PDPT_FOR_PHYS";
5522 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
5523 return "PGMPOOLKIND_EPT_PD_FOR_PHYS";
5524 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
5525 return "PGMPOOLKIND_EPT_PT_FOR_PHYS";
5526 case PGMPOOLKIND_ROOT_NESTED:
5527 return "PGMPOOLKIND_ROOT_NESTED";
5528 }
5529 return "Unknown kind!";
5530}
5531#endif /* LOG_ENABLED*/
5532
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use