VirtualBox

source: vbox/trunk/src/VBox/Storage/VD.cpp@ 103068

Last change on this file since 103068 was 100078, checked in by vboxsync, 18 months ago

Main/src-server and Storage: Immutable media handling flexibility added bugref:5995

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 339.5 KB
Line 
1/* $Id: VD.cpp 100078 2023-06-06 05:15:22Z vboxsync $ */
2/** @file
3 * VD - Virtual disk container implementation.
4 */
5
6/*
7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#define LOG_GROUP LOG_GROUP_VD
33#include <VBox/vd.h>
34#include <VBox/err.h>
35#include <VBox/sup.h>
36#include <VBox/log.h>
37
38#include <iprt/alloc.h>
39#include <iprt/assert.h>
40#include <iprt/uuid.h>
41#include <iprt/file.h>
42#include <iprt/string.h>
43#include <iprt/asm.h>
44#include <iprt/param.h>
45#include <iprt/path.h>
46#include <iprt/sg.h>
47#include <iprt/semaphore.h>
48#include <iprt/vector.h>
49
50#include "VDInternal.h"
51
52/** Buffer size used for merging images. */
53#define VD_MERGE_BUFFER_SIZE (16 * _1M)
54
55/** Maximum number of segments in one I/O task. */
56#define VD_IO_TASK_SEGMENTS_MAX 64
57
58/** Threshold after not recently used blocks are removed from the list. */
59#define VD_DISCARD_REMOVE_THRESHOLD (10 * _1M) /** @todo experiment */
60
61/**
62 * VD async I/O interface storage descriptor.
63 */
64typedef struct VDIIOFALLBACKSTORAGE
65{
66 /** File handle. */
67 RTFILE File;
68 /** Completion callback. */
69 PFNVDCOMPLETED pfnCompleted;
70 /** Thread for async access. */
71 RTTHREAD ThreadAsync;
72} VDIIOFALLBACKSTORAGE, *PVDIIOFALLBACKSTORAGE;
73
74/**
75 * uModified bit flags.
76 */
77#define VD_IMAGE_MODIFIED_FLAG RT_BIT(0)
78#define VD_IMAGE_MODIFIED_FIRST RT_BIT(1)
79#define VD_IMAGE_MODIFIED_DISABLE_UUID_UPDATE RT_BIT(2)
80
81
82# define VD_IS_LOCKED(a_pDisk) \
83 do \
84 { \
85 NOREF(a_pDisk); \
86 AssertMsg((a_pDisk)->fLocked, \
87 ("Lock not held\n"));\
88 } while(0)
89
90/**
91 * VBox parent read descriptor, used internally for compaction.
92 */
93typedef struct VDPARENTSTATEDESC
94{
95 /** Pointer to disk descriptor. */
96 PVDISK pDisk;
97 /** Pointer to image descriptor. */
98 PVDIMAGE pImage;
99} VDPARENTSTATEDESC, *PVDPARENTSTATEDESC;
100
101/**
102 * Transfer direction.
103 */
104typedef enum VDIOCTXTXDIR
105{
106 /** Read */
107 VDIOCTXTXDIR_READ = 0,
108 /** Write */
109 VDIOCTXTXDIR_WRITE,
110 /** Flush */
111 VDIOCTXTXDIR_FLUSH,
112 /** Discard */
113 VDIOCTXTXDIR_DISCARD,
114 /** 32bit hack */
115 VDIOCTXTXDIR_32BIT_HACK = 0x7fffffff
116} VDIOCTXTXDIR, *PVDIOCTXTXDIR;
117
118/** Transfer function */
119typedef DECLCALLBACKTYPE(int, FNVDIOCTXTRANSFER ,(PVDIOCTX pIoCtx));
120/** Pointer to a transfer function. */
121typedef FNVDIOCTXTRANSFER *PFNVDIOCTXTRANSFER;
122
123/**
124 * I/O context
125 */
126typedef struct VDIOCTX
127{
128 /** Pointer to the next I/O context. */
129 struct VDIOCTX * volatile pIoCtxNext;
130 /** Disk this is request is for. */
131 PVDISK pDisk;
132 /** Return code. */
133 int rcReq;
134 /** Various flags for the I/O context. */
135 uint32_t fFlags;
136 /** Number of data transfers currently pending. */
137 volatile uint32_t cDataTransfersPending;
138 /** How many meta data transfers are pending. */
139 volatile uint32_t cMetaTransfersPending;
140 /** Flag whether the request finished */
141 volatile bool fComplete;
142 /** Temporary allocated memory which is freed
143 * when the context completes. */
144 void *pvAllocation;
145 /** Transfer function. */
146 PFNVDIOCTXTRANSFER pfnIoCtxTransfer;
147 /** Next transfer part after the current one completed. */
148 PFNVDIOCTXTRANSFER pfnIoCtxTransferNext;
149 /** Transfer direction */
150 VDIOCTXTXDIR enmTxDir;
151 /** Request type dependent data. */
152 union
153 {
154 /** I/O request (read/write). */
155 struct
156 {
157 /** Number of bytes left until this context completes. */
158 volatile uint32_t cbTransferLeft;
159 /** Current offset */
160 volatile uint64_t uOffset;
161 /** Number of bytes to transfer */
162 volatile size_t cbTransfer;
163 /** Current image in the chain. */
164 PVDIMAGE pImageCur;
165 /** Start image to read from. pImageCur is reset to this
166 * value after it reached the first image in the chain. */
167 PVDIMAGE pImageStart;
168 /** S/G buffer */
169 RTSGBUF SgBuf;
170 /** Number of bytes to clear in the buffer before the current read. */
171 size_t cbBufClear;
172 /** Number of images to read. */
173 unsigned cImagesRead;
174 /** Override for the parent image to start reading from. */
175 PVDIMAGE pImageParentOverride;
176 /** Original offset of the transfer - required for filtering read requests. */
177 uint64_t uOffsetXferOrig;
178 /** Original size of the transfer - required for fitlering read requests. */
179 size_t cbXferOrig;
180 } Io;
181 /** Discard requests. */
182 struct
183 {
184 /** Pointer to the range descriptor array. */
185 PCRTRANGE paRanges;
186 /** Number of ranges in the array. */
187 unsigned cRanges;
188 /** Range descriptor index which is processed. */
189 unsigned idxRange;
190 /** Start offset to discard currently. */
191 uint64_t offCur;
192 /** How many bytes left to discard in the current range. */
193 size_t cbDiscardLeft;
194 /** How many bytes to discard in the current block (<= cbDiscardLeft). */
195 size_t cbThisDiscard;
196 /** Discard block handled currently. */
197 PVDDISCARDBLOCK pBlock;
198 } Discard;
199 } Req;
200 /** Parent I/O context if any. Sets the type of the context (root/child) */
201 PVDIOCTX pIoCtxParent;
202 /** Type dependent data (root/child) */
203 union
204 {
205 /** Root data */
206 struct
207 {
208 /** Completion callback */
209 PFNVDASYNCTRANSFERCOMPLETE pfnComplete;
210 /** User argument 1 passed on completion. */
211 void *pvUser1;
212 /** User argument 2 passed on completion. */
213 void *pvUser2;
214 } Root;
215 /** Child data */
216 struct
217 {
218 /** Saved start offset */
219 uint64_t uOffsetSaved;
220 /** Saved transfer size */
221 size_t cbTransferLeftSaved;
222 /** Number of bytes transferred from the parent if this context completes. */
223 size_t cbTransferParent;
224 /** Number of bytes to pre read */
225 size_t cbPreRead;
226 /** Number of bytes to post read. */
227 size_t cbPostRead;
228 /** Number of bytes to write left in the parent. */
229 size_t cbWriteParent;
230 /** Write type dependent data. */
231 union
232 {
233 /** Optimized */
234 struct
235 {
236 /** Bytes to fill to satisfy the block size. Not part of the virtual disk. */
237 size_t cbFill;
238 /** Bytes to copy instead of reading from the parent */
239 size_t cbWriteCopy;
240 /** Bytes to read from the image. */
241 size_t cbReadImage;
242 } Optimized;
243 } Write;
244 } Child;
245 } Type;
246} VDIOCTX;
247
248/** Default flags for an I/O context, i.e. unblocked and async. */
249#define VDIOCTX_FLAGS_DEFAULT (0)
250/** Flag whether the context is blocked. */
251#define VDIOCTX_FLAGS_BLOCKED RT_BIT_32(0)
252/** Flag whether the I/O context is using synchronous I/O. */
253#define VDIOCTX_FLAGS_SYNC RT_BIT_32(1)
254/** Flag whether the read should update the cache. */
255#define VDIOCTX_FLAGS_READ_UPDATE_CACHE RT_BIT_32(2)
256/** Flag whether free blocks should be zeroed.
257 * If false and no image has data for sepcified
258 * range VERR_VD_BLOCK_FREE is returned for the I/O context.
259 * Note that unallocated blocks are still zeroed
260 * if at least one image has valid data for a part
261 * of the range.
262 */
263#define VDIOCTX_FLAGS_ZERO_FREE_BLOCKS RT_BIT_32(3)
264/** Don't free the I/O context when complete because
265 * it was alloacted elsewhere (stack, ...). */
266#define VDIOCTX_FLAGS_DONT_FREE RT_BIT_32(4)
267/** Don't set the modified flag for this I/O context when writing. */
268#define VDIOCTX_FLAGS_DONT_SET_MODIFIED_FLAG RT_BIT_32(5)
269/** The write filter was applied already and shouldn't be applied a second time.
270 * Used at the beginning of vdWriteHelperAsync() because it might be called
271 * multiple times.
272 */
273#define VDIOCTX_FLAGS_WRITE_FILTER_APPLIED RT_BIT_32(6)
274
275/** NIL I/O context pointer value. */
276#define NIL_VDIOCTX ((PVDIOCTX)0)
277
278/**
279 * List node for deferred I/O contexts.
280 */
281typedef struct VDIOCTXDEFERRED
282{
283 /** Node in the list of deferred requests.
284 * A request can be deferred if the image is growing
285 * and the request accesses the same range or if
286 * the backend needs to read or write metadata from the disk
287 * before it can continue. */
288 RTLISTNODE NodeDeferred;
289 /** I/O context this entry points to. */
290 PVDIOCTX pIoCtx;
291} VDIOCTXDEFERRED, *PVDIOCTXDEFERRED;
292
293/**
294 * I/O task.
295 */
296typedef struct VDIOTASK
297{
298 /** Next I/O task waiting in the list. */
299 struct VDIOTASK * volatile pNext;
300 /** Storage this task belongs to. */
301 PVDIOSTORAGE pIoStorage;
302 /** Optional completion callback. */
303 PFNVDXFERCOMPLETED pfnComplete;
304 /** Opaque user data. */
305 void *pvUser;
306 /** Completion status code for the task. */
307 int rcReq;
308 /** Flag whether this is a meta data transfer. */
309 bool fMeta;
310 /** Type dependent data. */
311 union
312 {
313 /** User data transfer. */
314 struct
315 {
316 /** Number of bytes this task transferred. */
317 uint32_t cbTransfer;
318 /** Pointer to the I/O context the task belongs. */
319 PVDIOCTX pIoCtx;
320 } User;
321 /** Meta data transfer. */
322 struct
323 {
324 /** Meta transfer this task is for. */
325 PVDMETAXFER pMetaXfer;
326 } Meta;
327 } Type;
328} VDIOTASK;
329
330/**
331 * Storage handle.
332 */
333typedef struct VDIOSTORAGE
334{
335 /** Image I/O state this storage handle belongs to. */
336 PVDIO pVDIo;
337 /** AVL tree for pending async metadata transfers. */
338 PAVLRFOFFTREE pTreeMetaXfers;
339 /** Storage handle */
340 void *pStorage;
341} VDIOSTORAGE;
342
343/**
344 * Metadata transfer.
345 *
346 * @note This entry can't be freed if either the list is not empty or
347 * the reference counter is not 0.
348 * The assumption is that the backends don't need to read huge amounts of
349 * metadata to complete a transfer so the additional memory overhead should
350 * be relatively small.
351 */
352typedef struct VDMETAXFER
353{
354 /** AVL core for fast search (the file offset is the key) */
355 AVLRFOFFNODECORE Core;
356 /** I/O storage for this transfer. */
357 PVDIOSTORAGE pIoStorage;
358 /** Flags. */
359 uint32_t fFlags;
360 /** List of I/O contexts waiting for this metadata transfer to complete. */
361 RTLISTNODE ListIoCtxWaiting;
362 /** Number of references to this entry. */
363 unsigned cRefs;
364 /** Size of the data stored with this entry. */
365 size_t cbMeta;
366 /** Shadow buffer which is used in case a write is still active and other
367 * writes update the shadow buffer. */
368 uint8_t *pbDataShw;
369 /** List of I/O contexts updating the shadow buffer while there is a write
370 * in progress. */
371 RTLISTNODE ListIoCtxShwWrites;
372 /** Data stored - variable size. */
373 uint8_t abData[1];
374} VDMETAXFER;
375
376/**
377 * The transfer direction for the metadata.
378 */
379#define VDMETAXFER_TXDIR_MASK 0x3
380#define VDMETAXFER_TXDIR_NONE 0x0
381#define VDMETAXFER_TXDIR_WRITE 0x1
382#define VDMETAXFER_TXDIR_READ 0x2
383#define VDMETAXFER_TXDIR_FLUSH 0x3
384#define VDMETAXFER_TXDIR_GET(flags) ((flags) & VDMETAXFER_TXDIR_MASK)
385#define VDMETAXFER_TXDIR_SET(flags, dir) ((flags) = (flags & ~VDMETAXFER_TXDIR_MASK) | (dir))
386
387/** Forward declaration of the async discard helper. */
388static DECLCALLBACK(int) vdDiscardHelperAsync(PVDIOCTX pIoCtx);
389static DECLCALLBACK(int) vdWriteHelperAsync(PVDIOCTX pIoCtx);
390static void vdDiskProcessBlockedIoCtx(PVDISK pDisk);
391static int vdDiskUnlock(PVDISK pDisk, PVDIOCTX pIoCtxRc);
392static DECLCALLBACK(void) vdIoCtxSyncComplete(void *pvUser1, void *pvUser2, int rcReq);
393
394/**
395 * internal: issue error message.
396 */
397static int vdError(PVDISK pDisk, int rc, RT_SRC_POS_DECL,
398 const char *pszFormat, ...)
399{
400 va_list va;
401 va_start(va, pszFormat);
402 if (pDisk->pInterfaceError)
403 pDisk->pInterfaceError->pfnError(pDisk->pInterfaceError->Core.pvUser, rc, RT_SRC_POS_ARGS, pszFormat, va);
404 va_end(va);
405 return rc;
406}
407
408/**
409 * internal: thread synchronization, start read.
410 */
411DECLINLINE(int) vdThreadStartRead(PVDISK pDisk)
412{
413 int rc = VINF_SUCCESS;
414 if (RT_UNLIKELY(pDisk->pInterfaceThreadSync))
415 rc = pDisk->pInterfaceThreadSync->pfnStartRead(pDisk->pInterfaceThreadSync->Core.pvUser);
416 return rc;
417}
418
419/**
420 * internal: thread synchronization, finish read.
421 */
422DECLINLINE(int) vdThreadFinishRead(PVDISK pDisk)
423{
424 int rc = VINF_SUCCESS;
425 if (RT_UNLIKELY(pDisk->pInterfaceThreadSync))
426 rc = pDisk->pInterfaceThreadSync->pfnFinishRead(pDisk->pInterfaceThreadSync->Core.pvUser);
427 return rc;
428}
429
430/**
431 * internal: thread synchronization, start write.
432 */
433DECLINLINE(int) vdThreadStartWrite(PVDISK pDisk)
434{
435 int rc = VINF_SUCCESS;
436 if (RT_UNLIKELY(pDisk->pInterfaceThreadSync))
437 rc = pDisk->pInterfaceThreadSync->pfnStartWrite(pDisk->pInterfaceThreadSync->Core.pvUser);
438 return rc;
439}
440
441/**
442 * internal: thread synchronization, finish write.
443 */
444DECLINLINE(int) vdThreadFinishWrite(PVDISK pDisk)
445{
446 int rc = VINF_SUCCESS;
447 if (RT_UNLIKELY(pDisk->pInterfaceThreadSync))
448 rc = pDisk->pInterfaceThreadSync->pfnFinishWrite(pDisk->pInterfaceThreadSync->Core.pvUser);
449 return rc;
450}
451
452/**
453 * internal: add image structure to the end of images list.
454 */
455static void vdAddImageToList(PVDISK pDisk, PVDIMAGE pImage)
456{
457 pImage->pPrev = NULL;
458 pImage->pNext = NULL;
459
460 if (pDisk->pBase)
461 {
462 Assert(pDisk->cImages > 0);
463 pImage->pPrev = pDisk->pLast;
464 pDisk->pLast->pNext = pImage;
465 pDisk->pLast = pImage;
466 }
467 else
468 {
469 Assert(pDisk->cImages == 0);
470 pDisk->pBase = pImage;
471 pDisk->pLast = pImage;
472 }
473
474 pDisk->cImages++;
475}
476
477/**
478 * internal: remove image structure from the images list.
479 */
480static void vdRemoveImageFromList(PVDISK pDisk, PVDIMAGE pImage)
481{
482 Assert(pDisk->cImages > 0);
483
484 if (pImage->pPrev)
485 pImage->pPrev->pNext = pImage->pNext;
486 else
487 pDisk->pBase = pImage->pNext;
488
489 if (pImage->pNext)
490 pImage->pNext->pPrev = pImage->pPrev;
491 else
492 pDisk->pLast = pImage->pPrev;
493
494 pImage->pPrev = NULL;
495 pImage->pNext = NULL;
496
497 pDisk->cImages--;
498}
499
500/**
501 * Release a referene to the filter decrementing the counter and destroying the filter
502 * when the counter reaches zero.
503 *
504 * @returns The new reference count.
505 * @param pFilter The filter to release.
506 */
507static uint32_t vdFilterRelease(PVDFILTER pFilter)
508{
509 uint32_t cRefs = ASMAtomicDecU32(&pFilter->cRefs);
510 if (!cRefs)
511 {
512 pFilter->pBackend->pfnDestroy(pFilter->pvBackendData);
513 RTMemFree(pFilter);
514 }
515
516 return cRefs;
517}
518
519/**
520 * Increments the reference counter of the given filter.
521 *
522 * @return The new reference count.
523 * @param pFilter The filter.
524 */
525static uint32_t vdFilterRetain(PVDFILTER pFilter)
526{
527 return ASMAtomicIncU32(&pFilter->cRefs);
528}
529
530/**
531 * internal: find image by index into the images list.
532 */
533static PVDIMAGE vdGetImageByNumber(PVDISK pDisk, unsigned nImage)
534{
535 PVDIMAGE pImage = pDisk->pBase;
536 if (nImage == VD_LAST_IMAGE)
537 return pDisk->pLast;
538 while (pImage && nImage)
539 {
540 pImage = pImage->pNext;
541 nImage--;
542 }
543 return pImage;
544}
545
546/**
547 * Creates a new region list from the given one converting to match the flags if necessary.
548 *
549 * @returns VBox status code.
550 * @param pRegionList The region list to convert from.
551 * @param fFlags The flags for the new region list.
552 * @param ppRegionList Where to store the new region list on success.
553 */
554static int vdRegionListConv(PCVDREGIONLIST pRegionList, uint32_t fFlags, PPVDREGIONLIST ppRegionList)
555{
556 int rc = VINF_SUCCESS;
557 PVDREGIONLIST pRegionListNew = (PVDREGIONLIST)RTMemDup(pRegionList,
558 RT_UOFFSETOF_DYN(VDREGIONLIST, aRegions[pRegionList->cRegions]));
559 if (RT_LIKELY(pRegionListNew))
560 {
561 /* Do we have to convert anything? */
562 if (pRegionList->fFlags != fFlags)
563 {
564 uint64_t offRegionNext = 0;
565
566 pRegionListNew->fFlags = fFlags;
567 for (unsigned i = 0; i < pRegionListNew->cRegions; i++)
568 {
569 PVDREGIONDESC pRegion = &pRegionListNew->aRegions[i];
570
571 if ( (fFlags & VD_REGION_LIST_F_LOC_SIZE_BLOCKS)
572 && !(pRegionList->fFlags & VD_REGION_LIST_F_LOC_SIZE_BLOCKS))
573 {
574 Assert(!(pRegion->cRegionBlocksOrBytes % pRegion->cbBlock));
575
576 /* Convert from bytes to logical blocks. */
577 pRegion->offRegion = offRegionNext;
578 pRegion->cRegionBlocksOrBytes = pRegion->cRegionBlocksOrBytes / pRegion->cbBlock;
579 offRegionNext += pRegion->cRegionBlocksOrBytes;
580 }
581 else
582 {
583 /* Convert from logical blocks to bytes. */
584 pRegion->offRegion = offRegionNext;
585 pRegion->cRegionBlocksOrBytes = pRegion->cRegionBlocksOrBytes * pRegion->cbBlock;
586 offRegionNext += pRegion->cRegionBlocksOrBytes;
587 }
588 }
589 }
590
591 *ppRegionList = pRegionListNew;
592 }
593 else
594 rc = VERR_NO_MEMORY;
595
596 return rc;
597}
598
599/**
600 * Returns the virtual size of the image in bytes.
601 *
602 * @returns Size of the given image in bytes.
603 * @param pImage The image to get the size from.
604 */
605static uint64_t vdImageGetSize(PVDIMAGE pImage)
606{
607 uint64_t cbImage = 0;
608
609 if (pImage->cbImage == VD_IMAGE_SIZE_UNINITIALIZED)
610 {
611 PCVDREGIONLIST pRegionList = NULL;
612 int rc = pImage->Backend->pfnQueryRegions(pImage->pBackendData, &pRegionList);
613 if (RT_SUCCESS(rc))
614 {
615 if (pRegionList->fFlags & VD_REGION_LIST_F_LOC_SIZE_BLOCKS)
616 {
617 PVDREGIONLIST pRegionListConv = NULL;
618 rc = vdRegionListConv(pRegionList, 0, &pRegionListConv);
619 if (RT_SUCCESS(rc))
620 {
621 for (uint32_t i = 0; i < pRegionListConv->cRegions; i++)
622 cbImage += pRegionListConv->aRegions[i].cRegionBlocksOrBytes;
623
624 VDRegionListFree(pRegionListConv);
625 }
626 }
627 else
628 for (uint32_t i = 0; i < pRegionList->cRegions; i++)
629 cbImage += pRegionList->aRegions[i].cRegionBlocksOrBytes;
630
631 AssertPtr(pImage->Backend->pfnRegionListRelease);
632 pImage->Backend->pfnRegionListRelease(pImage->pBackendData, pRegionList);
633 pImage->cbImage = cbImage; /* Cache the value. */
634 }
635 }
636 else
637 cbImage = pImage->cbImage;
638
639 return cbImage;
640}
641
642/**
643 * Applies the filter chain to the given write request.
644 *
645 * @returns VBox status code.
646 * @param pDisk The HDD container.
647 * @param uOffset The start offset of the write.
648 * @param cbWrite Number of bytes to write.
649 * @param pIoCtx The I/O context associated with the request.
650 */
651static int vdFilterChainApplyWrite(PVDISK pDisk, uint64_t uOffset, size_t cbWrite,
652 PVDIOCTX pIoCtx)
653{
654 int rc = VINF_SUCCESS;
655
656 VD_IS_LOCKED(pDisk);
657
658 PVDFILTER pFilter;
659 RTListForEach(&pDisk->ListFilterChainWrite, pFilter, VDFILTER, ListNodeChainWrite)
660 {
661 rc = pFilter->pBackend->pfnFilterWrite(pFilter->pvBackendData, uOffset, cbWrite, pIoCtx);
662 if (RT_FAILURE(rc))
663 break;
664 /* Reset S/G buffer for the next filter. */
665 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
666 }
667
668 return rc;
669}
670
671/**
672 * Applies the filter chain to the given read request.
673 *
674 * @returns VBox status code.
675 * @param pDisk The HDD container.
676 * @param uOffset The start offset of the read.
677 * @param cbRead Number of bytes read.
678 * @param pIoCtx The I/O context associated with the request.
679 */
680static int vdFilterChainApplyRead(PVDISK pDisk, uint64_t uOffset, size_t cbRead,
681 PVDIOCTX pIoCtx)
682{
683 int rc = VINF_SUCCESS;
684
685 VD_IS_LOCKED(pDisk);
686
687 /* Reset buffer before starting. */
688 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
689
690 PVDFILTER pFilter;
691 RTListForEach(&pDisk->ListFilterChainRead, pFilter, VDFILTER, ListNodeChainRead)
692 {
693 rc = pFilter->pBackend->pfnFilterRead(pFilter->pvBackendData, uOffset, cbRead, pIoCtx);
694 if (RT_FAILURE(rc))
695 break;
696 /* Reset S/G buffer for the next filter. */
697 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
698 }
699
700 return rc;
701}
702
703DECLINLINE(void) vdIoCtxRootComplete(PVDISK pDisk, PVDIOCTX pIoCtx)
704{
705 if ( RT_SUCCESS(pIoCtx->rcReq)
706 && pIoCtx->enmTxDir == VDIOCTXTXDIR_READ)
707 pIoCtx->rcReq = vdFilterChainApplyRead(pDisk, pIoCtx->Req.Io.uOffsetXferOrig,
708 pIoCtx->Req.Io.cbXferOrig, pIoCtx);
709
710 pIoCtx->Type.Root.pfnComplete(pIoCtx->Type.Root.pvUser1,
711 pIoCtx->Type.Root.pvUser2,
712 pIoCtx->rcReq);
713}
714
715/**
716 * Initialize the structure members of a given I/O context.
717 */
718DECLINLINE(void) vdIoCtxInit(PVDIOCTX pIoCtx, PVDISK pDisk, VDIOCTXTXDIR enmTxDir,
719 uint64_t uOffset, size_t cbTransfer, PVDIMAGE pImageStart,
720 PCRTSGBUF pSgBuf, void *pvAllocation,
721 PFNVDIOCTXTRANSFER pfnIoCtxTransfer, uint32_t fFlags)
722{
723 pIoCtx->pDisk = pDisk;
724 pIoCtx->enmTxDir = enmTxDir;
725 pIoCtx->Req.Io.cbTransferLeft = (uint32_t)cbTransfer; Assert((uint32_t)cbTransfer == cbTransfer);
726 pIoCtx->Req.Io.uOffset = uOffset;
727 pIoCtx->Req.Io.cbTransfer = cbTransfer;
728 pIoCtx->Req.Io.pImageStart = pImageStart;
729 pIoCtx->Req.Io.pImageCur = pImageStart;
730 pIoCtx->Req.Io.cbBufClear = 0;
731 pIoCtx->Req.Io.pImageParentOverride = NULL;
732 pIoCtx->Req.Io.uOffsetXferOrig = uOffset;
733 pIoCtx->Req.Io.cbXferOrig = cbTransfer;
734 pIoCtx->cDataTransfersPending = 0;
735 pIoCtx->cMetaTransfersPending = 0;
736 pIoCtx->fComplete = false;
737 pIoCtx->fFlags = fFlags;
738 pIoCtx->pvAllocation = pvAllocation;
739 pIoCtx->pfnIoCtxTransfer = pfnIoCtxTransfer;
740 pIoCtx->pfnIoCtxTransferNext = NULL;
741 pIoCtx->rcReq = VINF_SUCCESS;
742 pIoCtx->pIoCtxParent = NULL;
743
744 /* There is no S/G list for a flush request. */
745 if ( enmTxDir != VDIOCTXTXDIR_FLUSH
746 && enmTxDir != VDIOCTXTXDIR_DISCARD)
747 RTSgBufClone(&pIoCtx->Req.Io.SgBuf, pSgBuf);
748 else
749 memset(&pIoCtx->Req.Io.SgBuf, 0, sizeof(RTSGBUF));
750}
751
752/**
753 * Internal: Tries to read the desired range from the given cache.
754 *
755 * @returns VBox status code.
756 * @retval VERR_VD_BLOCK_FREE if the block is not in the cache.
757 * pcbRead will be set to the number of bytes not in the cache.
758 * Everything thereafter might be in the cache.
759 * @param pCache The cache to read from.
760 * @param uOffset Offset of the virtual disk to read.
761 * @param cbRead How much to read.
762 * @param pIoCtx The I/O context to read into.
763 * @param pcbRead Where to store the number of bytes actually read.
764 * On success this indicates the number of bytes read from the cache.
765 * If VERR_VD_BLOCK_FREE is returned this gives the number of bytes
766 * which are not in the cache.
767 * In both cases everything beyond this value
768 * might or might not be in the cache.
769 */
770static int vdCacheReadHelper(PVDCACHE pCache, uint64_t uOffset,
771 size_t cbRead, PVDIOCTX pIoCtx, size_t *pcbRead)
772{
773 int rc = VINF_SUCCESS;
774
775 LogFlowFunc(("pCache=%#p uOffset=%llu pIoCtx=%p cbRead=%zu pcbRead=%#p\n",
776 pCache, uOffset, pIoCtx, cbRead, pcbRead));
777
778 AssertPtr(pCache);
779 AssertPtr(pcbRead);
780
781 rc = pCache->Backend->pfnRead(pCache->pBackendData, uOffset, cbRead,
782 pIoCtx, pcbRead);
783
784 LogFlowFunc(("returns rc=%Rrc pcbRead=%zu\n", rc, *pcbRead));
785 return rc;
786}
787
788/**
789 * Internal: Writes data for the given block into the cache.
790 *
791 * @returns VBox status code.
792 * @param pCache The cache to write to.
793 * @param uOffset Offset of the virtual disk to write to the cache.
794 * @param cbWrite How much to write.
795 * @param pIoCtx The I/O context to write from.
796 * @param pcbWritten How much data could be written, optional.
797 */
798static int vdCacheWriteHelper(PVDCACHE pCache, uint64_t uOffset, size_t cbWrite,
799 PVDIOCTX pIoCtx, size_t *pcbWritten)
800{
801 int rc = VINF_SUCCESS;
802
803 LogFlowFunc(("pCache=%#p uOffset=%llu pIoCtx=%p cbWrite=%zu pcbWritten=%#p\n",
804 pCache, uOffset, pIoCtx, cbWrite, pcbWritten));
805
806 AssertPtr(pCache);
807 AssertPtr(pIoCtx);
808 Assert(cbWrite > 0);
809
810 if (pcbWritten)
811 rc = pCache->Backend->pfnWrite(pCache->pBackendData, uOffset, cbWrite,
812 pIoCtx, pcbWritten);
813 else
814 {
815 size_t cbWritten = 0;
816
817 do
818 {
819 rc = pCache->Backend->pfnWrite(pCache->pBackendData, uOffset, cbWrite,
820 pIoCtx, &cbWritten);
821 uOffset += cbWritten;
822 cbWrite -= cbWritten;
823 } while ( cbWrite
824 && ( RT_SUCCESS(rc)
825 || rc == VERR_VD_ASYNC_IO_IN_PROGRESS));
826 }
827
828 LogFlowFunc(("returns rc=%Rrc pcbWritten=%zu\n",
829 rc, pcbWritten ? *pcbWritten : cbWrite));
830 return rc;
831}
832
833/**
834 * Creates a new empty discard state.
835 *
836 * @returns Pointer to the new discard state or NULL if out of memory.
837 */
838static PVDDISCARDSTATE vdDiscardStateCreate(void)
839{
840 PVDDISCARDSTATE pDiscard = (PVDDISCARDSTATE)RTMemAllocZ(sizeof(VDDISCARDSTATE));
841
842 if (pDiscard)
843 {
844 RTListInit(&pDiscard->ListLru);
845 pDiscard->pTreeBlocks = (PAVLRU64TREE)RTMemAllocZ(sizeof(AVLRU64TREE));
846 if (!pDiscard->pTreeBlocks)
847 {
848 RTMemFree(pDiscard);
849 pDiscard = NULL;
850 }
851 }
852
853 return pDiscard;
854}
855
856/**
857 * Removes the least recently used blocks from the waiting list until
858 * the new value is reached.
859 *
860 * @returns VBox status code.
861 * @param pDisk VD disk container.
862 * @param pDiscard The discard state.
863 * @param cbDiscardingNew How many bytes should be waiting on success.
864 * The number of bytes waiting can be less.
865 */
866static int vdDiscardRemoveBlocks(PVDISK pDisk, PVDDISCARDSTATE pDiscard, size_t cbDiscardingNew)
867{
868 int rc = VINF_SUCCESS;
869
870 LogFlowFunc(("pDisk=%#p pDiscard=%#p cbDiscardingNew=%zu\n",
871 pDisk, pDiscard, cbDiscardingNew));
872
873 while (pDiscard->cbDiscarding > cbDiscardingNew)
874 {
875 PVDDISCARDBLOCK pBlock = RTListGetLast(&pDiscard->ListLru, VDDISCARDBLOCK, NodeLru);
876
877 Assert(!RTListIsEmpty(&pDiscard->ListLru));
878
879 /* Go over the allocation bitmap and mark all discarded sectors as unused. */
880 uint64_t offStart = pBlock->Core.Key;
881 uint32_t idxStart = 0;
882 size_t cbLeft = pBlock->cbDiscard;
883 bool fAllocated = ASMBitTest(pBlock->pbmAllocated, idxStart);
884 uint32_t cSectors = (uint32_t)(pBlock->cbDiscard / 512);
885
886 while (cbLeft > 0)
887 {
888 int32_t idxEnd;
889 size_t cbThis = cbLeft;
890
891 if (fAllocated)
892 {
893 /* Check for the first unallocated bit. */
894 idxEnd = ASMBitNextClear(pBlock->pbmAllocated, cSectors, idxStart);
895 if (idxEnd != -1)
896 {
897 cbThis = (idxEnd - idxStart) * 512;
898 fAllocated = false;
899 }
900 }
901 else
902 {
903 /* Mark as unused and check for the first set bit. */
904 idxEnd = ASMBitNextSet(pBlock->pbmAllocated, cSectors, idxStart);
905 if (idxEnd != -1)
906 cbThis = (idxEnd - idxStart) * 512;
907
908
909 VDIOCTX IoCtx;
910 vdIoCtxInit(&IoCtx, pDisk, VDIOCTXTXDIR_DISCARD, 0, 0, NULL,
911 NULL, NULL, NULL, VDIOCTX_FLAGS_SYNC);
912 rc = pDisk->pLast->Backend->pfnDiscard(pDisk->pLast->pBackendData,
913 &IoCtx, offStart, cbThis, NULL,
914 NULL, &cbThis, NULL,
915 VD_DISCARD_MARK_UNUSED);
916 if (RT_FAILURE(rc))
917 break;
918
919 fAllocated = true;
920 }
921
922 idxStart = idxEnd;
923 offStart += cbThis;
924 cbLeft -= cbThis;
925 }
926
927 if (RT_FAILURE(rc))
928 break;
929
930 PVDDISCARDBLOCK pBlockRemove = (PVDDISCARDBLOCK)RTAvlrU64RangeRemove(pDiscard->pTreeBlocks, pBlock->Core.Key);
931 Assert(pBlockRemove == pBlock); NOREF(pBlockRemove);
932 RTListNodeRemove(&pBlock->NodeLru);
933
934 pDiscard->cbDiscarding -= pBlock->cbDiscard;
935 RTMemFree(pBlock->pbmAllocated);
936 RTMemFree(pBlock);
937 }
938
939 Assert(RT_FAILURE(rc) || pDiscard->cbDiscarding <= cbDiscardingNew);
940
941 LogFlowFunc(("returns rc=%Rrc\n", rc));
942 return rc;
943}
944
945/**
946 * Destroys the current discard state, writing any waiting blocks to the image.
947 *
948 * @returns VBox status code.
949 * @param pDisk VD disk container.
950 */
951static int vdDiscardStateDestroy(PVDISK pDisk)
952{
953 int rc = VINF_SUCCESS;
954
955 if (pDisk->pDiscard)
956 {
957 rc = vdDiscardRemoveBlocks(pDisk, pDisk->pDiscard, 0 /* Remove all blocks. */);
958 AssertRC(rc);
959 RTMemFree(pDisk->pDiscard->pTreeBlocks);
960 RTMemFree(pDisk->pDiscard);
961 pDisk->pDiscard = NULL;
962 }
963
964 return rc;
965}
966
967/**
968 * Marks the given range as allocated in the image.
969 * Required if there are discards in progress and a write to a block which can get discarded
970 * is written to.
971 *
972 * @returns VBox status code.
973 * @param pDisk VD container data.
974 * @param uOffset First byte to mark as allocated.
975 * @param cbRange Number of bytes to mark as allocated.
976 */
977static int vdDiscardSetRangeAllocated(PVDISK pDisk, uint64_t uOffset, size_t cbRange)
978{
979 PVDDISCARDSTATE pDiscard = pDisk->pDiscard;
980 int rc = VINF_SUCCESS;
981
982 if (pDiscard)
983 {
984 do
985 {
986 size_t cbThisRange = cbRange;
987 PVDDISCARDBLOCK pBlock = (PVDDISCARDBLOCK)RTAvlrU64RangeGet(pDiscard->pTreeBlocks, uOffset);
988
989 if (pBlock)
990 {
991 int32_t idxStart, idxEnd;
992
993 Assert(!(cbThisRange % 512));
994 Assert(!((uOffset - pBlock->Core.Key) % 512));
995
996 cbThisRange = RT_MIN(cbThisRange, pBlock->Core.KeyLast - uOffset + 1);
997
998 idxStart = (uOffset - pBlock->Core.Key) / 512;
999 idxEnd = idxStart + (int32_t)(cbThisRange / 512);
1000 ASMBitSetRange(pBlock->pbmAllocated, idxStart, idxEnd);
1001 }
1002 else
1003 {
1004 pBlock = (PVDDISCARDBLOCK)RTAvlrU64GetBestFit(pDiscard->pTreeBlocks, uOffset, true);
1005 if (pBlock)
1006 cbThisRange = RT_MIN(cbThisRange, pBlock->Core.Key - uOffset);
1007 }
1008
1009 Assert(cbRange >= cbThisRange);
1010
1011 uOffset += cbThisRange;
1012 cbRange -= cbThisRange;
1013 } while (cbRange != 0);
1014 }
1015
1016 return rc;
1017}
1018
1019DECLINLINE(PVDIOCTX) vdIoCtxAlloc(PVDISK pDisk, VDIOCTXTXDIR enmTxDir,
1020 uint64_t uOffset, size_t cbTransfer,
1021 PVDIMAGE pImageStart,PCRTSGBUF pSgBuf,
1022 void *pvAllocation, PFNVDIOCTXTRANSFER pfnIoCtxTransfer,
1023 uint32_t fFlags)
1024{
1025 PVDIOCTX pIoCtx = NULL;
1026
1027 pIoCtx = (PVDIOCTX)RTMemCacheAlloc(pDisk->hMemCacheIoCtx);
1028 if (RT_LIKELY(pIoCtx))
1029 {
1030 vdIoCtxInit(pIoCtx, pDisk, enmTxDir, uOffset, cbTransfer, pImageStart,
1031 pSgBuf, pvAllocation, pfnIoCtxTransfer, fFlags);
1032 }
1033
1034 return pIoCtx;
1035}
1036
1037DECLINLINE(PVDIOCTX) vdIoCtxRootAlloc(PVDISK pDisk, VDIOCTXTXDIR enmTxDir,
1038 uint64_t uOffset, size_t cbTransfer,
1039 PVDIMAGE pImageStart, PCRTSGBUF pSgBuf,
1040 PFNVDASYNCTRANSFERCOMPLETE pfnComplete,
1041 void *pvUser1, void *pvUser2,
1042 void *pvAllocation,
1043 PFNVDIOCTXTRANSFER pfnIoCtxTransfer,
1044 uint32_t fFlags)
1045{
1046 PVDIOCTX pIoCtx = vdIoCtxAlloc(pDisk, enmTxDir, uOffset, cbTransfer, pImageStart,
1047 pSgBuf, pvAllocation, pfnIoCtxTransfer, fFlags);
1048
1049 if (RT_LIKELY(pIoCtx))
1050 {
1051 pIoCtx->pIoCtxParent = NULL;
1052 pIoCtx->Type.Root.pfnComplete = pfnComplete;
1053 pIoCtx->Type.Root.pvUser1 = pvUser1;
1054 pIoCtx->Type.Root.pvUser2 = pvUser2;
1055 }
1056
1057 LogFlow(("Allocated root I/O context %#p\n", pIoCtx));
1058 return pIoCtx;
1059}
1060
1061DECLINLINE(void) vdIoCtxDiscardInit(PVDIOCTX pIoCtx, PVDISK pDisk, PCRTRANGE paRanges,
1062 unsigned cRanges, PFNVDASYNCTRANSFERCOMPLETE pfnComplete,
1063 void *pvUser1, void *pvUser2, void *pvAllocation,
1064 PFNVDIOCTXTRANSFER pfnIoCtxTransfer, uint32_t fFlags)
1065{
1066 pIoCtx->pIoCtxNext = NULL;
1067 pIoCtx->pDisk = pDisk;
1068 pIoCtx->enmTxDir = VDIOCTXTXDIR_DISCARD;
1069 pIoCtx->cDataTransfersPending = 0;
1070 pIoCtx->cMetaTransfersPending = 0;
1071 pIoCtx->fComplete = false;
1072 pIoCtx->fFlags = fFlags;
1073 pIoCtx->pvAllocation = pvAllocation;
1074 pIoCtx->pfnIoCtxTransfer = pfnIoCtxTransfer;
1075 pIoCtx->pfnIoCtxTransferNext = NULL;
1076 pIoCtx->rcReq = VINF_SUCCESS;
1077 pIoCtx->Req.Discard.paRanges = paRanges;
1078 pIoCtx->Req.Discard.cRanges = cRanges;
1079 pIoCtx->Req.Discard.idxRange = 0;
1080 pIoCtx->Req.Discard.cbDiscardLeft = 0;
1081 pIoCtx->Req.Discard.offCur = 0;
1082 pIoCtx->Req.Discard.cbThisDiscard = 0;
1083
1084 pIoCtx->pIoCtxParent = NULL;
1085 pIoCtx->Type.Root.pfnComplete = pfnComplete;
1086 pIoCtx->Type.Root.pvUser1 = pvUser1;
1087 pIoCtx->Type.Root.pvUser2 = pvUser2;
1088}
1089
1090DECLINLINE(PVDIOCTX) vdIoCtxDiscardAlloc(PVDISK pDisk, PCRTRANGE paRanges,
1091 unsigned cRanges,
1092 PFNVDASYNCTRANSFERCOMPLETE pfnComplete,
1093 void *pvUser1, void *pvUser2,
1094 void *pvAllocation,
1095 PFNVDIOCTXTRANSFER pfnIoCtxTransfer,
1096 uint32_t fFlags)
1097{
1098 PVDIOCTX pIoCtx = NULL;
1099
1100 pIoCtx = (PVDIOCTX)RTMemCacheAlloc(pDisk->hMemCacheIoCtx);
1101 if (RT_LIKELY(pIoCtx))
1102 {
1103 vdIoCtxDiscardInit(pIoCtx, pDisk, paRanges, cRanges, pfnComplete, pvUser1,
1104 pvUser2, pvAllocation, pfnIoCtxTransfer, fFlags);
1105 }
1106
1107 LogFlow(("Allocated discard I/O context %#p\n", pIoCtx));
1108 return pIoCtx;
1109}
1110
1111DECLINLINE(PVDIOCTX) vdIoCtxChildAlloc(PVDISK pDisk, VDIOCTXTXDIR enmTxDir,
1112 uint64_t uOffset, size_t cbTransfer,
1113 PVDIMAGE pImageStart, PCRTSGBUF pSgBuf,
1114 PVDIOCTX pIoCtxParent, size_t cbTransferParent,
1115 size_t cbWriteParent, void *pvAllocation,
1116 PFNVDIOCTXTRANSFER pfnIoCtxTransfer)
1117{
1118 PVDIOCTX pIoCtx = vdIoCtxAlloc(pDisk, enmTxDir, uOffset, cbTransfer, pImageStart,
1119 pSgBuf, pvAllocation, pfnIoCtxTransfer, pIoCtxParent->fFlags & ~VDIOCTX_FLAGS_DONT_FREE);
1120
1121 AssertPtr(pIoCtxParent);
1122 Assert(!pIoCtxParent->pIoCtxParent);
1123
1124 if (RT_LIKELY(pIoCtx))
1125 {
1126 pIoCtx->pIoCtxParent = pIoCtxParent;
1127 pIoCtx->Type.Child.uOffsetSaved = uOffset;
1128 pIoCtx->Type.Child.cbTransferLeftSaved = cbTransfer;
1129 pIoCtx->Type.Child.cbTransferParent = cbTransferParent;
1130 pIoCtx->Type.Child.cbWriteParent = cbWriteParent;
1131 }
1132
1133 LogFlow(("Allocated child I/O context %#p\n", pIoCtx));
1134 return pIoCtx;
1135}
1136
1137DECLINLINE(PVDIOTASK) vdIoTaskUserAlloc(PVDIOSTORAGE pIoStorage, PFNVDXFERCOMPLETED pfnComplete, void *pvUser, PVDIOCTX pIoCtx, uint32_t cbTransfer)
1138{
1139 PVDIOTASK pIoTask = NULL;
1140
1141 pIoTask = (PVDIOTASK)RTMemCacheAlloc(pIoStorage->pVDIo->pDisk->hMemCacheIoTask);
1142 if (pIoTask)
1143 {
1144 pIoTask->pIoStorage = pIoStorage;
1145 pIoTask->pfnComplete = pfnComplete;
1146 pIoTask->pvUser = pvUser;
1147 pIoTask->fMeta = false;
1148 pIoTask->Type.User.cbTransfer = cbTransfer;
1149 pIoTask->Type.User.pIoCtx = pIoCtx;
1150 }
1151
1152 return pIoTask;
1153}
1154
1155DECLINLINE(PVDIOTASK) vdIoTaskMetaAlloc(PVDIOSTORAGE pIoStorage, PFNVDXFERCOMPLETED pfnComplete, void *pvUser, PVDMETAXFER pMetaXfer)
1156{
1157 PVDIOTASK pIoTask = NULL;
1158
1159 pIoTask = (PVDIOTASK)RTMemCacheAlloc(pIoStorage->pVDIo->pDisk->hMemCacheIoTask);
1160 if (pIoTask)
1161 {
1162 pIoTask->pIoStorage = pIoStorage;
1163 pIoTask->pfnComplete = pfnComplete;
1164 pIoTask->pvUser = pvUser;
1165 pIoTask->fMeta = true;
1166 pIoTask->Type.Meta.pMetaXfer = pMetaXfer;
1167 }
1168
1169 return pIoTask;
1170}
1171
1172DECLINLINE(void) vdIoCtxFree(PVDISK pDisk, PVDIOCTX pIoCtx)
1173{
1174 Log(("Freeing I/O context %#p\n", pIoCtx));
1175
1176 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_DONT_FREE))
1177 {
1178 if (pIoCtx->pvAllocation)
1179 RTMemFree(pIoCtx->pvAllocation);
1180#ifdef DEBUG
1181 memset(&pIoCtx->pDisk, 0xff, sizeof(void *));
1182#endif
1183 RTMemCacheFree(pDisk->hMemCacheIoCtx, pIoCtx);
1184 }
1185}
1186
1187DECLINLINE(void) vdIoTaskFree(PVDISK pDisk, PVDIOTASK pIoTask)
1188{
1189#ifdef DEBUG
1190 memset(pIoTask, 0xff, sizeof(VDIOTASK));
1191#endif
1192 RTMemCacheFree(pDisk->hMemCacheIoTask, pIoTask);
1193}
1194
1195DECLINLINE(void) vdIoCtxChildReset(PVDIOCTX pIoCtx)
1196{
1197 AssertPtr(pIoCtx->pIoCtxParent);
1198
1199 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
1200 pIoCtx->Req.Io.uOffset = pIoCtx->Type.Child.uOffsetSaved;
1201 pIoCtx->Req.Io.cbTransferLeft = (uint32_t)pIoCtx->Type.Child.cbTransferLeftSaved;
1202 Assert((uint32_t)pIoCtx->Type.Child.cbTransferLeftSaved == pIoCtx->Type.Child.cbTransferLeftSaved);
1203}
1204
1205DECLINLINE(PVDMETAXFER) vdMetaXferAlloc(PVDIOSTORAGE pIoStorage, uint64_t uOffset, size_t cb)
1206{
1207 PVDMETAXFER pMetaXfer = (PVDMETAXFER)RTMemAlloc(RT_UOFFSETOF_DYN(VDMETAXFER, abData[cb]));
1208
1209 if (RT_LIKELY(pMetaXfer))
1210 {
1211 pMetaXfer->Core.Key = uOffset;
1212 pMetaXfer->Core.KeyLast = uOffset + cb - 1;
1213 pMetaXfer->fFlags = VDMETAXFER_TXDIR_NONE;
1214 pMetaXfer->cbMeta = cb;
1215 pMetaXfer->pIoStorage = pIoStorage;
1216 pMetaXfer->cRefs = 0;
1217 pMetaXfer->pbDataShw = NULL;
1218 RTListInit(&pMetaXfer->ListIoCtxWaiting);
1219 RTListInit(&pMetaXfer->ListIoCtxShwWrites);
1220 }
1221 return pMetaXfer;
1222}
1223
1224DECLINLINE(void) vdIoCtxAddToWaitingList(volatile PVDIOCTX *ppList, PVDIOCTX pIoCtx)
1225{
1226 /* Put it on the waiting list. */
1227 PVDIOCTX pNext = ASMAtomicUoReadPtrT(ppList, PVDIOCTX);
1228 PVDIOCTX pHeadOld;
1229 pIoCtx->pIoCtxNext = pNext;
1230 while (!ASMAtomicCmpXchgExPtr(ppList, pIoCtx, pNext, &pHeadOld))
1231 {
1232 pNext = pHeadOld;
1233 Assert(pNext != pIoCtx);
1234 pIoCtx->pIoCtxNext = pNext;
1235 ASMNopPause();
1236 }
1237}
1238
1239DECLINLINE(void) vdIoCtxDefer(PVDISK pDisk, PVDIOCTX pIoCtx)
1240{
1241 LogFlowFunc(("Deferring I/O context pIoCtx=%#p\n", pIoCtx));
1242
1243 Assert(!pIoCtx->pIoCtxParent && !(pIoCtx->fFlags & VDIOCTX_FLAGS_BLOCKED));
1244 pIoCtx->fFlags |= VDIOCTX_FLAGS_BLOCKED;
1245 vdIoCtxAddToWaitingList(&pDisk->pIoCtxBlockedHead, pIoCtx);
1246}
1247
1248static size_t vdIoCtxCopy(PVDIOCTX pIoCtxDst, PVDIOCTX pIoCtxSrc, size_t cbData)
1249{
1250 return RTSgBufCopy(&pIoCtxDst->Req.Io.SgBuf, &pIoCtxSrc->Req.Io.SgBuf, cbData);
1251}
1252
1253#if 0 /* unused */
1254static int vdIoCtxCmp(PVDIOCTX pIoCtx1, PVDIOCTX pIoCtx2, size_t cbData)
1255{
1256 return RTSgBufCmp(&pIoCtx1->Req.Io.SgBuf, &pIoCtx2->Req.Io.SgBuf, cbData);
1257}
1258#endif
1259
1260static size_t vdIoCtxCopyTo(PVDIOCTX pIoCtx, const uint8_t *pbData, size_t cbData)
1261{
1262 return RTSgBufCopyFromBuf(&pIoCtx->Req.Io.SgBuf, pbData, cbData);
1263}
1264
1265static size_t vdIoCtxCopyFrom(PVDIOCTX pIoCtx, uint8_t *pbData, size_t cbData)
1266{
1267 return RTSgBufCopyToBuf(&pIoCtx->Req.Io.SgBuf, pbData, cbData);
1268}
1269
1270static size_t vdIoCtxSet(PVDIOCTX pIoCtx, uint8_t ch, size_t cbData)
1271{
1272 return RTSgBufSet(&pIoCtx->Req.Io.SgBuf, ch, cbData);
1273}
1274
1275/**
1276 * Returns whether the given I/O context has completed.
1277 *
1278 * @returns Flag whether the I/O context is complete.
1279 * @param pIoCtx The I/O context to check.
1280 */
1281DECLINLINE(bool) vdIoCtxIsComplete(PVDIOCTX pIoCtx)
1282{
1283 if ( !pIoCtx->cMetaTransfersPending
1284 && !pIoCtx->cDataTransfersPending
1285 && !pIoCtx->pfnIoCtxTransfer)
1286 return true;
1287
1288 /*
1289 * We complete the I/O context in case of an error
1290 * if there is no I/O task pending.
1291 */
1292 if ( RT_FAILURE(pIoCtx->rcReq)
1293 && !pIoCtx->cMetaTransfersPending
1294 && !pIoCtx->cDataTransfersPending)
1295 return true;
1296
1297 return false;
1298}
1299
1300/**
1301 * Returns whether the given I/O context is blocked due to a metadata transfer
1302 * or because the backend blocked it.
1303 *
1304 * @returns Flag whether the I/O context is blocked.
1305 * @param pIoCtx The I/O context to check.
1306 */
1307DECLINLINE(bool) vdIoCtxIsBlocked(PVDIOCTX pIoCtx)
1308{
1309 /* Don't change anything if there is a metadata transfer pending or we are blocked. */
1310 if ( pIoCtx->cMetaTransfersPending
1311 || (pIoCtx->fFlags & VDIOCTX_FLAGS_BLOCKED))
1312 return true;
1313
1314 return false;
1315}
1316
1317/**
1318 * Process the I/O context, core method which assumes that the I/O context
1319 * acquired the lock.
1320 *
1321 * @returns VBox status code.
1322 * @param pIoCtx I/O context to process.
1323 */
1324static int vdIoCtxProcessLocked(PVDIOCTX pIoCtx)
1325{
1326 int rc = VINF_SUCCESS;
1327
1328 VD_IS_LOCKED(pIoCtx->pDisk);
1329
1330 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
1331
1332 if (!vdIoCtxIsComplete(pIoCtx))
1333 {
1334 if (!vdIoCtxIsBlocked(pIoCtx))
1335 {
1336 if (pIoCtx->pfnIoCtxTransfer)
1337 {
1338 /* Call the transfer function advancing to the next while there is no error. */
1339 while ( pIoCtx->pfnIoCtxTransfer
1340 && !pIoCtx->cMetaTransfersPending
1341 && RT_SUCCESS(rc))
1342 {
1343 LogFlowFunc(("calling transfer function %#p\n", pIoCtx->pfnIoCtxTransfer));
1344 rc = pIoCtx->pfnIoCtxTransfer(pIoCtx);
1345
1346 /* Advance to the next part of the transfer if the current one succeeded. */
1347 if (RT_SUCCESS(rc))
1348 {
1349 pIoCtx->pfnIoCtxTransfer = pIoCtx->pfnIoCtxTransferNext;
1350 pIoCtx->pfnIoCtxTransferNext = NULL;
1351 }
1352 }
1353 }
1354
1355 if ( RT_SUCCESS(rc)
1356 && !pIoCtx->cMetaTransfersPending
1357 && !pIoCtx->cDataTransfersPending
1358 && !(pIoCtx->fFlags & VDIOCTX_FLAGS_BLOCKED))
1359 rc = VINF_VD_ASYNC_IO_FINISHED;
1360 else if ( RT_SUCCESS(rc)
1361 || rc == VERR_VD_NOT_ENOUGH_METADATA
1362 || rc == VERR_VD_IOCTX_HALT)
1363 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
1364 else if ( RT_FAILURE(rc)
1365 && (rc != VERR_VD_ASYNC_IO_IN_PROGRESS))
1366 {
1367 ASMAtomicCmpXchgS32(&pIoCtx->rcReq, rc, VINF_SUCCESS);
1368
1369 /*
1370 * The I/O context completed if we have an error and there is no data
1371 * or meta data transfer pending.
1372 */
1373 if ( !pIoCtx->cMetaTransfersPending
1374 && !pIoCtx->cDataTransfersPending)
1375 rc = VINF_VD_ASYNC_IO_FINISHED;
1376 else
1377 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
1378 }
1379 }
1380 else
1381 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
1382 }
1383 else
1384 rc = VINF_VD_ASYNC_IO_FINISHED;
1385
1386 LogFlowFunc(("pIoCtx=%#p rc=%Rrc cDataTransfersPending=%u cMetaTransfersPending=%u fComplete=%RTbool\n",
1387 pIoCtx, rc, pIoCtx->cDataTransfersPending, pIoCtx->cMetaTransfersPending,
1388 pIoCtx->fComplete));
1389
1390 return rc;
1391}
1392
1393/**
1394 * Processes the list of waiting I/O contexts.
1395 *
1396 * @returns VBox status code, only valid if pIoCtxRc is not NULL, treat as void
1397 * function otherwise.
1398 * @param pDisk The disk structure.
1399 * @param pIoCtxRc An I/O context handle which waits on the list. When processed
1400 * The status code is returned. NULL if there is no I/O context
1401 * to return the status code for.
1402 */
1403static int vdDiskProcessWaitingIoCtx(PVDISK pDisk, PVDIOCTX pIoCtxRc)
1404{
1405 int rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
1406
1407 LogFlowFunc(("pDisk=%#p pIoCtxRc=%#p\n", pDisk, pIoCtxRc));
1408
1409 VD_IS_LOCKED(pDisk);
1410
1411 /* Get the waiting list and process it in FIFO order. */
1412 PVDIOCTX pIoCtxHead = ASMAtomicXchgPtrT(&pDisk->pIoCtxHead, NULL, PVDIOCTX);
1413
1414 /* Reverse it. */
1415 PVDIOCTX pCur = pIoCtxHead;
1416 pIoCtxHead = NULL;
1417 while (pCur)
1418 {
1419 PVDIOCTX pInsert = pCur;
1420 pCur = pCur->pIoCtxNext;
1421 pInsert->pIoCtxNext = pIoCtxHead;
1422 pIoCtxHead = pInsert;
1423 }
1424
1425 /* Process now. */
1426 pCur = pIoCtxHead;
1427 while (pCur)
1428 {
1429 int rcTmp;
1430 PVDIOCTX pTmp = pCur;
1431
1432 pCur = pCur->pIoCtxNext;
1433 pTmp->pIoCtxNext = NULL;
1434
1435 /*
1436 * Need to clear the sync flag here if there is a new I/O context
1437 * with it set and the context is not given in pIoCtxRc.
1438 * This happens most likely on a different thread and that one shouldn't
1439 * process the context synchronously.
1440 *
1441 * The thread who issued the context will wait on the event semaphore
1442 * anyway which is signalled when the completion handler is called.
1443 */
1444 if ( pTmp->fFlags & VDIOCTX_FLAGS_SYNC
1445 && pTmp != pIoCtxRc)
1446 pTmp->fFlags &= ~VDIOCTX_FLAGS_SYNC;
1447
1448 rcTmp = vdIoCtxProcessLocked(pTmp);
1449 if (pTmp == pIoCtxRc)
1450 {
1451 if ( rcTmp == VINF_VD_ASYNC_IO_FINISHED
1452 && RT_SUCCESS(pTmp->rcReq)
1453 && pTmp->enmTxDir == VDIOCTXTXDIR_READ)
1454 {
1455 int rc2 = vdFilterChainApplyRead(pDisk, pTmp->Req.Io.uOffsetXferOrig,
1456 pTmp->Req.Io.cbXferOrig, pTmp);
1457 if (RT_FAILURE(rc2))
1458 rcTmp = rc2;
1459 }
1460
1461 /* The given I/O context was processed, pass the return code to the caller. */
1462 if ( rcTmp == VINF_VD_ASYNC_IO_FINISHED
1463 && (pTmp->fFlags & VDIOCTX_FLAGS_SYNC))
1464 rc = pTmp->rcReq;
1465 else
1466 rc = rcTmp;
1467 }
1468 else if ( rcTmp == VINF_VD_ASYNC_IO_FINISHED
1469 && ASMAtomicCmpXchgBool(&pTmp->fComplete, true, false))
1470 {
1471 LogFlowFunc(("Waiting I/O context completed pTmp=%#p\n", pTmp));
1472 vdThreadFinishWrite(pDisk);
1473
1474 bool fFreeCtx = RT_BOOL(!(pTmp->fFlags & VDIOCTX_FLAGS_DONT_FREE));
1475 vdIoCtxRootComplete(pDisk, pTmp);
1476
1477 if (fFreeCtx)
1478 vdIoCtxFree(pDisk, pTmp);
1479 }
1480 }
1481
1482 LogFlowFunc(("returns rc=%Rrc\n", rc));
1483 return rc;
1484}
1485
1486/**
1487 * Processes the list of blocked I/O contexts.
1488 *
1489 * @param pDisk The disk structure.
1490 */
1491static void vdDiskProcessBlockedIoCtx(PVDISK pDisk)
1492{
1493 LogFlowFunc(("pDisk=%#p\n", pDisk));
1494
1495 VD_IS_LOCKED(pDisk);
1496
1497 /* Get the waiting list and process it in FIFO order. */
1498 PVDIOCTX pIoCtxHead = ASMAtomicXchgPtrT(&pDisk->pIoCtxBlockedHead, NULL, PVDIOCTX);
1499
1500 /* Reverse it. */
1501 PVDIOCTX pCur = pIoCtxHead;
1502 pIoCtxHead = NULL;
1503 while (pCur)
1504 {
1505 PVDIOCTX pInsert = pCur;
1506 pCur = pCur->pIoCtxNext;
1507 pInsert->pIoCtxNext = pIoCtxHead;
1508 pIoCtxHead = pInsert;
1509 }
1510
1511 /* Process now. */
1512 pCur = pIoCtxHead;
1513 while (pCur)
1514 {
1515 int rc;
1516 PVDIOCTX pTmp = pCur;
1517
1518 pCur = pCur->pIoCtxNext;
1519 pTmp->pIoCtxNext = NULL;
1520
1521 Assert(!pTmp->pIoCtxParent);
1522 Assert(pTmp->fFlags & VDIOCTX_FLAGS_BLOCKED);
1523 pTmp->fFlags &= ~VDIOCTX_FLAGS_BLOCKED;
1524
1525 rc = vdIoCtxProcessLocked(pTmp);
1526 if ( rc == VINF_VD_ASYNC_IO_FINISHED
1527 && ASMAtomicCmpXchgBool(&pTmp->fComplete, true, false))
1528 {
1529 LogFlowFunc(("Waiting I/O context completed pTmp=%#p\n", pTmp));
1530 vdThreadFinishWrite(pDisk);
1531
1532 bool fFreeCtx = RT_BOOL(!(pTmp->fFlags & VDIOCTX_FLAGS_DONT_FREE));
1533 vdIoCtxRootComplete(pDisk, pTmp);
1534 if (fFreeCtx)
1535 vdIoCtxFree(pDisk, pTmp);
1536 }
1537 }
1538
1539 LogFlowFunc(("returns\n"));
1540}
1541
1542/**
1543 * Processes the I/O context trying to lock the criticial section.
1544 * The context is deferred if the critical section is busy.
1545 *
1546 * @returns VBox status code.
1547 * @param pIoCtx The I/O context to process.
1548 */
1549static int vdIoCtxProcessTryLockDefer(PVDIOCTX pIoCtx)
1550{
1551 int rc = VINF_SUCCESS;
1552 PVDISK pDisk = pIoCtx->pDisk;
1553
1554 Log(("Defer pIoCtx=%#p\n", pIoCtx));
1555
1556 /* Put it on the waiting list first. */
1557 vdIoCtxAddToWaitingList(&pDisk->pIoCtxHead, pIoCtx);
1558
1559 if (ASMAtomicCmpXchgBool(&pDisk->fLocked, true, false))
1560 {
1561 /* Leave it again, the context will be processed just before leaving the lock. */
1562 LogFlowFunc(("Successfully acquired the lock\n"));
1563 rc = vdDiskUnlock(pDisk, pIoCtx);
1564 }
1565 else
1566 {
1567 LogFlowFunc(("Lock is held\n"));
1568 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
1569 }
1570
1571 return rc;
1572}
1573
1574/**
1575 * Process the I/O context in a synchronous manner, waiting
1576 * for it to complete.
1577 *
1578 * @returns VBox status code of the completed request.
1579 * @param pIoCtx The sync I/O context.
1580 * @param hEventComplete Event sempahore to wait on for completion.
1581 */
1582static int vdIoCtxProcessSync(PVDIOCTX pIoCtx, RTSEMEVENT hEventComplete)
1583{
1584 int rc = VINF_SUCCESS;
1585 PVDISK pDisk = pIoCtx->pDisk;
1586
1587 LogFlowFunc(("pIoCtx=%p\n", pIoCtx));
1588
1589 AssertMsg(pIoCtx->fFlags & (VDIOCTX_FLAGS_SYNC | VDIOCTX_FLAGS_DONT_FREE),
1590 ("I/O context is not marked as synchronous\n"));
1591
1592 rc = vdIoCtxProcessTryLockDefer(pIoCtx);
1593 if (rc == VINF_VD_ASYNC_IO_FINISHED)
1594 rc = VINF_SUCCESS;
1595
1596 if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
1597 {
1598 rc = RTSemEventWait(hEventComplete, RT_INDEFINITE_WAIT);
1599 AssertRC(rc);
1600 }
1601
1602 rc = pIoCtx->rcReq;
1603 vdIoCtxFree(pDisk, pIoCtx);
1604
1605 return rc;
1606}
1607
1608DECLINLINE(bool) vdIoCtxIsDiskLockOwner(PVDISK pDisk, PVDIOCTX pIoCtx)
1609{
1610 return pDisk->pIoCtxLockOwner == pIoCtx;
1611}
1612
1613static int vdIoCtxLockDisk(PVDISK pDisk, PVDIOCTX pIoCtx)
1614{
1615 int rc = VINF_SUCCESS;
1616
1617 VD_IS_LOCKED(pDisk);
1618
1619 LogFlowFunc(("pDisk=%#p pIoCtx=%#p\n", pDisk, pIoCtx));
1620
1621 if (!ASMAtomicCmpXchgPtr(&pDisk->pIoCtxLockOwner, pIoCtx, NIL_VDIOCTX))
1622 {
1623 Assert(pDisk->pIoCtxLockOwner != pIoCtx); /* No nesting allowed. */
1624 vdIoCtxDefer(pDisk, pIoCtx);
1625 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
1626 }
1627
1628 LogFlowFunc(("returns -> %Rrc\n", rc));
1629 return rc;
1630}
1631
1632static void vdIoCtxUnlockDisk(PVDISK pDisk, PVDIOCTX pIoCtx, bool fProcessBlockedReqs)
1633{
1634 RT_NOREF1(pIoCtx);
1635 LogFlowFunc(("pDisk=%#p pIoCtx=%#p fProcessBlockedReqs=%RTbool\n",
1636 pDisk, pIoCtx, fProcessBlockedReqs));
1637
1638 VD_IS_LOCKED(pDisk);
1639
1640 LogFlow(("Unlocking disk lock owner is %#p\n", pDisk->pIoCtxLockOwner));
1641 Assert(pDisk->pIoCtxLockOwner == pIoCtx);
1642 ASMAtomicXchgPtrT(&pDisk->pIoCtxLockOwner, NIL_VDIOCTX, PVDIOCTX);
1643
1644 if (fProcessBlockedReqs)
1645 {
1646 /* Process any blocked writes if the current request didn't caused another growing. */
1647 vdDiskProcessBlockedIoCtx(pDisk);
1648 }
1649
1650 LogFlowFunc(("returns\n"));
1651}
1652
1653/**
1654 * Internal: Reads a given amount of data from the image chain of the disk.
1655 **/
1656static int vdDiskReadHelper(PVDISK pDisk, PVDIMAGE pImage, PVDIMAGE pImageParentOverride,
1657 uint64_t uOffset, size_t cbRead, PVDIOCTX pIoCtx, size_t *pcbThisRead)
1658{
1659 RT_NOREF1(pDisk);
1660 int rc = VINF_SUCCESS;
1661 size_t cbThisRead = cbRead;
1662
1663 AssertPtr(pcbThisRead);
1664
1665 *pcbThisRead = 0;
1666
1667 /*
1668 * Try to read from the given image.
1669 * If the block is not allocated read from override chain if present.
1670 */
1671 rc = pImage->Backend->pfnRead(pImage->pBackendData,
1672 uOffset, cbThisRead, pIoCtx,
1673 &cbThisRead);
1674
1675 if (rc == VERR_VD_BLOCK_FREE)
1676 {
1677 for (PVDIMAGE pCurrImage = pImageParentOverride ? pImageParentOverride : pImage->pPrev;
1678 pCurrImage != NULL && rc == VERR_VD_BLOCK_FREE;
1679 pCurrImage = pCurrImage->pPrev)
1680 {
1681 rc = pCurrImage->Backend->pfnRead(pCurrImage->pBackendData,
1682 uOffset, cbThisRead, pIoCtx,
1683 &cbThisRead);
1684 }
1685 }
1686
1687 if (RT_SUCCESS(rc) || rc == VERR_VD_BLOCK_FREE)
1688 *pcbThisRead = cbThisRead;
1689
1690 return rc;
1691}
1692
1693/**
1694 * internal: read the specified amount of data in whatever blocks the backend
1695 * will give us - async version.
1696 */
1697static DECLCALLBACK(int) vdReadHelperAsync(PVDIOCTX pIoCtx)
1698{
1699 int rc;
1700 PVDISK pDisk = pIoCtx->pDisk;
1701 size_t cbToRead = pIoCtx->Req.Io.cbTransfer;
1702 uint64_t uOffset = pIoCtx->Req.Io.uOffset;
1703 PVDIMAGE pCurrImage = pIoCtx->Req.Io.pImageCur;
1704 PVDIMAGE pImageParentOverride = pIoCtx->Req.Io.pImageParentOverride;
1705 unsigned cImagesRead = pIoCtx->Req.Io.cImagesRead;
1706 size_t cbThisRead;
1707
1708 /*
1709 * Check whether there is a full block write in progress which was not allocated.
1710 * Defer I/O if the range interferes but only if it does not belong to the
1711 * write doing the allocation.
1712 */
1713 if ( pDisk->pIoCtxLockOwner != NIL_VDIOCTX
1714 && uOffset >= pDisk->uOffsetStartLocked
1715 && uOffset < pDisk->uOffsetEndLocked
1716 && ( !pIoCtx->pIoCtxParent
1717 || pIoCtx->pIoCtxParent != pDisk->pIoCtxLockOwner))
1718 {
1719 Log(("Interferring read while allocating a new block => deferring read\n"));
1720 vdIoCtxDefer(pDisk, pIoCtx);
1721 return VERR_VD_ASYNC_IO_IN_PROGRESS;
1722 }
1723
1724 /* Loop until all reads started or we have a backend which needs to read metadata. */
1725 do
1726 {
1727 /* Search for image with allocated block. Do not attempt to read more
1728 * than the previous reads marked as valid. Otherwise this would return
1729 * stale data when different block sizes are used for the images. */
1730 cbThisRead = cbToRead;
1731
1732 if ( pDisk->pCache
1733 && !pImageParentOverride)
1734 {
1735 rc = vdCacheReadHelper(pDisk->pCache, uOffset, cbThisRead,
1736 pIoCtx, &cbThisRead);
1737 if (rc == VERR_VD_BLOCK_FREE)
1738 {
1739 rc = vdDiskReadHelper(pDisk, pCurrImage, NULL, uOffset, cbThisRead,
1740 pIoCtx, &cbThisRead);
1741
1742 /* If the read was successful, write the data back into the cache. */
1743 if ( RT_SUCCESS(rc)
1744 && pIoCtx->fFlags & VDIOCTX_FLAGS_READ_UPDATE_CACHE)
1745 {
1746 rc = vdCacheWriteHelper(pDisk->pCache, uOffset, cbThisRead,
1747 pIoCtx, NULL);
1748 }
1749 }
1750 }
1751 else
1752 {
1753 /*
1754 * Try to read from the given image.
1755 * If the block is not allocated read from override chain if present.
1756 */
1757 rc = pCurrImage->Backend->pfnRead(pCurrImage->pBackendData,
1758 uOffset, cbThisRead, pIoCtx,
1759 &cbThisRead);
1760
1761 if ( rc == VERR_VD_BLOCK_FREE
1762 && cImagesRead != 1)
1763 {
1764 unsigned cImagesToProcess = cImagesRead;
1765
1766 pCurrImage = pImageParentOverride ? pImageParentOverride : pCurrImage->pPrev;
1767 pIoCtx->Req.Io.pImageParentOverride = NULL;
1768
1769 while (pCurrImage && rc == VERR_VD_BLOCK_FREE)
1770 {
1771 rc = pCurrImage->Backend->pfnRead(pCurrImage->pBackendData,
1772 uOffset, cbThisRead,
1773 pIoCtx, &cbThisRead);
1774 if (cImagesToProcess == 1)
1775 break;
1776 else if (cImagesToProcess > 0)
1777 cImagesToProcess--;
1778
1779 if (rc == VERR_VD_BLOCK_FREE)
1780 pCurrImage = pCurrImage->pPrev;
1781 }
1782 }
1783 }
1784
1785 /* The task state will be updated on success already, don't do it here!. */
1786 if (rc == VERR_VD_BLOCK_FREE)
1787 {
1788 /* No image in the chain contains the data for the block. */
1789 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbThisRead); Assert(cbThisRead == (uint32_t)cbThisRead);
1790
1791 /* Fill the free space with 0 if we are told to do so
1792 * or a previous read returned valid data. */
1793 if (pIoCtx->fFlags & VDIOCTX_FLAGS_ZERO_FREE_BLOCKS)
1794 vdIoCtxSet(pIoCtx, '\0', cbThisRead);
1795 else
1796 pIoCtx->Req.Io.cbBufClear += cbThisRead;
1797
1798 if (pIoCtx->Req.Io.pImageCur->uOpenFlags & VD_OPEN_FLAGS_INFORM_ABOUT_ZERO_BLOCKS)
1799 rc = VINF_VD_NEW_ZEROED_BLOCK;
1800 else
1801 rc = VINF_SUCCESS;
1802 }
1803 else if (rc == VERR_VD_IOCTX_HALT)
1804 {
1805 uOffset += cbThisRead;
1806 cbToRead -= cbThisRead;
1807 pIoCtx->fFlags |= VDIOCTX_FLAGS_BLOCKED;
1808 }
1809 else if ( RT_SUCCESS(rc)
1810 || rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
1811 {
1812 /* First not free block, fill the space before with 0. */
1813 if ( pIoCtx->Req.Io.cbBufClear
1814 && !(pIoCtx->fFlags & VDIOCTX_FLAGS_ZERO_FREE_BLOCKS))
1815 {
1816 RTSGBUF SgBuf;
1817 RTSgBufClone(&SgBuf, &pIoCtx->Req.Io.SgBuf);
1818 RTSgBufReset(&SgBuf);
1819 RTSgBufSet(&SgBuf, 0, pIoCtx->Req.Io.cbBufClear);
1820 pIoCtx->Req.Io.cbBufClear = 0;
1821 pIoCtx->fFlags |= VDIOCTX_FLAGS_ZERO_FREE_BLOCKS;
1822 }
1823 rc = VINF_SUCCESS;
1824 }
1825
1826 if (RT_FAILURE(rc))
1827 break;
1828
1829 cbToRead -= cbThisRead;
1830 uOffset += cbThisRead;
1831 pCurrImage = pIoCtx->Req.Io.pImageStart; /* Start with the highest image in the chain. */
1832 } while (cbToRead != 0 && RT_SUCCESS(rc));
1833
1834 if ( rc == VERR_VD_NOT_ENOUGH_METADATA
1835 || rc == VERR_VD_IOCTX_HALT)
1836 {
1837 /* Save the current state. */
1838 pIoCtx->Req.Io.uOffset = uOffset;
1839 pIoCtx->Req.Io.cbTransfer = cbToRead;
1840 pIoCtx->Req.Io.pImageCur = pCurrImage ? pCurrImage : pIoCtx->Req.Io.pImageStart;
1841 }
1842
1843 return (!(pIoCtx->fFlags & VDIOCTX_FLAGS_ZERO_FREE_BLOCKS))
1844 ? VERR_VD_BLOCK_FREE
1845 : rc;
1846}
1847
1848/**
1849 * internal: parent image read wrapper for compacting.
1850 */
1851static DECLCALLBACK(int) vdParentRead(void *pvUser, uint64_t uOffset, void *pvBuf,
1852 size_t cbRead)
1853{
1854 PVDPARENTSTATEDESC pParentState = (PVDPARENTSTATEDESC)pvUser;
1855
1856 /** @todo
1857 * Only used for compaction so far which is not possible to mix with async I/O.
1858 * Needs to be changed if we want to support online compaction of images.
1859 */
1860 bool fLocked = ASMAtomicXchgBool(&pParentState->pDisk->fLocked, true);
1861 AssertMsgReturn(!fLocked,
1862 ("Calling synchronous parent read while another thread holds the disk lock\n"),
1863 VERR_VD_INVALID_STATE);
1864
1865 /* Fake an I/O context. */
1866 RTSGSEG Segment;
1867 RTSGBUF SgBuf;
1868 VDIOCTX IoCtx;
1869
1870 Segment.pvSeg = pvBuf;
1871 Segment.cbSeg = cbRead;
1872 RTSgBufInit(&SgBuf, &Segment, 1);
1873 vdIoCtxInit(&IoCtx, pParentState->pDisk, VDIOCTXTXDIR_READ, uOffset, cbRead, pParentState->pImage,
1874 &SgBuf, NULL, NULL, VDIOCTX_FLAGS_SYNC | VDIOCTX_FLAGS_ZERO_FREE_BLOCKS);
1875 int rc = vdReadHelperAsync(&IoCtx);
1876 ASMAtomicXchgBool(&pParentState->pDisk->fLocked, false);
1877 return rc;
1878}
1879
1880/**
1881 * Extended version of vdReadHelper(), implementing certain optimizations
1882 * for image cloning.
1883 *
1884 * @returns VBox status code.
1885 * @param pDisk The disk to read from.
1886 * @param pImage The image to start reading from.
1887 * @param pImageParentOverride The parent image to read from
1888 * if the starting image returns a free block.
1889 * If NULL is passed the real parent of the image
1890 * in the chain is used.
1891 * @param uOffset Offset in the disk to start reading from.
1892 * @param pvBuf Where to store the read data.
1893 * @param cbRead How much to read.
1894 * @param fZeroFreeBlocks Flag whether free blocks should be zeroed.
1895 * If false and no image has data for sepcified
1896 * range VERR_VD_BLOCK_FREE is returned.
1897 * Note that unallocated blocks are still zeroed
1898 * if at least one image has valid data for a part
1899 * of the range.
1900 * @param fUpdateCache Flag whether to update the attached cache if
1901 * available.
1902 * @param cImagesRead Number of images in the chain to read until
1903 * the read is cut off. A value of 0 disables the cut off.
1904 */
1905static int vdReadHelperEx(PVDISK pDisk, PVDIMAGE pImage, PVDIMAGE pImageParentOverride,
1906 uint64_t uOffset, void *pvBuf, size_t cbRead,
1907 bool fZeroFreeBlocks, bool fUpdateCache, unsigned cImagesRead)
1908{
1909 int rc = VINF_SUCCESS;
1910 uint32_t fFlags = VDIOCTX_FLAGS_SYNC | VDIOCTX_FLAGS_DONT_FREE;
1911 RTSGSEG Segment;
1912 RTSGBUF SgBuf;
1913 VDIOCTX IoCtx;
1914 RTSEMEVENT hEventComplete = NIL_RTSEMEVENT;
1915
1916 rc = RTSemEventCreate(&hEventComplete);
1917 if (RT_FAILURE(rc))
1918 return rc;
1919
1920 if (fZeroFreeBlocks)
1921 fFlags |= VDIOCTX_FLAGS_ZERO_FREE_BLOCKS;
1922 if (fUpdateCache)
1923 fFlags |= VDIOCTX_FLAGS_READ_UPDATE_CACHE;
1924
1925 Segment.pvSeg = pvBuf;
1926 Segment.cbSeg = cbRead;
1927 RTSgBufInit(&SgBuf, &Segment, 1);
1928 vdIoCtxInit(&IoCtx, pDisk, VDIOCTXTXDIR_READ, uOffset, cbRead, pImage, &SgBuf,
1929 NULL, vdReadHelperAsync, fFlags);
1930
1931 IoCtx.Req.Io.pImageParentOverride = pImageParentOverride;
1932 IoCtx.Req.Io.cImagesRead = cImagesRead;
1933 IoCtx.Type.Root.pfnComplete = vdIoCtxSyncComplete;
1934 IoCtx.Type.Root.pvUser1 = pDisk;
1935 IoCtx.Type.Root.pvUser2 = hEventComplete;
1936 rc = vdIoCtxProcessSync(&IoCtx, hEventComplete);
1937 RTSemEventDestroy(hEventComplete);
1938 return rc;
1939}
1940
1941/**
1942 * internal: read the specified amount of data in whatever blocks the backend
1943 * will give us.
1944 */
1945static int vdReadHelper(PVDISK pDisk, PVDIMAGE pImage, uint64_t uOffset,
1946 void *pvBuf, size_t cbRead, bool fUpdateCache)
1947{
1948 return vdReadHelperEx(pDisk, pImage, NULL, uOffset, pvBuf, cbRead,
1949 true /* fZeroFreeBlocks */, fUpdateCache, 0);
1950}
1951
1952/**
1953 * internal: mark the disk as not modified.
1954 */
1955static void vdResetModifiedFlag(PVDISK pDisk)
1956{
1957 if (pDisk->uModified & VD_IMAGE_MODIFIED_FLAG)
1958 {
1959 /* generate new last-modified uuid */
1960 if (!(pDisk->uModified & VD_IMAGE_MODIFIED_DISABLE_UUID_UPDATE))
1961 {
1962 RTUUID Uuid;
1963
1964 RTUuidCreate(&Uuid);
1965 pDisk->pLast->Backend->pfnSetModificationUuid(pDisk->pLast->pBackendData,
1966 &Uuid);
1967
1968 if (pDisk->pCache)
1969 pDisk->pCache->Backend->pfnSetModificationUuid(pDisk->pCache->pBackendData,
1970 &Uuid);
1971 }
1972
1973 pDisk->uModified &= ~VD_IMAGE_MODIFIED_FLAG;
1974 }
1975}
1976
1977/**
1978 * internal: mark the disk as modified.
1979 */
1980static void vdSetModifiedFlag(PVDISK pDisk)
1981{
1982 pDisk->uModified |= VD_IMAGE_MODIFIED_FLAG;
1983 if (pDisk->uModified & VD_IMAGE_MODIFIED_FIRST)
1984 {
1985 pDisk->uModified &= ~VD_IMAGE_MODIFIED_FIRST;
1986
1987 /* First modify, so create a UUID and ensure it's written to disk. */
1988 vdResetModifiedFlag(pDisk);
1989
1990 if (!(pDisk->uModified & VD_IMAGE_MODIFIED_DISABLE_UUID_UPDATE))
1991 {
1992 VDIOCTX IoCtx;
1993 vdIoCtxInit(&IoCtx, pDisk, VDIOCTXTXDIR_FLUSH, 0, 0, NULL,
1994 NULL, NULL, NULL, VDIOCTX_FLAGS_SYNC);
1995 pDisk->pLast->Backend->pfnFlush(pDisk->pLast->pBackendData, &IoCtx);
1996 }
1997 }
1998}
1999
2000/**
2001 * internal: write buffer to the image, taking care of block boundaries and
2002 * write optimizations.
2003 */
2004static int vdWriteHelperEx(PVDISK pDisk, PVDIMAGE pImage,
2005 PVDIMAGE pImageParentOverride, uint64_t uOffset,
2006 const void *pvBuf, size_t cbWrite,
2007 uint32_t fFlags, unsigned cImagesRead)
2008{
2009 int rc = VINF_SUCCESS;
2010 RTSGSEG Segment;
2011 RTSGBUF SgBuf;
2012 VDIOCTX IoCtx;
2013 RTSEMEVENT hEventComplete = NIL_RTSEMEVENT;
2014
2015 rc = RTSemEventCreate(&hEventComplete);
2016 if (RT_FAILURE(rc))
2017 return rc;
2018
2019 fFlags |= VDIOCTX_FLAGS_SYNC | VDIOCTX_FLAGS_DONT_FREE;
2020
2021 Segment.pvSeg = (void *)pvBuf;
2022 Segment.cbSeg = cbWrite;
2023 RTSgBufInit(&SgBuf, &Segment, 1);
2024 vdIoCtxInit(&IoCtx, pDisk, VDIOCTXTXDIR_WRITE, uOffset, cbWrite, pImage, &SgBuf,
2025 NULL, vdWriteHelperAsync, fFlags);
2026
2027 IoCtx.Req.Io.pImageParentOverride = pImageParentOverride;
2028 IoCtx.Req.Io.cImagesRead = cImagesRead;
2029 IoCtx.pIoCtxParent = NULL;
2030 IoCtx.Type.Root.pfnComplete = vdIoCtxSyncComplete;
2031 IoCtx.Type.Root.pvUser1 = pDisk;
2032 IoCtx.Type.Root.pvUser2 = hEventComplete;
2033 if (RT_SUCCESS(rc))
2034 rc = vdIoCtxProcessSync(&IoCtx, hEventComplete);
2035
2036 RTSemEventDestroy(hEventComplete);
2037 return rc;
2038}
2039
2040/**
2041 * internal: write buffer to the image, taking care of block boundaries and
2042 * write optimizations.
2043 */
2044static int vdWriteHelper(PVDISK pDisk, PVDIMAGE pImage, uint64_t uOffset,
2045 const void *pvBuf, size_t cbWrite, uint32_t fFlags)
2046{
2047 return vdWriteHelperEx(pDisk, pImage, NULL, uOffset, pvBuf, cbWrite,
2048 fFlags, 0);
2049}
2050
2051/**
2052 * Internal: Copies the content of one disk to another one applying optimizations
2053 * to speed up the copy process if possible.
2054 */
2055static int vdCopyHelper(PVDISK pDiskFrom, PVDIMAGE pImageFrom, PVDISK pDiskTo, PVDIMAGE pImageTo,
2056 uint64_t cbSize, unsigned cImagesFromRead, unsigned cImagesToRead,
2057 bool fSuppressRedundantIo, PVDINTERFACEPROGRESS pIfProgress,
2058 PVDINTERFACEPROGRESS pDstIfProgress)
2059{
2060 int rc = VINF_SUCCESS;
2061 int rc2;
2062 uint64_t uOffset = 0;
2063 uint64_t cbRemaining = cbSize;
2064 void *pvBuf = NULL;
2065 bool fLockReadFrom = false;
2066 bool fLockWriteTo = false;
2067 bool fBlockwiseCopy = false;
2068 unsigned uProgressOld = 0;
2069
2070 LogFlowFunc(("pDiskFrom=%#p pImageFrom=%#p pDiskTo=%#p pImageTo=%#p cbSize=%llu cImagesFromRead=%u cImagesToRead=%u fSuppressRedundantIo=%RTbool pIfProgress=%#p pDstIfProgress=%#p\n",
2071 pDiskFrom, pImageFrom, pDiskTo, pImageTo, cbSize, cImagesFromRead, cImagesToRead, fSuppressRedundantIo, pDstIfProgress, pDstIfProgress));
2072
2073 if ( (fSuppressRedundantIo || (cImagesFromRead > 0))
2074 && RTListIsEmpty(&pDiskFrom->ListFilterChainRead))
2075 fBlockwiseCopy = true;
2076
2077 /* Allocate tmp buffer. */
2078 pvBuf = RTMemTmpAlloc(VD_MERGE_BUFFER_SIZE);
2079 if (!pvBuf)
2080 return rc;
2081
2082 do
2083 {
2084 size_t cbThisRead = RT_MIN(VD_MERGE_BUFFER_SIZE, cbRemaining);
2085
2086 /* Note that we don't attempt to synchronize cross-disk accesses.
2087 * It wouldn't be very difficult to do, just the lock order would
2088 * need to be defined somehow to prevent deadlocks. Postpone such
2089 * magic as there is no use case for this. */
2090
2091 rc2 = vdThreadStartRead(pDiskFrom);
2092 AssertRC(rc2);
2093 fLockReadFrom = true;
2094
2095 if (fBlockwiseCopy)
2096 {
2097 RTSGSEG SegmentBuf;
2098 RTSGBUF SgBuf;
2099 VDIOCTX IoCtx;
2100
2101 SegmentBuf.pvSeg = pvBuf;
2102 SegmentBuf.cbSeg = VD_MERGE_BUFFER_SIZE;
2103 RTSgBufInit(&SgBuf, &SegmentBuf, 1);
2104 vdIoCtxInit(&IoCtx, pDiskFrom, VDIOCTXTXDIR_READ, 0, 0, NULL,
2105 &SgBuf, NULL, NULL, VDIOCTX_FLAGS_SYNC);
2106
2107 /* Read the source data. */
2108 rc = pImageFrom->Backend->pfnRead(pImageFrom->pBackendData,
2109 uOffset, cbThisRead, &IoCtx,
2110 &cbThisRead);
2111
2112 if ( rc == VERR_VD_BLOCK_FREE
2113 && cImagesFromRead != 1)
2114 {
2115 unsigned cImagesToProcess = cImagesFromRead;
2116
2117 for (PVDIMAGE pCurrImage = pImageFrom->pPrev;
2118 pCurrImage != NULL && rc == VERR_VD_BLOCK_FREE;
2119 pCurrImage = pCurrImage->pPrev)
2120 {
2121 rc = pCurrImage->Backend->pfnRead(pCurrImage->pBackendData,
2122 uOffset, cbThisRead,
2123 &IoCtx, &cbThisRead);
2124 if (cImagesToProcess == 1)
2125 break;
2126 else if (cImagesToProcess > 0)
2127 cImagesToProcess--;
2128 }
2129 }
2130 }
2131 else
2132 rc = vdReadHelper(pDiskFrom, pImageFrom, uOffset, pvBuf, cbThisRead,
2133 false /* fUpdateCache */);
2134
2135 if (RT_FAILURE(rc) && rc != VERR_VD_BLOCK_FREE)
2136 break;
2137
2138 rc2 = vdThreadFinishRead(pDiskFrom);
2139 AssertRC(rc2);
2140 fLockReadFrom = false;
2141
2142 if (rc != VERR_VD_BLOCK_FREE)
2143 {
2144 rc2 = vdThreadStartWrite(pDiskTo);
2145 AssertRC(rc2);
2146 fLockWriteTo = true;
2147
2148 /* Only do collapsed I/O if we are copying the data blockwise. */
2149 rc = vdWriteHelperEx(pDiskTo, pImageTo, NULL, uOffset, pvBuf,
2150 cbThisRead, VDIOCTX_FLAGS_DONT_SET_MODIFIED_FLAG /* fFlags */,
2151 fBlockwiseCopy ? cImagesToRead : 0);
2152 if (RT_FAILURE(rc))
2153 break;
2154
2155 rc2 = vdThreadFinishWrite(pDiskTo);
2156 AssertRC(rc2);
2157 fLockWriteTo = false;
2158 }
2159 else /* Don't propagate the error to the outside */
2160 rc = VINF_SUCCESS;
2161
2162 uOffset += cbThisRead;
2163 cbRemaining -= cbThisRead;
2164
2165 unsigned uProgressNew = uOffset * 99 / cbSize;
2166 if (uProgressNew != uProgressOld)
2167 {
2168 uProgressOld = uProgressNew;
2169
2170 if (pIfProgress && pIfProgress->pfnProgress)
2171 {
2172 rc = pIfProgress->pfnProgress(pIfProgress->Core.pvUser,
2173 uProgressOld);
2174 if (RT_FAILURE(rc))
2175 break;
2176 }
2177 if (pDstIfProgress && pDstIfProgress->pfnProgress)
2178 {
2179 rc = pDstIfProgress->pfnProgress(pDstIfProgress->Core.pvUser,
2180 uProgressOld);
2181 if (RT_FAILURE(rc))
2182 break;
2183 }
2184 }
2185 } while (uOffset < cbSize);
2186
2187 RTMemFree(pvBuf);
2188
2189 if (fLockReadFrom)
2190 {
2191 rc2 = vdThreadFinishRead(pDiskFrom);
2192 AssertRC(rc2);
2193 }
2194
2195 if (fLockWriteTo)
2196 {
2197 rc2 = vdThreadFinishWrite(pDiskTo);
2198 AssertRC(rc2);
2199 }
2200
2201 LogFlowFunc(("returns rc=%Rrc\n", rc));
2202 return rc;
2203}
2204
2205/**
2206 * Flush helper async version.
2207 */
2208static DECLCALLBACK(int) vdSetModifiedHelperAsync(PVDIOCTX pIoCtx)
2209{
2210 int rc = VINF_SUCCESS;
2211 PVDIMAGE pImage = pIoCtx->Req.Io.pImageCur;
2212
2213 rc = pImage->Backend->pfnFlush(pImage->pBackendData, pIoCtx);
2214 if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
2215 rc = VINF_SUCCESS;
2216
2217 return rc;
2218}
2219
2220/**
2221 * internal: mark the disk as modified - async version.
2222 */
2223static int vdSetModifiedFlagAsync(PVDISK pDisk, PVDIOCTX pIoCtx)
2224{
2225 int rc = VINF_SUCCESS;
2226
2227 VD_IS_LOCKED(pDisk);
2228
2229 pDisk->uModified |= VD_IMAGE_MODIFIED_FLAG;
2230 if (pDisk->uModified & VD_IMAGE_MODIFIED_FIRST)
2231 {
2232 rc = vdIoCtxLockDisk(pDisk, pIoCtx);
2233 if (RT_SUCCESS(rc))
2234 {
2235 pDisk->uModified &= ~VD_IMAGE_MODIFIED_FIRST;
2236
2237 /* First modify, so create a UUID and ensure it's written to disk. */
2238 vdResetModifiedFlag(pDisk);
2239
2240 if (!(pDisk->uModified & VD_IMAGE_MODIFIED_DISABLE_UUID_UPDATE))
2241 {
2242 PVDIOCTX pIoCtxFlush = vdIoCtxChildAlloc(pDisk, VDIOCTXTXDIR_FLUSH,
2243 0, 0, pDisk->pLast,
2244 NULL, pIoCtx, 0, 0, NULL,
2245 vdSetModifiedHelperAsync);
2246
2247 if (pIoCtxFlush)
2248 {
2249 rc = vdIoCtxProcessLocked(pIoCtxFlush);
2250 if (rc == VINF_VD_ASYNC_IO_FINISHED)
2251 {
2252 vdIoCtxUnlockDisk(pDisk, pIoCtx, false /* fProcessDeferredReqs */);
2253 vdIoCtxFree(pDisk, pIoCtxFlush);
2254 }
2255 else if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
2256 {
2257 ASMAtomicIncU32(&pIoCtx->cDataTransfersPending);
2258 pIoCtx->fFlags |= VDIOCTX_FLAGS_BLOCKED;
2259 }
2260 else /* Another error */
2261 vdIoCtxFree(pDisk, pIoCtxFlush);
2262 }
2263 else
2264 rc = VERR_NO_MEMORY;
2265 }
2266 }
2267 }
2268
2269 return rc;
2270}
2271
2272static DECLCALLBACK(int) vdWriteHelperCommitAsync(PVDIOCTX pIoCtx)
2273{
2274 int rc = VINF_SUCCESS;
2275 PVDIMAGE pImage = pIoCtx->Req.Io.pImageStart;
2276 size_t cbPreRead = pIoCtx->Type.Child.cbPreRead;
2277 size_t cbPostRead = pIoCtx->Type.Child.cbPostRead;
2278 size_t cbThisWrite = pIoCtx->Type.Child.cbTransferParent;
2279
2280 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2281 rc = pImage->Backend->pfnWrite(pImage->pBackendData,
2282 pIoCtx->Req.Io.uOffset - cbPreRead,
2283 cbPreRead + cbThisWrite + cbPostRead,
2284 pIoCtx, NULL, &cbPreRead, &cbPostRead, 0);
2285 Assert(rc != VERR_VD_BLOCK_FREE);
2286 Assert(rc == VERR_VD_NOT_ENOUGH_METADATA || cbPreRead == 0);
2287 Assert(rc == VERR_VD_NOT_ENOUGH_METADATA || cbPostRead == 0);
2288 if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
2289 rc = VINF_SUCCESS;
2290 else if (rc == VERR_VD_IOCTX_HALT)
2291 {
2292 pIoCtx->fFlags |= VDIOCTX_FLAGS_BLOCKED;
2293 rc = VINF_SUCCESS;
2294 }
2295
2296 LogFlowFunc(("returns rc=%Rrc\n", rc));
2297 return rc;
2298}
2299
2300static DECLCALLBACK(int) vdWriteHelperOptimizedCmpAndWriteAsync(PVDIOCTX pIoCtx)
2301{
2302 int rc = VINF_SUCCESS;
2303 size_t cbThisWrite = 0;
2304 size_t cbPreRead = pIoCtx->Type.Child.cbPreRead;
2305 size_t cbPostRead = pIoCtx->Type.Child.cbPostRead;
2306 size_t cbWriteCopy = pIoCtx->Type.Child.Write.Optimized.cbWriteCopy;
2307 size_t cbFill = pIoCtx->Type.Child.Write.Optimized.cbFill;
2308 size_t cbReadImage = pIoCtx->Type.Child.Write.Optimized.cbReadImage;
2309 PVDIOCTX pIoCtxParent = pIoCtx->pIoCtxParent;
2310
2311 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2312
2313 AssertPtr(pIoCtxParent);
2314 Assert(!pIoCtxParent->pIoCtxParent);
2315 Assert(!pIoCtx->Req.Io.cbTransferLeft && !pIoCtx->cMetaTransfersPending);
2316
2317 vdIoCtxChildReset(pIoCtx);
2318 cbThisWrite = pIoCtx->Type.Child.cbTransferParent;
2319 RTSgBufAdvance(&pIoCtx->Req.Io.SgBuf, cbPreRead);
2320
2321 /* Check if the write would modify anything in this block. */
2322 if (!RTSgBufCmp(&pIoCtx->Req.Io.SgBuf, &pIoCtxParent->Req.Io.SgBuf, cbThisWrite))
2323 {
2324 RTSGBUF SgBufSrcTmp;
2325
2326 RTSgBufClone(&SgBufSrcTmp, &pIoCtxParent->Req.Io.SgBuf);
2327 RTSgBufAdvance(&SgBufSrcTmp, cbThisWrite);
2328 RTSgBufAdvance(&pIoCtx->Req.Io.SgBuf, cbThisWrite);
2329
2330 if (!cbWriteCopy || !RTSgBufCmp(&pIoCtx->Req.Io.SgBuf, &SgBufSrcTmp, cbWriteCopy))
2331 {
2332 /* Block is completely unchanged, so no need to write anything. */
2333 LogFlowFunc(("Block didn't changed\n"));
2334 ASMAtomicWriteU32(&pIoCtx->Req.Io.cbTransferLeft, 0);
2335 RTSgBufAdvance(&pIoCtxParent->Req.Io.SgBuf, cbThisWrite);
2336 return VINF_VD_ASYNC_IO_FINISHED;
2337 }
2338 }
2339
2340 /* Copy the data to the right place in the buffer. */
2341 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
2342 RTSgBufAdvance(&pIoCtx->Req.Io.SgBuf, cbPreRead);
2343 vdIoCtxCopy(pIoCtx, pIoCtxParent, cbThisWrite);
2344
2345 /* Handle the data that goes after the write to fill the block. */
2346 if (cbPostRead)
2347 {
2348 /* Now assemble the remaining data. */
2349 if (cbWriteCopy)
2350 {
2351 /*
2352 * The S/G buffer of the parent needs to be cloned because
2353 * it is not allowed to modify the state.
2354 */
2355 RTSGBUF SgBufParentTmp;
2356
2357 RTSgBufClone(&SgBufParentTmp, &pIoCtxParent->Req.Io.SgBuf);
2358 RTSgBufCopy(&pIoCtx->Req.Io.SgBuf, &SgBufParentTmp, cbWriteCopy);
2359 }
2360
2361 /* Zero out the remainder of this block. Will never be visible, as this
2362 * is beyond the limit of the image. */
2363 if (cbFill)
2364 {
2365 RTSgBufAdvance(&pIoCtx->Req.Io.SgBuf, cbReadImage);
2366 vdIoCtxSet(pIoCtx, '\0', cbFill);
2367 }
2368 }
2369
2370 /* Write the full block to the virtual disk. */
2371 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
2372 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperCommitAsync;
2373
2374 return rc;
2375}
2376
2377static DECLCALLBACK(int) vdWriteHelperOptimizedPreReadAsync(PVDIOCTX pIoCtx)
2378{
2379 int rc = VINF_SUCCESS;
2380
2381 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2382
2383 pIoCtx->fFlags |= VDIOCTX_FLAGS_ZERO_FREE_BLOCKS;
2384
2385 if ( pIoCtx->Req.Io.cbTransferLeft
2386 && !pIoCtx->cDataTransfersPending)
2387 rc = vdReadHelperAsync(pIoCtx);
2388
2389 if ( ( RT_SUCCESS(rc)
2390 || (rc == VERR_VD_ASYNC_IO_IN_PROGRESS))
2391 && ( pIoCtx->Req.Io.cbTransferLeft
2392 || pIoCtx->cMetaTransfersPending))
2393 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
2394 else
2395 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperOptimizedCmpAndWriteAsync;
2396
2397 return rc;
2398}
2399
2400/**
2401 * internal: write a complete block (only used for diff images), taking the
2402 * remaining data from parent images. This implementation optimizes out writes
2403 * that do not change the data relative to the state as of the parent images.
2404 * All backends which support differential/growing images support this - async version.
2405 */
2406static DECLCALLBACK(int) vdWriteHelperOptimizedAsync(PVDIOCTX pIoCtx)
2407{
2408 PVDISK pDisk = pIoCtx->pDisk;
2409 uint64_t uOffset = pIoCtx->Type.Child.uOffsetSaved;
2410 size_t cbThisWrite = pIoCtx->Type.Child.cbTransferParent;
2411 size_t cbPreRead = pIoCtx->Type.Child.cbPreRead;
2412 size_t cbPostRead = pIoCtx->Type.Child.cbPostRead;
2413 size_t cbWrite = pIoCtx->Type.Child.cbWriteParent;
2414 size_t cbFill = 0;
2415 size_t cbWriteCopy = 0;
2416 size_t cbReadImage = 0;
2417
2418 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2419
2420 AssertPtr(pIoCtx->pIoCtxParent);
2421 Assert(!pIoCtx->pIoCtxParent->pIoCtxParent);
2422
2423 if (cbPostRead)
2424 {
2425 /* Figure out how much we cannot read from the image, because
2426 * the last block to write might exceed the nominal size of the
2427 * image for technical reasons. */
2428 if (uOffset + cbThisWrite + cbPostRead > pDisk->cbSize)
2429 cbFill = uOffset + cbThisWrite + cbPostRead - pDisk->cbSize;
2430
2431 /* If we have data to be written, use that instead of reading
2432 * data from the image. */
2433 if (cbWrite > cbThisWrite)
2434 cbWriteCopy = RT_MIN(cbWrite - cbThisWrite, cbPostRead);
2435
2436 /* The rest must be read from the image. */
2437 cbReadImage = cbPostRead - cbWriteCopy - cbFill;
2438 }
2439
2440 pIoCtx->Type.Child.Write.Optimized.cbFill = cbFill;
2441 pIoCtx->Type.Child.Write.Optimized.cbWriteCopy = cbWriteCopy;
2442 pIoCtx->Type.Child.Write.Optimized.cbReadImage = cbReadImage;
2443
2444 /* Read the entire data of the block so that we can compare whether it will
2445 * be modified by the write or not. */
2446 size_t cbTmp = cbPreRead + cbThisWrite + cbPostRead - cbFill; Assert(cbTmp == (uint32_t)cbTmp);
2447 pIoCtx->Req.Io.cbTransferLeft = (uint32_t)cbTmp;
2448 pIoCtx->Req.Io.cbTransfer = pIoCtx->Req.Io.cbTransferLeft;
2449 pIoCtx->Req.Io.uOffset -= cbPreRead;
2450
2451 /* Next step */
2452 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperOptimizedPreReadAsync;
2453 return VINF_SUCCESS;
2454}
2455
2456static DECLCALLBACK(int) vdWriteHelperStandardReadImageAsync(PVDIOCTX pIoCtx)
2457{
2458 int rc = VINF_SUCCESS;
2459
2460 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2461
2462 pIoCtx->fFlags |= VDIOCTX_FLAGS_ZERO_FREE_BLOCKS;
2463
2464 if ( pIoCtx->Req.Io.cbTransferLeft
2465 && !pIoCtx->cDataTransfersPending)
2466 rc = vdReadHelperAsync(pIoCtx);
2467
2468 if ( RT_SUCCESS(rc)
2469 && ( pIoCtx->Req.Io.cbTransferLeft
2470 || pIoCtx->cMetaTransfersPending))
2471 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
2472 else
2473 {
2474 size_t cbFill = pIoCtx->Type.Child.Write.Optimized.cbFill;
2475
2476 /* Zero out the remainder of this block. Will never be visible, as this
2477 * is beyond the limit of the image. */
2478 if (cbFill)
2479 vdIoCtxSet(pIoCtx, '\0', cbFill);
2480
2481 /* Write the full block to the virtual disk. */
2482 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
2483
2484 vdIoCtxChildReset(pIoCtx);
2485 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperCommitAsync;
2486 }
2487
2488 return rc;
2489}
2490
2491static DECLCALLBACK(int) vdWriteHelperStandardAssemble(PVDIOCTX pIoCtx)
2492{
2493 int rc = VINF_SUCCESS;
2494 size_t cbPostRead = pIoCtx->Type.Child.cbPostRead;
2495 size_t cbThisWrite = pIoCtx->Type.Child.cbTransferParent;
2496 PVDIOCTX pIoCtxParent = pIoCtx->pIoCtxParent;
2497
2498 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2499
2500 vdIoCtxCopy(pIoCtx, pIoCtxParent, cbThisWrite);
2501 if (cbPostRead)
2502 {
2503 size_t cbFill = pIoCtx->Type.Child.Write.Optimized.cbFill;
2504 size_t cbWriteCopy = pIoCtx->Type.Child.Write.Optimized.cbWriteCopy;
2505 size_t cbReadImage = pIoCtx->Type.Child.Write.Optimized.cbReadImage;
2506
2507 /* Now assemble the remaining data. */
2508 if (cbWriteCopy)
2509 {
2510 /*
2511 * The S/G buffer of the parent needs to be cloned because
2512 * it is not allowed to modify the state.
2513 */
2514 RTSGBUF SgBufParentTmp;
2515
2516 RTSgBufClone(&SgBufParentTmp, &pIoCtxParent->Req.Io.SgBuf);
2517 RTSgBufCopy(&pIoCtx->Req.Io.SgBuf, &SgBufParentTmp, cbWriteCopy);
2518 }
2519
2520 if (cbReadImage)
2521 {
2522 /* Read remaining data. */
2523 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperStandardReadImageAsync;
2524
2525 /* Read the data that goes before the write to fill the block. */
2526 pIoCtx->Req.Io.cbTransferLeft = (uint32_t)cbReadImage; Assert(cbReadImage == (uint32_t)cbReadImage);
2527 pIoCtx->Req.Io.cbTransfer = pIoCtx->Req.Io.cbTransferLeft;
2528 pIoCtx->Req.Io.uOffset += cbWriteCopy;
2529 }
2530 else
2531 {
2532 /* Zero out the remainder of this block. Will never be visible, as this
2533 * is beyond the limit of the image. */
2534 if (cbFill)
2535 vdIoCtxSet(pIoCtx, '\0', cbFill);
2536
2537 /* Write the full block to the virtual disk. */
2538 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
2539 vdIoCtxChildReset(pIoCtx);
2540 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperCommitAsync;
2541 }
2542 }
2543 else
2544 {
2545 /* Write the full block to the virtual disk. */
2546 RTSgBufReset(&pIoCtx->Req.Io.SgBuf);
2547 vdIoCtxChildReset(pIoCtx);
2548 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperCommitAsync;
2549 }
2550
2551 return rc;
2552}
2553
2554static DECLCALLBACK(int) vdWriteHelperStandardPreReadAsync(PVDIOCTX pIoCtx)
2555{
2556 int rc = VINF_SUCCESS;
2557
2558 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2559
2560 pIoCtx->fFlags |= VDIOCTX_FLAGS_ZERO_FREE_BLOCKS;
2561
2562 if ( pIoCtx->Req.Io.cbTransferLeft
2563 && !pIoCtx->cDataTransfersPending)
2564 rc = vdReadHelperAsync(pIoCtx);
2565
2566 if ( RT_SUCCESS(rc)
2567 && ( pIoCtx->Req.Io.cbTransferLeft
2568 || pIoCtx->cMetaTransfersPending))
2569 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
2570 else
2571 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperStandardAssemble;
2572
2573 return rc;
2574}
2575
2576static DECLCALLBACK(int) vdWriteHelperStandardAsync(PVDIOCTX pIoCtx)
2577{
2578 PVDISK pDisk = pIoCtx->pDisk;
2579 uint64_t uOffset = pIoCtx->Type.Child.uOffsetSaved;
2580 size_t cbThisWrite = pIoCtx->Type.Child.cbTransferParent;
2581 size_t cbPreRead = pIoCtx->Type.Child.cbPreRead;
2582 size_t cbPostRead = pIoCtx->Type.Child.cbPostRead;
2583 size_t cbWrite = pIoCtx->Type.Child.cbWriteParent;
2584 size_t cbFill = 0;
2585 size_t cbWriteCopy = 0;
2586 size_t cbReadImage = 0;
2587
2588 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2589
2590 AssertPtr(pIoCtx->pIoCtxParent);
2591 Assert(!pIoCtx->pIoCtxParent->pIoCtxParent);
2592
2593 /* Calculate the amount of data to read that goes after the write to fill the block. */
2594 if (cbPostRead)
2595 {
2596 /* If we have data to be written, use that instead of reading
2597 * data from the image. */
2598 if (cbWrite > cbThisWrite)
2599 cbWriteCopy = RT_MIN(cbWrite - cbThisWrite, cbPostRead);
2600 else
2601 cbWriteCopy = 0;
2602
2603 /* Figure out how much we cannot read from the image, because
2604 * the last block to write might exceed the nominal size of the
2605 * image for technical reasons. */
2606 if (uOffset + cbThisWrite + cbPostRead > pDisk->cbSize)
2607 cbFill = uOffset + cbThisWrite + cbPostRead - pDisk->cbSize;
2608
2609 /* The rest must be read from the image. */
2610 cbReadImage = cbPostRead - cbWriteCopy - cbFill;
2611 }
2612
2613 pIoCtx->Type.Child.Write.Optimized.cbFill = cbFill;
2614 pIoCtx->Type.Child.Write.Optimized.cbWriteCopy = cbWriteCopy;
2615 pIoCtx->Type.Child.Write.Optimized.cbReadImage = cbReadImage;
2616
2617 /* Next step */
2618 if (cbPreRead)
2619 {
2620 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperStandardPreReadAsync;
2621
2622 /* Read the data that goes before the write to fill the block. */
2623 pIoCtx->Req.Io.cbTransferLeft = (uint32_t)cbPreRead; Assert(cbPreRead == (uint32_t)cbPreRead);
2624 pIoCtx->Req.Io.cbTransfer = pIoCtx->Req.Io.cbTransferLeft;
2625 pIoCtx->Req.Io.uOffset -= cbPreRead;
2626 }
2627 else
2628 pIoCtx->pfnIoCtxTransferNext = vdWriteHelperStandardAssemble;
2629
2630 return VINF_SUCCESS;
2631}
2632
2633/**
2634 * internal: write buffer to the image, taking care of block boundaries and
2635 * write optimizations - async version.
2636 */
2637static DECLCALLBACK(int) vdWriteHelperAsync(PVDIOCTX pIoCtx)
2638{
2639 int rc;
2640 size_t cbWrite = pIoCtx->Req.Io.cbTransfer;
2641 uint64_t uOffset = pIoCtx->Req.Io.uOffset;
2642 PVDIMAGE pImage = pIoCtx->Req.Io.pImageCur;
2643 PVDISK pDisk = pIoCtx->pDisk;
2644 unsigned fWrite;
2645 size_t cbThisWrite;
2646 size_t cbPreRead, cbPostRead;
2647
2648 /* Apply write filter chain here if it was not done already. */
2649 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_WRITE_FILTER_APPLIED))
2650 {
2651 rc = vdFilterChainApplyWrite(pDisk, uOffset, cbWrite, pIoCtx);
2652 if (RT_FAILURE(rc))
2653 return rc;
2654 pIoCtx->fFlags |= VDIOCTX_FLAGS_WRITE_FILTER_APPLIED;
2655 }
2656
2657 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_DONT_SET_MODIFIED_FLAG))
2658 {
2659 rc = vdSetModifiedFlagAsync(pDisk, pIoCtx);
2660 if (RT_FAILURE(rc)) /* Includes I/O in progress. */
2661 return rc;
2662 }
2663
2664 rc = vdDiscardSetRangeAllocated(pDisk, uOffset, cbWrite);
2665 if (RT_FAILURE(rc))
2666 return rc;
2667
2668 /* Loop until all written. */
2669 do
2670 {
2671 /* Try to write the possibly partial block to the last opened image.
2672 * This works when the block is already allocated in this image or
2673 * if it is a full-block write (and allocation isn't suppressed below).
2674 * For image formats which don't support zero blocks, it's beneficial
2675 * to avoid unnecessarily allocating unchanged blocks. This prevents
2676 * unwanted expanding of images. VMDK is an example. */
2677 cbThisWrite = cbWrite;
2678
2679 /*
2680 * Check whether there is a full block write in progress which was not allocated.
2681 * Defer I/O if the range interferes.
2682 */
2683 if ( pDisk->pIoCtxLockOwner != NIL_VDIOCTX
2684 && uOffset >= pDisk->uOffsetStartLocked
2685 && uOffset < pDisk->uOffsetEndLocked)
2686 {
2687 Log(("Interferring write while allocating a new block => deferring write\n"));
2688 vdIoCtxDefer(pDisk, pIoCtx);
2689 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
2690 break;
2691 }
2692
2693 fWrite = (pImage->uOpenFlags & VD_OPEN_FLAGS_HONOR_SAME)
2694 ? 0 : VD_WRITE_NO_ALLOC;
2695 rc = pImage->Backend->pfnWrite(pImage->pBackendData, uOffset, cbThisWrite,
2696 pIoCtx, &cbThisWrite, &cbPreRead, &cbPostRead,
2697 fWrite);
2698 if (rc == VERR_VD_BLOCK_FREE)
2699 {
2700 /* Lock the disk .*/
2701 rc = vdIoCtxLockDisk(pDisk, pIoCtx);
2702 if (RT_SUCCESS(rc))
2703 {
2704 /*
2705 * Allocate segment and buffer in one go.
2706 * A bit hackish but avoids the need to allocate memory twice.
2707 */
2708 PRTSGBUF pTmp = (PRTSGBUF)RTMemAlloc(cbPreRead + cbThisWrite + cbPostRead + sizeof(RTSGSEG) + sizeof(RTSGBUF));
2709 AssertBreakStmt(pTmp, rc = VERR_NO_MEMORY);
2710 PRTSGSEG pSeg = (PRTSGSEG)(pTmp + 1);
2711
2712 pSeg->pvSeg = pSeg + 1;
2713 pSeg->cbSeg = cbPreRead + cbThisWrite + cbPostRead;
2714 RTSgBufInit(pTmp, pSeg, 1);
2715
2716 PVDIOCTX pIoCtxWrite = vdIoCtxChildAlloc(pDisk, VDIOCTXTXDIR_WRITE,
2717 uOffset, pSeg->cbSeg, pImage,
2718 pTmp,
2719 pIoCtx, cbThisWrite,
2720 cbWrite,
2721 pTmp,
2722 (pImage->uOpenFlags & VD_OPEN_FLAGS_HONOR_SAME)
2723 ? vdWriteHelperStandardAsync
2724 : vdWriteHelperOptimizedAsync);
2725 if (!pIoCtxWrite)
2726 {
2727 RTMemTmpFree(pTmp);
2728 rc = VERR_NO_MEMORY;
2729 break;
2730 }
2731
2732 LogFlowFunc(("Disk is growing because of pIoCtx=%#p pIoCtxWrite=%#p\n",
2733 pIoCtx, pIoCtxWrite));
2734
2735 /* Save the current range for the growing operation to check for intersecting requests later. */
2736 pDisk->uOffsetStartLocked = uOffset - cbPreRead;
2737 pDisk->uOffsetEndLocked = uOffset + cbThisWrite + cbPostRead;
2738
2739 pIoCtxWrite->Type.Child.cbPreRead = cbPreRead;
2740 pIoCtxWrite->Type.Child.cbPostRead = cbPostRead;
2741 pIoCtxWrite->Req.Io.pImageParentOverride = pIoCtx->Req.Io.pImageParentOverride;
2742
2743 /* Process the write request */
2744 rc = vdIoCtxProcessLocked(pIoCtxWrite);
2745
2746 if (RT_FAILURE(rc) && (rc != VERR_VD_ASYNC_IO_IN_PROGRESS))
2747 {
2748 vdIoCtxUnlockDisk(pDisk, pIoCtx, false /* fProcessDeferredReqs*/ );
2749 vdIoCtxFree(pDisk, pIoCtxWrite);
2750 break;
2751 }
2752 else if ( rc == VINF_VD_ASYNC_IO_FINISHED
2753 && ASMAtomicCmpXchgBool(&pIoCtxWrite->fComplete, true, false))
2754 {
2755 LogFlow(("Child write request completed\n"));
2756 Assert(pIoCtx->Req.Io.cbTransferLeft >= cbThisWrite);
2757 Assert(cbThisWrite == (uint32_t)cbThisWrite);
2758 rc = pIoCtxWrite->rcReq;
2759 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbThisWrite);
2760 vdIoCtxUnlockDisk(pDisk, pIoCtx, false /* fProcessDeferredReqs*/ );
2761 vdIoCtxFree(pDisk, pIoCtxWrite);
2762 }
2763 else
2764 {
2765 LogFlow(("Child write pending\n"));
2766 ASMAtomicIncU32(&pIoCtx->cDataTransfersPending);
2767 pIoCtx->fFlags |= VDIOCTX_FLAGS_BLOCKED;
2768 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
2769 cbWrite -= cbThisWrite;
2770 uOffset += cbThisWrite;
2771 break;
2772 }
2773 }
2774 else
2775 {
2776 rc = VERR_VD_ASYNC_IO_IN_PROGRESS;
2777 break;
2778 }
2779 }
2780
2781 if (rc == VERR_VD_IOCTX_HALT)
2782 {
2783 cbWrite -= cbThisWrite;
2784 uOffset += cbThisWrite;
2785 pIoCtx->fFlags |= VDIOCTX_FLAGS_BLOCKED;
2786 break;
2787 }
2788 else if (rc == VERR_VD_NOT_ENOUGH_METADATA)
2789 break;
2790
2791 cbWrite -= cbThisWrite;
2792 uOffset += cbThisWrite;
2793 } while (cbWrite != 0 && (RT_SUCCESS(rc) || rc == VERR_VD_ASYNC_IO_IN_PROGRESS));
2794
2795 if ( rc == VERR_VD_ASYNC_IO_IN_PROGRESS
2796 || rc == VERR_VD_NOT_ENOUGH_METADATA
2797 || rc == VERR_VD_IOCTX_HALT)
2798 {
2799 /*
2800 * Tell the caller that we don't need to go back here because all
2801 * writes are initiated.
2802 */
2803 if ( !cbWrite
2804 && rc != VERR_VD_IOCTX_HALT)
2805 rc = VINF_SUCCESS;
2806
2807 pIoCtx->Req.Io.uOffset = uOffset;
2808 pIoCtx->Req.Io.cbTransfer = cbWrite;
2809 }
2810
2811 return rc;
2812}
2813
2814/**
2815 * Flush helper async version.
2816 */
2817static DECLCALLBACK(int) vdFlushHelperAsync(PVDIOCTX pIoCtx)
2818{
2819 int rc = VINF_SUCCESS;
2820 PVDISK pDisk = pIoCtx->pDisk;
2821 PVDIMAGE pImage = pIoCtx->Req.Io.pImageCur;
2822
2823 rc = vdIoCtxLockDisk(pDisk, pIoCtx);
2824 if (RT_SUCCESS(rc))
2825 {
2826 /* Mark the whole disk as locked. */
2827 pDisk->uOffsetStartLocked = 0;
2828 pDisk->uOffsetEndLocked = UINT64_C(0xffffffffffffffff);
2829
2830 vdResetModifiedFlag(pDisk);
2831 rc = pImage->Backend->pfnFlush(pImage->pBackendData, pIoCtx);
2832 if ( ( RT_SUCCESS(rc)
2833 || rc == VERR_VD_ASYNC_IO_IN_PROGRESS
2834 || rc == VERR_VD_IOCTX_HALT)
2835 && pDisk->pCache)
2836 {
2837 rc = pDisk->pCache->Backend->pfnFlush(pDisk->pCache->pBackendData, pIoCtx);
2838 if ( RT_SUCCESS(rc)
2839 || ( rc != VERR_VD_ASYNC_IO_IN_PROGRESS
2840 && rc != VERR_VD_IOCTX_HALT))
2841 vdIoCtxUnlockDisk(pDisk, pIoCtx, true /* fProcessBlockedReqs */);
2842 else if (rc != VERR_VD_IOCTX_HALT)
2843 rc = VINF_SUCCESS;
2844 }
2845 else if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
2846 rc = VINF_SUCCESS;
2847 else if (rc != VERR_VD_IOCTX_HALT)/* Some other error. */
2848 vdIoCtxUnlockDisk(pDisk, pIoCtx, true /* fProcessBlockedReqs */);
2849 }
2850
2851 return rc;
2852}
2853
2854/**
2855 * Async discard helper - discards a whole block which is recorded in the block
2856 * tree.
2857 *
2858 * @returns VBox status code.
2859 * @param pIoCtx The I/O context to operate on.
2860 */
2861static DECLCALLBACK(int) vdDiscardWholeBlockAsync(PVDIOCTX pIoCtx)
2862{
2863 int rc = VINF_SUCCESS;
2864 PVDISK pDisk = pIoCtx->pDisk;
2865 PVDDISCARDSTATE pDiscard = pDisk->pDiscard;
2866 PVDDISCARDBLOCK pBlock = pIoCtx->Req.Discard.pBlock;
2867 size_t cbPreAllocated, cbPostAllocated, cbActuallyDiscarded;
2868
2869 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
2870
2871 AssertPtr(pBlock);
2872
2873 rc = pDisk->pLast->Backend->pfnDiscard(pDisk->pLast->pBackendData, pIoCtx,
2874 pBlock->Core.Key, pBlock->cbDiscard,
2875 &cbPreAllocated, &cbPostAllocated,
2876 &cbActuallyDiscarded, NULL, 0);
2877 Assert(rc != VERR_VD_DISCARD_ALIGNMENT_NOT_MET);
2878 Assert(!cbPreAllocated);
2879 Assert(!cbPostAllocated);
2880 Assert(cbActuallyDiscarded == pBlock->cbDiscard || RT_FAILURE(rc));
2881
2882 /* Remove the block on success. */
2883 if ( RT_SUCCESS(rc)
2884 || rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
2885 {
2886 PVDDISCARDBLOCK pBlockRemove = (PVDDISCARDBLOCK)RTAvlrU64RangeRemove(pDiscard->pTreeBlocks, pBlock->Core.Key);
2887 Assert(pBlockRemove == pBlock); RT_NOREF1(pBlockRemove);
2888
2889 pDiscard->cbDiscarding -= pBlock->cbDiscard;
2890 RTListNodeRemove(&pBlock->NodeLru);
2891 RTMemFree(pBlock->pbmAllocated);
2892 RTMemFree(pBlock);
2893 pIoCtx->Req.Discard.pBlock = NULL;/* Safety precaution. */
2894 pIoCtx->pfnIoCtxTransferNext = vdDiscardHelperAsync; /* Next part. */
2895 rc = VINF_SUCCESS;
2896 }
2897
2898 LogFlowFunc(("returns rc=%Rrc\n", rc));
2899 return rc;
2900}
2901
2902/**
2903 * Removes the least recently used blocks from the waiting list until
2904 * the new value is reached - version for async I/O.
2905 *
2906 * @returns VBox status code.
2907 * @param pDisk VD disk container.
2908 * @param pIoCtx The I/O context associated with this discard operation.
2909 * @param cbDiscardingNew How many bytes should be waiting on success.
2910 * The number of bytes waiting can be less.
2911 */
2912static int vdDiscardRemoveBlocksAsync(PVDISK pDisk, PVDIOCTX pIoCtx, size_t cbDiscardingNew)
2913{
2914 int rc = VINF_SUCCESS;
2915 PVDDISCARDSTATE pDiscard = pDisk->pDiscard;
2916
2917 LogFlowFunc(("pDisk=%#p pDiscard=%#p cbDiscardingNew=%zu\n",
2918 pDisk, pDiscard, cbDiscardingNew));
2919
2920 while (pDiscard->cbDiscarding > cbDiscardingNew)
2921 {
2922 PVDDISCARDBLOCK pBlock = RTListGetLast(&pDiscard->ListLru, VDDISCARDBLOCK, NodeLru);
2923
2924 Assert(!RTListIsEmpty(&pDiscard->ListLru));
2925
2926 /* Go over the allocation bitmap and mark all discarded sectors as unused. */
2927 uint64_t offStart = pBlock->Core.Key;
2928 uint32_t idxStart = 0;
2929 size_t cbLeft = pBlock->cbDiscard;
2930 bool fAllocated = ASMBitTest(pBlock->pbmAllocated, idxStart);
2931 uint32_t cSectors = (uint32_t)(pBlock->cbDiscard / 512);
2932
2933 while (cbLeft > 0)
2934 {
2935 int32_t idxEnd;
2936 size_t cbThis = cbLeft;
2937
2938 if (fAllocated)
2939 {
2940 /* Check for the first unallocated bit. */
2941 idxEnd = ASMBitNextClear(pBlock->pbmAllocated, cSectors, idxStart);
2942 if (idxEnd != -1)
2943 {
2944 cbThis = (idxEnd - idxStart) * 512;
2945 fAllocated = false;
2946 }
2947 }
2948 else
2949 {
2950 /* Mark as unused and check for the first set bit. */
2951 idxEnd = ASMBitNextSet(pBlock->pbmAllocated, cSectors, idxStart);
2952 if (idxEnd != -1)
2953 cbThis = (idxEnd - idxStart) * 512;
2954
2955 rc = pDisk->pLast->Backend->pfnDiscard(pDisk->pLast->pBackendData, pIoCtx,
2956 offStart, cbThis, NULL, NULL, &cbThis,
2957 NULL, VD_DISCARD_MARK_UNUSED);
2958 if ( RT_FAILURE(rc)
2959 && rc != VERR_VD_ASYNC_IO_IN_PROGRESS)
2960 break;
2961
2962 fAllocated = true;
2963 }
2964
2965 idxStart = idxEnd;
2966 offStart += cbThis;
2967 cbLeft -= cbThis;
2968 }
2969
2970 if ( RT_FAILURE(rc)
2971 && rc != VERR_VD_ASYNC_IO_IN_PROGRESS)
2972 break;
2973
2974 PVDDISCARDBLOCK pBlockRemove = (PVDDISCARDBLOCK)RTAvlrU64RangeRemove(pDiscard->pTreeBlocks, pBlock->Core.Key);
2975 Assert(pBlockRemove == pBlock); NOREF(pBlockRemove);
2976 RTListNodeRemove(&pBlock->NodeLru);
2977
2978 pDiscard->cbDiscarding -= pBlock->cbDiscard;
2979 RTMemFree(pBlock->pbmAllocated);
2980 RTMemFree(pBlock);
2981 }
2982
2983 if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
2984 rc = VINF_SUCCESS;
2985
2986 Assert(RT_FAILURE(rc) || pDiscard->cbDiscarding <= cbDiscardingNew);
2987
2988 LogFlowFunc(("returns rc=%Rrc\n", rc));
2989 return rc;
2990}
2991
2992/**
2993 * Async discard helper - discards the current range if there is no matching
2994 * block in the tree.
2995 *
2996 * @returns VBox status code.
2997 * @param pIoCtx The I/O context to operate on.
2998 */
2999static DECLCALLBACK(int) vdDiscardCurrentRangeAsync(PVDIOCTX pIoCtx)
3000{
3001 PVDISK pDisk = pIoCtx->pDisk;
3002 PVDDISCARDSTATE pDiscard = pDisk->pDiscard;
3003 uint64_t offStart = pIoCtx->Req.Discard.offCur;
3004 size_t cbThisDiscard = pIoCtx->Req.Discard.cbThisDiscard;
3005 void *pbmAllocated = NULL;
3006 size_t cbPreAllocated, cbPostAllocated;
3007 int rc = VINF_SUCCESS;
3008
3009 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
3010
3011 /* No block found, try to discard using the backend first. */
3012 rc = pDisk->pLast->Backend->pfnDiscard(pDisk->pLast->pBackendData, pIoCtx,
3013 offStart, cbThisDiscard, &cbPreAllocated,
3014 &cbPostAllocated, &cbThisDiscard,
3015 &pbmAllocated, 0);
3016 if (rc == VERR_VD_DISCARD_ALIGNMENT_NOT_MET)
3017 {
3018 /* Create new discard block. */
3019 PVDDISCARDBLOCK pBlock = (PVDDISCARDBLOCK)RTMemAllocZ(sizeof(VDDISCARDBLOCK));
3020 if (pBlock)
3021 {
3022 pBlock->Core.Key = offStart - cbPreAllocated;
3023 pBlock->Core.KeyLast = offStart + cbThisDiscard + cbPostAllocated - 1;
3024 pBlock->cbDiscard = cbPreAllocated + cbThisDiscard + cbPostAllocated;
3025 pBlock->pbmAllocated = pbmAllocated;
3026 bool fInserted = RTAvlrU64Insert(pDiscard->pTreeBlocks, &pBlock->Core);
3027 Assert(fInserted); NOREF(fInserted);
3028
3029 RTListPrepend(&pDiscard->ListLru, &pBlock->NodeLru);
3030 pDiscard->cbDiscarding += pBlock->cbDiscard;
3031
3032 Assert(pIoCtx->Req.Discard.cbDiscardLeft >= cbThisDiscard);
3033 pIoCtx->Req.Discard.cbDiscardLeft -= cbThisDiscard;
3034 pIoCtx->Req.Discard.offCur += cbThisDiscard;
3035 pIoCtx->Req.Discard.cbThisDiscard = cbThisDiscard;
3036
3037 if (pDiscard->cbDiscarding > VD_DISCARD_REMOVE_THRESHOLD)
3038 rc = vdDiscardRemoveBlocksAsync(pDisk, pIoCtx, VD_DISCARD_REMOVE_THRESHOLD);
3039 else
3040 rc = VINF_SUCCESS;
3041
3042 if (RT_SUCCESS(rc))
3043 pIoCtx->pfnIoCtxTransferNext = vdDiscardHelperAsync; /* Next part. */
3044 }
3045 else
3046 {
3047 RTMemFree(pbmAllocated);
3048 rc = VERR_NO_MEMORY;
3049 }
3050 }
3051 else if ( RT_SUCCESS(rc)
3052 || rc == VERR_VD_ASYNC_IO_IN_PROGRESS) /* Save state and andvance to next range. */
3053 {
3054 Assert(pIoCtx->Req.Discard.cbDiscardLeft >= cbThisDiscard);
3055 pIoCtx->Req.Discard.cbDiscardLeft -= cbThisDiscard;
3056 pIoCtx->Req.Discard.offCur += cbThisDiscard;
3057 pIoCtx->Req.Discard.cbThisDiscard = cbThisDiscard;
3058 pIoCtx->pfnIoCtxTransferNext = vdDiscardHelperAsync;
3059 rc = VINF_SUCCESS;
3060 }
3061
3062 LogFlowFunc(("returns rc=%Rrc\n", rc));
3063 return rc;
3064}
3065
3066/**
3067 * Async discard helper - entry point.
3068 *
3069 * @returns VBox status code.
3070 * @param pIoCtx The I/O context to operate on.
3071 */
3072static DECLCALLBACK(int) vdDiscardHelperAsync(PVDIOCTX pIoCtx)
3073{
3074 int rc = VINF_SUCCESS;
3075 PVDISK pDisk = pIoCtx->pDisk;
3076 PCRTRANGE paRanges = pIoCtx->Req.Discard.paRanges;
3077 unsigned cRanges = pIoCtx->Req.Discard.cRanges;
3078 PVDDISCARDSTATE pDiscard = pDisk->pDiscard;
3079
3080 LogFlowFunc(("pIoCtx=%#p\n", pIoCtx));
3081
3082 /* Check if the I/O context processed all ranges. */
3083 if ( pIoCtx->Req.Discard.idxRange == cRanges
3084 && !pIoCtx->Req.Discard.cbDiscardLeft)
3085 {
3086 LogFlowFunc(("All ranges discarded, completing\n"));
3087 vdIoCtxUnlockDisk(pDisk, pIoCtx, true /* fProcessDeferredReqs*/);
3088 return VINF_SUCCESS;
3089 }
3090
3091 if (pDisk->pIoCtxLockOwner != pIoCtx)
3092 rc = vdIoCtxLockDisk(pDisk, pIoCtx);
3093
3094 if (RT_SUCCESS(rc))
3095 {
3096 uint64_t offStart = pIoCtx->Req.Discard.offCur;
3097 size_t cbDiscardLeft = pIoCtx->Req.Discard.cbDiscardLeft;
3098 size_t cbThisDiscard;
3099
3100 pDisk->uOffsetStartLocked = offStart;
3101 pDisk->uOffsetEndLocked = offStart + cbDiscardLeft;
3102
3103 if (RT_UNLIKELY(!pDiscard))
3104 {
3105 pDiscard = vdDiscardStateCreate();
3106 if (!pDiscard)
3107 return VERR_NO_MEMORY;
3108
3109 pDisk->pDiscard = pDiscard;
3110 }
3111
3112 if (!pIoCtx->Req.Discard.cbDiscardLeft)
3113 {
3114 offStart = paRanges[pIoCtx->Req.Discard.idxRange].offStart;
3115 cbDiscardLeft = paRanges[pIoCtx->Req.Discard.idxRange].cbRange;
3116 LogFlowFunc(("New range descriptor loaded (%u) offStart=%llu cbDiscard=%zu\n",
3117 pIoCtx->Req.Discard.idxRange, offStart, cbDiscardLeft));
3118 pIoCtx->Req.Discard.idxRange++;
3119 }
3120
3121 /* Look for a matching block in the AVL tree first. */
3122 PVDDISCARDBLOCK pBlock = (PVDDISCARDBLOCK)RTAvlrU64GetBestFit(pDiscard->pTreeBlocks, offStart, false);
3123 if (!pBlock || pBlock->Core.KeyLast < offStart)
3124 {
3125 PVDDISCARDBLOCK pBlockAbove = (PVDDISCARDBLOCK)RTAvlrU64GetBestFit(pDiscard->pTreeBlocks, offStart, true);
3126
3127 /* Clip range to remain in the current block. */
3128 if (pBlockAbove)
3129 cbThisDiscard = RT_MIN(cbDiscardLeft, pBlockAbove->Core.KeyLast - offStart + 1);
3130 else
3131 cbThisDiscard = cbDiscardLeft;
3132
3133 Assert(!(cbThisDiscard % 512));
3134 pIoCtx->Req.Discard.pBlock = NULL;
3135 pIoCtx->pfnIoCtxTransferNext = vdDiscardCurrentRangeAsync;
3136 }
3137 else
3138 {
3139 /* Range lies partly in the block, update allocation bitmap. */
3140 int32_t idxStart, idxEnd;
3141
3142 cbThisDiscard = RT_MIN(cbDiscardLeft, pBlock->Core.KeyLast - offStart + 1);
3143
3144 AssertPtr(pBlock);
3145
3146 Assert(!(cbThisDiscard % 512));
3147 Assert(!((offStart - pBlock->Core.Key) % 512));
3148
3149 idxStart = (offStart - pBlock->Core.Key) / 512;
3150 idxEnd = idxStart + (int32_t)(cbThisDiscard / 512);
3151
3152 ASMBitClearRange(pBlock->pbmAllocated, idxStart, idxEnd);
3153
3154 cbDiscardLeft -= cbThisDiscard;
3155 offStart += cbThisDiscard;
3156
3157 /* Call the backend to discard the block if it is completely unallocated now. */
3158 if (ASMBitFirstSet((volatile void *)pBlock->pbmAllocated, (uint32_t)(pBlock->cbDiscard / 512)) == -1)
3159 {
3160 pIoCtx->Req.Discard.pBlock = pBlock;
3161 pIoCtx->pfnIoCtxTransferNext = vdDiscardWholeBlockAsync;
3162 rc = VINF_SUCCESS;
3163 }
3164 else
3165 {
3166 RTListNodeRemove(&pBlock->NodeLru);
3167 RTListPrepend(&pDiscard->ListLru, &pBlock->NodeLru);
3168
3169 /* Start with next range. */
3170 pIoCtx->pfnIoCtxTransferNext = vdDiscardHelperAsync;
3171 rc = VINF_SUCCESS;
3172 }
3173 }
3174
3175 /* Save state in the context. */
3176 pIoCtx->Req.Discard.offCur = offStart;
3177 pIoCtx->Req.Discard.cbDiscardLeft = cbDiscardLeft;
3178 pIoCtx->Req.Discard.cbThisDiscard = cbThisDiscard;
3179 }
3180
3181 LogFlowFunc(("returns rc=%Rrc\n", rc));
3182 return rc;
3183}
3184
3185/**
3186 * VD async I/O interface open callback.
3187 */
3188static DECLCALLBACK(int) vdIOOpenFallback(void *pvUser, const char *pszLocation,
3189 uint32_t fOpen, PFNVDCOMPLETED pfnCompleted,
3190 void **ppStorage)
3191{
3192 RT_NOREF1(pvUser);
3193 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)RTMemAllocZ(sizeof(VDIIOFALLBACKSTORAGE));
3194
3195 if (!pStorage)
3196 return VERR_NO_MEMORY;
3197
3198 pStorage->pfnCompleted = pfnCompleted;
3199
3200 /* Open the file. */
3201 int rc = RTFileOpen(&pStorage->File, pszLocation, fOpen);
3202 if (RT_SUCCESS(rc))
3203 {
3204 *ppStorage = pStorage;
3205 return VINF_SUCCESS;
3206 }
3207
3208 RTMemFree(pStorage);
3209 return rc;
3210}
3211
3212/**
3213 * VD async I/O interface close callback.
3214 */
3215static DECLCALLBACK(int) vdIOCloseFallback(void *pvUser, void *pvStorage)
3216{
3217 RT_NOREF1(pvUser);
3218 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3219
3220 RTFileClose(pStorage->File);
3221 RTMemFree(pStorage);
3222 return VINF_SUCCESS;
3223}
3224
3225static DECLCALLBACK(int) vdIODeleteFallback(void *pvUser, const char *pcszFilename)
3226{
3227 RT_NOREF1(pvUser);
3228 return RTFileDelete(pcszFilename);
3229}
3230
3231static DECLCALLBACK(int) vdIOMoveFallback(void *pvUser, const char *pcszSrc, const char *pcszDst, unsigned fMove)
3232{
3233 RT_NOREF1(pvUser);
3234 return RTFileMove(pcszSrc, pcszDst, fMove);
3235}
3236
3237static DECLCALLBACK(int) vdIOGetFreeSpaceFallback(void *pvUser, const char *pcszFilename, int64_t *pcbFreeSpace)
3238{
3239 RT_NOREF1(pvUser);
3240 return RTFsQuerySizes(pcszFilename, NULL, pcbFreeSpace, NULL, NULL);
3241}
3242
3243static DECLCALLBACK(int) vdIOGetModificationTimeFallback(void *pvUser, const char *pcszFilename, PRTTIMESPEC pModificationTime)
3244{
3245 RT_NOREF1(pvUser);
3246 RTFSOBJINFO info;
3247 int rc = RTPathQueryInfo(pcszFilename, &info, RTFSOBJATTRADD_NOTHING);
3248 if (RT_SUCCESS(rc))
3249 *pModificationTime = info.ModificationTime;
3250 return rc;
3251}
3252
3253/**
3254 * VD async I/O interface callback for retrieving the file size.
3255 */
3256static DECLCALLBACK(int) vdIOGetSizeFallback(void *pvUser, void *pvStorage, uint64_t *pcbSize)
3257{
3258 RT_NOREF1(pvUser);
3259 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3260
3261 return RTFileQuerySize(pStorage->File, pcbSize);
3262}
3263
3264/**
3265 * VD async I/O interface callback for setting the file size.
3266 */
3267static DECLCALLBACK(int) vdIOSetSizeFallback(void *pvUser, void *pvStorage, uint64_t cbSize)
3268{
3269 RT_NOREF1(pvUser);
3270 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3271
3272 return RTFileSetSize(pStorage->File, cbSize);
3273}
3274
3275/**
3276 * VD async I/O interface callback for setting the file allocation size.
3277 */
3278static DECLCALLBACK(int) vdIOSetAllocationSizeFallback(void *pvUser, void *pvStorage, uint64_t cbSize,
3279 uint32_t fFlags)
3280{
3281 RT_NOREF2(pvUser, fFlags);
3282 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3283
3284 return RTFileSetAllocationSize(pStorage->File, cbSize, RTFILE_ALLOC_SIZE_F_DEFAULT);
3285}
3286
3287/**
3288 * VD async I/O interface callback for a synchronous write to the file.
3289 */
3290static DECLCALLBACK(int) vdIOWriteSyncFallback(void *pvUser, void *pvStorage, uint64_t uOffset,
3291 const void *pvBuf, size_t cbWrite, size_t *pcbWritten)
3292{
3293 RT_NOREF1(pvUser);
3294 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3295
3296 return RTFileWriteAt(pStorage->File, uOffset, pvBuf, cbWrite, pcbWritten);
3297}
3298
3299/**
3300 * VD async I/O interface callback for a synchronous read from the file.
3301 */
3302static DECLCALLBACK(int) vdIOReadSyncFallback(void *pvUser, void *pvStorage, uint64_t uOffset,
3303 void *pvBuf, size_t cbRead, size_t *pcbRead)
3304{
3305 RT_NOREF1(pvUser);
3306 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3307
3308 return RTFileReadAt(pStorage->File, uOffset, pvBuf, cbRead, pcbRead);
3309}
3310
3311/**
3312 * VD async I/O interface callback for a synchronous flush of the file data.
3313 */
3314static DECLCALLBACK(int) vdIOFlushSyncFallback(void *pvUser, void *pvStorage)
3315{
3316 RT_NOREF1(pvUser);
3317 PVDIIOFALLBACKSTORAGE pStorage = (PVDIIOFALLBACKSTORAGE)pvStorage;
3318
3319 return RTFileFlush(pStorage->File);
3320}
3321
3322/**
3323 * Internal - Continues an I/O context after
3324 * it was halted because of an active transfer.
3325 */
3326static int vdIoCtxContinue(PVDIOCTX pIoCtx, int rcReq)
3327{
3328 PVDISK pDisk = pIoCtx->pDisk;
3329 int rc = VINF_SUCCESS;
3330
3331 VD_IS_LOCKED(pDisk);
3332
3333 if (RT_FAILURE(rcReq))
3334 ASMAtomicCmpXchgS32(&pIoCtx->rcReq, rcReq, VINF_SUCCESS);
3335
3336 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_BLOCKED))
3337 {
3338 /* Continue the transfer */
3339 rc = vdIoCtxProcessLocked(pIoCtx);
3340
3341 if ( rc == VINF_VD_ASYNC_IO_FINISHED
3342 && ASMAtomicCmpXchgBool(&pIoCtx->fComplete, true, false))
3343 {
3344 LogFlowFunc(("I/O context completed pIoCtx=%#p\n", pIoCtx));
3345 bool fFreeCtx = RT_BOOL(!(pIoCtx->fFlags & VDIOCTX_FLAGS_DONT_FREE));
3346 if (pIoCtx->pIoCtxParent)
3347 {
3348 PVDIOCTX pIoCtxParent = pIoCtx->pIoCtxParent;
3349
3350 Assert(!pIoCtxParent->pIoCtxParent);
3351 if (RT_FAILURE(pIoCtx->rcReq))
3352 ASMAtomicCmpXchgS32(&pIoCtxParent->rcReq, pIoCtx->rcReq, VINF_SUCCESS);
3353
3354 ASMAtomicDecU32(&pIoCtxParent->cDataTransfersPending);
3355
3356 if (pIoCtx->enmTxDir == VDIOCTXTXDIR_WRITE)
3357 {
3358 LogFlowFunc(("I/O context transferred %u bytes for the parent pIoCtxParent=%p\n",
3359 pIoCtx->Type.Child.cbTransferParent, pIoCtxParent));
3360
3361 /* Update the parent state. */
3362 Assert(pIoCtxParent->Req.Io.cbTransferLeft >= pIoCtx->Type.Child.cbTransferParent);
3363 ASMAtomicSubU32(&pIoCtxParent->Req.Io.cbTransferLeft, (uint32_t)pIoCtx->Type.Child.cbTransferParent);
3364 }
3365 else
3366 Assert(pIoCtx->enmTxDir == VDIOCTXTXDIR_FLUSH);
3367
3368 /*
3369 * A completed child write means that we finished growing the image.
3370 * We have to process any pending writes now.
3371 */
3372 vdIoCtxUnlockDisk(pDisk, pIoCtxParent, false /* fProcessDeferredReqs */);
3373
3374 /* Unblock the parent */
3375 pIoCtxParent->fFlags &= ~VDIOCTX_FLAGS_BLOCKED;
3376
3377 rc = vdIoCtxProcessLocked(pIoCtxParent);
3378
3379 if ( rc == VINF_VD_ASYNC_IO_FINISHED
3380 && ASMAtomicCmpXchgBool(&pIoCtxParent->fComplete, true, false))
3381 {
3382 LogFlowFunc(("Parent I/O context completed pIoCtxParent=%#p rcReq=%Rrc\n", pIoCtxParent, pIoCtxParent->rcReq));
3383 bool fFreeParentCtx = RT_BOOL(!(pIoCtxParent->fFlags & VDIOCTX_FLAGS_DONT_FREE));
3384 vdIoCtxRootComplete(pDisk, pIoCtxParent);
3385 vdThreadFinishWrite(pDisk);
3386
3387 if (fFreeParentCtx)
3388 vdIoCtxFree(pDisk, pIoCtxParent);
3389 vdDiskProcessBlockedIoCtx(pDisk);
3390 }
3391 else if (!vdIoCtxIsDiskLockOwner(pDisk, pIoCtx))
3392 {
3393 /* Process any pending writes if the current request didn't caused another growing. */
3394 vdDiskProcessBlockedIoCtx(pDisk);
3395 }
3396 }
3397 else
3398 {
3399 if (pIoCtx->enmTxDir == VDIOCTXTXDIR_FLUSH)
3400 {
3401 vdIoCtxUnlockDisk(pDisk, pIoCtx, true /* fProcessDerredReqs */);
3402 vdThreadFinishWrite(pDisk);
3403 }
3404 else if ( pIoCtx->enmTxDir == VDIOCTXTXDIR_WRITE
3405 || pIoCtx->enmTxDir == VDIOCTXTXDIR_DISCARD)
3406 vdThreadFinishWrite(pDisk);
3407 else
3408 {
3409 Assert(pIoCtx->enmTxDir == VDIOCTXTXDIR_READ);
3410 vdThreadFinishRead(pDisk);
3411 }
3412
3413 LogFlowFunc(("I/O context completed pIoCtx=%#p rcReq=%Rrc\n", pIoCtx, pIoCtx->rcReq));
3414 vdIoCtxRootComplete(pDisk, pIoCtx);
3415 }
3416
3417 if (fFreeCtx)
3418 vdIoCtxFree(pDisk, pIoCtx);
3419 }
3420 }
3421
3422 return VINF_SUCCESS;
3423}
3424
3425/**
3426 * Internal - Called when user transfer completed.
3427 */
3428static int vdUserXferCompleted(PVDIOSTORAGE pIoStorage, PVDIOCTX pIoCtx,
3429 PFNVDXFERCOMPLETED pfnComplete, void *pvUser,
3430 size_t cbTransfer, int rcReq)
3431{
3432 int rc = VINF_SUCCESS;
3433 PVDISK pDisk = pIoCtx->pDisk;
3434
3435 LogFlowFunc(("pIoStorage=%#p pIoCtx=%#p pfnComplete=%#p pvUser=%#p cbTransfer=%zu rcReq=%Rrc\n",
3436 pIoStorage, pIoCtx, pfnComplete, pvUser, cbTransfer, rcReq));
3437
3438 VD_IS_LOCKED(pDisk);
3439
3440 Assert(pIoCtx->Req.Io.cbTransferLeft >= cbTransfer);
3441 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbTransfer); Assert(cbTransfer == (uint32_t)cbTransfer);
3442 ASMAtomicDecU32(&pIoCtx->cDataTransfersPending);
3443
3444 if (pfnComplete)
3445 rc = pfnComplete(pIoStorage->pVDIo->pBackendData, pIoCtx, pvUser, rcReq);
3446
3447 if (RT_SUCCESS(rc))
3448 rc = vdIoCtxContinue(pIoCtx, rcReq);
3449 else if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
3450 rc = VINF_SUCCESS;
3451
3452 return rc;
3453}
3454
3455static void vdIoCtxContinueDeferredList(PVDIOSTORAGE pIoStorage, PRTLISTANCHOR pListWaiting,
3456 PFNVDXFERCOMPLETED pfnComplete, void *pvUser, int rcReq)
3457{
3458 LogFlowFunc(("pIoStorage=%#p pListWaiting=%#p pfnComplete=%#p pvUser=%#p rcReq=%Rrc\n",
3459 pIoStorage, pListWaiting, pfnComplete, pvUser, rcReq));
3460
3461 /* Go through the waiting list and continue the I/O contexts. */
3462 while (!RTListIsEmpty(pListWaiting))
3463 {
3464 int rc = VINF_SUCCESS;
3465 PVDIOCTXDEFERRED pDeferred = RTListGetFirst(pListWaiting, VDIOCTXDEFERRED, NodeDeferred);
3466 PVDIOCTX pIoCtx = pDeferred->pIoCtx;
3467 RTListNodeRemove(&pDeferred->NodeDeferred);
3468
3469 RTMemFree(pDeferred);
3470 ASMAtomicDecU32(&pIoCtx->cMetaTransfersPending);
3471
3472 if (pfnComplete)
3473 rc = pfnComplete(pIoStorage->pVDIo->pBackendData, pIoCtx, pvUser, rcReq);
3474
3475 LogFlow(("Completion callback for I/O context %#p returned %Rrc\n", pIoCtx, rc));
3476
3477 if (RT_SUCCESS(rc))
3478 {
3479 rc = vdIoCtxContinue(pIoCtx, rcReq);
3480 AssertRC(rc);
3481 }
3482 else
3483 Assert(rc == VERR_VD_ASYNC_IO_IN_PROGRESS);
3484 }
3485}
3486
3487/**
3488 * Internal - Called when a meta transfer completed.
3489 */
3490static int vdMetaXferCompleted(PVDIOSTORAGE pIoStorage, PFNVDXFERCOMPLETED pfnComplete, void *pvUser,
3491 PVDMETAXFER pMetaXfer, int rcReq)
3492{
3493 PVDISK pDisk = pIoStorage->pVDIo->pDisk;
3494 RTLISTANCHOR ListIoCtxWaiting;
3495 bool fFlush;
3496
3497 LogFlowFunc(("pIoStorage=%#p pfnComplete=%#p pvUser=%#p pMetaXfer=%#p rcReq=%Rrc\n",
3498 pIoStorage, pfnComplete, pvUser, pMetaXfer, rcReq));
3499
3500 VD_IS_LOCKED(pDisk);
3501
3502 fFlush = VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_FLUSH;
3503
3504 if (!fFlush)
3505 {
3506 RTListMove(&ListIoCtxWaiting, &pMetaXfer->ListIoCtxWaiting);
3507
3508 if (RT_FAILURE(rcReq))
3509 {
3510 /* Remove from the AVL tree. */
3511 LogFlow(("Removing meta xfer=%#p\n", pMetaXfer));
3512 bool fRemoved = RTAvlrFileOffsetRemove(pIoStorage->pTreeMetaXfers, pMetaXfer->Core.Key) != NULL;
3513 Assert(fRemoved); NOREF(fRemoved);
3514 /* If this was a write check if there is a shadow buffer with updated data. */
3515 if (pMetaXfer->pbDataShw)
3516 {
3517 Assert(VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_WRITE);
3518 Assert(!RTListIsEmpty(&pMetaXfer->ListIoCtxShwWrites));
3519 RTListConcatenate(&ListIoCtxWaiting, &pMetaXfer->ListIoCtxShwWrites);
3520 RTMemFree(pMetaXfer->pbDataShw);
3521 pMetaXfer->pbDataShw = NULL;
3522 }
3523 RTMemFree(pMetaXfer);
3524 }
3525 else
3526 {
3527 /* Increase the reference counter to make sure it doesn't go away before the last context is processed. */
3528 pMetaXfer->cRefs++;
3529 }
3530 }
3531 else
3532 RTListMove(&ListIoCtxWaiting, &pMetaXfer->ListIoCtxWaiting);
3533
3534 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_NONE);
3535 vdIoCtxContinueDeferredList(pIoStorage, &ListIoCtxWaiting, pfnComplete, pvUser, rcReq);
3536
3537 /*
3538 * If there is a shadow buffer and the previous write was successful update with the
3539 * new data and trigger a new write.
3540 */
3541 if ( pMetaXfer->pbDataShw
3542 && RT_SUCCESS(rcReq)
3543 && VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_NONE)
3544 {
3545 LogFlowFunc(("pMetaXfer=%#p Updating from shadow buffer and triggering new write\n", pMetaXfer));
3546 memcpy(pMetaXfer->abData, pMetaXfer->pbDataShw, pMetaXfer->cbMeta);
3547 RTMemFree(pMetaXfer->pbDataShw);
3548 pMetaXfer->pbDataShw = NULL;
3549 Assert(!RTListIsEmpty(&pMetaXfer->ListIoCtxShwWrites));
3550
3551 /* Setup a new I/O write. */
3552 PVDIOTASK pIoTask = vdIoTaskMetaAlloc(pIoStorage, pfnComplete, pvUser, pMetaXfer);
3553 if (RT_LIKELY(pIoTask))
3554 {
3555 void *pvTask = NULL;
3556 RTSGSEG Seg;
3557
3558 Seg.cbSeg = pMetaXfer->cbMeta;
3559 Seg.pvSeg = pMetaXfer->abData;
3560
3561 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_WRITE);
3562 rcReq = pIoStorage->pVDIo->pInterfaceIo->pfnWriteAsync(pIoStorage->pVDIo->pInterfaceIo->Core.pvUser,
3563 pIoStorage->pStorage,
3564 pMetaXfer->Core.Key, &Seg, 1,
3565 pMetaXfer->cbMeta, pIoTask,
3566 &pvTask);
3567 if ( RT_SUCCESS(rcReq)
3568 || rcReq != VERR_VD_ASYNC_IO_IN_PROGRESS)
3569 {
3570 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_NONE);
3571 vdIoTaskFree(pDisk, pIoTask);
3572 }
3573 else
3574 RTListMove(&pMetaXfer->ListIoCtxWaiting, &pMetaXfer->ListIoCtxShwWrites);
3575 }
3576 else
3577 rcReq = VERR_NO_MEMORY;
3578
3579 /* Cleanup if there was an error or the request completed already. */
3580 if (rcReq != VERR_VD_ASYNC_IO_IN_PROGRESS)
3581 vdIoCtxContinueDeferredList(pIoStorage, &pMetaXfer->ListIoCtxShwWrites, pfnComplete, pvUser, rcReq);
3582 }
3583
3584 /* Remove if not used anymore. */
3585 if (!fFlush)
3586 {
3587 pMetaXfer->cRefs--;
3588 if (!pMetaXfer->cRefs && RTListIsEmpty(&pMetaXfer->ListIoCtxWaiting))
3589 {
3590 /* Remove from the AVL tree. */
3591 LogFlow(("Removing meta xfer=%#p\n", pMetaXfer));
3592 bool fRemoved = RTAvlrFileOffsetRemove(pIoStorage->pTreeMetaXfers, pMetaXfer->Core.Key) != NULL;
3593 Assert(fRemoved); NOREF(fRemoved);
3594 RTMemFree(pMetaXfer);
3595 }
3596 }
3597 else if (fFlush)
3598 RTMemFree(pMetaXfer);
3599
3600 return VINF_SUCCESS;
3601}
3602
3603/**
3604 * Processes a list of waiting I/O tasks. The disk lock must be held by caller.
3605 *
3606 * @param pDisk The disk to process the list for.
3607 */
3608static void vdIoTaskProcessWaitingList(PVDISK pDisk)
3609{
3610 LogFlowFunc(("pDisk=%#p\n", pDisk));
3611
3612 VD_IS_LOCKED(pDisk);
3613
3614 PVDIOTASK pHead = ASMAtomicXchgPtrT(&pDisk->pIoTasksPendingHead, NULL, PVDIOTASK);
3615
3616 Log(("I/O task list cleared\n"));
3617
3618 /* Reverse order. */
3619 PVDIOTASK pCur = pHead;
3620 pHead = NULL;
3621 while (pCur)
3622 {
3623 PVDIOTASK pInsert = pCur;
3624 pCur = pCur->pNext;
3625 pInsert->pNext = pHead;
3626 pHead = pInsert;
3627 }
3628
3629 while (pHead)
3630 {
3631 PVDIOSTORAGE pIoStorage = pHead->pIoStorage;
3632
3633 if (!pHead->fMeta)
3634 vdUserXferCompleted(pIoStorage, pHead->Type.User.pIoCtx,
3635 pHead->pfnComplete, pHead->pvUser,
3636 pHead->Type.User.cbTransfer, pHead->rcReq);
3637 else
3638 vdMetaXferCompleted(pIoStorage, pHead->pfnComplete, pHead->pvUser,
3639 pHead->Type.Meta.pMetaXfer, pHead->rcReq);
3640
3641 pCur = pHead;
3642 pHead = pHead->pNext;
3643 vdIoTaskFree(pDisk, pCur);
3644 }
3645}
3646
3647/**
3648 * Process any I/O context on the halted list.
3649 *
3650 * @param pDisk The disk.
3651 */
3652static void vdIoCtxProcessHaltedList(PVDISK pDisk)
3653{
3654 LogFlowFunc(("pDisk=%#p\n", pDisk));
3655
3656 VD_IS_LOCKED(pDisk);
3657
3658 /* Get the waiting list and process it in FIFO order. */
3659 PVDIOCTX pIoCtxHead = ASMAtomicXchgPtrT(&pDisk->pIoCtxHaltedHead, NULL, PVDIOCTX);
3660
3661 /* Reverse it. */
3662 PVDIOCTX pCur = pIoCtxHead;
3663 pIoCtxHead = NULL;
3664 while (pCur)
3665 {
3666 PVDIOCTX pInsert = pCur;
3667 pCur = pCur->pIoCtxNext;
3668 pInsert->pIoCtxNext = pIoCtxHead;
3669 pIoCtxHead = pInsert;
3670 }
3671
3672 /* Process now. */
3673 pCur = pIoCtxHead;
3674 while (pCur)
3675 {
3676 PVDIOCTX pTmp = pCur;
3677
3678 pCur = pCur->pIoCtxNext;
3679 pTmp->pIoCtxNext = NULL;
3680
3681 /* Continue */
3682 pTmp->fFlags &= ~VDIOCTX_FLAGS_BLOCKED;
3683 vdIoCtxContinue(pTmp, pTmp->rcReq);
3684 }
3685}
3686
3687/**
3688 * Unlock the disk and process pending tasks.
3689 *
3690 * @returns VBox status code.
3691 * @param pDisk The disk to unlock.
3692 * @param pIoCtxRc The I/O context to get the status code from, optional.
3693 */
3694static int vdDiskUnlock(PVDISK pDisk, PVDIOCTX pIoCtxRc)
3695{
3696 int rc = VINF_SUCCESS;
3697
3698 VD_IS_LOCKED(pDisk);
3699
3700 /*
3701 * Process the list of waiting I/O tasks first
3702 * because they might complete I/O contexts.
3703 * Same for the list of halted I/O contexts.
3704 * Afterwards comes the list of new I/O contexts.
3705 */
3706 vdIoTaskProcessWaitingList(pDisk);
3707 vdIoCtxProcessHaltedList(pDisk);
3708 rc = vdDiskProcessWaitingIoCtx(pDisk, pIoCtxRc);
3709 ASMAtomicXchgBool(&pDisk->fLocked, false);
3710
3711 /*
3712 * Need to check for new I/O tasks and waiting I/O contexts now
3713 * again as other threads might added them while we processed
3714 * previous lists.
3715 */
3716 while ( ASMAtomicUoReadPtrT(&pDisk->pIoCtxHead, PVDIOCTX) != NULL
3717 || ASMAtomicUoReadPtrT(&pDisk->pIoTasksPendingHead, PVDIOTASK) != NULL
3718 || ASMAtomicUoReadPtrT(&pDisk->pIoCtxHaltedHead, PVDIOCTX) != NULL)
3719 {
3720 /* Try lock disk again. */
3721 if (ASMAtomicCmpXchgBool(&pDisk->fLocked, true, false))
3722 {
3723 vdIoTaskProcessWaitingList(pDisk);
3724 vdIoCtxProcessHaltedList(pDisk);
3725 vdDiskProcessWaitingIoCtx(pDisk, NULL);
3726 ASMAtomicXchgBool(&pDisk->fLocked, false);
3727 }
3728 else /* Let the other thread everything when he unlocks the disk. */
3729 break;
3730 }
3731
3732 return rc;
3733}
3734
3735/**
3736 * Try to lock the disk to complete pressing of the I/O task.
3737 * The completion is deferred if the disk is locked already.
3738 *
3739 * @param pIoTask The I/O task to complete.
3740 */
3741static void vdXferTryLockDiskDeferIoTask(PVDIOTASK pIoTask)
3742{
3743 PVDIOSTORAGE pIoStorage = pIoTask->pIoStorage;
3744 PVDISK pDisk = pIoStorage->pVDIo->pDisk;
3745
3746 Log(("Deferring I/O task pIoTask=%p\n", pIoTask));
3747
3748 /* Put it on the waiting list. */
3749 PVDIOTASK pNext = ASMAtomicUoReadPtrT(&pDisk->pIoTasksPendingHead, PVDIOTASK);
3750 PVDIOTASK pHeadOld;
3751 pIoTask->pNext = pNext;
3752 while (!ASMAtomicCmpXchgExPtr(&pDisk->pIoTasksPendingHead, pIoTask, pNext, &pHeadOld))
3753 {
3754 pNext = pHeadOld;
3755 Assert(pNext != pIoTask);
3756 pIoTask->pNext = pNext;
3757 ASMNopPause();
3758 }
3759
3760 if (ASMAtomicCmpXchgBool(&pDisk->fLocked, true, false))
3761 {
3762 /* Release disk lock, it will take care of processing all lists. */
3763 vdDiskUnlock(pDisk, NULL);
3764 }
3765}
3766
3767static DECLCALLBACK(int) vdIOIntReqCompleted(void *pvUser, int rcReq)
3768{
3769 PVDIOTASK pIoTask = (PVDIOTASK)pvUser;
3770
3771 LogFlowFunc(("Task completed pIoTask=%#p\n", pIoTask));
3772
3773 pIoTask->rcReq = rcReq;
3774 vdXferTryLockDiskDeferIoTask(pIoTask);
3775 return VINF_SUCCESS;
3776}
3777
3778/**
3779 * VD I/O interface callback for opening a file.
3780 */
3781static DECLCALLBACK(int) vdIOIntOpen(void *pvUser, const char *pszLocation,
3782 unsigned uOpenFlags, PPVDIOSTORAGE ppIoStorage)
3783{
3784 int rc = VINF_SUCCESS;
3785 PVDIO pVDIo = (PVDIO)pvUser;
3786 PVDIOSTORAGE pIoStorage = (PVDIOSTORAGE)RTMemAllocZ(sizeof(VDIOSTORAGE));
3787
3788 if (!pIoStorage)
3789 return VERR_NO_MEMORY;
3790
3791 /* Create the AVl tree. */
3792 pIoStorage->pTreeMetaXfers = (PAVLRFOFFTREE)RTMemAllocZ(sizeof(AVLRFOFFTREE));
3793 if (pIoStorage->pTreeMetaXfers)
3794 {
3795 rc = pVDIo->pInterfaceIo->pfnOpen(pVDIo->pInterfaceIo->Core.pvUser,
3796 pszLocation, uOpenFlags,
3797 vdIOIntReqCompleted,
3798 &pIoStorage->pStorage);
3799 if (RT_SUCCESS(rc))
3800 {
3801 pIoStorage->pVDIo = pVDIo;
3802 *ppIoStorage = pIoStorage;
3803 return VINF_SUCCESS;
3804 }
3805
3806 RTMemFree(pIoStorage->pTreeMetaXfers);
3807 }
3808 else
3809 rc = VERR_NO_MEMORY;
3810
3811 RTMemFree(pIoStorage);
3812 return rc;
3813}
3814
3815static DECLCALLBACK(int) vdIOIntTreeMetaXferDestroy(PAVLRFOFFNODECORE pNode, void *pvUser)
3816{
3817 RT_NOREF2(pNode, pvUser);
3818 AssertMsgFailed(("Tree should be empty at this point!\n"));
3819 return VINF_SUCCESS;
3820}
3821
3822static DECLCALLBACK(int) vdIOIntClose(void *pvUser, PVDIOSTORAGE pIoStorage)
3823{
3824 int rc = VINF_SUCCESS;
3825 PVDIO pVDIo = (PVDIO)pvUser;
3826
3827 /* We free everything here, even if closing the file failed for some reason. */
3828 rc = pVDIo->pInterfaceIo->pfnClose(pVDIo->pInterfaceIo->Core.pvUser, pIoStorage->pStorage);
3829 RTAvlrFileOffsetDestroy(pIoStorage->pTreeMetaXfers, vdIOIntTreeMetaXferDestroy, NULL);
3830 RTMemFree(pIoStorage->pTreeMetaXfers);
3831 RTMemFree(pIoStorage);
3832 return rc;
3833}
3834
3835static DECLCALLBACK(int) vdIOIntDelete(void *pvUser, const char *pcszFilename)
3836{
3837 PVDIO pVDIo = (PVDIO)pvUser;
3838 return pVDIo->pInterfaceIo->pfnDelete(pVDIo->pInterfaceIo->Core.pvUser,
3839 pcszFilename);
3840}
3841
3842static DECLCALLBACK(int) vdIOIntMove(void *pvUser, const char *pcszSrc, const char *pcszDst,
3843 unsigned fMove)
3844{
3845 PVDIO pVDIo = (PVDIO)pvUser;
3846 return pVDIo->pInterfaceIo->pfnMove(pVDIo->pInterfaceIo->Core.pvUser,
3847 pcszSrc, pcszDst, fMove);
3848}
3849
3850static DECLCALLBACK(int) vdIOIntGetFreeSpace(void *pvUser, const char *pcszFilename,
3851 int64_t *pcbFreeSpace)
3852{
3853 PVDIO pVDIo = (PVDIO)pvUser;
3854 return pVDIo->pInterfaceIo->pfnGetFreeSpace(pVDIo->pInterfaceIo->Core.pvUser,
3855 pcszFilename, pcbFreeSpace);
3856}
3857
3858static DECLCALLBACK(int) vdIOIntGetModificationTime(void *pvUser, const char *pcszFilename,
3859 PRTTIMESPEC pModificationTime)
3860{
3861 PVDIO pVDIo = (PVDIO)pvUser;
3862 return pVDIo->pInterfaceIo->pfnGetModificationTime(pVDIo->pInterfaceIo->Core.pvUser,
3863 pcszFilename, pModificationTime);
3864}
3865
3866static DECLCALLBACK(int) vdIOIntGetSize(void *pvUser, PVDIOSTORAGE pIoStorage,
3867 uint64_t *pcbSize)
3868{
3869 PVDIO pVDIo = (PVDIO)pvUser;
3870 return pVDIo->pInterfaceIo->pfnGetSize(pVDIo->pInterfaceIo->Core.pvUser,
3871 pIoStorage->pStorage, pcbSize);
3872}
3873
3874static DECLCALLBACK(int) vdIOIntSetSize(void *pvUser, PVDIOSTORAGE pIoStorage,
3875 uint64_t cbSize)
3876{
3877 PVDIO pVDIo = (PVDIO)pvUser;
3878 return pVDIo->pInterfaceIo->pfnSetSize(pVDIo->pInterfaceIo->Core.pvUser,
3879 pIoStorage->pStorage, cbSize);
3880}
3881
3882static DECLCALLBACK(int) vdIOIntSetAllocationSize(void *pvUser, PVDIOSTORAGE pIoStorage,
3883 uint64_t cbSize, uint32_t fFlags,
3884 PVDINTERFACEPROGRESS pIfProgress,
3885 unsigned uPercentStart, unsigned uPercentSpan)
3886{
3887 PVDIO pVDIo = (PVDIO)pvUser;
3888 int rc = pVDIo->pInterfaceIo->pfnSetAllocationSize(pVDIo->pInterfaceIo->Core.pvUser,
3889 pIoStorage->pStorage, cbSize, fFlags);
3890 if (rc == VERR_NOT_SUPPORTED)
3891 {
3892 /* Fallback if the underlying medium does not support optimized storage allocation. */
3893 uint64_t cbSizeCur = 0;
3894 rc = pVDIo->pInterfaceIo->pfnGetSize(pVDIo->pInterfaceIo->Core.pvUser,
3895 pIoStorage->pStorage, &cbSizeCur);
3896 if (RT_SUCCESS(rc))
3897 {
3898 if (cbSizeCur < cbSize)
3899 {
3900 const size_t cbBuf = 128 * _1K;
3901 void *pvBuf = RTMemTmpAllocZ(cbBuf);
3902 if (RT_LIKELY(pvBuf))
3903 {
3904 uint64_t cbFill = cbSize - cbSizeCur;
3905 uint64_t uOff = 0;
3906
3907 /* Write data to all blocks. */
3908 while ( uOff < cbFill
3909 && RT_SUCCESS(rc))
3910 {
3911 size_t cbChunk = (size_t)RT_MIN(cbFill - uOff, cbBuf);
3912
3913 rc = pVDIo->pInterfaceIo->pfnWriteSync(pVDIo->pInterfaceIo->Core.pvUser,
3914 pIoStorage->pStorage, cbSizeCur + uOff,
3915 pvBuf, cbChunk, NULL);
3916 if (RT_SUCCESS(rc))
3917 {
3918 uOff += cbChunk;
3919
3920 rc = vdIfProgress(pIfProgress, uPercentStart + uOff * uPercentSpan / cbFill);
3921 }
3922 }
3923
3924 RTMemTmpFree(pvBuf);
3925 }
3926 else
3927 rc = VERR_NO_MEMORY;
3928 }
3929 else if (cbSizeCur > cbSize)
3930 rc = pVDIo->pInterfaceIo->pfnSetSize(pVDIo->pInterfaceIo->Core.pvUser,
3931 pIoStorage->pStorage, cbSize);
3932 }
3933 }
3934
3935 if (RT_SUCCESS(rc))
3936 rc = vdIfProgress(pIfProgress, uPercentStart + uPercentSpan);
3937
3938 return rc;
3939}
3940
3941static DECLCALLBACK(int) vdIOIntReadUser(void *pvUser, PVDIOSTORAGE pIoStorage, uint64_t uOffset,
3942 PVDIOCTX pIoCtx, size_t cbRead)
3943{
3944 int rc = VINF_SUCCESS;
3945 PVDIO pVDIo = (PVDIO)pvUser;
3946 PVDISK pDisk = pVDIo->pDisk;
3947
3948 LogFlowFunc(("pvUser=%#p pIoStorage=%#p uOffset=%llu pIoCtx=%#p cbRead=%u\n",
3949 pvUser, pIoStorage, uOffset, pIoCtx, cbRead));
3950
3951 /** @todo Enable check for sync I/O later. */
3952 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
3953 VD_IS_LOCKED(pDisk);
3954
3955 Assert(cbRead > 0);
3956
3957 if ( (pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC)
3958 || !pVDIo->pInterfaceIo->pfnReadAsync)
3959 {
3960 RTSGSEG Seg;
3961 unsigned cSegments = 1;
3962 size_t cbTaskRead = 0;
3963
3964 /* Synchronous I/O contexts only have one buffer segment. */
3965 AssertMsgReturn(pIoCtx->Req.Io.SgBuf.cSegs == 1,
3966 ("Invalid number of buffer segments for synchronous I/O context"),
3967 VERR_INVALID_PARAMETER);
3968
3969 cbTaskRead = RTSgBufSegArrayCreate(&pIoCtx->Req.Io.SgBuf, &Seg, &cSegments, cbRead);
3970 Assert(cbRead == cbTaskRead);
3971 Assert(cSegments == 1);
3972 rc = pVDIo->pInterfaceIo->pfnReadSync(pVDIo->pInterfaceIo->Core.pvUser,
3973 pIoStorage->pStorage, uOffset,
3974 Seg.pvSeg, cbRead, NULL);
3975 if (RT_SUCCESS(rc))
3976 {
3977 Assert(cbRead == (uint32_t)cbRead);
3978 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbRead);
3979 }
3980 }
3981 else
3982 {
3983 /* Build the S/G array and spawn a new I/O task */
3984 while (cbRead)
3985 {
3986 RTSGSEG aSeg[VD_IO_TASK_SEGMENTS_MAX];
3987 unsigned cSegments = VD_IO_TASK_SEGMENTS_MAX;
3988 size_t cbTaskRead = RTSgBufSegArrayCreate(&pIoCtx->Req.Io.SgBuf, aSeg, &cSegments, cbRead);
3989
3990 Assert(cSegments > 0);
3991 Assert(cbTaskRead > 0);
3992 AssertMsg(cbTaskRead <= cbRead, ("Invalid number of bytes to read\n"));
3993
3994 LogFlow(("Reading %u bytes into %u segments\n", cbTaskRead, cSegments));
3995
3996#ifdef RT_STRICT
3997 for (unsigned i = 0; i < cSegments; i++)
3998 AssertMsg(aSeg[i].pvSeg && !(aSeg[i].cbSeg % 512),
3999 ("Segment %u is invalid\n", i));
4000#endif
4001
4002 Assert(cbTaskRead == (uint32_t)cbTaskRead);
4003 PVDIOTASK pIoTask = vdIoTaskUserAlloc(pIoStorage, NULL, NULL, pIoCtx, (uint32_t)cbTaskRead);
4004
4005 if (!pIoTask)
4006 return VERR_NO_MEMORY;
4007
4008 ASMAtomicIncU32(&pIoCtx->cDataTransfersPending);
4009
4010 void *pvTask;
4011 Log(("Spawning pIoTask=%p pIoCtx=%p\n", pIoTask, pIoCtx));
4012 rc = pVDIo->pInterfaceIo->pfnReadAsync(pVDIo->pInterfaceIo->Core.pvUser,
4013 pIoStorage->pStorage, uOffset,
4014 aSeg, cSegments, cbTaskRead, pIoTask,
4015 &pvTask);
4016 if (RT_SUCCESS(rc))
4017 {
4018 AssertMsg(cbTaskRead <= pIoCtx->Req.Io.cbTransferLeft, ("Impossible!\n"));
4019 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbTaskRead);
4020 ASMAtomicDecU32(&pIoCtx->cDataTransfersPending);
4021 vdIoTaskFree(pDisk, pIoTask);
4022 }
4023 else if (rc != VERR_VD_ASYNC_IO_IN_PROGRESS)
4024 {
4025 ASMAtomicDecU32(&pIoCtx->cDataTransfersPending);
4026 vdIoTaskFree(pDisk, pIoTask);
4027 break;
4028 }
4029
4030 uOffset += cbTaskRead;
4031 cbRead -= cbTaskRead;
4032 }
4033 }
4034
4035 LogFlowFunc(("returns rc=%Rrc\n", rc));
4036 return rc;
4037}
4038
4039static DECLCALLBACK(int) vdIOIntWriteUser(void *pvUser, PVDIOSTORAGE pIoStorage, uint64_t uOffset,
4040 PVDIOCTX pIoCtx, size_t cbWrite, PFNVDXFERCOMPLETED pfnComplete,
4041 void *pvCompleteUser)
4042{
4043 int rc = VINF_SUCCESS;
4044 PVDIO pVDIo = (PVDIO)pvUser;
4045 PVDISK pDisk = pVDIo->pDisk;
4046
4047 LogFlowFunc(("pvUser=%#p pIoStorage=%#p uOffset=%llu pIoCtx=%#p cbWrite=%u\n",
4048 pvUser, pIoStorage, uOffset, pIoCtx, cbWrite));
4049
4050 /** @todo Enable check for sync I/O later. */
4051 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4052 VD_IS_LOCKED(pDisk);
4053
4054 Assert(cbWrite > 0);
4055
4056 if ( (pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC)
4057 || !pVDIo->pInterfaceIo->pfnWriteAsync)
4058 {
4059 RTSGSEG Seg;
4060 unsigned cSegments = 1;
4061 size_t cbTaskWrite = 0;
4062
4063 /* Synchronous I/O contexts only have one buffer segment. */
4064 AssertMsgReturn(pIoCtx->Req.Io.SgBuf.cSegs == 1,
4065 ("Invalid number of buffer segments for synchronous I/O context"),
4066 VERR_INVALID_PARAMETER);
4067
4068 cbTaskWrite = RTSgBufSegArrayCreate(&pIoCtx->Req.Io.SgBuf, &Seg, &cSegments, cbWrite);
4069 Assert(cbWrite == cbTaskWrite);
4070 Assert(cSegments == 1);
4071 rc = pVDIo->pInterfaceIo->pfnWriteSync(pVDIo->pInterfaceIo->Core.pvUser,
4072 pIoStorage->pStorage, uOffset,
4073 Seg.pvSeg, cbWrite, NULL);
4074 if (RT_SUCCESS(rc))
4075 {
4076 Assert(pIoCtx->Req.Io.cbTransferLeft >= cbWrite);
4077 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbWrite);
4078 }
4079 }
4080 else
4081 {
4082 /* Build the S/G array and spawn a new I/O task */
4083 while (cbWrite)
4084 {
4085 RTSGSEG aSeg[VD_IO_TASK_SEGMENTS_MAX];
4086 unsigned cSegments = VD_IO_TASK_SEGMENTS_MAX;
4087 size_t cbTaskWrite = 0;
4088
4089 cbTaskWrite = RTSgBufSegArrayCreate(&pIoCtx->Req.Io.SgBuf, aSeg, &cSegments, cbWrite);
4090
4091 Assert(cSegments > 0);
4092 Assert(cbTaskWrite > 0);
4093 AssertMsg(cbTaskWrite <= cbWrite, ("Invalid number of bytes to write\n"));
4094
4095 LogFlow(("Writing %u bytes from %u segments\n", cbTaskWrite, cSegments));
4096
4097#ifdef DEBUG
4098 for (unsigned i = 0; i < cSegments; i++)
4099 AssertMsg(aSeg[i].pvSeg && !(aSeg[i].cbSeg % 512),
4100 ("Segment %u is invalid\n", i));
4101#endif
4102
4103 Assert(cbTaskWrite == (uint32_t)cbTaskWrite);
4104 PVDIOTASK pIoTask = vdIoTaskUserAlloc(pIoStorage, pfnComplete, pvCompleteUser, pIoCtx, (uint32_t)cbTaskWrite);
4105
4106 if (!pIoTask)
4107 return VERR_NO_MEMORY;
4108
4109 ASMAtomicIncU32(&pIoCtx->cDataTransfersPending);
4110
4111 void *pvTask;
4112 Log(("Spawning pIoTask=%p pIoCtx=%p\n", pIoTask, pIoCtx));
4113 rc = pVDIo->pInterfaceIo->pfnWriteAsync(pVDIo->pInterfaceIo->Core.pvUser,
4114 pIoStorage->pStorage,
4115 uOffset, aSeg, cSegments,
4116 cbTaskWrite, pIoTask, &pvTask);
4117 if (RT_SUCCESS(rc))
4118 {
4119 AssertMsg(cbTaskWrite <= pIoCtx->Req.Io.cbTransferLeft, ("Impossible!\n"));
4120 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbTaskWrite);
4121 ASMAtomicDecU32(&pIoCtx->cDataTransfersPending);
4122 vdIoTaskFree(pDisk, pIoTask);
4123 }
4124 else if (rc != VERR_VD_ASYNC_IO_IN_PROGRESS)
4125 {
4126 ASMAtomicDecU32(&pIoCtx->cDataTransfersPending);
4127 vdIoTaskFree(pDisk, pIoTask);
4128 break;
4129 }
4130
4131 uOffset += cbTaskWrite;
4132 cbWrite -= cbTaskWrite;
4133 }
4134 }
4135
4136 LogFlowFunc(("returns rc=%Rrc\n", rc));
4137 return rc;
4138}
4139
4140static DECLCALLBACK(int) vdIOIntReadMeta(void *pvUser, PVDIOSTORAGE pIoStorage, uint64_t uOffset,
4141 void *pvBuf, size_t cbRead, PVDIOCTX pIoCtx,
4142 PPVDMETAXFER ppMetaXfer, PFNVDXFERCOMPLETED pfnComplete,
4143 void *pvCompleteUser)
4144{
4145 PVDIO pVDIo = (PVDIO)pvUser;
4146 PVDISK pDisk = pVDIo->pDisk;
4147 int rc = VINF_SUCCESS;
4148 RTSGSEG Seg;
4149 PVDIOTASK pIoTask;
4150 PVDMETAXFER pMetaXfer = NULL;
4151 void *pvTask = NULL;
4152
4153 LogFlowFunc(("pvUser=%#p pIoStorage=%#p uOffset=%llu pvBuf=%#p cbRead=%u\n",
4154 pvUser, pIoStorage, uOffset, pvBuf, cbRead));
4155
4156 AssertMsgReturn( pIoCtx
4157 || (!ppMetaXfer && !pfnComplete && !pvCompleteUser),
4158 ("A synchronous metadata read is requested but the parameters are wrong\n"),
4159 VERR_INVALID_POINTER);
4160
4161 /** @todo Enable check for sync I/O later. */
4162 if ( pIoCtx
4163 && !(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4164 VD_IS_LOCKED(pDisk);
4165
4166 if ( !pIoCtx
4167 || pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC
4168 || !pVDIo->pInterfaceIo->pfnReadAsync)
4169 {
4170 /* Handle synchronous metadata I/O. */
4171 /** @todo Integrate with metadata transfers below. */
4172 rc = pVDIo->pInterfaceIo->pfnReadSync(pVDIo->pInterfaceIo->Core.pvUser,
4173 pIoStorage->pStorage, uOffset,
4174 pvBuf, cbRead, NULL);
4175 if (ppMetaXfer)
4176 *ppMetaXfer = NULL;
4177 }
4178 else
4179 {
4180 pMetaXfer = (PVDMETAXFER)RTAvlrFileOffsetGet(pIoStorage->pTreeMetaXfers, uOffset);
4181 if (!pMetaXfer)
4182 {
4183#ifdef RT_STRICT
4184 pMetaXfer = (PVDMETAXFER)RTAvlrFileOffsetGetBestFit(pIoStorage->pTreeMetaXfers, uOffset, false /* fAbove */);
4185 AssertMsg(!pMetaXfer || (pMetaXfer->Core.Key + (RTFOFF)pMetaXfer->cbMeta <= (RTFOFF)uOffset),
4186 ("Overlapping meta transfers!\n"));
4187#endif
4188
4189 /* Allocate a new meta transfer. */
4190 pMetaXfer = vdMetaXferAlloc(pIoStorage, uOffset, cbRead);
4191 if (!pMetaXfer)
4192 return VERR_NO_MEMORY;
4193
4194 pIoTask = vdIoTaskMetaAlloc(pIoStorage, pfnComplete, pvCompleteUser, pMetaXfer);
4195 if (!pIoTask)
4196 {
4197 RTMemFree(pMetaXfer);
4198 return VERR_NO_MEMORY;
4199 }
4200
4201 Seg.cbSeg = cbRead;
4202 Seg.pvSeg = pMetaXfer->abData;
4203
4204 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_READ);
4205 rc = pVDIo->pInterfaceIo->pfnReadAsync(pVDIo->pInterfaceIo->Core.pvUser,
4206 pIoStorage->pStorage,
4207 uOffset, &Seg, 1,
4208 cbRead, pIoTask, &pvTask);
4209
4210 if (RT_SUCCESS(rc) || rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
4211 {
4212 bool fInserted = RTAvlrFileOffsetInsert(pIoStorage->pTreeMetaXfers, &pMetaXfer->Core);
4213 Assert(fInserted); NOREF(fInserted);
4214 }
4215 else
4216 RTMemFree(pMetaXfer);
4217
4218 if (RT_SUCCESS(rc))
4219 {
4220 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_NONE);
4221 vdIoTaskFree(pDisk, pIoTask);
4222 }
4223 else if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS && !pfnComplete)
4224 rc = VERR_VD_NOT_ENOUGH_METADATA;
4225 }
4226
4227 Assert(RT_VALID_PTR(pMetaXfer) || RT_FAILURE(rc));
4228
4229 if (RT_SUCCESS(rc) || rc == VERR_VD_NOT_ENOUGH_METADATA || rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
4230 {
4231 /* If it is pending add the request to the list. */
4232 if (VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_READ)
4233 {
4234 PVDIOCTXDEFERRED pDeferred = (PVDIOCTXDEFERRED)RTMemAllocZ(sizeof(VDIOCTXDEFERRED));
4235 AssertPtr(pDeferred);
4236
4237 RTListInit(&pDeferred->NodeDeferred);
4238 pDeferred->pIoCtx = pIoCtx;
4239
4240 ASMAtomicIncU32(&pIoCtx->cMetaTransfersPending);
4241 RTListAppend(&pMetaXfer->ListIoCtxWaiting, &pDeferred->NodeDeferred);
4242 rc = VERR_VD_NOT_ENOUGH_METADATA;
4243 }
4244 else
4245 {
4246 /* Transfer the data. */
4247 pMetaXfer->cRefs++;
4248 Assert(pMetaXfer->cbMeta >= cbRead);
4249 Assert(pMetaXfer->Core.Key == (RTFOFF)uOffset);
4250 if (pMetaXfer->pbDataShw)
4251 memcpy(pvBuf, pMetaXfer->pbDataShw, cbRead);
4252 else
4253 memcpy(pvBuf, pMetaXfer->abData, cbRead);
4254 *ppMetaXfer = pMetaXfer;
4255 }
4256 }
4257 }
4258
4259 LogFlowFunc(("returns rc=%Rrc\n", rc));
4260 return rc;
4261}
4262
4263static DECLCALLBACK(int) vdIOIntWriteMeta(void *pvUser, PVDIOSTORAGE pIoStorage, uint64_t uOffset,
4264 const void *pvBuf, size_t cbWrite, PVDIOCTX pIoCtx,
4265 PFNVDXFERCOMPLETED pfnComplete, void *pvCompleteUser)
4266{
4267 PVDIO pVDIo = (PVDIO)pvUser;
4268 PVDISK pDisk = pVDIo->pDisk;
4269 int rc = VINF_SUCCESS;
4270 RTSGSEG Seg;
4271 PVDIOTASK pIoTask;
4272 PVDMETAXFER pMetaXfer = NULL;
4273 bool fInTree = false;
4274 void *pvTask = NULL;
4275
4276 LogFlowFunc(("pvUser=%#p pIoStorage=%#p uOffset=%llu pvBuf=%#p cbWrite=%u\n",
4277 pvUser, pIoStorage, uOffset, pvBuf, cbWrite));
4278
4279 AssertMsgReturn( pIoCtx
4280 || (!pfnComplete && !pvCompleteUser),
4281 ("A synchronous metadata write is requested but the parameters are wrong\n"),
4282 VERR_INVALID_POINTER);
4283
4284 /** @todo Enable check for sync I/O later. */
4285 if ( pIoCtx
4286 && !(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4287 VD_IS_LOCKED(pDisk);
4288
4289 if ( !pIoCtx
4290 || pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC
4291 || !pVDIo->pInterfaceIo->pfnWriteAsync)
4292 {
4293 /* Handle synchronous metadata I/O. */
4294 /** @todo Integrate with metadata transfers below. */
4295 rc = pVDIo->pInterfaceIo->pfnWriteSync(pVDIo->pInterfaceIo->Core.pvUser,
4296 pIoStorage->pStorage, uOffset,
4297 pvBuf, cbWrite, NULL);
4298 }
4299 else
4300 {
4301 pMetaXfer = (PVDMETAXFER)RTAvlrFileOffsetGet(pIoStorage->pTreeMetaXfers, uOffset);
4302 if (!pMetaXfer)
4303 {
4304 /* Allocate a new meta transfer. */
4305 pMetaXfer = vdMetaXferAlloc(pIoStorage, uOffset, cbWrite);
4306 if (!pMetaXfer)
4307 return VERR_NO_MEMORY;
4308 }
4309 else
4310 {
4311 Assert(pMetaXfer->cbMeta >= cbWrite);
4312 Assert(pMetaXfer->Core.Key == (RTFOFF)uOffset);
4313 fInTree = true;
4314 }
4315
4316 if (VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_NONE)
4317 {
4318 pIoTask = vdIoTaskMetaAlloc(pIoStorage, pfnComplete, pvCompleteUser, pMetaXfer);
4319 if (!pIoTask)
4320 {
4321 RTMemFree(pMetaXfer);
4322 return VERR_NO_MEMORY;
4323 }
4324
4325 memcpy(pMetaXfer->abData, pvBuf, cbWrite);
4326 Seg.cbSeg = cbWrite;
4327 Seg.pvSeg = pMetaXfer->abData;
4328
4329 ASMAtomicIncU32(&pIoCtx->cMetaTransfersPending);
4330
4331 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_WRITE);
4332 rc = pVDIo->pInterfaceIo->pfnWriteAsync(pVDIo->pInterfaceIo->Core.pvUser,
4333 pIoStorage->pStorage,
4334 uOffset, &Seg, 1, cbWrite, pIoTask,
4335 &pvTask);
4336 if (RT_SUCCESS(rc))
4337 {
4338 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_NONE);
4339 ASMAtomicDecU32(&pIoCtx->cMetaTransfersPending);
4340 vdIoTaskFree(pDisk, pIoTask);
4341 if (fInTree && !pMetaXfer->cRefs)
4342 {
4343 LogFlow(("Removing meta xfer=%#p\n", pMetaXfer));
4344 bool fRemoved = RTAvlrFileOffsetRemove(pIoStorage->pTreeMetaXfers, pMetaXfer->Core.Key) != NULL;
4345 AssertMsg(fRemoved, ("Metadata transfer wasn't removed\n")); NOREF(fRemoved);
4346 RTMemFree(pMetaXfer);
4347 pMetaXfer = NULL;
4348 }
4349 }
4350 else if (rc == VERR_VD_ASYNC_IO_IN_PROGRESS)
4351 {
4352 PVDIOCTXDEFERRED pDeferred = (PVDIOCTXDEFERRED)RTMemAllocZ(sizeof(VDIOCTXDEFERRED));
4353 AssertPtr(pDeferred);
4354
4355 RTListInit(&pDeferred->NodeDeferred);
4356 pDeferred->pIoCtx = pIoCtx;
4357
4358 if (!fInTree)
4359 {
4360 bool fInserted = RTAvlrFileOffsetInsert(pIoStorage->pTreeMetaXfers, &pMetaXfer->Core);
4361 Assert(fInserted); NOREF(fInserted);
4362 }
4363
4364 RTListAppend(&pMetaXfer->ListIoCtxWaiting, &pDeferred->NodeDeferred);
4365 }
4366 else
4367 {
4368 RTMemFree(pMetaXfer);
4369 pMetaXfer = NULL;
4370 }
4371 }
4372 else
4373 {
4374 /* I/O is in progress, update shadow buffer and add to waiting list. */
4375 Assert(VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_WRITE);
4376 if (!pMetaXfer->pbDataShw)
4377 {
4378 /* Allocate shadow buffer and set initial state. */
4379 LogFlowFunc(("pMetaXfer=%#p Creating shadow buffer\n", pMetaXfer));
4380 pMetaXfer->pbDataShw = (uint8_t *)RTMemAlloc(pMetaXfer->cbMeta);
4381 if (RT_LIKELY(pMetaXfer->pbDataShw))
4382 memcpy(pMetaXfer->pbDataShw, pMetaXfer->abData, pMetaXfer->cbMeta);
4383 else
4384 rc = VERR_NO_MEMORY;
4385 }
4386
4387 if (RT_SUCCESS(rc))
4388 {
4389 /* Update with written data and append to waiting list. */
4390 PVDIOCTXDEFERRED pDeferred = (PVDIOCTXDEFERRED)RTMemAllocZ(sizeof(VDIOCTXDEFERRED));
4391 if (pDeferred)
4392 {
4393 LogFlowFunc(("pMetaXfer=%#p Updating shadow buffer\n", pMetaXfer));
4394
4395 RTListInit(&pDeferred->NodeDeferred);
4396 pDeferred->pIoCtx = pIoCtx;
4397 ASMAtomicIncU32(&pIoCtx->cMetaTransfersPending);
4398 memcpy(pMetaXfer->pbDataShw, pvBuf, cbWrite);
4399 RTListAppend(&pMetaXfer->ListIoCtxShwWrites, &pDeferred->NodeDeferred);
4400 }
4401 else
4402 {
4403 /*
4404 * Free shadow buffer if there is no one depending on it, i.e.
4405 * we just allocated it.
4406 */
4407 if (RTListIsEmpty(&pMetaXfer->ListIoCtxShwWrites))
4408 {
4409 RTMemFree(pMetaXfer->pbDataShw);
4410 pMetaXfer->pbDataShw = NULL;
4411 }
4412 rc = VERR_NO_MEMORY;
4413 }
4414 }
4415 }
4416 }
4417
4418 LogFlowFunc(("returns rc=%Rrc\n", rc));
4419 return rc;
4420}
4421
4422static DECLCALLBACK(void) vdIOIntMetaXferRelease(void *pvUser, PVDMETAXFER pMetaXfer)
4423{
4424 PVDIO pVDIo = (PVDIO)pvUser;
4425 PVDISK pDisk = pVDIo->pDisk;
4426 PVDIOSTORAGE pIoStorage;
4427
4428 /*
4429 * It is possible that we get called with a NULL metadata xfer handle
4430 * for synchronous I/O. Just exit.
4431 */
4432 if (!pMetaXfer)
4433 return;
4434
4435 pIoStorage = pMetaXfer->pIoStorage;
4436
4437 VD_IS_LOCKED(pDisk);
4438
4439 Assert( VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_NONE
4440 || VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_WRITE);
4441 Assert(pMetaXfer->cRefs > 0);
4442
4443 pMetaXfer->cRefs--;
4444 if ( !pMetaXfer->cRefs
4445 && RTListIsEmpty(&pMetaXfer->ListIoCtxWaiting)
4446 && VDMETAXFER_TXDIR_GET(pMetaXfer->fFlags) == VDMETAXFER_TXDIR_NONE)
4447 {
4448 /* Free the meta data entry. */
4449 LogFlow(("Removing meta xfer=%#p\n", pMetaXfer));
4450 bool fRemoved = RTAvlrFileOffsetRemove(pIoStorage->pTreeMetaXfers, pMetaXfer->Core.Key) != NULL;
4451 AssertMsg(fRemoved, ("Metadata transfer wasn't removed\n")); NOREF(fRemoved);
4452
4453 RTMemFree(pMetaXfer);
4454 }
4455}
4456
4457static DECLCALLBACK(int) vdIOIntFlush(void *pvUser, PVDIOSTORAGE pIoStorage, PVDIOCTX pIoCtx,
4458 PFNVDXFERCOMPLETED pfnComplete, void *pvCompleteUser)
4459{
4460 PVDIO pVDIo = (PVDIO)pvUser;
4461 PVDISK pDisk = pVDIo->pDisk;
4462 int rc = VINF_SUCCESS;
4463 PVDIOTASK pIoTask;
4464 PVDMETAXFER pMetaXfer = NULL;
4465 void *pvTask = NULL;
4466
4467 LogFlowFunc(("pvUser=%#p pIoStorage=%#p pIoCtx=%#p\n",
4468 pvUser, pIoStorage, pIoCtx));
4469
4470 AssertMsgReturn( pIoCtx
4471 || (!pfnComplete && !pvCompleteUser),
4472 ("A synchronous metadata write is requested but the parameters are wrong\n"),
4473 VERR_INVALID_POINTER);
4474
4475 /** @todo Enable check for sync I/O later. */
4476 if ( pIoCtx
4477 && !(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4478 VD_IS_LOCKED(pDisk);
4479
4480 if (pVDIo->fIgnoreFlush)
4481 return VINF_SUCCESS;
4482
4483 if ( !pIoCtx
4484 || pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC
4485 || !pVDIo->pInterfaceIo->pfnFlushAsync)
4486 {
4487 /* Handle synchronous flushes. */
4488 /** @todo Integrate with metadata transfers below. */
4489 rc = pVDIo->pInterfaceIo->pfnFlushSync(pVDIo->pInterfaceIo->Core.pvUser,
4490 pIoStorage->pStorage);
4491 }
4492 else
4493 {
4494 /* Allocate a new meta transfer. */
4495 pMetaXfer = vdMetaXferAlloc(pIoStorage, 0, 0);
4496 if (!pMetaXfer)
4497 return VERR_NO_MEMORY;
4498
4499 pIoTask = vdIoTaskMetaAlloc(pIoStorage, pfnComplete, pvUser, pMetaXfer);
4500 if (!pIoTask)
4501 {
4502 RTMemFree(pMetaXfer);
4503 return VERR_NO_MEMORY;
4504 }
4505
4506 ASMAtomicIncU32(&pIoCtx->cMetaTransfersPending);
4507
4508 PVDIOCTXDEFERRED pDeferred = (PVDIOCTXDEFERRED)RTMemAllocZ(sizeof(VDIOCTXDEFERRED));
4509 AssertPtr(pDeferred);
4510
4511 RTListInit(&pDeferred->NodeDeferred);
4512 pDeferred->pIoCtx = pIoCtx;
4513
4514 RTListAppend(&pMetaXfer->ListIoCtxWaiting, &pDeferred->NodeDeferred);
4515 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_FLUSH);
4516 rc = pVDIo->pInterfaceIo->pfnFlushAsync(pVDIo->pInterfaceIo->Core.pvUser,
4517 pIoStorage->pStorage,
4518 pIoTask, &pvTask);
4519 if (RT_SUCCESS(rc))
4520 {
4521 VDMETAXFER_TXDIR_SET(pMetaXfer->fFlags, VDMETAXFER_TXDIR_NONE);
4522 ASMAtomicDecU32(&pIoCtx->cMetaTransfersPending);
4523 vdIoTaskFree(pDisk, pIoTask);
4524 RTMemFree(pDeferred);
4525 RTMemFree(pMetaXfer);
4526 }
4527 else if (rc != VERR_VD_ASYNC_IO_IN_PROGRESS)
4528 RTMemFree(pMetaXfer);
4529 }
4530
4531 LogFlowFunc(("returns rc=%Rrc\n", rc));
4532 return rc;
4533}
4534
4535static DECLCALLBACK(size_t) vdIOIntIoCtxCopyTo(void *pvUser, PVDIOCTX pIoCtx,
4536 const void *pvBuf, size_t cbBuf)
4537{
4538 PVDIO pVDIo = (PVDIO)pvUser;
4539 PVDISK pDisk = pVDIo->pDisk;
4540 size_t cbCopied = 0;
4541
4542 /** @todo Enable check for sync I/O later. */
4543 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4544 VD_IS_LOCKED(pDisk);
4545
4546 cbCopied = vdIoCtxCopyTo(pIoCtx, (uint8_t *)pvBuf, cbBuf);
4547 Assert(cbCopied == cbBuf);
4548
4549 /// @todo Assert(pIoCtx->Req.Io.cbTransferLeft >= cbCopied); - triggers with vdCopyHelper/dmgRead.
4550 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbCopied);
4551
4552 return cbCopied;
4553}
4554
4555static DECLCALLBACK(size_t) vdIOIntIoCtxCopyFrom(void *pvUser, PVDIOCTX pIoCtx,
4556 void *pvBuf, size_t cbBuf)
4557{
4558 PVDIO pVDIo = (PVDIO)pvUser;
4559 PVDISK pDisk = pVDIo->pDisk;
4560 size_t cbCopied = 0;
4561
4562 /** @todo Enable check for sync I/O later. */
4563 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4564 VD_IS_LOCKED(pDisk);
4565
4566 cbCopied = vdIoCtxCopyFrom(pIoCtx, (uint8_t *)pvBuf, cbBuf);
4567 Assert(cbCopied == cbBuf);
4568
4569 /// @todo Assert(pIoCtx->Req.Io.cbTransferLeft > cbCopied); - triggers with vdCopyHelper/dmgRead.
4570 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbCopied);
4571
4572 return cbCopied;
4573}
4574
4575static DECLCALLBACK(size_t) vdIOIntIoCtxSet(void *pvUser, PVDIOCTX pIoCtx, int ch, size_t cb)
4576{
4577 PVDIO pVDIo = (PVDIO)pvUser;
4578 PVDISK pDisk = pVDIo->pDisk;
4579 size_t cbSet = 0;
4580
4581 /** @todo Enable check for sync I/O later. */
4582 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4583 VD_IS_LOCKED(pDisk);
4584
4585 cbSet = vdIoCtxSet(pIoCtx, ch, cb);
4586 Assert(cbSet == cb);
4587
4588 /// @todo Assert(pIoCtx->Req.Io.cbTransferLeft >= cbSet); - triggers with vdCopyHelper/dmgRead.
4589 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbSet);
4590
4591 return cbSet;
4592}
4593
4594static DECLCALLBACK(size_t) vdIOIntIoCtxSegArrayCreate(void *pvUser, PVDIOCTX pIoCtx,
4595 PRTSGSEG paSeg, unsigned *pcSeg,
4596 size_t cbData)
4597{
4598 PVDIO pVDIo = (PVDIO)pvUser;
4599 PVDISK pDisk = pVDIo->pDisk;
4600 size_t cbCreated = 0;
4601
4602 /** @todo It is possible that this gets called from a filter plugin
4603 * outside of the disk lock. Refine assertion or remove completely. */
4604#if 0
4605 /** @todo Enable check for sync I/O later. */
4606 if (!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC))
4607 VD_IS_LOCKED(pDisk);
4608#else
4609 NOREF(pDisk);
4610#endif
4611
4612 cbCreated = RTSgBufSegArrayCreate(&pIoCtx->Req.Io.SgBuf, paSeg, pcSeg, cbData);
4613 Assert(!paSeg || cbData == cbCreated);
4614
4615 return cbCreated;
4616}
4617
4618static DECLCALLBACK(void) vdIOIntIoCtxCompleted(void *pvUser, PVDIOCTX pIoCtx, int rcReq,
4619 size_t cbCompleted)
4620{
4621 PVDIO pVDIo = (PVDIO)pvUser;
4622 PVDISK pDisk = pVDIo->pDisk;
4623
4624 LogFlowFunc(("pvUser=%#p pIoCtx=%#p rcReq=%Rrc cbCompleted=%zu\n",
4625 pvUser, pIoCtx, rcReq, cbCompleted));
4626
4627 /*
4628 * Grab the disk critical section to avoid races with other threads which
4629 * might still modify the I/O context.
4630 * Example is that iSCSI is doing an asynchronous write but calls us already
4631 * while the other thread is still hanging in vdWriteHelperAsync and couldn't update
4632 * the blocked state yet.
4633 * It can overwrite the state to true before we call vdIoCtxContinue and the
4634 * the request would hang indefinite.
4635 */
4636 ASMAtomicCmpXchgS32(&pIoCtx->rcReq, rcReq, VINF_SUCCESS);
4637 Assert(pIoCtx->Req.Io.cbTransferLeft >= cbCompleted);
4638 ASMAtomicSubU32(&pIoCtx->Req.Io.cbTransferLeft, (uint32_t)cbCompleted);
4639
4640 /* Set next transfer function if the current one finished.
4641 * @todo: Find a better way to prevent vdIoCtxContinue from calling the current helper again. */
4642 if (!pIoCtx->Req.Io.cbTransferLeft)
4643 {
4644 pIoCtx->pfnIoCtxTransfer = pIoCtx->pfnIoCtxTransferNext;
4645 pIoCtx->pfnIoCtxTransferNext = NULL;
4646 }
4647
4648 vdIoCtxAddToWaitingList(&pDisk->pIoCtxHaltedHead, pIoCtx);
4649 if (ASMAtomicCmpXchgBool(&pDisk->fLocked, true, false))
4650 {
4651 /* Immediately drop the lock again, it will take care of processing the list. */
4652 vdDiskUnlock(pDisk, NULL);
4653 }
4654}
4655
4656static DECLCALLBACK(bool) vdIOIntIoCtxIsSynchronous(void *pvUser, PVDIOCTX pIoCtx)
4657{
4658 NOREF(pvUser);
4659 return !!(pIoCtx->fFlags & VDIOCTX_FLAGS_SYNC);
4660}
4661
4662static DECLCALLBACK(bool) vdIOIntIoCtxIsZero(void *pvUser, PVDIOCTX pIoCtx, size_t cbCheck,
4663 bool fAdvance)
4664{
4665 NOREF(pvUser);
4666
4667 bool fIsZero = RTSgBufIsZero(&pIoCtx->Req.Io.SgBuf, cbCheck);
4668 if (fIsZero && fAdvance)
4669 RTSgBufAdvance(&pIoCtx->Req.Io.SgBuf, cbCheck);
4670
4671 return fIsZero;
4672}
4673
4674static DECLCALLBACK(size_t) vdIOIntIoCtxGetDataUnitSize(void *pvUser, PVDIOCTX pIoCtx)
4675{
4676 RT_NOREF1(pIoCtx);
4677 PVDIO pVDIo = (PVDIO)pvUser;
4678 PVDISK pDisk = pVDIo->pDisk;
4679 size_t cbSector = 0;
4680
4681 PVDIMAGE pImage = vdGetImageByNumber(pDisk, VD_LAST_IMAGE);
4682 AssertPtrReturn(pImage, 0);
4683
4684 PCVDREGIONLIST pRegionList = NULL;
4685 int rc = pImage->Backend->pfnQueryRegions(pImage->pBackendData, &pRegionList);
4686 if (RT_SUCCESS(rc))
4687 {
4688 cbSector = pRegionList->aRegions[0].cbBlock;
4689
4690 AssertPtr(pImage->Backend->pfnRegionListRelease);
4691 pImage->Backend->pfnRegionListRelease(pImage->pBackendData, pRegionList);
4692 }
4693
4694 return cbSector;
4695}
4696
4697/**
4698 * VD I/O interface callback for opening a file (limited version for VDGetFormat).
4699 */
4700static DECLCALLBACK(int) vdIOIntOpenLimited(void *pvUser, const char *pszLocation,
4701 uint32_t fOpen, PPVDIOSTORAGE ppIoStorage)
4702{
4703 int rc = VINF_SUCCESS;
4704 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4705 PVDIOSTORAGE pIoStorage = (PVDIOSTORAGE)RTMemAllocZ(sizeof(VDIOSTORAGE));
4706
4707 if (!pIoStorage)
4708 return VERR_NO_MEMORY;
4709
4710 rc = pInterfaceIo->pfnOpen(NULL, pszLocation, fOpen, NULL, &pIoStorage->pStorage);
4711 if (RT_SUCCESS(rc))
4712 *ppIoStorage = pIoStorage;
4713 else
4714 RTMemFree(pIoStorage);
4715
4716 return rc;
4717}
4718
4719static DECLCALLBACK(int) vdIOIntCloseLimited(void *pvUser, PVDIOSTORAGE pIoStorage)
4720{
4721 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4722 int rc = pInterfaceIo->pfnClose(NULL, pIoStorage->pStorage);
4723
4724 RTMemFree(pIoStorage);
4725 return rc;
4726}
4727
4728static DECLCALLBACK(int) vdIOIntDeleteLimited(void *pvUser, const char *pcszFilename)
4729{
4730 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4731 return pInterfaceIo->pfnDelete(NULL, pcszFilename);
4732}
4733
4734static DECLCALLBACK(int) vdIOIntMoveLimited(void *pvUser, const char *pcszSrc,
4735 const char *pcszDst, unsigned fMove)
4736{
4737 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4738 return pInterfaceIo->pfnMove(NULL, pcszSrc, pcszDst, fMove);
4739}
4740
4741static DECLCALLBACK(int) vdIOIntGetFreeSpaceLimited(void *pvUser, const char *pcszFilename,
4742 int64_t *pcbFreeSpace)
4743{
4744 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4745 return pInterfaceIo->pfnGetFreeSpace(NULL, pcszFilename, pcbFreeSpace);
4746}
4747
4748static DECLCALLBACK(int) vdIOIntGetModificationTimeLimited(void *pvUser,
4749 const char *pcszFilename,
4750 PRTTIMESPEC pModificationTime)
4751{
4752 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4753 return pInterfaceIo->pfnGetModificationTime(NULL, pcszFilename, pModificationTime);
4754}
4755
4756static DECLCALLBACK(int) vdIOIntGetSizeLimited(void *pvUser, PVDIOSTORAGE pIoStorage,
4757 uint64_t *pcbSize)
4758{
4759 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4760 return pInterfaceIo->pfnGetSize(NULL, pIoStorage->pStorage, pcbSize);
4761}
4762
4763static DECLCALLBACK(int) vdIOIntSetSizeLimited(void *pvUser, PVDIOSTORAGE pIoStorage,
4764 uint64_t cbSize)
4765{
4766 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4767 return pInterfaceIo->pfnSetSize(NULL, pIoStorage->pStorage, cbSize);
4768}
4769
4770static DECLCALLBACK(int) vdIOIntWriteUserLimited(void *pvUser, PVDIOSTORAGE pStorage,
4771 uint64_t uOffset, PVDIOCTX pIoCtx,
4772 size_t cbWrite,
4773 PFNVDXFERCOMPLETED pfnComplete,
4774 void *pvCompleteUser)
4775{
4776 NOREF(pvUser);
4777 NOREF(pStorage);
4778 NOREF(uOffset);
4779 NOREF(pIoCtx);
4780 NOREF(cbWrite);
4781 NOREF(pfnComplete);
4782 NOREF(pvCompleteUser);
4783 AssertMsgFailedReturn(("This needs to be implemented when called\n"), VERR_NOT_IMPLEMENTED);
4784}
4785
4786static DECLCALLBACK(int) vdIOIntReadUserLimited(void *pvUser, PVDIOSTORAGE pStorage,
4787 uint64_t uOffset, PVDIOCTX pIoCtx,
4788 size_t cbRead)
4789{
4790 NOREF(pvUser);
4791 NOREF(pStorage);
4792 NOREF(uOffset);
4793 NOREF(pIoCtx);
4794 NOREF(cbRead);
4795 AssertMsgFailedReturn(("This needs to be implemented when called\n"), VERR_NOT_IMPLEMENTED);
4796}
4797
4798static DECLCALLBACK(int) vdIOIntWriteMetaLimited(void *pvUser, PVDIOSTORAGE pStorage,
4799 uint64_t uOffset, const void *pvBuffer,
4800 size_t cbBuffer, PVDIOCTX pIoCtx,
4801 PFNVDXFERCOMPLETED pfnComplete,
4802 void *pvCompleteUser)
4803{
4804 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4805
4806 AssertMsgReturn(!pIoCtx && !pfnComplete && !pvCompleteUser,
4807 ("Async I/O not implemented for the limited interface"),
4808 VERR_NOT_SUPPORTED);
4809
4810 return pInterfaceIo->pfnWriteSync(NULL, pStorage->pStorage, uOffset, pvBuffer, cbBuffer, NULL);
4811}
4812
4813static DECLCALLBACK(int) vdIOIntReadMetaLimited(void *pvUser, PVDIOSTORAGE pStorage,
4814 uint64_t uOffset, void *pvBuffer,
4815 size_t cbBuffer, PVDIOCTX pIoCtx,
4816 PPVDMETAXFER ppMetaXfer,
4817 PFNVDXFERCOMPLETED pfnComplete,
4818 void *pvCompleteUser)
4819{
4820 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4821
4822 AssertMsgReturn(!pIoCtx && !ppMetaXfer && !pfnComplete && !pvCompleteUser,
4823 ("Async I/O not implemented for the limited interface"),
4824 VERR_NOT_SUPPORTED);
4825
4826 return pInterfaceIo->pfnReadSync(NULL, pStorage->pStorage, uOffset, pvBuffer, cbBuffer, NULL);
4827}
4828
4829#if 0 /* unsed */
4830static int vdIOIntMetaXferReleaseLimited(void *pvUser, PVDMETAXFER pMetaXfer)
4831{
4832 /* This is a NOP in this case. */
4833 NOREF(pvUser);
4834 NOREF(pMetaXfer);
4835 return VINF_SUCCESS;
4836}
4837#endif
4838
4839static DECLCALLBACK(int) vdIOIntFlushLimited(void *pvUser, PVDIOSTORAGE pStorage,
4840 PVDIOCTX pIoCtx,
4841 PFNVDXFERCOMPLETED pfnComplete,
4842 void *pvCompleteUser)
4843{
4844 PVDINTERFACEIO pInterfaceIo = (PVDINTERFACEIO)pvUser;
4845
4846 AssertMsgReturn(!pIoCtx && !pfnComplete && !pvCompleteUser,
4847 ("Async I/O not implemented for the limited interface"),
4848 VERR_NOT_SUPPORTED);
4849
4850 return pInterfaceIo->pfnFlushSync(NULL, pStorage->pStorage);
4851}
4852
4853/**
4854 * internal: send output to the log (unconditionally).
4855 */
4856static DECLCALLBACK(int) vdLogMessage(void *pvUser, const char *pszFormat, va_list args)
4857{
4858 NOREF(pvUser);
4859 RTLogPrintfV(pszFormat, args);
4860 return VINF_SUCCESS;
4861}
4862
4863DECLINLINE(int) vdMessageWrapper(PVDISK pDisk, const char *pszFormat, ...)
4864{
4865 va_list va;
4866 va_start(va, pszFormat);
4867 int rc = pDisk->pInterfaceError->pfnMessage(pDisk->pInterfaceError->Core.pvUser,
4868 pszFormat, va);
4869 va_end(va);
4870 return rc;
4871}
4872
4873
4874/**
4875 * internal: adjust PCHS geometry
4876 */
4877static void vdFixupPCHSGeometry(PVDGEOMETRY pPCHS, uint64_t cbSize)
4878{
4879 /* Fix broken PCHS geometry. Can happen for two reasons: either the backend
4880 * mixes up PCHS and LCHS, or the application used to create the source
4881 * image has put garbage in it. Additionally, if the PCHS geometry covers
4882 * more than the image size, set it back to the default. */
4883 if ( pPCHS->cHeads > 16
4884 || pPCHS->cSectors > 63
4885 || pPCHS->cCylinders == 0
4886 || (uint64_t)pPCHS->cHeads * pPCHS->cSectors * pPCHS->cCylinders * 512 > cbSize)
4887 {
4888 Assert(!(RT_MIN(cbSize / 512 / 16 / 63, 16383) - (uint32_t)RT_MIN(cbSize / 512 / 16 / 63, 16383)));
4889 pPCHS->cCylinders = (uint32_t)RT_MIN(cbSize / 512 / 16 / 63, 16383);
4890 pPCHS->cHeads = 16;
4891 pPCHS->cSectors = 63;
4892 }
4893}
4894
4895/**
4896 * internal: adjust LCHS geometry
4897 */
4898static void vdFixupLCHSGeometry(PVDGEOMETRY pLCHS, uint64_t cbSize)
4899{
4900 /* Fix broken LCHS geometry. Can happen for two reasons: either the backend
4901 * mixes up PCHS and LCHS, or the application used to create the source
4902 * image has put garbage in it. The fix in this case is to clear the LCHS
4903 * geometry to trigger autodetection when it is used next. If the geometry
4904 * already says "please autodetect" (cylinders=0) keep it. */
4905 if ( ( pLCHS->cHeads > 255
4906 || pLCHS->cHeads == 0
4907 || pLCHS->cSectors > 63
4908 || pLCHS->cSectors == 0)
4909 && pLCHS->cCylinders != 0)
4910 {
4911 pLCHS->cCylinders = 0;
4912 pLCHS->cHeads = 0;
4913 pLCHS->cSectors = 0;
4914 }
4915 /* Always recompute the number of cylinders stored in the LCHS
4916 * geometry if it isn't set to "autotedetect" at the moment.
4917 * This is very useful if the destination image size is
4918 * larger or smaller than the source image size. Do not modify
4919 * the number of heads and sectors. Windows guests hate it. */
4920 if ( pLCHS->cCylinders != 0
4921 && pLCHS->cHeads != 0 /* paranoia */
4922 && pLCHS->cSectors != 0 /* paranoia */)
4923 {
4924 Assert(!(RT_MIN(cbSize / 512 / pLCHS->cHeads / pLCHS->cSectors, 1024) - (uint32_t)RT_MIN(cbSize / 512 / pLCHS->cHeads / pLCHS->cSectors, 1024)));
4925 pLCHS->cCylinders = (uint32_t)RT_MIN(cbSize / 512 / pLCHS->cHeads / pLCHS->cSectors, 1024);
4926 }
4927}
4928
4929/**
4930 * Sets the I/O callbacks of the given interface to the fallback methods
4931 *
4932 * @param pIfIo The I/O interface to setup.
4933 */
4934static void vdIfIoFallbackCallbacksSetup(PVDINTERFACEIO pIfIo)
4935{
4936 pIfIo->pfnOpen = vdIOOpenFallback;
4937 pIfIo->pfnClose = vdIOCloseFallback;
4938 pIfIo->pfnDelete = vdIODeleteFallback;
4939 pIfIo->pfnMove = vdIOMoveFallback;
4940 pIfIo->pfnGetFreeSpace = vdIOGetFreeSpaceFallback;
4941 pIfIo->pfnGetModificationTime = vdIOGetModificationTimeFallback;
4942 pIfIo->pfnGetSize = vdIOGetSizeFallback;
4943 pIfIo->pfnSetSize = vdIOSetSizeFallback;
4944 pIfIo->pfnSetAllocationSize = vdIOSetAllocationSizeFallback;
4945 pIfIo->pfnReadSync = vdIOReadSyncFallback;
4946 pIfIo->pfnWriteSync = vdIOWriteSyncFallback;
4947 pIfIo->pfnFlushSync = vdIOFlushSyncFallback;
4948 pIfIo->pfnReadAsync = NULL;
4949 pIfIo->pfnWriteAsync = NULL;
4950 pIfIo->pfnFlushAsync = NULL;
4951}
4952
4953/**
4954 * Sets the internal I/O callbacks of the given interface.
4955 *
4956 * @param pIfIoInt The internal I/O interface to setup.
4957 */
4958static void vdIfIoIntCallbacksSetup(PVDINTERFACEIOINT pIfIoInt)
4959{
4960 pIfIoInt->pfnOpen = vdIOIntOpen;
4961 pIfIoInt->pfnClose = vdIOIntClose;
4962 pIfIoInt->pfnDelete = vdIOIntDelete;
4963 pIfIoInt->pfnMove = vdIOIntMove;
4964 pIfIoInt->pfnGetFreeSpace = vdIOIntGetFreeSpace;
4965 pIfIoInt->pfnGetModificationTime = vdIOIntGetModificationTime;
4966 pIfIoInt->pfnGetSize = vdIOIntGetSize;
4967 pIfIoInt->pfnSetSize = vdIOIntSetSize;
4968 pIfIoInt->pfnSetAllocationSize = vdIOIntSetAllocationSize;
4969 pIfIoInt->pfnReadUser = vdIOIntReadUser;
4970 pIfIoInt->pfnWriteUser = vdIOIntWriteUser;
4971 pIfIoInt->pfnReadMeta = vdIOIntReadMeta;
4972 pIfIoInt->pfnWriteMeta = vdIOIntWriteMeta;
4973 pIfIoInt->pfnMetaXferRelease = vdIOIntMetaXferRelease;
4974 pIfIoInt->pfnFlush = vdIOIntFlush;
4975 pIfIoInt->pfnIoCtxCopyFrom = vdIOIntIoCtxCopyFrom;
4976 pIfIoInt->pfnIoCtxCopyTo = vdIOIntIoCtxCopyTo;
4977 pIfIoInt->pfnIoCtxSet = vdIOIntIoCtxSet;
4978 pIfIoInt->pfnIoCtxSegArrayCreate = vdIOIntIoCtxSegArrayCreate;
4979 pIfIoInt->pfnIoCtxCompleted = vdIOIntIoCtxCompleted;
4980 pIfIoInt->pfnIoCtxIsSynchronous = vdIOIntIoCtxIsSynchronous;
4981 pIfIoInt->pfnIoCtxIsZero = vdIOIntIoCtxIsZero;
4982 pIfIoInt->pfnIoCtxGetDataUnitSize = vdIOIntIoCtxGetDataUnitSize;
4983}
4984
4985/**
4986 * Internally used completion handler for synchronous I/O contexts.
4987 */
4988static DECLCALLBACK(void) vdIoCtxSyncComplete(void *pvUser1, void *pvUser2, int rcReq)
4989{
4990 RT_NOREF2(pvUser1, rcReq);
4991 RTSEMEVENT hEvent = (RTSEMEVENT)pvUser2;
4992
4993 RTSemEventSignal(hEvent);
4994}
4995
4996
4997VBOXDDU_DECL(int) VDInit(void)
4998{
4999 int rc = vdPluginInit();
5000 LogRel(("VD: VDInit finished with %Rrc\n", rc));
5001 return rc;
5002}
5003
5004
5005VBOXDDU_DECL(int) VDShutdown(void)
5006{
5007 return vdPluginTerm();
5008}
5009
5010
5011VBOXDDU_DECL(int) VDPluginLoadFromFilename(const char *pszFilename)
5012{
5013 if (!vdPluginIsInitialized())
5014 {
5015 int rc = VDInit();
5016 if (RT_FAILURE(rc))
5017 return rc;
5018 }
5019
5020 return vdPluginLoadFromFilename(pszFilename);
5021}
5022
5023/**
5024 * Load all plugins from a given path.
5025 *
5026 * @returns VBox statuse code.
5027 * @param pszPath The path to load plugins from.
5028 */
5029VBOXDDU_DECL(int) VDPluginLoadFromPath(const char *pszPath)
5030{
5031 if (!vdPluginIsInitialized())
5032 {
5033 int rc = VDInit();
5034 if (RT_FAILURE(rc))
5035 return rc;
5036 }
5037
5038 return vdPluginLoadFromPath(pszPath);
5039}
5040
5041
5042VBOXDDU_DECL(int) VDPluginUnloadFromFilename(const char *pszFilename)
5043{
5044 if (!vdPluginIsInitialized())
5045 {
5046 int rc = VDInit();
5047 if (RT_FAILURE(rc))
5048 return rc;
5049 }
5050
5051 return vdPluginUnloadFromFilename(pszFilename);
5052}
5053
5054
5055VBOXDDU_DECL(int) VDPluginUnloadFromPath(const char *pszPath)
5056{
5057 if (!vdPluginIsInitialized())
5058 {
5059 int rc = VDInit();
5060 if (RT_FAILURE(rc))
5061 return rc;
5062 }
5063
5064 return vdPluginUnloadFromPath(pszPath);
5065}
5066
5067
5068VBOXDDU_DECL(int) VDBackendInfo(unsigned cEntriesAlloc, PVDBACKENDINFO pEntries,
5069 unsigned *pcEntriesUsed)
5070{
5071 int rc = VINF_SUCCESS;
5072
5073 LogFlowFunc(("cEntriesAlloc=%u pEntries=%#p pcEntriesUsed=%#p\n", cEntriesAlloc, pEntries, pcEntriesUsed));
5074 /* Check arguments. */
5075 AssertMsgReturn(cEntriesAlloc, ("cEntriesAlloc=%u\n", cEntriesAlloc), VERR_INVALID_PARAMETER);
5076 AssertPtrReturn(pEntries, VERR_INVALID_POINTER);
5077 AssertPtrReturn(pcEntriesUsed, VERR_INVALID_POINTER);
5078 if (!vdPluginIsInitialized())
5079 VDInit();
5080
5081 uint32_t cBackends = vdGetImageBackendCount();
5082 if (cEntriesAlloc < cBackends)
5083 {
5084 *pcEntriesUsed = cBackends;
5085 return VERR_BUFFER_OVERFLOW;
5086 }
5087
5088 for (unsigned i = 0; i < cBackends; i++)
5089 {
5090 PCVDIMAGEBACKEND pBackend;
5091 rc = vdQueryImageBackend(i, &pBackend);
5092 AssertRC(rc);
5093
5094 pEntries[i].pszBackend = pBackend->pszBackendName;
5095 pEntries[i].uBackendCaps = pBackend->uBackendCaps;
5096 pEntries[i].paFileExtensions = pBackend->paFileExtensions;
5097 pEntries[i].paConfigInfo = pBackend->paConfigInfo;
5098 pEntries[i].pfnComposeLocation = pBackend->pfnComposeLocation;
5099 pEntries[i].pfnComposeName = pBackend->pfnComposeName;
5100 }
5101
5102 LogFlowFunc(("returns %Rrc *pcEntriesUsed=%u\n", rc, cBackends));
5103 *pcEntriesUsed = cBackends;
5104 return rc;
5105}
5106
5107
5108VBOXDDU_DECL(int) VDBackendInfoOne(const char *pszBackend, PVDBACKENDINFO pEntry)
5109{
5110 LogFlowFunc(("pszBackend=%#p pEntry=%#p\n", pszBackend, pEntry));
5111 /* Check arguments. */
5112 AssertPtrReturn(pszBackend, VERR_INVALID_POINTER);
5113 AssertPtrReturn(pEntry, VERR_INVALID_POINTER);
5114 if (!vdPluginIsInitialized())
5115 VDInit();
5116
5117 PCVDIMAGEBACKEND pBackend;
5118 int rc = vdFindImageBackend(pszBackend, &pBackend);
5119 if (RT_SUCCESS(rc))
5120 {
5121 pEntry->pszBackend = pBackend->pszBackendName;
5122 pEntry->uBackendCaps = pBackend->uBackendCaps;
5123 pEntry->paFileExtensions = pBackend->paFileExtensions;
5124 pEntry->paConfigInfo = pBackend->paConfigInfo;
5125 }
5126
5127 return rc;
5128}
5129
5130
5131VBOXDDU_DECL(int) VDFilterInfo(unsigned cEntriesAlloc, PVDFILTERINFO pEntries,
5132 unsigned *pcEntriesUsed)
5133{
5134 int rc = VINF_SUCCESS;
5135
5136 LogFlowFunc(("cEntriesAlloc=%u pEntries=%#p pcEntriesUsed=%#p\n", cEntriesAlloc, pEntries, pcEntriesUsed));
5137 /* Check arguments. */
5138 AssertMsgReturn(cEntriesAlloc,
5139 ("cEntriesAlloc=%u\n", cEntriesAlloc),
5140 VERR_INVALID_PARAMETER);
5141 AssertPtrReturn(pEntries, VERR_INVALID_POINTER);
5142 AssertPtrReturn(pcEntriesUsed, VERR_INVALID_POINTER);
5143 if (!vdPluginIsInitialized())
5144 VDInit();
5145
5146 uint32_t cBackends = vdGetFilterBackendCount();
5147 if (cEntriesAlloc < cBackends)
5148 {
5149 *pcEntriesUsed = cBackends;
5150 return VERR_BUFFER_OVERFLOW;
5151 }
5152
5153 for (unsigned i = 0; i < cBackends; i++)
5154 {
5155 PCVDFILTERBACKEND pBackend;
5156 rc = vdQueryFilterBackend(i, &pBackend);
5157 pEntries[i].pszFilter = pBackend->pszBackendName;
5158 pEntries[i].paConfigInfo = pBackend->paConfigInfo;
5159 }
5160
5161 LogFlowFunc(("returns %Rrc *pcEntriesUsed=%u\n", rc, cBackends));
5162 *pcEntriesUsed = cBackends;
5163 return rc;
5164}
5165
5166
5167VBOXDDU_DECL(int) VDFilterInfoOne(const char *pszFilter, PVDFILTERINFO pEntry)
5168{
5169 LogFlowFunc(("pszFilter=%#p pEntry=%#p\n", pszFilter, pEntry));
5170 /* Check arguments. */
5171 AssertPtrReturn(pszFilter, VERR_INVALID_POINTER);
5172 AssertPtrReturn(pEntry, VERR_INVALID_POINTER);
5173 if (!vdPluginIsInitialized())
5174 VDInit();
5175
5176 PCVDFILTERBACKEND pBackend;
5177 int rc = vdFindFilterBackend(pszFilter, &pBackend);
5178 if (RT_SUCCESS(rc))
5179 {
5180 pEntry->pszFilter = pBackend->pszBackendName;
5181 pEntry->paConfigInfo = pBackend->paConfigInfo;
5182 }
5183
5184 return rc;
5185}
5186
5187
5188VBOXDDU_DECL(int) VDCreate(PVDINTERFACE pVDIfsDisk, VDTYPE enmType, PVDISK *ppDisk)
5189{
5190 int rc = VINF_SUCCESS;
5191 PVDISK pDisk = NULL;
5192
5193 LogFlowFunc(("pVDIfsDisk=%#p\n", pVDIfsDisk));
5194 /* Check arguments. */
5195 AssertPtrReturn(ppDisk, VERR_INVALID_POINTER);
5196
5197 do
5198 {
5199 pDisk = (PVDISK)RTMemAllocZ(sizeof(VDISK));
5200 if (pDisk)
5201 {
5202 pDisk->u32Signature = VDISK_SIGNATURE;
5203 pDisk->enmType = enmType;
5204 pDisk->cImages = 0;
5205 pDisk->pBase = NULL;
5206 pDisk->pLast = NULL;
5207 pDisk->cbSize = 0;
5208 pDisk->PCHSGeometry.cCylinders = 0;
5209 pDisk->PCHSGeometry.cHeads = 0;
5210 pDisk->PCHSGeometry.cSectors = 0;
5211 pDisk->LCHSGeometry.cCylinders = 0;
5212 pDisk->LCHSGeometry.cHeads = 0;
5213 pDisk->LCHSGeometry.cSectors = 0;
5214 pDisk->pVDIfsDisk = pVDIfsDisk;
5215 pDisk->pInterfaceError = NULL;
5216 pDisk->pInterfaceThreadSync = NULL;
5217 pDisk->pIoCtxLockOwner = NULL;
5218 pDisk->pIoCtxHead = NULL;
5219 pDisk->fLocked = false;
5220 pDisk->hMemCacheIoCtx = NIL_RTMEMCACHE;
5221 pDisk->hMemCacheIoTask = NIL_RTMEMCACHE;
5222 RTListInit(&pDisk->ListFilterChainWrite);
5223 RTListInit(&pDisk->ListFilterChainRead);
5224
5225 /* Create the I/O ctx cache */
5226 rc = RTMemCacheCreate(&pDisk->hMemCacheIoCtx, sizeof(VDIOCTX), 0, UINT32_MAX,
5227 NULL, NULL, NULL, 0);
5228 if (RT_FAILURE(rc))
5229 break;
5230
5231 /* Create the I/O task cache */
5232 rc = RTMemCacheCreate(&pDisk->hMemCacheIoTask, sizeof(VDIOTASK), 0, UINT32_MAX,
5233 NULL, NULL, NULL, 0);
5234 if (RT_FAILURE(rc))
5235 break;
5236
5237 pDisk->pInterfaceError = VDIfErrorGet(pVDIfsDisk);
5238 pDisk->pInterfaceThreadSync = VDIfThreadSyncGet(pVDIfsDisk);
5239
5240 *ppDisk = pDisk;
5241 }
5242 else
5243 {
5244 rc = VERR_NO_MEMORY;
5245 break;
5246 }
5247 } while (0);
5248
5249 if ( RT_FAILURE(rc)
5250 && pDisk)
5251 {
5252 if (pDisk->hMemCacheIoCtx != NIL_RTMEMCACHE)
5253 RTMemCacheDestroy(pDisk->hMemCacheIoCtx);
5254 if (pDisk->hMemCacheIoTask != NIL_RTMEMCACHE)
5255 RTMemCacheDestroy(pDisk->hMemCacheIoTask);
5256 }
5257
5258 LogFlowFunc(("returns %Rrc (pDisk=%#p)\n", rc, pDisk));
5259 return rc;
5260}
5261
5262
5263VBOXDDU_DECL(int) VDDestroy(PVDISK pDisk)
5264{
5265 int rc = VINF_SUCCESS;
5266 LogFlowFunc(("pDisk=%#p\n", pDisk));
5267 do
5268 {
5269 /* sanity check */
5270 AssertPtrBreak(pDisk);
5271 AssertMsg(pDisk->u32Signature == VDISK_SIGNATURE, ("u32Signature=%08x\n", pDisk->u32Signature));
5272 Assert(!pDisk->fLocked);
5273
5274 rc = VDCloseAll(pDisk);
5275 int rc2 = VDFilterRemoveAll(pDisk);
5276 if (RT_SUCCESS(rc))
5277 rc = rc2;
5278
5279 RTMemCacheDestroy(pDisk->hMemCacheIoCtx);
5280 RTMemCacheDestroy(pDisk