Index: /trunk/src/VBox/Devices/Storage/ATAController.cpp
===================================================================
--- /trunk/src/VBox/Devices/Storage/ATAController.cpp	(revision 24095)
+++ /trunk/src/VBox/Devices/Storage/ATAController.cpp	(revision 24096)
@@ -248,9 +248,7 @@
     rc = RTSemMutexRelease(pCtl->AsyncIORequestMutex);
     AssertRC(rc);
-    LogBird(("ata: %x: signalling\n", pCtl->IOPortBase1));
     rc = PDMR3CritSectScheduleExitEvent(&pCtl->lock, pCtl->AsyncIOSem);
     if (RT_FAILURE(rc))
     {
-        LogBird(("ata: %x: schedule failed, rc=%Rrc\n", pCtl->IOPortBase1, rc));
         rc = RTSemEventSignal(pCtl->AsyncIOSem);
         AssertRC(rc);
@@ -3083,21 +3081,39 @@
 {
     uint64_t        u64Start;
+    bool            fRc;
+
+    /* Hope for the simple way out...  */
+    if (ataAsyncIOIsIdle(pCtl, false /*fStrict*/))
+        return true;
 
     /*
-     * Wait for any pending async operation to finish
+     * Have to wait. Do the setup while owning the mutex to avoid races.
      */
+    RTSemMutexRequest(pCtl->AsyncIORequestMutex, RT_INDEFINITE_WAIT);
+
+    RTThreadUserReset(pCtl->AsyncIOThread);
+    ASMAtomicWriteBool(&pCtl->fSignalIdle, true);
+
+    RTSemMutexRelease(pCtl->AsyncIORequestMutex);
+
     u64Start = RTTimeMilliTS();
     for (;;)
     {
-        if (ataAsyncIOIsIdle(pCtl, false))
-            return true;
+        fRc = ataAsyncIOIsIdle(pCtl, false /*fStrict*/);
+        if (fRc)
+            break;
+
         if (RTTimeMilliTS() - u64Start >= cMillies)
             break;
 
-        /* Sleep for a bit. */
-        RTThreadSleep(100);
-    }
-
-    return false;
+        int rc = RTThreadUserWait(pCtl->AsyncIOThread, 100 /*ms*/);
+        AssertMsg(   (   RT_SUCCESS(rc)
+                      && ataAsyncIOIsIdle(pCtl, false /*fStrict*/))
+                  || rc == VERR_TIMEOUT,
+                  ("rc=%Rrc irq=%u\n", rc, pCtl->irq));
+    }
+
+    ASMAtomicWriteBool(&pCtl->fSignalIdle, false);
+    return fRc;
 }
 
@@ -3830,4 +3846,26 @@
 }
 
+
+/**
+ * Signal ataWaitForAsyncIOIsIdle that we're idle (if we actually are).
+ *
+ * @param   pCtl        The controller.
+ */
+static void ataAsyncSignalIdle(PAHCIATACONTROLLER pCtl)
+{
+    /*
+     * Take the mutex here and recheck the idle indicator as there might be
+     * interesting races, like in the ataReset code.
+     */
+    int rc = RTSemMutexRequest(pCtl->AsyncIORequestMutex, RT_INDEFINITE_WAIT); AssertRC(rc);
+
+    if (    pCtl->fSignalIdle
+        &&  ataAsyncIOIsIdle(pCtl, false /*fStrict*/))
+        RTThreadUserSignal(pCtl->AsyncIOThread);
+
+    rc = RTSemMutexRelease(pCtl->AsyncIORequestMutex); AssertRC(rc);
+}
+
+
 /** Asynch I/O thread for an interface. Once upon a time this was readable
  * code with several loops and a different semaphore for each purpose. But
@@ -3850,4 +3888,6 @@
         while (pCtl->fRedoIdle)
         {
+            if (pCtl->fSignalIdle)
+                ataAsyncSignalIdle(pCtl);
             rc = RTSemEventWait(pCtl->SuspendIOSem, RT_INDEFINITE_WAIT);
             if (RT_FAILURE(rc) || pCtl->fShutdown)
@@ -3860,7 +3900,7 @@
         if (pReq == NULL)
         {
-            LogBird(("ata: %x: going to sleep...\n", pCtl->IOPortBase1));
+            if (pCtl->fSignalIdle)
+                ataAsyncSignalIdle(pCtl);
             rc = RTSemEventWait(pCtl->AsyncIOSem, RT_INDEFINITE_WAIT);
-            LogBird(("ata: %x: waking up\n", pCtl->IOPortBase1));
             if (RT_FAILURE(rc) || pCtl->fShutdown)
                 break;
@@ -3892,7 +3932,5 @@
         {
         STAM_PROFILE_START(&pCtl->StatLockWait, a);
-        LogBird(("ata: %x: entering critsect\n", pCtl->IOPortBase1));
         PDMCritSectEnter(&pCtl->lock, VINF_SUCCESS);
-        LogBird(("ata: %x: entered\n", pCtl->IOPortBase1));
         STAM_PROFILE_STOP(&pCtl->StatLockWait, a);
         }
@@ -4291,23 +4329,12 @@
         }
 
-        LogBird(("ata: %x: leaving critsect\n", pCtl->IOPortBase1));
         PDMCritSectLeave(&pCtl->lock);
     }
 
-    /* Cleanup the state.  */
-    if (pCtl->AsyncIOSem)
-    {
-        RTSemEventDestroy(pCtl->AsyncIOSem);
-        pCtl->AsyncIOSem = NIL_RTSEMEVENT;
-    }
-    if (pCtl->SuspendIOSem)
-    {
-        RTSemEventDestroy(pCtl->SuspendIOSem);
-        pCtl->SuspendIOSem = NIL_RTSEMEVENT;
-    }
+    /* Signal the ultimate idleness. */
+    RTThreadUserSignal(ThreadSelf);
+
     /* Do not destroy request mutex yet, still needed for proper shutdown. */
     pCtl->fShutdown = false;
-    /* This must be last, as it also signals thread exit to EMT. */
-    pCtl->AsyncIOThread = NIL_RTTHREAD;
 
     Log2(("%s: Ctl: return %Rrc\n", __FUNCTION__, rc));
@@ -4606,7 +4633,5 @@
     else
         AssertMsgFailed(("ataIOPortWrite1: unsupported write to port %x val=%x size=%d\n", Port, u32, cb));
-    LogBird(("ata: leaving critsect\n"));
     PDMCritSectLeave(&pCtl->lock);
-    LogBird(("ata: left critsect\n"));
     return rc;
 }
@@ -4861,40 +4886,19 @@
 
     /*
-     * Terminate all async helper threads
+     * Terminate the async helper thread and wait for it to finish up.
      */
     if (pCtl->AsyncIOThread != NIL_RTTHREAD)
     {
-        ASMAtomicXchgU32(&pCtl->fShutdown, true);
+        ASMAtomicWriteU32(&pCtl->fShutdown, true);
         rc = RTSemEventSignal(pCtl->AsyncIOSem);
         AssertRC(rc);
-    }
-
-    if (pCtl->CTX_SUFF(pDevIns))
-    {
-        /*
-         * Wait for them to complete whatever they are doing and then
-         * for them to terminate.
-         */
-        if (ataWaitForAllAsyncIOIsIdle(pCtl, 20000))
-        {
-            uint64_t    u64Start = RTTimeMilliTS();
-            bool        fAllDone;
-            for (;;)
-            {
-                /* check */
-                fAllDone = true;
-                fAllDone &= (pCtl->AsyncIOThread == NIL_RTTHREAD);
-
-                if (    fAllDone
-                    ||  RTTimeMilliTS() - u64Start >= 500)
-                    break;
-
-                /* Sleep for a bit. */
-                RTThreadSleep(100);
-            }
-            AssertMsg(fAllDone, ("Some of the async I/O threads are still running!\n"));
-        }
+
+        rc = RTThreadWait(pCtl->AsyncIOThread, 30000 /* 30 s*/, NULL);
+        if (RT_SUCCESS(rc))
+            pCtl->AsyncIOThread = NIL_RTTHREAD;
         else
-            AssertMsgFailed(("Async I/O is still busy!\n"));
+            LogRel(("PIIX3 ATA Dtor: Ctl/irq=%u is still executing, DevSel=%d AIOIf=%d CmdIf0=%#04x CmdIf1=%#04x rc=%Rrc\n",
+                    pCtl->irq, pCtl->iSelectedIf, pCtl->iAIOIf,
+                    pCtl->aIfs[0].uATARegCommand, pCtl->aIfs[1].uATARegCommand, rc));
     }
 
@@ -4902,8 +4906,29 @@
      * Now the request mutexes are no longer needed. Free resources.
      */
-    if (pCtl->AsyncIORequestMutex)
+    if (pCtl->AsyncIORequestMutex != NIL_RTSEMMUTEX)
     {
         RTSemMutexDestroy(pCtl->AsyncIORequestMutex);
-        pCtl->AsyncIORequestMutex = NIL_RTSEMEVENT;
+        pCtl->AsyncIORequestMutex = NIL_RTSEMMUTEX;
+    }
+    if (pCtl->AsyncIOSem != NIL_RTSEMEVENT)
+    {
+        RTSemEventDestroy(pCtl->AsyncIOSem);
+        pCtl->AsyncIOSem = NIL_RTSEMEVENT;
+    }
+    if (pCtl->SuspendIOSem != NIL_RTSEMEVENT)
+    {
+        RTSemEventDestroy(pCtl->SuspendIOSem);
+        pCtl->SuspendIOSem = NIL_RTSEMEVENT;
+    }
+
+    /* try one final time */
+    if (pCtl->AsyncIOThread != NIL_RTTHREAD)
+    {
+        rc = RTThreadWait(pCtl->AsyncIOThread, 1 /*ms*/, NULL);
+        if (RT_SUCCESS(rc))
+        {
+            pCtl->AsyncIOThread = NIL_RTTHREAD;
+            LogRel(("AHCI ATA Dtor: Ctl/irq=%u actually completed.\n", pCtl->irq));
+        }
     }
 
@@ -5475,4 +5500,8 @@
     pCtl->pDevInsR0 = PDMDEVINS_2_R0PTR(pDevIns);
     pCtl->pDevInsRC = PDMDEVINS_2_RCPTR(pDevIns);
+    pCtl->AsyncIOSem = NIL_RTSEMEVENT;
+    pCtl->SuspendIOSem = NIL_RTSEMEVENT;
+    pCtl->AsyncIORequestMutex = NIL_RTSEMMUTEX;
+    pCtl->AsyncIOThread = NIL_RTTHREAD;
 
     for (uint32_t j = 0; j < RT_ELEMENTS(pCtl->aIfs); j++)
@@ -5505,12 +5534,13 @@
     pCtl->uAsyncIOState = AHCIATA_AIO_NEW;
     rc = RTSemEventCreate(&pCtl->AsyncIOSem);
-    AssertRC(rc);
+    AssertRCReturn(rc, rc);
     rc = RTSemEventCreate(&pCtl->SuspendIOSem);
-    AssertRC(rc);
+    AssertRCReturn(rc, rc);
     rc = RTSemMutexCreate(&pCtl->AsyncIORequestMutex);
-    AssertRC(rc);
+    AssertRCReturn(rc, rc);
     ataAsyncIOClearRequests(pCtl);
-    rc = RTThreadCreate(&pCtl->AsyncIOThread, ataAsyncIOLoop, (void *)pCtl, 128*1024, RTTHREADTYPE_IO, 0, "ATA");
-    AssertRC(rc);
+    rc = RTThreadCreateF(&pCtl->AsyncIOThread, ataAsyncIOLoop, (void *)pCtl, 128*1024,
+                         RTTHREADTYPE_IO, RTTHREADFLAGS_WAITABLE, "AHCI-ATA-%u", pCtl->irq);
+    AssertRCReturn(rc, rc);
     Assert(pCtl->AsyncIOThread != NIL_RTTHREAD && pCtl->AsyncIOSem != NIL_RTSEMEVENT && pCtl->SuspendIOSem != NIL_RTSEMEVENT && pCtl->AsyncIORequestMutex != NIL_RTSEMMUTEX);
     Log(("%s: controller AIO thread id %#x; sem %p susp_sem %p mutex %p\n", __FUNCTION__, pCtl->AsyncIOThread, pCtl->AsyncIOSem, pCtl->SuspendIOSem, pCtl->AsyncIORequestMutex));
Index: /trunk/src/VBox/Devices/Storage/ATAController.h
===================================================================
--- /trunk/src/VBox/Devices/Storage/ATAController.h	(revision 24095)
+++ /trunk/src/VBox/Devices/Storage/ATAController.h	(revision 24096)
@@ -366,5 +366,8 @@
     /** The position at which to get a new request for the AIO thread. */
     uint8_t             AsyncIOReqTail;
-    uint8_t             Alignment3[2]; /**< Explicit padding of the 2 byte gap. */
+    /** Whether to call RTThreadUserSignal when idle.
+     * Before setting this, call RTThreadUserReset. */
+    bool volatile       fSignalIdle;
+    uint8_t             Alignment3[1]; /**< Explicit padding of the 1 byte gap. */
     /** Magic delay before triggering interrupts in DMA mode. */
     uint32_t            DelayIRQMillies;
Index: /trunk/src/VBox/Devices/Storage/DevAHCI.cpp
===================================================================
--- /trunk/src/VBox/Devices/Storage/DevAHCI.cpp	(revision 24095)
+++ /trunk/src/VBox/Devices/Storage/DevAHCI.cpp	(revision 24096)
@@ -574,4 +574,10 @@
     PDMCRITSECT                     lock;
 
+    /** Semaphore that gets set when fSignalIdle is set. */
+    RTSEMEVENT                      hEvtIdle;
+#if HC_ARCH_BITS == 32
+    uint32_t                        Alignment7;
+#endif
+
     /** Bitmask of ports which asserted an interrupt. */
     uint32_t                        u32PortsInterrupted;
@@ -586,4 +592,8 @@
     /** If the new async interface is used if available. */
     bool                            fUseAsyncInterfaceIfAvailable;
+    /** Indicates that hEvtIdle should be signalled when a port is entering the
+     * idle state. */
+    bool volatile                   fSignalIdle;
+    bool                            afAlignment8[1];
 
     /** Number of usable ports on this controller. */
@@ -591,5 +601,5 @@
 
 #if HC_ARCH_BITS == 64
-    uint32_t                        Alignment7;
+    uint32_t                        Alignment9;
 #endif
 
@@ -4917,5 +4927,9 @@
              __FUNCTION__, pInterface, pvUser, pAhciPortTaskState->uTag));
 
-    return ahciTransferComplete(pAhciPort, pAhciPortTaskState);
+    int rc = ahciTransferComplete(pAhciPort, pAhciPortTaskState);
+
+    if (pAhciPort->uActTasksActive == 0 && pAhciPort->pAhciR3->fSignalIdle)
+        RTSemEventSignal(pAhciPort->pAhciR3->hEvtIdle);
+    return rc;
 }
 
@@ -5374,4 +5388,6 @@
 
         ASMAtomicXchgBool(&pAhciPort->fAsyncIOThreadIdle, true);
+        if (pAhci->fSignalIdle)
+            RTSemEventSignal(pAhci->hEvtIdle);
 
         rc = RTSemEventWait(pAhciPort->AsyncIORequestSem, 1000);
@@ -5637,4 +5653,7 @@
     }
 
+    if (pAhci->fSignalIdle)
+        RTSemEventSignal(pAhci->hEvtIdle);
+
     /* Free task state memory */
     if (pAhciPortTaskState->pSGListHead)
@@ -5768,4 +5787,7 @@
             ataControllerDestroy(&pAhci->aCts[i]);
 
+        RTSemEventDestroy(pAhci->hEvtIdle);
+        pAhci->hEvtIdle = NIL_RTSEMEVENT;
+
         PDMR3CritSectDelete(&pAhci->lock);
     }
@@ -5882,4 +5904,5 @@
     bool      fAllFinished;
 
+    ASMAtomicWriteBool(&pAhci->fSignalIdle, true);
     u64Start = RTTimeMilliTS();
     for (;;)
@@ -5905,7 +5928,10 @@
             break;
 
-        /* Sleep a bit. */
-        RTThreadSleep(100); /** @todo wait on something which can be woken up. 100ms is too long for teleporting VMs! */
-    }
+        /* Wait for a port to signal idleness. */
+        int rc = RTSemEventWait(pAhci->hEvtIdle, 100 /*ms*/);
+        AssertMsg(RT_SUCCESS(rc) || rc == VERR_TIMEOUT, ("%Rrc\n", rc)); NOREF(rc);
+    }
+
+    ASMAtomicWriteBool(&pAhci->fSignalIdle, false);
     return fAllFinished;
 }
@@ -6502,4 +6528,5 @@
     pThis->pDevInsR0 = PDMDEVINS_2_R0PTR(pDevIns);
     pThis->pDevInsRC = PDMDEVINS_2_RCPTR(pDevIns);
+    pThis->hEvtIdle  = NIL_RTSEMEVENT;
 
     PCIDevSetVendorId    (&pThis->dev, 0x8086); /* Intel */
@@ -6582,4 +6609,7 @@
         return rc;
     }
+
+    rc = RTSemEventCreate(&pThis->hEvtIdle);
+    AssertRCReturn(rc, rc);
 
     /* Create the timer for command completion coalescing feature. */
