VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp@ 103795

Last change on this file since 103795 was 99739, checked in by vboxsync, 19 months ago

*: doxygen corrections (mostly about removing @returns from functions returning void).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 35.2 KB
Line 
1/* $Id: ioqueue-iouringfile-provider.cpp 99739 2023-05-11 01:01:08Z vboxsync $ */
2/** @file
3 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
4 */
5
6/*
7 * Copyright (C) 2019-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
38 * @internal
39 *
40 * The io_uring interface is the most recent interface added to the Linux kernel
41 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
42 * thus not available on most systems as of writing this backend (July 2019).
43 * It supersedes the old async I/O interface and cleans up with some restrictions like
44 * having to disable caching for the file.
45 * The interface is centered around a submission and completion queue to queue multiple new
46 * requests for the kernel to process and get notified about completions to reduce the amount
47 * of context switches to an absolute minimum. It also offers advanced features like
48 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
49 * even more.
50 *
51 * The first implementation will only make use of the basic features and more advanced features
52 * will be added later.
53 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
54 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
55 * while still keeping a consistent platform independent API which allows efficient implementations on
56 * other hosts when they come up.
57 *
58 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
59 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
60 * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
61 * * http://kernel.dk/io_uring.pdf
62 * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
63 */
64
65
66/*********************************************************************************************************************************
67* Header Files *
68*********************************************************************************************************************************/
69#define LOG_GROUP RTLOGGROUP_IOQUEUE
70#include <iprt/ioqueue.h>
71
72#include <iprt/assertcompile.h>
73#include <iprt/asm.h>
74#include <iprt/errcore.h>
75#include <iprt/file.h>
76#include <iprt/log.h>
77#include <iprt/mem.h>
78#include <iprt/string.h>
79
80#include <errno.h>
81#include <unistd.h>
82#include <signal.h>
83#include <sys/mman.h>
84#include <sys/syscall.h>
85#include <sys/uio.h>
86
87#include "internal/ioqueue.h"
88
89
90/*********************************************************************************************************************************
91* Defined Constants And Macros *
92*********************************************************************************************************************************/
93
94/** The syscall number of io_uring_setup(). */
95#define LNX_IOURING_SYSCALL_SETUP 425
96/** The syscall number of io_uring_enter(). */
97#define LNX_IOURING_SYSCALL_ENTER 426
98/** The syscall number of io_uring_register(). */
99#define LNX_IOURING_SYSCALL_REGISTER 427
100/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
101#define LNX_SYSCALL_EVENTFD2 290
102
103
104/*********************************************************************************************************************************
105* Structures and Typedefs *
106*********************************************************************************************************************************/
107
108/**
109 * Linux io_uring completion event.
110 */
111typedef struct LNXIOURINGCQE
112{
113 /** Opaque user data associated with the completed request. */
114 uint64_t u64User;
115 /** The status code of the request. */
116 int32_t rcLnx;
117 /** Some flags which are not used as of now. */
118 uint32_t fFlags;
119} LNXIOURINGCQE;
120AssertCompileSize(LNXIOURINGCQE, 16);
121/** Pointer to a Linux io_uring completion event. */
122typedef LNXIOURINGCQE *PLNXIOURINGCQE;
123/** Pointer to a constant linux io_uring completion event. */
124typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
125
126
127/**
128 * Linux io_uring submission queue entry.
129 */
130typedef struct LNXIOURINGSQE
131{
132 /** The opcode for the request. */
133 uint8_t u8Opc;
134 /** Common flags for the request. */
135 uint8_t u8Flags;
136 /** Assigned I/O priority. */
137 uint16_t u16IoPrio;
138 /** The file descriptor the request is for. */
139 int32_t i32Fd;
140 /** The start offset into the file for the request. */
141 uint64_t u64OffStart;
142 /** Buffer pointer or Pointer to io vector array depending on opcode. */
143 uint64_t u64AddrBufIoVec;
144 /** Size of the buffer in bytes or number of io vectors. */
145 uint32_t u32BufIoVecSz;
146 /** Opcode dependent data. */
147 union
148 {
149 /** Flags for read/write requests. */
150 uint32_t u32KrnlRwFlags;
151 /** Flags for fsync() like requests. */
152 uint32_t u32FsyncFlags;
153 /** Flags for poll() like requests. */
154 uint16_t u16PollFlags;
155 /** Flags for sync_file_range() like requests. */
156 uint32_t u32SyncFileRangeFlags;
157 /** Flags for requests requiring a msg structure. */
158 uint32_t u32MsgFlags;
159 } uOpc;
160 /** Opaque user data associated with the request and returned durign completion. */
161 uint64_t u64User;
162 /** Request type dependent data. */
163 union
164 {
165 /** Fixed buffer index if indicated by the request flags. */
166 uint16_t u16FixedBufIdx;
167 /** Padding to align the structure to 64 bytes. */
168 uint64_t au64Padding[3];
169 } uReq;
170} LNXIOURINGSQE;
171AssertCompileSize(LNXIOURINGSQE, 64);
172/** Pointer to a Linux io_uring submission queue entry. */
173typedef LNXIOURINGSQE *PLNXIOURINGSQE;
174/** Pointer to a constant Linux io_uring submission queue entry. */
175typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
176
177
178/**
179 * Linux u_ioring SQ ring header structure to maintain the queue.
180 */
181typedef struct LNXIOURINGSQ
182{
183 /** The current head position to fill in new requests. */
184 uint32_t u32OffHead;
185 /** The current tail position the kernel starts processing from. */
186 uint32_t u32OffTail;
187 /** The mask for the head and tail counters to apply to retrieve the index. */
188 uint32_t u32OffRingMask;
189 /** Number of entries in the SQ ring. */
190 uint32_t u32OffRingEntries;
191 /** Flags set asychronously by the kernel. */
192 uint32_t u32OffFlags;
193 /** Counter of dropped requests. */
194 uint32_t u32OffDroppedReqs;
195 /** Offset where to find the array of SQ entries. */
196 uint32_t u32OffArray;
197 /** Reserved. */
198 uint32_t u32Rsvd0;
199 /** Reserved. */
200 uint64_t u64Rsvd1;
201} LNXIOURINGSQ;
202AssertCompileSize(LNXIOURINGSQ, 40);
203/** Pointer to a Linux u_ioring SQ ring header. */
204typedef LNXIOURINGSQ *PLNXIOURINGSQ;
205/** Pointer to a constant Linux u_ioring SQ ring header. */
206typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
207
208
209/**
210 * Linux io_uring CQ ring header structure to maintain the queue.
211 */
212typedef struct LNXIOURINGCQ
213{
214 /** The current head position the kernel modifies when completion events happen. */
215 uint32_t u32OffHead;
216 /** The current tail position to read completion events from. */
217 uint32_t u32OffTail;
218 /** The mask for the head and tail counters to apply to retrieve the index. */
219 uint32_t u32OffRingMask;
220 /** Number of entries in the CQ ring. */
221 uint32_t u32OffRingEntries;
222 /** Number of CQ overflows happened. */
223 uint32_t u32OffOverflowCnt;
224 /** */
225 uint32_t u32OffCqes;
226 /** Reserved. */
227 uint64_t au64Rsvd0[2];
228} LNXIOURINGCQ;
229AssertCompileSize(LNXIOURINGCQ, 40);
230/** Pointer to a Linux u_ioring CQ ring header. */
231typedef LNXIOURINGCQ *PLNXIOURINGCQ;
232/** Pointer to a constant Linux u_ioring CQ ring header. */
233typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
234
235
236/**
237 * Linux io_uring parameters passed to io_uring_setup().
238 */
239typedef struct LNXIOURINGPARAMS
240{
241 /** Number of SQ entries requested, must be power of 2. */
242 uint32_t u32SqEntriesCnt;
243 /** Number of CQ entries requested, must be power of 2. */
244 uint32_t u32CqEntriesCnt;
245 /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
246 uint32_t u32Flags;
247 /** Affinity of the kernel side SQ polling thread if enabled. */
248 uint32_t u32SqPollCpu;
249 /** Milliseconds after the kernel side SQ polling thread goes to sleep
250 * if there is are no requests to process. */
251 uint32_t u32SqPollIdleMs;
252 /** Reserved. */
253 uint32_t au32Rsvd0[5];
254 /** Offsets returned for the submission queue. */
255 LNXIOURINGSQ SqOffsets;
256 /** Offsets returned for the completion queue. */
257 LNXIOURINGCQ CqOffsets;
258} LNXIOURINGPARAMS;
259/** Pointer to Linux io_uring parameters. */
260typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
261/** Pointer to constant Linux io_uring parameters. */
262typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
263
264
265/** @name LNXIOURINGSQE::u8Opc defined opcodes.
266 * @{ */
267/** Opcode to profile the interface, does nothing. */
268#define LNX_IOURING_OPC_NOP 0
269/** preadv() like request. */
270#define LNX_IOURING_OPC_READV 1
271/** pwritev() like request. */
272#define LNX_IOURING_OPC_WRITEV 2
273/** fsync() like request. */
274#define LNX_IOURING_OPC_FSYNC 3
275/** Read request using a fixed preset buffer. */
276#define LNX_IOURING_OPC_READ_FIXED 4
277/** Write request using a fixed preset buffer. */
278#define LNX_IOURING_OPC_WRITE_FIXED 5
279/** Add file descriptor to pollset. */
280#define LNX_IOURING_OPC_POLL_ADD 6
281/** Remove file descriptor from pollset. */
282#define LNX_IOURING_OPC_POLL_REMOVE 7
283/** sync_file_range() like request. */
284#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
285/** sendmsg() like request. */
286#define LNX_IOURING_OPC_SENDMSG 9
287/** recvmsg() like request. */
288#define LNX_IOURING_OPC_RECVMSG 10
289/** @} */
290
291
292/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
293 * @{ */
294/** Sync userdata as well instead of metadata only. */
295#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
296/** @} */
297
298
299/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
300 * @{ */
301/** The I/O context is polled. */
302#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
303/** The kernel should poll the submission queue. */
304#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
305/** Sets the CPU affinity of the kernel thread polling the submission queue. */
306#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
307/** @} */
308
309
310/** @name Flags for LNXIOURINGSQE::u8Flags.
311 * @{ */
312/** The file descriptor was registered before use. */
313#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
314/** Complete all active requests before issuing the request with the flag set. */
315#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
316/** Links the request with the flag set to the next one. */
317#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
318/** @} */
319
320
321/** @name Magic mmap offsets to map submission and completion queues.
322 * @{ */
323/** Used to map the submission queue. */
324#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
325/** Used to map the completion queue. */
326#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
327/** Used to map the submission queue entries array. */
328#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
329/** @} */
330
331
332/** @name Flags used for the SQ ring structure.
333 * @{ */
334/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
335#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
336/** @} */
337
338
339/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
340 * @{ */
341/** Retrieve completion events for the completion queue. */
342#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
343/** Wakes the suspended kernel thread processing the requests. */
344#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
345/** @} */
346
347
348/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
349 * @{ */
350/** Register a fixed set of buffers. */
351#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
352/** Unregisters a fixed set of buffers registered previously. */
353#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
354/** Register a fixed set of files. */
355#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
356/** Unregisters a fixed set of files registered previously. */
357#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
358/** Register an eventfd associated with the I/O ring. */
359#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
360/** Unregisters an eventfd registered previously. */
361#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
362/** @} */
363
364
365/**
366 * SQ ring structure.
367 *
368 * @note Some members of this structure point to memory shared with the kernel,
369 * hence the volatile keyword.
370 */
371typedef struct RTIOQUEUESQ
372{
373 /** Pointer to the head counter. */
374 volatile uint32_t *pidxHead;
375 /** Pointer to the tail counter. */
376 volatile uint32_t *pidxTail;
377 /** Mask to apply for the counters to get to the index. */
378 uint32_t fRingMask;
379 /** Number of entries in the ring. */
380 uint32_t cEntries;
381 /** Pointer to the global flags. */
382 volatile uint32_t *pfFlags;
383 /** Pointer to the indirection array used for indexing the real SQ entries. */
384 volatile uint32_t *paidxSqes;
385} RTIOQUEUESQ;
386
387
388/**
389 * CQ ring structure.
390 *
391 * @note Some members of this structure point to memory shared with the kernel,
392 * hence the volatile keyword.
393 */
394typedef struct RTIOQUEUECQ
395{
396 /** Pointer to the head counter. */
397 volatile uint32_t *pidxHead;
398 /** Pointer to the tail counter. */
399 volatile uint32_t *pidxTail;
400 /** Mask to apply for the counters to get to the index. */
401 uint32_t fRingMask;
402 /** Number of entries in the ring. */
403 uint32_t cEntries;
404 /** Pointer to the completion entry ring. */
405 volatile LNXIOURINGCQE *paCqes;
406} RTIOQUEUECQ;
407
408
409/**
410 * Internal I/O queue provider instance data.
411 */
412typedef struct RTIOQUEUEPROVINT
413{
414 /** The io_uring file descriptor. */
415 int iFdIoCtx;
416 /** The eventfd file descriptor registered with the ring. */
417 int iFdEvt;
418 /** The submission queue. */
419 RTIOQUEUESQ Sq;
420 /** The currently uncommitted tail for the SQ. */
421 uint32_t idxSqTail;
422 /** Numbere of uncommitted SQEs. */
423 uint32_t cSqesToCommit;
424 /** The completion queue. */
425 RTIOQUEUECQ Cq;
426 /** Pointer to the mapped SQES entries. */
427 PLNXIOURINGSQE paSqes;
428 /** Pointer to the iovec structure used for non S/G requests. */
429 struct iovec *paIoVecs;
430 /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
431 void *pvMMapSqRing;
432 /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
433 void *pvMMapCqRing;
434 /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
435 void *pvMMapSqes;
436 /** Size of the mapped SQ ring, used for unmapping. */
437 size_t cbMMapSqRing;
438 /** Size of the mapped CQ ring, used for unmapping. */
439 size_t cbMMapCqRing;
440 /** Size of the mapped SQ entries array, used for unmapping. */
441 size_t cbMMapSqes;
442 /** Flag whether the waiter was woken up externally. */
443 volatile bool fExtIntr;
444} RTIOQUEUEPROVINT;
445/** Pointer to the internal I/O queue provider instance data. */
446typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
447
448
449/*********************************************************************************************************************************
450* Internal Functions *
451*********************************************************************************************************************************/
452
453/**
454 * Syscall wrapper for io_uring_setup().
455 *
456 * @returns IPRT status code.
457 * @param cEntries Number of entries for submission and completion queues.
458 * @param pParams Additional parameters for the I/O ring and updated return values
459 * on success.
460 * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
461 */
462DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
463{
464 int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
465 if (RT_UNLIKELY(rcLnx == -1))
466 return RTErrConvertFromErrno(errno);
467
468 *piFdIoCtx = rcLnx;
469 return VINF_SUCCESS;
470}
471
472
473/**
474 * Syscall wrapper for io_uring_enter().
475 *
476 * @returns IPRT status code.
477 * @param iFdIoCtx The I/O ring file descriptor.
478 * @param cToSubmit Maximum number of requests waiting for processing.
479 * @param cMinComplete Minimum number of completion events to accumulate before returning.
480 * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
481 */
482DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
483 uint32_t fFlags)
484{
485 int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
486 NULL, 0);
487 if (RT_UNLIKELY(rcLnx == -1))
488 return RTErrConvertFromErrno(errno);
489
490 return VINF_SUCCESS;
491}
492
493
494/**
495 * Syscall wrapper for io_uring_register().
496 *
497 * @returns IPRT status code.
498 * @param iFdIoCtx The I/O ring file descriptor.
499 * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
500 * @param pvArg Opaque arguments.
501 * @param cArgs Number of arguments.
502 */
503DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
504 uint32_t cArgs)
505{
506 int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
507 if (RT_UNLIKELY(rcLnx == -1))
508 return RTErrConvertFromErrno(errno);
509
510 return VINF_SUCCESS;
511}
512
513
514/**
515 * mmap() wrapper for the common bits and returning an IPRT status code.
516 *
517 * @returns IPRT status code.
518 * @param iFdIoCtx The I/O ring file descriptor.
519 * @param offMmap The mmap() offset.
520 * @param cbMmap How much to map.
521 * @param ppv Where to store the pointer to the mapping on success.
522 */
523DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
524{
525 void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
526 if (pv != MAP_FAILED)
527 {
528 *ppv = pv;
529 return VINF_SUCCESS;
530 }
531
532 return RTErrConvertFromErrno(errno);
533}
534
535
536/**
537 * eventfd2() syscall wrapper.
538 *
539 * @returns IPRT status code.
540 * @param uValInit The initial value of the maintained counter.
541 * @param fFlags Flags controlling the eventfd behavior.
542 * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
543 */
544DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
545{
546 int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
547 if (RT_UNLIKELY(rcLnx == -1))
548 return RTErrConvertFromErrno(errno);
549
550 *piFdEvt = rcLnx;
551 return VINF_SUCCESS;
552}
553
554
555/**
556 * Checks the completion event queue for pending events.
557 *
558 * @param pThis The provider instance.
559 * @param paCEvt Pointer to the array of completion events.
560 * @param cCEvt Maximum number of completion events the array can hold.
561 * @param pcCEvtSeen Where to store the number of completion events processed.
562 */
563static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
564 uint32_t cCEvt, uint32_t *pcCEvtSeen)
565{
566 /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
567 ASMReadFence();
568 uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
569 uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
570 ASMReadFence();
571
572 uint32_t cCEvtSeen = 0;
573
574 while ( idxCqTail != idxCqHead
575 && cCEvtSeen < cCEvt)
576 {
577 /* Get the index. */
578 uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
579 volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
580
581 paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
582 if (pCqe->rcLnx >= 0)
583 {
584 paCEvt->rcReq = VINF_SUCCESS;
585 paCEvt->cbXfered = (size_t)pCqe->rcLnx;
586 }
587 else
588 paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
589
590#ifdef RT_STRICT /* poison */
591 memset((void *)pCqe, 0xff, sizeof(*pCqe));
592#endif
593
594 paCEvt++;
595 cCEvtSeen++;
596 idxCqHead++;
597 }
598
599 *pcCEvtSeen = cCEvtSeen;
600
601 /* Paranoia strikes again. */
602 ASMWriteFence();
603 ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
604 ASMWriteFence();
605}
606
607
608/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
609static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
610{
611 /*
612 * Try to create a simple I/O ring and close it again.
613 * The common code/public API already checked for the proper handle type.
614 */
615 int iFdIoCtx = 0;
616 bool fSupp = false;
617 LNXIOURINGPARAMS Params;
618 RT_ZERO(Params);
619
620 int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
621 if (RT_SUCCESS(rc))
622 {
623 /*
624 * Check that we can register an eventfd descriptor to get notified about
625 * completion events while being able to kick the waiter externally out of the wait.
626 */
627 int iFdEvt = 0;
628 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
629 if (RT_SUCCESS(rc))
630 {
631 rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
632 &iFdEvt, 1 /*cArgs*/);
633 if (RT_SUCCESS(rc))
634 fSupp = true;
635
636 int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
637 }
638 int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
639 }
640
641 return fSupp;
642}
643
644
645/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
646static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
647 uint32_t cSqEntries, uint32_t cCqEntries)
648{
649 RT_NOREF(fFlags, cCqEntries);
650
651 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
652 LNXIOURINGPARAMS Params;
653 RT_ZERO(Params);
654
655 pThis->cSqesToCommit = 0;
656 pThis->fExtIntr = false;
657
658 int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
659 if (RT_SUCCESS(rc))
660 {
661 /* Map the rings into userspace. */
662 pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
663 pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
664 pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
665
666 pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
667 if (RT_LIKELY(pThis->paIoVecs))
668 {
669 rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
670 if (RT_SUCCESS(rc))
671 {
672 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
673 if (RT_SUCCESS(rc))
674 {
675 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
676 if (RT_SUCCESS(rc))
677 {
678 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
679 if (RT_SUCCESS(rc))
680 {
681 rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
682 if (RT_SUCCESS(rc))
683 {
684 uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
685
686 pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
687 pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
688 pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
689 pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
690 pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
691 pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
692 pThis->idxSqTail = *pThis->Sq.pidxTail;
693
694 pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
695
696 pbTmp = (uint8_t *)pThis->pvMMapCqRing;
697
698 pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
699 pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
700 pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
701 pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
702 pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
703 return VINF_SUCCESS;
704 }
705
706 munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
707 }
708
709 munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
710 }
711
712 rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
713 AssertRC(rc);
714 }
715
716 close(pThis->iFdEvt);
717 }
718
719 RTMemFree(pThis->paIoVecs);
720 }
721
722 int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
723 }
724
725 return rc;
726}
727
728
729/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
730static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
731{
732 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
733
734 int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
735 rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
736 rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
737
738 int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
739 AssertRC(rc);
740
741 close(pThis->iFdEvt);
742 close(pThis->iFdIoCtx);
743 RTMemFree(pThis->paIoVecs);
744
745 RT_ZERO(pThis);
746}
747
748
749/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
750static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
751{
752 RT_NOREF(hIoQueueProv, pHandle);
753 /** @todo Add support for fixed file sets later. */
754 return VINF_SUCCESS;
755}
756
757
758/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
759static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
760{
761 RT_NOREF(hIoQueueProv, pHandle);
762 /** @todo Add support for fixed file sets later. */
763 return VINF_SUCCESS;
764}
765
766
767/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
768static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
769 uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
770 void *pvUser)
771{
772 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
773 RT_NOREF(fReqFlags);
774
775 uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
776 PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
777 struct iovec *pIoVec = &pThis->paIoVecs[idx];
778
779 pIoVec->iov_base = pvBuf;
780 pIoVec->iov_len = cbBuf;
781
782 pSqe->u8Flags = 0;
783 pSqe->u16IoPrio = 0;
784 pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
785 pSqe->u64OffStart = off;
786 pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
787 pSqe->u32BufIoVecSz = 1;
788 pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
789
790 switch (enmOp)
791 {
792 case RTIOQUEUEOP_READ:
793 pSqe->u8Opc = LNX_IOURING_OPC_READV;
794 pSqe->uOpc.u32KrnlRwFlags = 0;
795 break;
796 case RTIOQUEUEOP_WRITE:
797 pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
798 pSqe->uOpc.u32KrnlRwFlags = 0;
799 break;
800 case RTIOQUEUEOP_SYNC:
801 pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
802 pSqe->uOpc.u32FsyncFlags = 0;
803 break;
804 default:
805 AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
806 VERR_INVALID_PARAMETER);
807 }
808
809 pThis->Sq.paidxSqes[idx] = idx;
810 pThis->idxSqTail++;
811 pThis->cSqesToCommit++;
812 return VINF_SUCCESS;
813}
814
815
816/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
817static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
818{
819 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
820
821 ASMWriteFence();
822 ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
823 ASMWriteFence();
824
825 int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
826 if (RT_SUCCESS(rc))
827 {
828 *pcReqsCommitted = pThis->cSqesToCommit;
829 pThis->cSqesToCommit = 0;
830 }
831
832 return rc;
833}
834
835
836/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
837static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
838 uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
839{
840 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
841 int rc = VINF_SUCCESS;
842 uint32_t cCEvtSeen = 0;
843
844 RT_NOREF(fFlags);
845
846 /*
847 * Check the completion queue first for any completed events which might save us a
848 * context switch later on.
849 */
850 rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
851
852 while ( cCEvtSeen < cMinWait
853 && RT_SUCCESS(rc))
854 {
855 /*
856 * We can employ a blocking read on the event file descriptor, it will return
857 * either when woken up externally or when there are completion events pending.
858 */
859 uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
860 ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
861 if (rcLnx == sizeof(uCnt))
862 {
863 uint32_t cCEvtThisSeen = 0;
864 rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
865 cCEvtSeen += cCEvtThisSeen;
866
867 /* Whether we got woken up externally. */
868 if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
869 rc = VERR_INTERRUPTED;
870 }
871 else if (rcLnx == -1)
872 rc = RTErrConvertFromErrno(errno);
873 else
874 AssertMsgFailed(("Unexpected read() -> 0\n"));
875 }
876
877 *pcCEvt = cCEvtSeen;
878 return rc;
879}
880
881
882/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
883static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
884{
885 PRTIOQUEUEPROVINT pThis = hIoQueueProv;
886 int rc = VINF_SUCCESS;
887
888 if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
889 {
890 const uint64_t uValAdd = 1;
891 ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
892
893 Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
894 if (rcLnx == -1)
895 rc = RTErrConvertFromErrno(errno);
896 }
897
898 return rc;
899}
900
901
902/**
903 * Async file I/O queue provider virtual method table.
904 */
905RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
906{
907 /** uVersion */
908 RTIOQUEUEPROVVTABLE_VERSION,
909 /** pszId */
910 "LnxIoURingFile",
911 /** cbIoQueueProv */
912 sizeof(RTIOQUEUEPROVINT),
913 /** enmHnd */
914 RTHANDLETYPE_FILE,
915 /** fFlags */
916 0,
917 /** pfnIsSupported */
918 rtIoQueueLnxIoURingFileProv_IsSupported,
919 /** pfnQueueInit */
920 rtIoQueueLnxIoURingFileProv_QueueInit,
921 /** pfnQueueDestroy */
922 rtIoQueueLnxIoURingFileProv_QueueDestroy,
923 /** pfnHandleRegister */
924 rtIoQueueLnxIoURingFileProv_HandleRegister,
925 /** pfnHandleDeregister */
926 rtIoQueueLnxIoURingFileProv_HandleDeregister,
927 /** pfnReqPrepare */
928 rtIoQueueLnxIoURingFileProv_ReqPrepare,
929 /** pfnReqPrepareSg */
930 NULL,
931 /** pfnCommit */
932 rtIoQueueLnxIoURingFileProv_Commit,
933 /** pfnEvtWait */
934 rtIoQueueLnxIoURingFileProv_EvtWait,
935 /** pfnEvtWaitWakeup */
936 rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
937 /** uEndMarker */
938 RTIOQUEUEPROVVTABLE_VERSION
939};
940
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette