VirtualBox

source: vbox/trunk/src/VBox/Additions/linux/sharedfolders/regops.c@ 80712

Last change on this file since 80712 was 80712, checked in by vboxsync, 6 years ago

Additions/linux: ticketref:18917: VBox 6.0.10 GAs fail to compile on Red Hat/CentOS/OL 7.7; also Red Hat 8.1 Beta

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 148.2 KB
Line 
1/* $Id: regops.c 80712 2019-09-10 19:25:36Z vboxsync $ */
2/** @file
3 * vboxsf - VBox Linux Shared Folders VFS, regular file inode and file operations.
4 */
5
6/*
7 * Copyright (C) 2006-2019 Oracle Corporation
8 *
9 * Permission is hereby granted, free of charge, to any person
10 * obtaining a copy of this software and associated documentation
11 * files (the "Software"), to deal in the Software without
12 * restriction, including without limitation the rights to use,
13 * copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following
16 * conditions:
17 *
18 * The above copyright notice and this permission notice shall be
19 * included in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
23 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
25 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
26 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 * OTHER DEALINGS IN THE SOFTWARE.
29 */
30
31
32/*********************************************************************************************************************************
33* Header Files *
34*********************************************************************************************************************************/
35#include "vfsmod.h"
36#include <linux/uio.h>
37#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 32)
38# include <linux/aio.h> /* struct kiocb before 4.1 */
39#endif
40#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
41# include <linux/buffer_head.h>
42#endif
43#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12) \
44 && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31)
45# include <linux/writeback.h>
46#endif
47#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23) \
48 && LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
49# include <linux/splice.h>
50#endif
51#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 17) \
52 && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)
53# include <linux/pipe_fs_i.h>
54#endif
55#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
56# include <linux/swap.h> /* for mark_page_accessed */
57#endif
58#include <iprt/err.h>
59
60#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18)
61# define SEEK_END 2
62#endif
63
64#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
65# define iter_is_iovec(a_pIter) ( !((a_pIter)->type & ITER_KVEC) )
66#elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
67# define iter_is_iovec(a_pIter) ( !((a_pIter)->type & (ITER_KVEC | ITER_BVEC)) )
68#endif
69
70#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
71# define vm_fault_t int
72#endif
73
74#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 20)
75# define pgoff_t unsigned long
76#endif
77
78#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 12)
79# define PageUptodate(a_pPage) Page_Uptodate(a_pPage)
80#endif
81
82#ifdef RHEL_RELEASE_CODE
83# if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 1)
84# define RHEL_81
85# endif
86#endif
87
88/*********************************************************************************************************************************
89* Structures and Typedefs *
90*********************************************************************************************************************************/
91#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
92struct vbsf_iov_iter {
93 unsigned int type;
94 unsigned int v_write : 1;
95 size_t iov_offset;
96 size_t nr_segs;
97 struct iovec const *iov;
98# ifdef VBOX_STRICT
99 struct iovec const *iov_org;
100 size_t nr_segs_org;
101# endif
102};
103# ifdef VBOX_STRICT
104# define VBSF_IOV_ITER_INITIALIZER(a_cSegs, a_pIov, a_fWrite) \
105 { vbsf_iov_iter_detect_type(a_pIov, a_cSegs), a_fWrite, 0, a_cSegs, a_pIov, a_pIov, a_cSegs }
106# else
107# define VBSF_IOV_ITER_INITIALIZER(a_cSegs, a_pIov, a_fWrite) \
108 { vbsf_iov_iter_detect_type(a_pIov, a_cSegs), a_fWrite, 0, a_cSegs, a_pIov }
109# endif
110# define ITER_KVEC 1
111# define iov_iter vbsf_iov_iter
112#endif
113
114#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
115/** Used by vbsf_iter_lock_pages() to keep the first page of the next segment. */
116struct vbsf_iter_stash {
117 struct page *pPage;
118 size_t off;
119 size_t cb;
120# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
121 size_t offFromEnd;
122 struct iov_iter Copy;
123# endif
124};
125#endif /* >= 3.16.0 */
126/** Initializer for struct vbsf_iter_stash. */
127#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
128# define VBSF_ITER_STASH_INITIALIZER { NULL, 0 }
129#else
130# define VBSF_ITER_STASH_INITIALIZER { NULL, 0, ~(size_t)0 }
131#endif
132
133
134/*********************************************************************************************************************************
135* Internal Functions *
136*********************************************************************************************************************************/
137DECLINLINE(void) vbsf_put_page(struct page *pPage);
138static void vbsf_unlock_user_pages(struct page **papPages, size_t cPages, bool fSetDirty, bool fLockPgHack);
139static void vbsf_reg_write_sync_page_cache(struct address_space *mapping, loff_t offFile, uint32_t cbRange,
140 uint8_t const *pbSrcBuf, struct page **papSrcPages,
141 uint32_t offSrcPage, size_t cSrcPages);
142
143
144/*********************************************************************************************************************************
145* Provide more recent uio.h functionality to older kernels. *
146*********************************************************************************************************************************/
147#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
148
149/**
150 * Detects the vector type.
151 */
152static int vbsf_iov_iter_detect_type(struct iovec const *paIov, size_t cSegs)
153{
154 /* Check the first segment with a non-zero length. */
155 while (cSegs-- > 0) {
156 if (paIov->iov_len > 0) {
157 if (access_ok(VERIFY_READ, paIov->iov_base, paIov->iov_len))
158 return (uintptr_t)paIov->iov_base >= USER_DS.seg ? ITER_KVEC : 0;
159 AssertMsgFailed(("%p LB %#zx\n", paIov->iov_base, paIov->iov_len));
160 break;
161 }
162 paIov++;
163 }
164 return 0;
165}
166
167
168# undef iov_iter_count
169# define iov_iter_count(a_pIter) vbsf_iov_iter_count(a_pIter)
170static size_t vbsf_iov_iter_count(struct vbsf_iov_iter const *iter)
171{
172 size_t cbRet = 0;
173 size_t cLeft = iter->nr_segs;
174 struct iovec const *iov = iter->iov;
175 while (cLeft-- > 0) {
176 cbRet += iov->iov_len;
177 iov++;
178 }
179 return cbRet - iter->iov_offset;
180}
181
182
183# undef iov_iter_single_seg_count
184# define iov_iter_single_seg_count(a_pIter) vbsf_iov_iter_single_seg_count(a_pIter)
185static size_t vbsf_iov_iter_single_seg_count(struct vbsf_iov_iter const *iter)
186{
187 if (iter->nr_segs > 0)
188 return iter->iov->iov_len - iter->iov_offset;
189 return 0;
190}
191
192
193# undef iov_iter_advance
194# define iov_iter_advance(a_pIter, a_cbSkip) vbsf_iov_iter_advance(a_pIter, a_cbSkip)
195static void vbsf_iov_iter_advance(struct vbsf_iov_iter *iter, size_t cbSkip)
196{
197 SFLOG2(("vbsf_iov_iter_advance: cbSkip=%#zx\n", cbSkip));
198 if (iter->nr_segs > 0) {
199 size_t const cbLeftCur = iter->iov->iov_len - iter->iov_offset;
200 Assert(iter->iov_offset <= iter->iov->iov_len);
201 if (cbLeftCur > cbSkip) {
202 iter->iov_offset += cbSkip;
203 } else {
204 cbSkip -= cbLeftCur;
205 iter->iov_offset = 0;
206 iter->iov++;
207 iter->nr_segs--;
208 while (iter->nr_segs > 0) {
209 size_t const cbSeg = iter->iov->iov_len;
210 if (cbSeg > cbSkip) {
211 iter->iov_offset = cbSkip;
212 break;
213 }
214 cbSkip -= cbSeg;
215 iter->iov++;
216 iter->nr_segs--;
217 }
218 }
219 }
220}
221
222
223# undef iov_iter_get_pages
224# define iov_iter_get_pages(a_pIter, a_papPages, a_cbMax, a_cMaxPages, a_poffPg0) \
225 vbsf_iov_iter_get_pages(a_pIter, a_papPages, a_cbMax, a_cMaxPages, a_poffPg0)
226static ssize_t vbsf_iov_iter_get_pages(struct vbsf_iov_iter *iter, struct page **papPages,
227 size_t cbMax, unsigned cMaxPages, size_t *poffPg0)
228{
229 while (iter->nr_segs > 0) {
230 size_t const cbLeft = iter->iov->iov_len - iter->iov_offset;
231 Assert(iter->iov->iov_len >= iter->iov_offset);
232 if (cbLeft > 0) {
233 uintptr_t uPtrFrom = (uintptr_t)iter->iov->iov_base + iter->iov_offset;
234 size_t offPg0 = *poffPg0 = uPtrFrom & PAGE_OFFSET_MASK;
235 size_t cPagesLeft = RT_ALIGN_Z(offPg0 + cbLeft, PAGE_SIZE) >> PAGE_SHIFT;
236 size_t cPages = RT_MIN(cPagesLeft, cMaxPages);
237 struct task_struct *pTask = current;
238 size_t cPagesLocked;
239
240 down_read(&pTask->mm->mmap_sem);
241 cPagesLocked = get_user_pages(pTask, pTask->mm, uPtrFrom, cPages, iter->v_write, 1 /*force*/, papPages, NULL);
242 up_read(&pTask->mm->mmap_sem);
243 if (cPagesLocked == cPages) {
244 size_t cbRet = (cPages << PAGE_SHIFT) - offPg0;
245 if (cPages == cPagesLeft) {
246 size_t offLastPg = (uPtrFrom + cbLeft) & PAGE_OFFSET_MASK;
247 if (offLastPg)
248 cbRet -= PAGE_SIZE - offLastPg;
249 }
250 Assert(cbRet <= cbLeft);
251 return cbRet;
252 }
253 if (cPagesLocked > 0)
254 vbsf_unlock_user_pages(papPages, cPagesLocked, false /*fSetDirty*/, false /*fLockPgHack*/);
255 return -EFAULT;
256 }
257 iter->iov_offset = 0;
258 iter->iov++;
259 iter->nr_segs--;
260 }
261 AssertFailed();
262 return 0;
263}
264
265
266# undef iov_iter_truncate
267# define iov_iter_truncate(iter, cbNew) vbsf_iov_iter_truncate(iter, cbNew)
268static void vbsf_iov_iter_truncate(struct vbsf_iov_iter *iter, size_t cbNew)
269{
270 /* we have no counter or stuff, so it's a no-op. */
271 RT_NOREF(iter, cbNew);
272}
273
274
275# undef iov_iter_revert
276# define iov_iter_revert(a_pIter, a_cbRewind) vbsf_iov_iter_revert(a_pIter, a_cbRewind)
277void vbsf_iov_iter_revert(struct vbsf_iov_iter *iter, size_t cbRewind)
278{
279 SFLOG2(("vbsf_iov_iter_revert: cbRewind=%#zx\n", cbRewind));
280 if (iter->iov_offset > 0) {
281 if (cbRewind <= iter->iov_offset) {
282 iter->iov_offset -= cbRewind;
283 return;
284 }
285 cbRewind -= iter->iov_offset;
286 iter->iov_offset = 0;
287 }
288
289 while (cbRewind > 0) {
290 struct iovec const *pIov = --iter->iov;
291 size_t const cbSeg = pIov->iov_len;
292 iter->nr_segs++;
293
294 Assert((uintptr_t)pIov >= (uintptr_t)iter->iov_org);
295 Assert(iter->nr_segs <= iter->nr_segs_org);
296
297 if (cbRewind <= cbSeg) {
298 iter->iov_offset = cbSeg - cbRewind;
299 break;
300 }
301 cbRewind -= cbSeg;
302 }
303}
304
305#endif /* 2.6.19 <= linux < 3.16.0 */
306#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 35)
307
308/** This is for implementing cMaxPage on 3.16 which doesn't have it. */
309static ssize_t vbsf_iov_iter_get_pages_3_16(struct iov_iter *iter, struct page **papPages,
310 size_t cbMax, unsigned cMaxPages, size_t *poffPg0)
311{
312 if (!(iter->type & ITER_BVEC)) {
313 size_t const offPg0 = iter->iov_offset & PAGE_OFFSET_MASK;
314 size_t const cbMaxPages = ((size_t)cMaxPages << PAGE_SHIFT) - offPg0;
315 if (cbMax > cbMaxPages)
316 cbMax = cbMaxPages;
317 }
318 /* else: BVEC works a page at a time and shouldn't have much of a problem here. */
319 return iov_iter_get_pages(iter, papPages, cbMax, poffPg0);
320}
321# undef iov_iter_get_pages
322# define iov_iter_get_pages(a_pIter, a_papPages, a_cbMax, a_cMaxPages, a_poffPg0) \
323 vbsf_iov_iter_get_pages_3_16(a_pIter, a_papPages, a_cbMax, a_cMaxPages, a_poffPg0)
324
325#endif /* 3.16.0-3.16.34 */
326#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19) && LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
327
328static size_t copy_from_iter(uint8_t *pbDst, size_t cbToCopy, struct iov_iter *pSrcIter)
329{
330 size_t const cbTotal = cbToCopy;
331 Assert(iov_iter_count(pSrcIter) >= cbToCopy);
332# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
333 if (pSrcIter->type & ITER_BVEC) {
334 while (cbToCopy > 0) {
335 size_t const offPage = (uintptr_t)pbDst & PAGE_OFFSET_MASK;
336 size_t const cbThisCopy = RT_MIN(PAGE_SIZE - offPage, cbToCopy);
337 struct page *pPage = rtR0MemObjLinuxVirtToPage(pbDst);
338 size_t cbCopied = copy_page_from_iter(pPage, offPage, cbThisCopy, pSrcIter);
339 AssertStmt(cbCopied <= cbThisCopy, cbCopied = cbThisCopy);
340 pbDst += cbCopied;
341 cbToCopy -= cbCopied;
342 if (cbCopied != cbToCopy)
343 break;
344 }
345 } else
346# endif
347 {
348 while (cbToCopy > 0) {
349 size_t cbThisCopy = iov_iter_single_seg_count(pSrcIter);
350 if (cbThisCopy > 0) {
351 if (cbThisCopy > cbToCopy)
352 cbThisCopy = cbToCopy;
353 if (pSrcIter->type & ITER_KVEC)
354 memcpy(pbDst, (void *)pSrcIter->iov->iov_base + pSrcIter->iov_offset, cbThisCopy);
355 else if (copy_from_user(pbDst, pSrcIter->iov->iov_base + pSrcIter->iov_offset, cbThisCopy) != 0)
356 break;
357 pbDst += cbThisCopy;
358 cbToCopy -= cbThisCopy;
359 }
360 iov_iter_advance(pSrcIter, cbThisCopy);
361 }
362 }
363 return cbTotal - cbToCopy;
364}
365
366
367static size_t copy_to_iter(uint8_t const *pbSrc, size_t cbToCopy, struct iov_iter *pDstIter)
368{
369 size_t const cbTotal = cbToCopy;
370 Assert(iov_iter_count(pDstIter) >= cbToCopy);
371# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
372 if (pDstIter->type & ITER_BVEC) {
373 while (cbToCopy > 0) {
374 size_t const offPage = (uintptr_t)pbSrc & PAGE_OFFSET_MASK;
375 size_t const cbThisCopy = RT_MIN(PAGE_SIZE - offPage, cbToCopy);
376 struct page *pPage = rtR0MemObjLinuxVirtToPage((void *)pbSrc);
377 size_t cbCopied = copy_page_to_iter(pPage, offPage, cbThisCopy, pDstIter);
378 AssertStmt(cbCopied <= cbThisCopy, cbCopied = cbThisCopy);
379 pbSrc += cbCopied;
380 cbToCopy -= cbCopied;
381 if (cbCopied != cbToCopy)
382 break;
383 }
384 } else
385# endif
386 {
387 while (cbToCopy > 0) {
388 size_t cbThisCopy = iov_iter_single_seg_count(pDstIter);
389 if (cbThisCopy > 0) {
390 if (cbThisCopy > cbToCopy)
391 cbThisCopy = cbToCopy;
392 if (pDstIter->type & ITER_KVEC)
393 memcpy((void *)pDstIter->iov->iov_base + pDstIter->iov_offset, pbSrc, cbThisCopy);
394 else if (copy_to_user(pDstIter->iov->iov_base + pDstIter->iov_offset, pbSrc, cbThisCopy) != 0) {
395 break;
396 }
397 pbSrc += cbThisCopy;
398 cbToCopy -= cbThisCopy;
399 }
400 iov_iter_advance(pDstIter, cbThisCopy);
401 }
402 }
403 return cbTotal - cbToCopy;
404}
405
406#endif /* 3.16.0 <= linux < 3.18.0 */
407
408
409
410/*********************************************************************************************************************************
411* Handle management *
412*********************************************************************************************************************************/
413
414/**
415 * Called when an inode is released to unlink all handles that might impossibly
416 * still be associated with it.
417 *
418 * @param pInodeInfo The inode which handles to drop.
419 */
420void vbsf_handle_drop_chain(struct vbsf_inode_info *pInodeInfo)
421{
422 struct vbsf_handle *pCur, *pNext;
423 unsigned long fSavedFlags;
424 SFLOGFLOW(("vbsf_handle_drop_chain: %p\n", pInodeInfo));
425 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
426
427 RTListForEachSafe(&pInodeInfo->HandleList, pCur, pNext, struct vbsf_handle, Entry) {
428 AssertMsg( (pCur->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST))
429 == (VBSF_HANDLE_F_MAGIC | VBSF_HANDLE_F_ON_LIST), ("%p %#x\n", pCur, pCur->fFlags));
430 pCur->fFlags |= VBSF_HANDLE_F_ON_LIST;
431 RTListNodeRemove(&pCur->Entry);
432 }
433
434 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
435}
436
437
438/**
439 * Locates a handle that matches all the flags in @a fFlags.
440 *
441 * @returns Pointer to handle on success (retained), use vbsf_handle_release() to
442 * release it. NULL if no suitable handle was found.
443 * @param pInodeInfo The inode info to search.
444 * @param fFlagsSet The flags that must be set.
445 * @param fFlagsClear The flags that must be clear.
446 */
447struct vbsf_handle *vbsf_handle_find(struct vbsf_inode_info *pInodeInfo, uint32_t fFlagsSet, uint32_t fFlagsClear)
448{
449 struct vbsf_handle *pCur;
450 unsigned long fSavedFlags;
451 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
452
453 RTListForEach(&pInodeInfo->HandleList, pCur, struct vbsf_handle, Entry) {
454 AssertMsg( (pCur->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST))
455 == (VBSF_HANDLE_F_MAGIC | VBSF_HANDLE_F_ON_LIST), ("%p %#x\n", pCur, pCur->fFlags));
456 if ((pCur->fFlags & (fFlagsSet | fFlagsClear)) == fFlagsSet) {
457 uint32_t cRefs = ASMAtomicIncU32(&pCur->cRefs);
458 if (cRefs > 1) {
459 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
460 SFLOGFLOW(("vbsf_handle_find: returns %p\n", pCur));
461 return pCur;
462 }
463 /* Oops, already being closed (safe as it's only ever increased here). */
464 ASMAtomicDecU32(&pCur->cRefs);
465 }
466 }
467
468 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
469 SFLOGFLOW(("vbsf_handle_find: returns NULL!\n"));
470 return NULL;
471}
472
473
474/**
475 * Slow worker for vbsf_handle_release() that does the freeing.
476 *
477 * @returns 0 (ref count).
478 * @param pHandle The handle to release.
479 * @param pSuperInfo The info structure for the shared folder associated with
480 * the handle.
481 * @param pszCaller The caller name (for logging failures).
482 */
483uint32_t vbsf_handle_release_slow(struct vbsf_handle *pHandle, struct vbsf_super_info *pSuperInfo, const char *pszCaller)
484{
485 int rc;
486 unsigned long fSavedFlags;
487
488 SFLOGFLOW(("vbsf_handle_release_slow: %p (%s)\n", pHandle, pszCaller));
489
490 /*
491 * Remove from the list.
492 */
493 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
494
495 AssertMsg((pHandle->fFlags & VBSF_HANDLE_F_MAGIC_MASK) == VBSF_HANDLE_F_MAGIC, ("%p %#x\n", pHandle, pHandle->fFlags));
496 Assert(pHandle->pInodeInfo);
497 Assert(pHandle->pInodeInfo && pHandle->pInodeInfo->u32Magic == SF_INODE_INFO_MAGIC);
498
499 if (pHandle->fFlags & VBSF_HANDLE_F_ON_LIST) {
500 pHandle->fFlags &= ~VBSF_HANDLE_F_ON_LIST;
501 RTListNodeRemove(&pHandle->Entry);
502 }
503
504 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
505
506 /*
507 * Actually destroy it.
508 */
509 rc = VbglR0SfHostReqCloseSimple(pSuperInfo->map.root, pHandle->hHost);
510 if (RT_FAILURE(rc))
511 LogFunc(("Caller %s: VbglR0SfHostReqCloseSimple %#RX64 failed with rc=%Rrc\n", pszCaller, pHandle->hHost, rc));
512 pHandle->hHost = SHFL_HANDLE_NIL;
513 pHandle->fFlags = VBSF_HANDLE_F_MAGIC_DEAD;
514 kfree(pHandle);
515 return 0;
516}
517
518
519/**
520 * Appends a handle to a handle list.
521 *
522 * @param pInodeInfo The inode to add it to.
523 * @param pHandle The handle to add.
524 */
525void vbsf_handle_append(struct vbsf_inode_info *pInodeInfo, struct vbsf_handle *pHandle)
526{
527#ifdef VBOX_STRICT
528 struct vbsf_handle *pCur;
529#endif
530 unsigned long fSavedFlags;
531
532 SFLOGFLOW(("vbsf_handle_append: %p (to %p)\n", pHandle, pInodeInfo));
533 AssertMsg((pHandle->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST)) == VBSF_HANDLE_F_MAGIC,
534 ("%p %#x\n", pHandle, pHandle->fFlags));
535 Assert(pInodeInfo->u32Magic == SF_INODE_INFO_MAGIC);
536
537 spin_lock_irqsave(&g_SfHandleLock, fSavedFlags);
538
539 AssertMsg((pHandle->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST)) == VBSF_HANDLE_F_MAGIC,
540 ("%p %#x\n", pHandle, pHandle->fFlags));
541#ifdef VBOX_STRICT
542 RTListForEach(&pInodeInfo->HandleList, pCur, struct vbsf_handle, Entry) {
543 Assert(pCur != pHandle);
544 AssertMsg( (pCur->fFlags & (VBSF_HANDLE_F_MAGIC_MASK | VBSF_HANDLE_F_ON_LIST))
545 == (VBSF_HANDLE_F_MAGIC | VBSF_HANDLE_F_ON_LIST), ("%p %#x\n", pCur, pCur->fFlags));
546 }
547 pHandle->pInodeInfo = pInodeInfo;
548#endif
549
550 pHandle->fFlags |= VBSF_HANDLE_F_ON_LIST;
551 RTListAppend(&pInodeInfo->HandleList, &pHandle->Entry);
552
553 spin_unlock_irqrestore(&g_SfHandleLock, fSavedFlags);
554}
555
556
557
558/*********************************************************************************************************************************
559* Misc *
560*********************************************************************************************************************************/
561
562#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 6)
563/** Any writable mappings? */
564DECLINLINE(bool) mapping_writably_mapped(struct address_space const *mapping)
565{
566# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 6)
567 return !list_empty(&mapping->i_mmap_shared);
568# else
569 return mapping->i_mmap_shared != NULL;
570# endif
571}
572#endif
573
574
575#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 12)
576/** Missing in 2.4.x, so just stub it for now. */
577DECLINLINE(bool) PageWriteback(struct page const *page)
578{
579 return false;
580}
581#endif
582
583
584/**
585 * Helper for deciding wheter we should do a read via the page cache or not.
586 *
587 * By default we will only use the page cache if there is a writable memory
588 * mapping of the file with a chance that it may have modified any of the pages
589 * already.
590 */
591DECLINLINE(bool) vbsf_should_use_cached_read(struct file *file, struct address_space *mapping, struct vbsf_super_info *pSuperInfo)
592{
593 if ( (file->f_flags & O_DIRECT)
594 || pSuperInfo->enmCacheMode == kVbsfCacheMode_None)
595 return false;
596 if ( pSuperInfo->enmCacheMode == kVbsfCacheMode_Read
597 || pSuperInfo->enmCacheMode == kVbsfCacheMode_ReadWrite)
598 return true;
599 Assert(pSuperInfo->enmCacheMode == kVbsfCacheMode_Strict);
600 return mapping
601 && mapping->nrpages > 0
602 && mapping_writably_mapped(mapping);
603}
604
605
606
607/*********************************************************************************************************************************
608* Pipe / splice stuff mainly for 2.6.17 >= linux < 2.6.31 (where no fallbacks were available) *
609*********************************************************************************************************************************/
610
611#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 17) \
612 && LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
613
614# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 30)
615# define LOCK_PIPE(a_pPipe) do { if ((a_pPipe)->inode) mutex_lock(&(a_pPipe)->inode->i_mutex); } while (0)
616# define UNLOCK_PIPE(a_pPipe) do { if ((a_pPipe)->inode) mutex_unlock(&(a_pPipe)->inode->i_mutex); } while (0)
617# else
618# define LOCK_PIPE(a_pPipe) pipe_lock(a_pPipe)
619# define UNLOCK_PIPE(a_pPipe) pipe_unlock(a_pPipe)
620# endif
621
622
623/** Waits for the pipe buffer status to change. */
624static void vbsf_wait_pipe(struct pipe_inode_info *pPipe)
625{
626 DEFINE_WAIT(WaitStuff);
627# ifdef TASK_NONINTERACTIVE
628 prepare_to_wait(&pPipe->wait, &WaitStuff, TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
629# else
630 prepare_to_wait(&pPipe->wait, &WaitStuff, TASK_INTERRUPTIBLE);
631# endif
632 UNLOCK_PIPE(pPipe);
633
634 schedule();
635
636 finish_wait(&pPipe->wait, &WaitStuff);
637 LOCK_PIPE(pPipe);
638}
639
640
641/** Worker for vbsf_feed_pages_to_pipe that wakes up readers. */
642static void vbsf_wake_up_pipe(struct pipe_inode_info *pPipe, bool fReaders)
643{
644 smp_mb();
645 if (waitqueue_active(&pPipe->wait))
646 wake_up_interruptible_sync(&pPipe->wait);
647 if (fReaders)
648 kill_fasync(&pPipe->fasync_readers, SIGIO, POLL_IN);
649 else
650 kill_fasync(&pPipe->fasync_writers, SIGIO, POLL_OUT);
651}
652
653#endif
654#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 17) \
655 && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31)
656
657/** Verify pipe buffer content (needed for page-cache to ensure idle page). */
658static int vbsf_pipe_buf_confirm(struct pipe_inode_info *pPipe, struct pipe_buffer *pPipeBuf)
659{
660 /*SFLOG3(("vbsf_pipe_buf_confirm: %p\n", pPipeBuf));*/
661 return 0;
662}
663
664
665/** Maps the buffer page. */
666static void *vbsf_pipe_buf_map(struct pipe_inode_info *pPipe, struct pipe_buffer *pPipeBuf, int atomic)
667{
668 void *pvRet;
669 if (!atomic)
670 pvRet = kmap(pPipeBuf->page);
671 else {
672 pPipeBuf->flags |= PIPE_BUF_FLAG_ATOMIC;
673 pvRet = kmap_atomic(pPipeBuf->page, KM_USER0);
674 }
675 /*SFLOG3(("vbsf_pipe_buf_map: %p -> %p\n", pPipeBuf, pvRet));*/
676 return pvRet;
677}
678
679
680/** Unmaps the buffer page. */
681static void vbsf_pipe_buf_unmap(struct pipe_inode_info *pPipe, struct pipe_buffer *pPipeBuf, void *pvMapping)
682{
683 /*SFLOG3(("vbsf_pipe_buf_unmap: %p/%p\n", pPipeBuf, pvMapping)); */
684 if (!(pPipeBuf->flags & PIPE_BUF_FLAG_ATOMIC))
685 kunmap(pPipeBuf->page);
686 else {
687 pPipeBuf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
688 kunmap_atomic(pvMapping, KM_USER0);
689 }
690}
691
692
693/** Gets a reference to the page. */
694static void vbsf_pipe_buf_get(struct pipe_inode_info *pPipe, struct pipe_buffer *pPipeBuf)
695{
696 page_cache_get(pPipeBuf->page);
697 /*SFLOG3(("vbsf_pipe_buf_get: %p (return count=%d)\n", pPipeBuf, page_count(pPipeBuf->page)));*/
698}
699
700
701/** Release the buffer page (counter to vbsf_pipe_buf_get). */
702static void vbsf_pipe_buf_release(struct pipe_inode_info *pPipe, struct pipe_buffer *pPipeBuf)
703{
704 /*SFLOG3(("vbsf_pipe_buf_release: %p (incoming count=%d)\n", pPipeBuf, page_count(pPipeBuf->page)));*/
705 page_cache_release(pPipeBuf->page);
706}
707
708
709/** Attempt to steal the page.
710 * @returns 0 success, 1 on failure. */
711static int vbsf_pipe_buf_steal(struct pipe_inode_info *pPipe, struct pipe_buffer *pPipeBuf)
712{
713 if (page_count(pPipeBuf->page) == 1) {
714 lock_page(pPipeBuf->page);
715 SFLOG3(("vbsf_pipe_buf_steal: %p -> 0\n", pPipeBuf));
716 return 0;
717 }
718 SFLOG3(("vbsf_pipe_buf_steal: %p -> 1\n", pPipeBuf));
719 return 1;
720}
721
722
723/**
724 * Pipe buffer operations for used by vbsf_feed_pages_to_pipe.
725 */
726static struct pipe_buf_operations vbsf_pipe_buf_ops = {
727 .can_merge = 0,
728# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
729 .confirm = vbsf_pipe_buf_confirm,
730# else
731 .pin = vbsf_pipe_buf_confirm,
732# endif
733 .map = vbsf_pipe_buf_map,
734 .unmap = vbsf_pipe_buf_unmap,
735 .get = vbsf_pipe_buf_get,
736 .release = vbsf_pipe_buf_release,
737 .steal = vbsf_pipe_buf_steal,
738};
739
740
741/**
742 * Feeds the pages to the pipe.
743 *
744 * Pages given to the pipe are set to NULL in papPages.
745 */
746static ssize_t vbsf_feed_pages_to_pipe(struct pipe_inode_info *pPipe, struct page **papPages, size_t cPages, uint32_t offPg0,
747 uint32_t cbActual, unsigned fFlags)
748{
749 ssize_t cbRet = 0;
750 size_t iPage = 0;
751 bool fNeedWakeUp = false;
752
753 LOCK_PIPE(pPipe);
754 for (;;) {
755 if ( pPipe->readers > 0
756 && pPipe->nrbufs < PIPE_BUFFERS) {
757 struct pipe_buffer *pPipeBuf = &pPipe->bufs[(pPipe->curbuf + pPipe->nrbufs) % PIPE_BUFFERS];
758 uint32_t const cbThisPage = RT_MIN(cbActual, PAGE_SIZE - offPg0);
759 pPipeBuf->len = cbThisPage;
760 pPipeBuf->offset = offPg0;
761# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
762 pPipeBuf->private = 0;
763# endif
764 pPipeBuf->ops = &vbsf_pipe_buf_ops;
765 pPipeBuf->flags = fFlags & SPLICE_F_GIFT ? PIPE_BUF_FLAG_GIFT : 0;
766 pPipeBuf->page = papPages[iPage];
767
768 papPages[iPage++] = NULL;
769 pPipe->nrbufs++;
770 fNeedWakeUp |= pPipe->inode != NULL;
771 offPg0 = 0;
772 cbRet += cbThisPage;
773
774 /* done? */
775 cbActual -= cbThisPage;
776 if (!cbActual)
777 break;
778 } else if (pPipe->readers == 0) {
779 SFLOGFLOW(("vbsf_feed_pages_to_pipe: no readers!\n"));
780 send_sig(SIGPIPE, current, 0);
781 if (cbRet == 0)
782 cbRet = -EPIPE;
783 break;
784 } else if (fFlags & SPLICE_F_NONBLOCK) {
785 if (cbRet == 0)
786 cbRet = -EAGAIN;
787 break;
788 } else if (signal_pending(current)) {
789 if (cbRet == 0)
790 cbRet = -ERESTARTSYS;
791 SFLOGFLOW(("vbsf_feed_pages_to_pipe: pending signal! (%zd)\n", cbRet));
792 break;
793 } else {
794 if (fNeedWakeUp) {
795 vbsf_wake_up_pipe(pPipe, true /*fReaders*/);
796 fNeedWakeUp = 0;
797 }
798 pPipe->waiting_writers++;
799 vbsf_wait_pipe(pPipe);
800 pPipe->waiting_writers--;
801 }
802 }
803 UNLOCK_PIPE(pPipe);
804
805 if (fNeedWakeUp)
806 vbsf_wake_up_pipe(pPipe, true /*fReaders*/);
807
808 return cbRet;
809}
810
811
812/**
813 * For splicing from a file to a pipe.
814 */
815static ssize_t vbsf_splice_read(struct file *file, loff_t *poffset, struct pipe_inode_info *pipe, size_t len, unsigned int flags)
816{
817 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
818 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
819 ssize_t cbRet;
820
821 SFLOGFLOW(("vbsf_splice_read: file=%p poffset=%p{%#RX64} pipe=%p len=%#zx flags=%#x\n", file, poffset, *poffset, pipe, len, flags));
822 if (vbsf_should_use_cached_read(file, inode->i_mapping, pSuperInfo)) {
823 cbRet = generic_file_splice_read(file, poffset, pipe, len, flags);
824 } else {
825 /*
826 * Create a read request.
827 */
828 loff_t offFile = *poffset;
829 size_t cPages = RT_MIN(RT_ALIGN_Z((offFile & ~PAGE_CACHE_MASK) + len, PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT,
830 PIPE_BUFFERS);
831 VBOXSFREADPGLSTREQ *pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ,
832 PgLst.aPages[cPages]));
833 if (pReq) {
834 /*
835 * Allocate pages.
836 */
837 struct page *apPages[PIPE_BUFFERS];
838 size_t i;
839 pReq->PgLst.offFirstPage = (uint16_t)offFile & (uint16_t)PAGE_OFFSET_MASK;
840 cbRet = 0;
841 for (i = 0; i < cPages; i++) {
842 struct page *pPage;
843 apPages[i] = pPage = alloc_page(GFP_USER);
844 if (pPage) {
845 pReq->PgLst.aPages[i] = page_to_phys(pPage);
846# ifdef VBOX_STRICT
847 ASMMemFill32(kmap(pPage), PAGE_SIZE, UINT32_C(0xdeadbeef));
848 kunmap(pPage);
849# endif
850 } else {
851 cbRet = -ENOMEM;
852 break;
853 }
854 }
855 if (cbRet == 0) {
856 /*
857 * Do the reading.
858 */
859 uint32_t const cbToRead = RT_MIN((cPages << PAGE_SHIFT) - (offFile & PAGE_OFFSET_MASK), len);
860 struct vbsf_reg_info *sf_r = (struct vbsf_reg_info *)file->private_data;
861 int vrc = VbglR0SfHostReqReadPgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, offFile, cbToRead, cPages);
862 if (RT_SUCCESS(vrc)) {
863 /*
864 * Get the number of bytes read, jettison the request
865 * and, in case of EOF, any unnecessary pages.
866 */
867 uint32_t cbActual = pReq->Parms.cb32Read.u.value32;
868 AssertStmt(cbActual <= cbToRead, cbActual = cbToRead);
869 SFLOG2(("vbsf_splice_read: read -> %#x bytes @ %#RX64\n", cbActual, offFile));
870
871 VbglR0PhysHeapFree(pReq);
872 pReq = NULL;
873
874 /*
875 * Now, feed it to the pipe thingy.
876 * This will take ownership of the all pages no matter what happens.
877 */
878 cbRet = vbsf_feed_pages_to_pipe(pipe, apPages, cPages, offFile & PAGE_OFFSET_MASK, cbActual, flags);
879 if (cbRet > 0)
880 *poffset = offFile + cbRet;
881 } else {
882 cbRet = -RTErrConvertToErrno(vrc);
883 SFLOGFLOW(("vbsf_splice_read: Read failed: %Rrc -> %zd\n", vrc, cbRet));
884 }
885 i = cPages;
886 }
887
888 while (i-- > 0)
889 if (apPages[i])
890 __free_pages(apPages[i], 0);
891 if (pReq)
892 VbglR0PhysHeapFree(pReq);
893 } else {
894 cbRet = -ENOMEM;
895 }
896 }
897 SFLOGFLOW(("vbsf_splice_read: returns %zd (%#zx), *poffset=%#RX64\n", cbRet, cbRet, *poffset));
898 return cbRet;
899}
900
901#endif /* 2.6.17 <= LINUX_VERSION_CODE < 2.6.31 */
902#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 17) \
903 && LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
904
905/**
906 * For splicing from a pipe to a file.
907 *
908 * Since we can combine buffers and request allocations, this should be faster
909 * than the default implementation.
910 */
911static ssize_t vbsf_splice_write(struct pipe_inode_info *pPipe, struct file *file, loff_t *poffset, size_t len, unsigned int flags)
912{
913 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
914 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
915 ssize_t cbRet;
916
917 SFLOGFLOW(("vbsf_splice_write: pPipe=%p file=%p poffset=%p{%#RX64} len=%#zx flags=%#x\n", pPipe, file, poffset, *poffset, len, flags));
918 /** @todo later if (false) {
919 cbRet = generic_file_splice_write(pPipe, file, poffset, len, flags);
920 } else */ {
921 /*
922 * Prepare a write request.
923 */
924# ifdef PIPE_BUFFERS
925 uint32_t const cMaxPages = RT_MIN(PIPE_BUFFERS, RT_ALIGN_Z(len, PAGE_SIZE) >> PAGE_SHIFT);
926# else
927 uint32_t const cMaxPages = RT_MIN(RT_MAX(RT_MIN(pPipe->buffers, 256), PIPE_DEF_BUFFERS),
928 RT_ALIGN_Z(len, PAGE_SIZE) >> PAGE_SHIFT);
929# endif
930 VBOXSFWRITEPGLSTREQ *pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ,
931 PgLst.aPages[cMaxPages]));
932 if (pReq) {
933 /*
934 * Feed from the pipe.
935 */
936 struct vbsf_reg_info *sf_r = (struct vbsf_reg_info *)file->private_data;
937 struct address_space *mapping = inode->i_mapping;
938 loff_t offFile = *poffset;
939 bool fNeedWakeUp = false;
940 cbRet = 0;
941
942 LOCK_PIPE(pPipe);
943
944 for (;;) {
945 unsigned cBufs = pPipe->nrbufs;
946 /*SFLOG2(("vbsf_splice_write: nrbufs=%#x curbuf=%#x\n", cBufs, pPipe->curbuf));*/
947 if (cBufs) {
948 /*
949 * There is data available. Write it to the file.
950 */
951 int vrc;
952 struct pipe_buffer *pPipeBuf = &pPipe->bufs[pPipe->curbuf];
953 uint32_t cPagesToWrite = 1;
954 uint32_t cbToWrite = pPipeBuf->len;
955
956 Assert(pPipeBuf->offset < PAGE_SIZE);
957 Assert(pPipeBuf->offset + pPipeBuf->len <= PAGE_SIZE);
958
959 pReq->PgLst.offFirstPage = pPipeBuf->offset & PAGE_OFFSET;
960 pReq->PgLst.aPages[0] = page_to_phys(pPipeBuf->page);
961
962 /* Add any adjacent page buffers: */
963 while ( cPagesToWrite < cBufs
964 && cPagesToWrite < cMaxPages
965 && ((pReq->PgLst.offFirstPage + cbToWrite) & PAGE_OFFSET_MASK) == 0) {
966# ifdef PIPE_BUFFERS
967 struct pipe_buffer *pPipeBuf2 = &pPipe->bufs[(pPipe->curbuf + cPagesToWrite) % PIPE_BUFFERS];
968# else
969 struct pipe_buffer *pPipeBuf2 = &pPipe->bufs[(pPipe->curbuf + cPagesToWrite) % pPipe->buffers];
970# endif
971 Assert(pPipeBuf2->len <= PAGE_SIZE);
972 Assert(pPipeBuf2->offset < PAGE_SIZE);
973 if (pPipeBuf2->offset != 0)
974 break;
975 pReq->PgLst.aPages[cPagesToWrite] = page_to_phys(pPipeBuf2->page);
976 cbToWrite += pPipeBuf2->len;
977 cPagesToWrite += 1;
978 }
979
980 /* Check that we don't have signals pending before we issue the write, as
981 we'll only end up having to cancel the HGCM request 99% of the time: */
982 if (!signal_pending(current)) {
983 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
984 vrc = VbglR0SfHostReqWritePgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, offFile,
985 cbToWrite, cPagesToWrite);
986 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
987 } else
988 vrc = VERR_INTERRUPTED;
989 if (RT_SUCCESS(vrc)) {
990 /*
991 * Get the number of bytes actually written, update file position
992 * and return value, and advance the pipe buffer.
993 */
994 uint32_t cbActual = pReq->Parms.cb32Write.u.value32;
995 AssertStmt(cbActual <= cbToWrite, cbActual = cbToWrite);
996 SFLOG2(("vbsf_splice_write: write -> %#x bytes @ %#RX64\n", cbActual, offFile));
997
998 cbRet += cbActual;
999
1000 while (cbActual > 0) {
1001 uint32_t cbAdvance = RT_MIN(pPipeBuf->len, cbActual);
1002
1003 vbsf_reg_write_sync_page_cache(mapping, offFile, cbAdvance, NULL,
1004 &pPipeBuf->page, pPipeBuf->offset, 1);
1005
1006 offFile += cbAdvance;
1007 cbActual -= cbAdvance;
1008 pPipeBuf->offset += cbAdvance;
1009 pPipeBuf->len -= cbAdvance;
1010
1011 if (!pPipeBuf->len) {
1012 struct pipe_buf_operations const *pOps = pPipeBuf->ops;
1013 pPipeBuf->ops = NULL;
1014 pOps->release(pPipe, pPipeBuf);
1015
1016# ifdef PIPE_BUFFERS
1017 pPipe->curbuf = (pPipe->curbuf + 1) % PIPE_BUFFERS;
1018# else
1019 pPipe->curbuf = (pPipe->curbuf + 1) % pPipe->buffers;
1020# endif
1021 pPipe->nrbufs -= 1;
1022 pPipeBuf = &pPipe->bufs[pPipe->curbuf];
1023
1024# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 30)
1025 fNeedWakeUp |= pPipe->inode != NULL;
1026# else
1027 fNeedWakeUp = true;
1028# endif
1029 } else {
1030 Assert(cbActual == 0);
1031 break;
1032 }
1033 }
1034
1035 *poffset = offFile;
1036 } else {
1037 if (cbRet == 0)
1038 cbRet = vrc == VERR_INTERRUPTED ? -ERESTARTSYS : -RTErrConvertToErrno(vrc);
1039 SFLOGFLOW(("vbsf_splice_write: Write failed: %Rrc -> %zd (cbRet=%#zx)\n",
1040 vrc, -RTErrConvertToErrno(vrc), cbRet));
1041 break;
1042 }
1043 } else {
1044 /*
1045 * Wait for data to become available, if there is chance that'll happen.
1046 */
1047 /* Quit if there are no writers (think EOF): */
1048 if (pPipe->writers == 0) {
1049 SFLOGFLOW(("vbsf_splice_write: No buffers. No writers. The show is done!\n"));
1050 break;
1051 }
1052
1053 /* Quit if if we've written some and no writers waiting on the lock: */
1054 if (cbRet > 0 && pPipe->waiting_writers == 0) {
1055 SFLOGFLOW(("vbsf_splice_write: No waiting writers, returning what we've got.\n"));
1056 break;
1057 }
1058
1059 /* Quit with EAGAIN if non-blocking: */
1060 if (flags & SPLICE_F_NONBLOCK) {
1061 if (cbRet == 0)
1062 cbRet = -EAGAIN;
1063 break;
1064 }
1065
1066 /* Quit if we've got pending signals: */
1067 if (signal_pending(current)) {
1068 if (cbRet == 0)
1069 cbRet = -ERESTARTSYS;
1070 SFLOGFLOW(("vbsf_splice_write: pending signal! (%zd)\n", cbRet));
1071 break;
1072 }
1073
1074 /* Wake up writers before we start waiting: */
1075 if (fNeedWakeUp) {
1076 vbsf_wake_up_pipe(pPipe, false /*fReaders*/);
1077 fNeedWakeUp = false;
1078 }
1079 vbsf_wait_pipe(pPipe);
1080 }
1081 } /* feed loop */
1082
1083 if (fNeedWakeUp)
1084 vbsf_wake_up_pipe(pPipe, false /*fReaders*/);
1085
1086 UNLOCK_PIPE(pPipe);
1087
1088 VbglR0PhysHeapFree(pReq);
1089 } else {
1090 cbRet = -ENOMEM;
1091 }
1092 }
1093 SFLOGFLOW(("vbsf_splice_write: returns %zd (%#zx), *poffset=%#RX64\n", cbRet, cbRet, *poffset));
1094 return cbRet;
1095}
1096
1097#endif /* 2.6.17 <= LINUX_VERSION_CODE < 3.16.0 */
1098
1099#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 30) \
1100 && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)
1101/**
1102 * Our own senfile implementation that does not go via the page cache like
1103 * generic_file_sendfile() does.
1104 */
1105static ssize_t vbsf_reg_sendfile(struct file *pFile, loff_t *poffFile, size_t cbToSend, read_actor_t pfnActor,
1106# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 8)
1107 void *pvUser
1108# else
1109 void __user *pvUser
1110# endif
1111 )
1112{
1113 struct inode *inode = VBSF_GET_F_DENTRY(pFile)->d_inode;
1114 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
1115 ssize_t cbRet;
1116 SFLOGFLOW(("vbsf_reg_sendfile: pFile=%p poffFile=%p{%#RX64} cbToSend=%#zx pfnActor=%p pvUser=%p\n",
1117 pFile, poffFile, poffFile ? *poffFile : 0, cbToSend, pfnActor, pvUser));
1118 Assert(pSuperInfo);
1119
1120 /*
1121 * Return immediately if asked to send nothing.
1122 */
1123 if (cbToSend == 0)
1124 return 0;
1125
1126 /*
1127 * Like for vbsf_reg_read() and vbsf_reg_read_iter(), we allow going via
1128 * the page cache in some cases or configs.
1129 */
1130 if (vbsf_should_use_cached_read(pFile, inode->i_mapping, pSuperInfo)) {
1131 cbRet = generic_file_sendfile(pFile, poffFile, cbToSend, pfnActor, pvUser);
1132 SFLOGFLOW(("vbsf_reg_sendfile: returns %#zx *poffFile=%#RX64 [generic_file_sendfile]\n", cbRet, poffFile ? *poffFile : UINT64_MAX));
1133 } else {
1134 /*
1135 * Allocate a request and a bunch of pages for reading from the file.
1136 */
1137 struct page *apPages[16];
1138 loff_t offFile = poffFile ? *poffFile : 0;
1139 size_t const cPages = cbToSend + ((size_t)offFile & PAGE_OFFSET_MASK) >= RT_ELEMENTS(apPages) * PAGE_SIZE
1140 ? RT_ELEMENTS(apPages)
1141 : RT_ALIGN_Z(cbToSend + ((size_t)offFile & PAGE_OFFSET_MASK), PAGE_SIZE) >> PAGE_SHIFT;
1142 size_t iPage;
1143 VBOXSFREADPGLSTREQ *pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ,
1144 PgLst.aPages[cPages]));
1145 if (pReq) {
1146 Assert(cPages > 0);
1147 cbRet = 0;
1148 for (iPage = 0; iPage < cPages; iPage++) {
1149 struct page *pPage;
1150 apPages[iPage] = pPage = alloc_page(GFP_USER);
1151 if (pPage) {
1152 Assert(page_count(pPage) == 1);
1153 pReq->PgLst.aPages[iPage] = page_to_phys(pPage);
1154 } else {
1155 while (iPage-- > 0)
1156 vbsf_put_page(apPages[iPage]);
1157 cbRet = -ENOMEM;
1158 break;
1159 }
1160 }
1161 if (cbRet == 0) {
1162 /*
1163 * Do the job.
1164 */
1165 struct vbsf_reg_info *sf_r = (struct vbsf_reg_info *)pFile->private_data;
1166 read_descriptor_t RdDesc;
1167 RdDesc.count = cbToSend;
1168# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 8)
1169 RdDesc.arg.data = pvUser;
1170# else
1171 RdDesc.buf = pvUser;
1172# endif
1173 RdDesc.written = 0;
1174 RdDesc.error = 0;
1175
1176 Assert(sf_r);
1177 Assert((sf_r->Handle.fFlags & VBSF_HANDLE_F_MAGIC_MASK) == VBSF_HANDLE_F_MAGIC);
1178
1179 while (cbToSend > 0) {
1180 /*
1181 * Read another chunk. For paranoid reasons, we keep data where the page cache
1182 * would keep it, i.e. page offset bits corresponds to the file offset bits.
1183 */
1184 uint32_t const offPg0 = (uint32_t)offFile & (uint32_t)PAGE_OFFSET_MASK;
1185 uint32_t const cbToRead = RT_MIN((cPages << PAGE_SHIFT) - offPg0, cbToSend);
1186 uint32_t const cPagesToRead = RT_ALIGN_Z(cbToRead + offPg0, PAGE_SIZE) >> PAGE_SHIFT;
1187 int vrc;
1188 pReq->PgLst.offFirstPage = (uint16_t)offPg0;
1189 if (!signal_pending(current))
1190 vrc = VbglR0SfHostReqReadPgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, offFile,
1191 cbToRead, cPagesToRead);
1192 else
1193 vrc = VERR_INTERRUPTED;
1194 if (RT_SUCCESS(vrc)) {
1195 /*
1196 * Pass what we read to the actor.
1197 */
1198 uint32_t off = offPg0;
1199 uint32_t cbActual = pReq->Parms.cb32Read.u.value32;
1200 bool const fIsEof = cbActual < cbToRead;
1201 AssertStmt(cbActual <= cbToRead, cbActual = cbToRead);
1202 SFLOG3(("vbsf_reg_sendfile: Read %#x bytes (offPg0=%#x), wanted %#x ...\n", cbActual, offPg0, cbToRead));
1203
1204 iPage = 0;
1205 while (cbActual > 0) {
1206 uint32_t const cbPage = RT_MIN(cbActual, PAGE_SIZE - off);
1207 int const cbRetActor = pfnActor(&RdDesc, apPages[iPage], off, cbPage);
1208 Assert(cbRetActor >= 0); /* Returns zero on failure, with RdDesc.error holding the status code. */
1209
1210 AssertMsg(iPage < cPages && iPage < cPagesToRead, ("iPage=%#x cPages=%#x cPagesToRead=%#x\n", iPage, cPages, cPagesToRead));
1211
1212 offFile += cbRetActor;
1213 if ((uint32_t)cbRetActor == cbPage && RdDesc.count > 0) {
1214 cbActual -= cbPage;
1215 cbToSend -= cbPage;
1216 iPage++;
1217 } else {
1218 SFLOG3(("vbsf_reg_sendfile: cbRetActor=%#x (%d) cbPage=%#x RdDesc{count=%#lx error=%d} iPage=%#x/%#x/%#x cbToSend=%#zx\n",
1219 cbRetActor, cbRetActor, cbPage, RdDesc.count, RdDesc.error, iPage, cPagesToRead, cPages, cbToSend));
1220 vrc = VERR_CALLBACK_RETURN;
1221 break;
1222 }
1223 off = 0;
1224 }
1225
1226 /*
1227 * Are we done yet?
1228 */
1229 if (RT_FAILURE_NP(vrc) || cbToSend == 0 || RdDesc.error != 0 || fIsEof) {
1230 break;
1231 }
1232
1233 /*
1234 * Replace pages held by the actor.
1235 */
1236 vrc = VINF_SUCCESS;
1237 for (iPage = 0; iPage < cPages; iPage++) {
1238 struct page *pPage = apPages[iPage];
1239 if (page_count(pPage) != 1) {
1240 struct page *pNewPage = alloc_page(GFP_USER);
1241 if (pNewPage) {
1242 SFLOGFLOW(("vbsf_reg_sendfile: Replacing page #%x: %p -> %p\n", iPage, pPage, pNewPage));
1243 vbsf_put_page(pPage);
1244 apPages[iPage] = pNewPage;
1245 } else {
1246 SFLOGFLOW(("vbsf_reg_sendfile: Failed to allocate a replacement page.\n"));
1247 vrc = VERR_NO_MEMORY;
1248 break;
1249 }
1250 }
1251 }
1252 if (RT_FAILURE(vrc))
1253 break; /* RdDesc.written should be non-zero, so don't bother with setting error. */
1254 } else {
1255 RdDesc.error = vrc == VERR_INTERRUPTED ? -ERESTARTSYS : -RTErrConvertToErrno(vrc);
1256 SFLOGFLOW(("vbsf_reg_sendfile: Read failed: %Rrc -> %zd (RdDesc.error=%#d)\n",
1257 vrc, -RTErrConvertToErrno(vrc), RdDesc.error));
1258 break;
1259 }
1260 }
1261
1262 /*
1263 * Free memory.
1264 */
1265 for (iPage = 0; iPage < cPages; iPage++)
1266 vbsf_put_page(apPages[iPage]);
1267
1268 /*
1269 * Set the return values.
1270 */
1271 if (RdDesc.written) {
1272 cbRet = RdDesc.written;
1273 if (poffFile)
1274 *poffFile = offFile;
1275 } else {
1276 cbRet = RdDesc.error;
1277 }
1278 }
1279 VbglR0PhysHeapFree(pReq);
1280 } else {
1281 cbRet = -ENOMEM;
1282 }
1283 SFLOGFLOW(("vbsf_reg_sendfile: returns %#zx offFile=%#RX64\n", cbRet, offFile));
1284 }
1285 return cbRet;
1286}
1287#endif /* 2.5.30 <= LINUX_VERSION_CODE < 2.6.23 */
1288
1289
1290/*********************************************************************************************************************************
1291* File operations on regular files *
1292*********************************************************************************************************************************/
1293
1294/** Wrapper around put_page / page_cache_release. */
1295DECLINLINE(void) vbsf_put_page(struct page *pPage)
1296{
1297#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
1298 put_page(pPage);
1299#else
1300 page_cache_release(pPage);
1301#endif
1302}
1303
1304
1305/** Wrapper around get_page / page_cache_get. */
1306DECLINLINE(void) vbsf_get_page(struct page *pPage)
1307{
1308#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
1309 get_page(pPage);
1310#else
1311 page_cache_get(pPage);
1312#endif
1313}
1314
1315
1316/** Companion to vbsf_lock_user_pages(). */
1317static void vbsf_unlock_user_pages(struct page **papPages, size_t cPages, bool fSetDirty, bool fLockPgHack)
1318{
1319 /* We don't mark kernel pages dirty: */
1320 if (fLockPgHack)
1321 fSetDirty = false;
1322
1323 while (cPages-- > 0)
1324 {
1325 struct page *pPage = papPages[cPages];
1326 Assert((ssize_t)cPages >= 0);
1327 if (fSetDirty && !PageReserved(pPage))
1328 set_page_dirty(pPage);
1329 vbsf_put_page(pPage);
1330 }
1331}
1332
1333
1334/**
1335 * Worker for vbsf_lock_user_pages_failed_check_kernel() and
1336 * vbsf_iter_lock_pages().
1337 */
1338static int vbsf_lock_kernel_pages(uint8_t *pbStart, bool fWrite, size_t cPages, struct page **papPages)
1339{
1340 uintptr_t const uPtrFrom = (uintptr_t)pbStart;
1341 uintptr_t const uPtrLast = (uPtrFrom & ~(uintptr_t)PAGE_OFFSET_MASK) + (cPages << PAGE_SHIFT) - 1;
1342 uint8_t *pbPage = (uint8_t *)uPtrLast;
1343 size_t iPage = cPages;
1344
1345 /*
1346 * Touch the pages first (paranoia^2).
1347 */
1348 if (fWrite) {
1349 uint8_t volatile *pbProbe = (uint8_t volatile *)uPtrFrom;
1350 while (iPage-- > 0) {
1351 *pbProbe = *pbProbe;
1352 pbProbe += PAGE_SIZE;
1353 }
1354 } else {
1355 uint8_t const *pbProbe = (uint8_t const *)uPtrFrom;
1356 while (iPage-- > 0) {
1357 ASMProbeReadByte(pbProbe);
1358 pbProbe += PAGE_SIZE;
1359 }
1360 }
1361
1362 /*
1363 * Get the pages.
1364 * Note! Fixes here probably applies to rtR0MemObjNativeLockKernel as well.
1365 */
1366 iPage = cPages;
1367 if ( uPtrFrom >= (unsigned long)__va(0)
1368 && uPtrLast < (unsigned long)high_memory) {
1369 /* The physical page mapping area: */
1370 while (iPage-- > 0) {
1371 struct page *pPage = papPages[iPage] = virt_to_page(pbPage);
1372 vbsf_get_page(pPage);
1373 pbPage -= PAGE_SIZE;
1374 }
1375 } else {
1376 /* This is vmalloc or some such thing, so go thru page tables: */
1377 while (iPage-- > 0) {
1378 struct page *pPage = rtR0MemObjLinuxVirtToPage(pbPage);
1379 if (pPage) {
1380 papPages[iPage] = pPage;
1381 vbsf_get_page(pPage);
1382 pbPage -= PAGE_SIZE;
1383 } else {
1384 while (++iPage < cPages) {
1385 pPage = papPages[iPage];
1386 vbsf_put_page(pPage);
1387 }
1388 return -EFAULT;
1389 }
1390 }
1391 }
1392 return 0;
1393}
1394
1395
1396/**
1397 * Catches kernel_read() and kernel_write() calls and works around them.
1398 *
1399 * The file_operations::read and file_operations::write callbacks supposedly
1400 * hands us the user buffers to read into and write out of. To allow the kernel
1401 * to read and write without allocating buffers in userland, they kernel_read()
1402 * and kernel_write() increases the user space address limit before calling us
1403 * so that copyin/copyout won't reject it. Our problem is that get_user_pages()
1404 * works on the userspace address space structures and will not be fooled by an
1405 * increased addr_limit.
1406 *
1407 * This code tries to detect this situation and fake get_user_lock() for the
1408 * kernel buffer.
1409 */
1410static int vbsf_lock_user_pages_failed_check_kernel(uintptr_t uPtrFrom, size_t cPages, bool fWrite, int rcFailed,
1411 struct page **papPages, bool *pfLockPgHack)
1412{
1413 /*
1414 * Check that this is valid user memory that is actually in the kernel range.
1415 */
1416#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) || defined(RHEL_81)
1417 if ( access_ok((void *)uPtrFrom, cPages << PAGE_SHIFT)
1418 && uPtrFrom >= USER_DS.seg)
1419#else
1420 if ( access_ok(fWrite ? VERIFY_WRITE : VERIFY_READ, (void *)uPtrFrom, cPages << PAGE_SHIFT)
1421 && uPtrFrom >= USER_DS.seg)
1422#endif
1423 {
1424 int rc = vbsf_lock_kernel_pages((uint8_t *)uPtrFrom, fWrite, cPages, papPages);
1425 if (rc == 0) {
1426 *pfLockPgHack = true;
1427 return 0;
1428 }
1429 }
1430
1431 return rcFailed;
1432}
1433
1434
1435/** Wrapper around get_user_pages. */
1436DECLINLINE(int) vbsf_lock_user_pages(uintptr_t uPtrFrom, size_t cPages, bool fWrite, struct page **papPages, bool *pfLockPgHack)
1437{
1438# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
1439 ssize_t cPagesLocked = get_user_pages_unlocked(uPtrFrom, cPages, papPages,
1440 fWrite ? FOLL_WRITE | FOLL_FORCE : FOLL_FORCE);
1441# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
1442 ssize_t cPagesLocked = get_user_pages_unlocked(uPtrFrom, cPages, fWrite, 1 /*force*/, papPages);
1443# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 168) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
1444 ssize_t cPagesLocked = get_user_pages_unlocked(current, current->mm, uPtrFrom, cPages, papPages,
1445 fWrite ? FOLL_WRITE | FOLL_FORCE : FOLL_FORCE);
1446# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
1447 ssize_t cPagesLocked = get_user_pages_unlocked(current, current->mm, uPtrFrom, cPages, fWrite, 1 /*force*/, papPages);
1448# else
1449 struct task_struct *pTask = current;
1450 ssize_t cPagesLocked;
1451 down_read(&pTask->mm->mmap_sem);
1452 cPagesLocked = get_user_pages(pTask, pTask->mm, uPtrFrom, cPages, fWrite, 1 /*force*/, papPages, NULL);
1453 up_read(&pTask->mm->mmap_sem);
1454# endif
1455 *pfLockPgHack = false;
1456 if (cPagesLocked == cPages)
1457 return 0;
1458
1459 /*
1460 * It failed.
1461 */
1462 if (cPagesLocked < 0)
1463 return vbsf_lock_user_pages_failed_check_kernel(uPtrFrom, cPages, fWrite, (int)cPagesLocked, papPages, pfLockPgHack);
1464
1465 vbsf_unlock_user_pages(papPages, cPagesLocked, false /*fSetDirty*/, false /*fLockPgHack*/);
1466
1467 /* We could use uPtrFrom + cPagesLocked to get the correct status here... */
1468 return -EFAULT;
1469}
1470
1471
1472/**
1473 * Read function used when accessing files that are memory mapped.
1474 *
1475 * We read from the page cache here to present the a cohertent picture of the
1476 * the file content.
1477 */
1478static ssize_t vbsf_reg_read_mapped(struct file *file, char /*__user*/ *buf, size_t size, loff_t *off)
1479{
1480#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
1481 struct iovec iov = { .iov_base = buf, .iov_len = size };
1482 struct iov_iter iter;
1483 struct kiocb kiocb;
1484 ssize_t cbRet;
1485
1486 init_sync_kiocb(&kiocb, file);
1487 kiocb.ki_pos = *off;
1488 iov_iter_init(&iter, READ, &iov, 1, size);
1489
1490 cbRet = generic_file_read_iter(&kiocb, &iter);
1491
1492 *off = kiocb.ki_pos;
1493 return cbRet;
1494
1495#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
1496 struct iovec iov = { .iov_base = buf, .iov_len = size };
1497 struct kiocb kiocb;
1498 ssize_t cbRet;
1499
1500 init_sync_kiocb(&kiocb, file);
1501 kiocb.ki_pos = *off;
1502
1503 cbRet = generic_file_aio_read(&kiocb, &iov, 1, *off);
1504 if (cbRet == -EIOCBQUEUED)
1505 cbRet = wait_on_sync_kiocb(&kiocb);
1506
1507 *off = kiocb.ki_pos;
1508 return cbRet;
1509
1510#else /* 2.6.18 or earlier: */
1511 return generic_file_read(file, buf, size, off);
1512#endif
1513}
1514
1515
1516/**
1517 * Fallback case of vbsf_reg_read() that locks the user buffers and let the host
1518 * write directly to them.
1519 */
1520static ssize_t vbsf_reg_read_locking(struct file *file, char /*__user*/ *buf, size_t size, loff_t *off,
1521 struct vbsf_super_info *pSuperInfo, struct vbsf_reg_info *sf_r)
1522{
1523 /*
1524 * Lock pages and execute the read, taking care not to pass the host
1525 * more than it can handle in one go or more than we care to allocate
1526 * page arrays for. The latter limit is set at just short of 32KB due
1527 * to how the physical heap works.
1528 */
1529 struct page *apPagesStack[16];
1530 struct page **papPages = &apPagesStack[0];
1531 struct page **papPagesFree = NULL;
1532 VBOXSFREADPGLSTREQ *pReq;
1533 loff_t offFile = *off;
1534 ssize_t cbRet = -ENOMEM;
1535 size_t cPages = (((uintptr_t)buf & PAGE_OFFSET_MASK) + size + PAGE_OFFSET_MASK) >> PAGE_SHIFT;
1536 size_t cMaxPages = RT_MIN(RT_MAX(pSuperInfo->cMaxIoPages, 1), cPages);
1537 bool fLockPgHack;
1538
1539 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
1540 while (!pReq && cMaxPages > 4) {
1541 cMaxPages /= 2;
1542 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
1543 }
1544 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
1545 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
1546 if (pReq && papPages) {
1547 cbRet = 0;
1548 for (;;) {
1549 /*
1550 * Figure out how much to process now and lock the user pages.
1551 */
1552 int rc;
1553 size_t cbChunk = (uintptr_t)buf & PAGE_OFFSET_MASK;
1554 pReq->PgLst.offFirstPage = (uint16_t)cbChunk;
1555 cPages = RT_ALIGN_Z(cbChunk + size, PAGE_SIZE) >> PAGE_SHIFT;
1556 if (cPages <= cMaxPages)
1557 cbChunk = size;
1558 else {
1559 cPages = cMaxPages;
1560 cbChunk = (cMaxPages << PAGE_SHIFT) - cbChunk;
1561 }
1562
1563 rc = vbsf_lock_user_pages((uintptr_t)buf, cPages, true /*fWrite*/, papPages, &fLockPgHack);
1564 if (rc == 0) {
1565 size_t iPage = cPages;
1566 while (iPage-- > 0)
1567 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
1568 } else {
1569 cbRet = rc;
1570 break;
1571 }
1572
1573 /*
1574 * Issue the request and unlock the pages.
1575 */
1576 rc = VbglR0SfHostReqReadPgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, offFile, cbChunk, cPages);
1577
1578 Assert(cPages <= cMaxPages);
1579 vbsf_unlock_user_pages(papPages, cPages, true /*fSetDirty*/, fLockPgHack);
1580
1581 if (RT_SUCCESS(rc)) {
1582 /*
1583 * Success, advance position and buffer.
1584 */
1585 uint32_t cbActual = pReq->Parms.cb32Read.u.value32;
1586 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
1587 cbRet += cbActual;
1588 offFile += cbActual;
1589 buf = (uint8_t *)buf + cbActual;
1590 size -= cbActual;
1591
1592 /*
1593 * Are we done already? If so commit the new file offset.
1594 */
1595 if (!size || cbActual < cbChunk) {
1596 *off = offFile;
1597 break;
1598 }
1599 } else if (rc == VERR_NO_MEMORY && cMaxPages > 4) {
1600 /*
1601 * The host probably doesn't have enough heap to handle the
1602 * request, reduce the page count and retry.
1603 */
1604 cMaxPages /= 4;
1605 Assert(cMaxPages > 0);
1606 } else {
1607 /*
1608 * If we've successfully read stuff, return it rather than
1609 * the error. (Not sure if this is such a great idea...)
1610 */
1611 if (cbRet > 0) {
1612 SFLOGFLOW(("vbsf_reg_read: read at %#RX64 -> %Rrc; got cbRet=%#zx already\n", offFile, rc, cbRet));
1613 *off = offFile;
1614 } else {
1615 SFLOGFLOW(("vbsf_reg_read: read at %#RX64 -> %Rrc\n", offFile, rc));
1616 cbRet = -EPROTO;
1617 }
1618 break;
1619 }
1620 }
1621 }
1622 if (papPagesFree)
1623 kfree(papPages);
1624 if (pReq)
1625 VbglR0PhysHeapFree(pReq);
1626 SFLOGFLOW(("vbsf_reg_read: returns %zd (%#zx), *off=%RX64 [lock]\n", cbRet, cbRet, *off));
1627 return cbRet;
1628}
1629
1630
1631/**
1632 * Read from a regular file.
1633 *
1634 * @param file the file
1635 * @param buf the buffer
1636 * @param size length of the buffer
1637 * @param off offset within the file (in/out).
1638 * @returns the number of read bytes on success, Linux error code otherwise
1639 */
1640static ssize_t vbsf_reg_read(struct file *file, char /*__user*/ *buf, size_t size, loff_t *off)
1641{
1642 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
1643 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
1644 struct vbsf_reg_info *sf_r = file->private_data;
1645 struct address_space *mapping = inode->i_mapping;
1646
1647 SFLOGFLOW(("vbsf_reg_read: inode=%p file=%p buf=%p size=%#zx off=%#llx\n", inode, file, buf, size, *off));
1648
1649 if (!S_ISREG(inode->i_mode)) {
1650 LogFunc(("read from non regular file %d\n", inode->i_mode));
1651 return -EINVAL;
1652 }
1653
1654 /** @todo XXX Check read permission according to inode->i_mode! */
1655
1656 if (!size)
1657 return 0;
1658
1659 /*
1660 * If there is a mapping and O_DIRECT isn't in effect, we must at a
1661 * heed dirty pages in the mapping and read from them. For simplicity
1662 * though, we just do page cache reading when there are writable
1663 * mappings around with any kind of pages loaded.
1664 */
1665 if (vbsf_should_use_cached_read(file, mapping, pSuperInfo))
1666 return vbsf_reg_read_mapped(file, buf, size, off);
1667
1668 /*
1669 * For small requests, try use an embedded buffer provided we get a heap block
1670 * that does not cross page boundraries (see host code).
1671 */
1672 if (size <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) /* see allocator */) {
1673 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) + size;
1674 VBOXSFREADEMBEDDEDREQ *pReq = (VBOXSFREADEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
1675 if (pReq) {
1676 if ((PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
1677 ssize_t cbRet;
1678 int vrc = VbglR0SfHostReqReadEmbedded(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, *off, (uint32_t)size);
1679 if (RT_SUCCESS(vrc)) {
1680 cbRet = pReq->Parms.cb32Read.u.value32;
1681 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
1682 if (copy_to_user(buf, pReq->abData, cbRet) == 0)
1683 *off += cbRet;
1684 else
1685 cbRet = -EFAULT;
1686 } else
1687 cbRet = -EPROTO;
1688 VbglR0PhysHeapFree(pReq);
1689 SFLOGFLOW(("vbsf_reg_read: returns %zd (%#zx), *off=%RX64 [embed]\n", cbRet, cbRet, *off));
1690 return cbRet;
1691 }
1692 VbglR0PhysHeapFree(pReq);
1693 }
1694 }
1695
1696#if 0 /* Turns out this is slightly slower than locking the pages even for 4KB reads (4.19/amd64). */
1697 /*
1698 * For medium sized requests try use a bounce buffer.
1699 */
1700 if (size <= _64K /** @todo make this configurable? */) {
1701 void *pvBounce = kmalloc(size, GFP_KERNEL);
1702 if (pvBounce) {
1703 VBOXSFREADPGLSTREQ *pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
1704 if (pReq) {
1705 ssize_t cbRet;
1706 int vrc = VbglR0SfHostReqReadContig(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, *off,
1707 (uint32_t)size, pvBounce, virt_to_phys(pvBounce));
1708 if (RT_SUCCESS(vrc)) {
1709 cbRet = pReq->Parms.cb32Read.u.value32;
1710 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
1711 if (copy_to_user(buf, pvBounce, cbRet) == 0)
1712 *off += cbRet;
1713 else
1714 cbRet = -EFAULT;
1715 } else
1716 cbRet = -EPROTO;
1717 VbglR0PhysHeapFree(pReq);
1718 kfree(pvBounce);
1719 SFLOGFLOW(("vbsf_reg_read: returns %zd (%#zx), *off=%RX64 [bounce]\n", cbRet, cbRet, *off));
1720 return cbRet;
1721 }
1722 kfree(pvBounce);
1723 }
1724 }
1725#endif
1726
1727 return vbsf_reg_read_locking(file, buf, size, off, pSuperInfo, sf_r);
1728}
1729
1730
1731/**
1732 * Helper the synchronizes the page cache content with something we just wrote
1733 * to the host.
1734 */
1735static void vbsf_reg_write_sync_page_cache(struct address_space *mapping, loff_t offFile, uint32_t cbRange,
1736 uint8_t const *pbSrcBuf, struct page **papSrcPages,
1737 uint32_t offSrcPage, size_t cSrcPages)
1738{
1739 Assert(offSrcPage < PAGE_SIZE);
1740 if (mapping && mapping->nrpages > 0) {
1741 /*
1742 * Work the pages in the write range.
1743 */
1744 while (cbRange > 0) {
1745 /*
1746 * Lookup the page at offFile. We're fine if there aren't
1747 * any there. We're skip if it's dirty or is being written
1748 * back, at least for now.
1749 */
1750 size_t const offDstPage = offFile & PAGE_OFFSET_MASK;
1751 size_t const cbToCopy = RT_MIN(PAGE_SIZE - offDstPage, cbRange);
1752 pgoff_t const idxPage = offFile >> PAGE_SHIFT;
1753 struct page *pDstPage = find_lock_page(mapping, idxPage);
1754 if (pDstPage) {
1755 if ( pDstPage->mapping == mapping /* ignore if re-purposed (paranoia) */
1756 && pDstPage->index == idxPage
1757 && !PageDirty(pDstPage) /* ignore if dirty */
1758 && !PageWriteback(pDstPage) /* ignore if being written back */ ) {
1759 /*
1760 * Map the page and do the copying.
1761 */
1762 uint8_t *pbDst = (uint8_t *)kmap(pDstPage);
1763 if (pbSrcBuf)
1764 memcpy(&pbDst[offDstPage], pbSrcBuf, cbToCopy);
1765 else {
1766 uint32_t const cbSrc0 = PAGE_SIZE - offSrcPage;
1767 uint8_t const *pbSrc = (uint8_t const *)kmap(papSrcPages[0]);
1768 AssertMsg(cSrcPages >= 1, ("offFile=%#llx cbRange=%#zx cbToCopy=%#zx\n", offFile, cbRange, cbToCopy));
1769 memcpy(&pbDst[offDstPage], &pbSrc[offSrcPage], RT_MIN(cbToCopy, cbSrc0));
1770 kunmap(papSrcPages[0]);
1771 if (cbToCopy > cbSrc0) {
1772 AssertMsg(cSrcPages >= 2, ("offFile=%#llx cbRange=%#zx cbToCopy=%#zx\n", offFile, cbRange, cbToCopy));
1773 pbSrc = (uint8_t const *)kmap(papSrcPages[1]);
1774 memcpy(&pbDst[offDstPage + cbSrc0], pbSrc, cbToCopy - cbSrc0);
1775 kunmap(papSrcPages[1]);
1776 }
1777 }
1778 kunmap(pDstPage);
1779 flush_dcache_page(pDstPage);
1780 if (cbToCopy == PAGE_SIZE)
1781 SetPageUptodate(pDstPage);
1782# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
1783 mark_page_accessed(pDstPage);
1784# endif
1785 } else
1786 SFLOGFLOW(("vbsf_reg_write_sync_page_cache: Skipping page %p: mapping=%p (vs %p) writeback=%d offset=%#lx (vs%#lx)\n",
1787 pDstPage, pDstPage->mapping, mapping, PageWriteback(pDstPage), pDstPage->index, idxPage));
1788 unlock_page(pDstPage);
1789 vbsf_put_page(pDstPage);
1790 }
1791
1792 /*
1793 * Advance.
1794 */
1795 if (pbSrcBuf)
1796 pbSrcBuf += cbToCopy;
1797 else
1798 {
1799 offSrcPage += cbToCopy;
1800 Assert(offSrcPage < PAGE_SIZE * 2);
1801 if (offSrcPage >= PAGE_SIZE) {
1802 offSrcPage &= PAGE_OFFSET_MASK;
1803 papSrcPages++;
1804# ifdef VBOX_STRICT
1805 Assert(cSrcPages > 0);
1806 cSrcPages--;
1807# endif
1808 }
1809 }
1810 offFile += cbToCopy;
1811 cbRange -= cbToCopy;
1812 }
1813 }
1814 RT_NOREF(cSrcPages);
1815}
1816
1817
1818/**
1819 * Fallback case of vbsf_reg_write() that locks the user buffers and let the host
1820 * write directly to them.
1821 */
1822static ssize_t vbsf_reg_write_locking(struct file *file, const char /*__user*/ *buf, size_t size, loff_t *off, loff_t offFile,
1823 struct inode *inode, struct vbsf_inode_info *sf_i,
1824 struct vbsf_super_info *pSuperInfo, struct vbsf_reg_info *sf_r)
1825{
1826 /*
1827 * Lock pages and execute the write, taking care not to pass the host
1828 * more than it can handle in one go or more than we care to allocate
1829 * page arrays for. The latter limit is set at just short of 32KB due
1830 * to how the physical heap works.
1831 */
1832 struct page *apPagesStack[16];
1833 struct page **papPages = &apPagesStack[0];
1834 struct page **papPagesFree = NULL;
1835 VBOXSFWRITEPGLSTREQ *pReq;
1836 ssize_t cbRet = -ENOMEM;
1837 size_t cPages = (((uintptr_t)buf & PAGE_OFFSET_MASK) + size + PAGE_OFFSET_MASK) >> PAGE_SHIFT;
1838 size_t cMaxPages = RT_MIN(RT_MAX(pSuperInfo->cMaxIoPages, 1), cPages);
1839 bool fLockPgHack;
1840
1841 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
1842 while (!pReq && cMaxPages > 4) {
1843 cMaxPages /= 2;
1844 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
1845 }
1846 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
1847 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
1848 if (pReq && papPages) {
1849 cbRet = 0;
1850 for (;;) {
1851 /*
1852 * Figure out how much to process now and lock the user pages.
1853 */
1854 int rc;
1855 size_t cbChunk = (uintptr_t)buf & PAGE_OFFSET_MASK;
1856 pReq->PgLst.offFirstPage = (uint16_t)cbChunk;
1857 cPages = RT_ALIGN_Z(cbChunk + size, PAGE_SIZE) >> PAGE_SHIFT;
1858 if (cPages <= cMaxPages)
1859 cbChunk = size;
1860 else {
1861 cPages = cMaxPages;
1862 cbChunk = (cMaxPages << PAGE_SHIFT) - cbChunk;
1863 }
1864
1865 rc = vbsf_lock_user_pages((uintptr_t)buf, cPages, false /*fWrite*/, papPages, &fLockPgHack);
1866 if (rc == 0) {
1867 size_t iPage = cPages;
1868 while (iPage-- > 0)
1869 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
1870 } else {
1871 cbRet = rc;
1872 break;
1873 }
1874
1875 /*
1876 * Issue the request and unlock the pages.
1877 */
1878 rc = VbglR0SfHostReqWritePgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, offFile, cbChunk, cPages);
1879 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
1880 if (RT_SUCCESS(rc)) {
1881 /*
1882 * Success, advance position and buffer.
1883 */
1884 uint32_t cbActual = pReq->Parms.cb32Write.u.value32;
1885 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
1886
1887 vbsf_reg_write_sync_page_cache(inode->i_mapping, offFile, cbActual, NULL /*pbKrnlBuf*/,
1888 papPages, (uintptr_t)buf & PAGE_OFFSET_MASK, cPages);
1889 Assert(cPages <= cMaxPages);
1890 vbsf_unlock_user_pages(papPages, cPages, false /*fSetDirty*/, fLockPgHack);
1891
1892 cbRet += cbActual;
1893 buf = (uint8_t *)buf + cbActual;
1894 size -= cbActual;
1895
1896 offFile += cbActual;
1897 if ((file->f_flags & O_APPEND) && (g_fSfFeatures & SHFL_FEATURE_WRITE_UPDATES_OFFSET))
1898 offFile = pReq->Parms.off64Write.u.value64;
1899 if (offFile > i_size_read(inode))
1900 i_size_write(inode, offFile);
1901
1902 sf_i->force_restat = 1; /* mtime (and size) may have changed */
1903
1904 /*
1905 * Are we done already? If so commit the new file offset.
1906 */
1907 if (!size || cbActual < cbChunk) {
1908 *off = offFile;
1909 break;
1910 }
1911 } else {
1912 vbsf_unlock_user_pages(papPages, cPages, false /*fSetDirty*/, fLockPgHack);
1913 if (rc == VERR_NO_MEMORY && cMaxPages > 4) {
1914 /*
1915 * The host probably doesn't have enough heap to handle the
1916 * request, reduce the page count and retry.
1917 */
1918 cMaxPages /= 4;
1919 Assert(cMaxPages > 0);
1920 } else {
1921 /*
1922 * If we've successfully written stuff, return it rather than
1923 * the error. (Not sure if this is such a great idea...)
1924 */
1925 if (cbRet > 0) {
1926 SFLOGFLOW(("vbsf_reg_write: write at %#RX64 -> %Rrc; got cbRet=%#zx already\n", offFile, rc, cbRet));
1927 *off = offFile;
1928 } else {
1929 SFLOGFLOW(("vbsf_reg_write: write at %#RX64 -> %Rrc\n", offFile, rc));
1930 cbRet = -EPROTO;
1931 }
1932 break;
1933 }
1934 }
1935 }
1936 }
1937 if (papPagesFree)
1938 kfree(papPages);
1939 if (pReq)
1940 VbglR0PhysHeapFree(pReq);
1941 SFLOGFLOW(("vbsf_reg_write: returns %zd (%#zx), *off=%RX64 [lock]\n", cbRet, cbRet, *off));
1942 return cbRet;
1943}
1944
1945
1946/**
1947 * Write to a regular file.
1948 *
1949 * @param file the file
1950 * @param buf the buffer
1951 * @param size length of the buffer
1952 * @param off offset within the file
1953 * @returns the number of written bytes on success, Linux error code otherwise
1954 */
1955static ssize_t vbsf_reg_write(struct file *file, const char *buf, size_t size, loff_t * off)
1956{
1957 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
1958 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
1959 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
1960 struct vbsf_reg_info *sf_r = file->private_data;
1961 struct address_space *mapping = inode->i_mapping;
1962 loff_t pos;
1963
1964 SFLOGFLOW(("vbsf_reg_write: inode=%p file=%p buf=%p size=%#zx off=%#llx\n", inode, file, buf, size, *off));
1965 Assert(sf_i);
1966 Assert(pSuperInfo);
1967 Assert(sf_r);
1968 AssertReturn(S_ISREG(inode->i_mode), -EINVAL);
1969
1970 pos = *off;
1971 if (file->f_flags & O_APPEND)
1972 pos = i_size_read(inode);
1973
1974 /** @todo XXX Check write permission according to inode->i_mode! */
1975
1976 if (!size) {
1977 if (file->f_flags & O_APPEND) /** @todo check if this is the consensus behavior... */
1978 *off = pos;
1979 return 0;
1980 }
1981
1982 /** @todo Implement the read-write caching mode. */
1983
1984 /*
1985 * If there are active writable mappings, coordinate with any
1986 * pending writes via those.
1987 */
1988 if ( mapping
1989 && mapping->nrpages > 0
1990 && mapping_writably_mapped(mapping)) {
1991#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
1992 int err = filemap_fdatawait_range(mapping, pos, pos + size - 1);
1993 if (err)
1994 return err;
1995#else
1996 /** @todo ... */
1997#endif
1998 }
1999
2000 /*
2001 * For small requests, try use an embedded buffer provided we get a heap block
2002 * that does not cross page boundraries (see host code).
2003 */
2004 if (size <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) /* see allocator */) {
2005 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) + size;
2006 VBOXSFWRITEEMBEDDEDREQ *pReq = (VBOXSFWRITEEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
2007 if ( pReq
2008 && (PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
2009 ssize_t cbRet;
2010 if (copy_from_user(pReq->abData, buf, size) == 0) {
2011 int vrc = VbglR0SfHostReqWriteEmbedded(pSuperInfo->map.root, pReq, sf_r->Handle.hHost,
2012 pos, (uint32_t)size);
2013 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
2014 if (RT_SUCCESS(vrc)) {
2015 cbRet = pReq->Parms.cb32Write.u.value32;
2016 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
2017 vbsf_reg_write_sync_page_cache(mapping, pos, (uint32_t)cbRet, pReq->abData,
2018 NULL /*papSrcPages*/, 0 /*offSrcPage0*/, 0 /*cSrcPages*/);
2019 pos += cbRet;
2020 if ((file->f_flags & O_APPEND) && (g_fSfFeatures & SHFL_FEATURE_WRITE_UPDATES_OFFSET))
2021 pos = pReq->Parms.off64Write.u.value64;
2022 *off = pos;
2023 if (pos > i_size_read(inode))
2024 i_size_write(inode, pos);
2025 } else
2026 cbRet = -EPROTO;
2027 sf_i->force_restat = 1; /* mtime (and size) may have changed */
2028 } else
2029 cbRet = -EFAULT;
2030
2031 VbglR0PhysHeapFree(pReq);
2032 SFLOGFLOW(("vbsf_reg_write: returns %zd (%#zx), *off=%RX64 [embed]\n", cbRet, cbRet, *off));
2033 return cbRet;
2034 }
2035 if (pReq)
2036 VbglR0PhysHeapFree(pReq);
2037 }
2038
2039#if 0 /* Turns out this is slightly slower than locking the pages even for 4KB reads (4.19/amd64). */
2040 /*
2041 * For medium sized requests try use a bounce buffer.
2042 */
2043 if (size <= _64K /** @todo make this configurable? */) {
2044 void *pvBounce = kmalloc(size, GFP_KERNEL);
2045 if (pvBounce) {
2046 if (copy_from_user(pvBounce, buf, size) == 0) {
2047 VBOXSFWRITEPGLSTREQ *pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
2048 if (pReq) {
2049 ssize_t cbRet;
2050 int vrc = VbglR0SfHostReqWriteContig(pSuperInfo->map.root, pReq, sf_r->handle, pos,
2051 (uint32_t)size, pvBounce, virt_to_phys(pvBounce));
2052 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
2053 if (RT_SUCCESS(vrc)) {
2054 cbRet = pReq->Parms.cb32Write.u.value32;
2055 AssertStmt(cbRet <= (ssize_t)size, cbRet = size);
2056 vbsf_reg_write_sync_page_cache(mapping, pos, (uint32_t)cbRet, (uint8_t const *)pvBounce,
2057 NULL /*papSrcPages*/, 0 /*offSrcPage0*/, 0 /*cSrcPages*/);
2058 pos += cbRet;
2059 *off = pos;
2060 if (pos > i_size_read(inode))
2061 i_size_write(inode, pos);
2062 } else
2063 cbRet = -EPROTO;
2064 sf_i->force_restat = 1; /* mtime (and size) may have changed */
2065 VbglR0PhysHeapFree(pReq);
2066 kfree(pvBounce);
2067 SFLOGFLOW(("vbsf_reg_write: returns %zd (%#zx), *off=%RX64 [bounce]\n", cbRet, cbRet, *off));
2068 return cbRet;
2069 }
2070 kfree(pvBounce);
2071 } else {
2072 kfree(pvBounce);
2073 SFLOGFLOW(("vbsf_reg_write: returns -EFAULT, *off=%RX64 [bounce]\n", *off));
2074 return -EFAULT;
2075 }
2076 }
2077 }
2078#endif
2079
2080 return vbsf_reg_write_locking(file, buf, size, off, pos, inode, sf_i, pSuperInfo, sf_r);
2081}
2082
2083#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
2084
2085/**
2086 * Companion to vbsf_iter_lock_pages().
2087 */
2088DECLINLINE(void) vbsf_iter_unlock_pages(struct iov_iter *iter, struct page **papPages, size_t cPages, bool fSetDirty)
2089{
2090 /* We don't mark kernel pages dirty: */
2091 if (iter->type & ITER_KVEC)
2092 fSetDirty = false;
2093
2094 while (cPages-- > 0)
2095 {
2096 struct page *pPage = papPages[cPages];
2097 if (fSetDirty && !PageReserved(pPage))
2098 set_page_dirty(pPage);
2099 vbsf_put_page(pPage);
2100 }
2101}
2102
2103
2104/**
2105 * Locks up to @a cMaxPages from the I/O vector iterator, advancing the
2106 * iterator.
2107 *
2108 * @returns 0 on success, negative errno value on failure.
2109 * @param iter The iterator to lock pages from.
2110 * @param fWrite Whether to write (true) or read (false) lock the pages.
2111 * @param pStash Where we stash peek results.
2112 * @param cMaxPages The maximum number of pages to get.
2113 * @param papPages Where to return the locked pages.
2114 * @param pcPages Where to return the number of pages.
2115 * @param poffPage0 Where to return the offset into the first page.
2116 * @param pcbChunk Where to return the number of bytes covered.
2117 */
2118static int vbsf_iter_lock_pages(struct iov_iter *iter, bool fWrite, struct vbsf_iter_stash *pStash, size_t cMaxPages,
2119 struct page **papPages, size_t *pcPages, size_t *poffPage0, size_t *pcbChunk)
2120{
2121 size_t cbChunk = 0;
2122 size_t cPages = 0;
2123 size_t offPage0 = 0;
2124 int rc = 0;
2125
2126 Assert(iov_iter_count(iter) + pStash->cb > 0);
2127 if (!(iter->type & ITER_KVEC)) {
2128 /*
2129 * Do we have a stashed page?
2130 */
2131 if (pStash->pPage) {
2132 papPages[0] = pStash->pPage;
2133 offPage0 = pStash->off;
2134 cbChunk = pStash->cb;
2135 cPages = 1;
2136 pStash->pPage = NULL;
2137 pStash->off = 0;
2138 pStash->cb = 0;
2139 if ( offPage0 + cbChunk < PAGE_SIZE
2140 || iov_iter_count(iter) == 0) {
2141 *poffPage0 = offPage0;
2142 *pcbChunk = cbChunk;
2143 *pcPages = cPages;
2144 SFLOGFLOW(("vbsf_iter_lock_pages: returns %d - cPages=%#zx offPage0=%#zx cbChunk=%zx (stashed)\n",
2145 rc, cPages, offPage0, cbChunk));
2146 return 0;
2147 }
2148 cMaxPages -= 1;
2149 SFLOG3(("vbsf_iter_lock_pages: Picked up stashed page: %#zx LB %#zx\n", offPage0, cbChunk));
2150 } else {
2151# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
2152 /*
2153 * Copy out our starting point to assist rewinding.
2154 */
2155 pStash->offFromEnd = iov_iter_count(iter);
2156 pStash->Copy = *iter;
2157# endif
2158 }
2159
2160 /*
2161 * Get pages segment by segment.
2162 */
2163 do {
2164 /*
2165 * Make a special case of the first time thru here, since that's
2166 * the most typical scenario.
2167 */
2168 ssize_t cbSegRet;
2169 if (cPages == 0) {
2170# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
2171 while (!iov_iter_single_seg_count(iter)) /* Old code didn't skip empty segments which caused EFAULTs. */
2172 iov_iter_advance(iter, 0);
2173# endif
2174 cbSegRet = iov_iter_get_pages(iter, papPages, iov_iter_count(iter), cMaxPages, &offPage0);
2175 if (cbSegRet > 0) {
2176 iov_iter_advance(iter, cbSegRet);
2177 cbChunk = (size_t)cbSegRet;
2178 cPages = RT_ALIGN_Z(offPage0 + cbSegRet, PAGE_SIZE) >> PAGE_SHIFT;
2179 cMaxPages -= cPages;
2180 SFLOG3(("vbsf_iter_lock_pages: iov_iter_get_pages -> %#zx @ %#zx; %#zx pages [first]\n", cbSegRet, offPage0, cPages));
2181 if ( cMaxPages == 0
2182 || ((offPage0 + (size_t)cbSegRet) & PAGE_OFFSET_MASK))
2183 break;
2184 } else {
2185 AssertStmt(cbSegRet < 0, cbSegRet = -EFAULT);
2186 rc = (int)cbSegRet;
2187 break;
2188 }
2189 } else {
2190 /*
2191 * Probe first page of new segment to check that we've got a zero offset and
2192 * can continue on the current chunk. Stash the page if the offset isn't zero.
2193 */
2194 size_t offPgProbe;
2195 size_t cbSeg = iov_iter_single_seg_count(iter);
2196 while (!cbSeg) {
2197 iov_iter_advance(iter, 0);
2198 cbSeg = iov_iter_single_seg_count(iter);
2199 }
2200 cbSegRet = iov_iter_get_pages(iter, &papPages[cPages], iov_iter_count(iter), 1, &offPgProbe);
2201 if (cbSegRet > 0) {
2202 iov_iter_advance(iter, cbSegRet); /** @todo maybe not do this if we stash the page? */
2203 Assert(offPgProbe + cbSegRet <= PAGE_SIZE);
2204 if (offPgProbe == 0) {
2205 cbChunk += cbSegRet;
2206 cPages += 1;
2207 cMaxPages -= 1;
2208 SFLOG3(("vbsf_iter_lock_pages: iov_iter_get_pages(1) -> %#zx @ %#zx\n", cbSegRet, offPgProbe));
2209 if ( cMaxPages == 0
2210 || cbSegRet != PAGE_SIZE)
2211 break;
2212
2213 /*
2214 * Get the rest of the segment (if anything remaining).
2215 */
2216 cbSeg -= cbSegRet;
2217 if (cbSeg > 0) {
2218 cbSegRet = iov_iter_get_pages(iter, &papPages[cPages], iov_iter_count(iter), cMaxPages, &offPgProbe);
2219 if (cbSegRet > 0) {
2220 size_t const cPgRet = RT_ALIGN_Z((size_t)cbSegRet, PAGE_SIZE) >> PAGE_SHIFT;
2221 Assert(offPgProbe == 0);
2222 iov_iter_advance(iter, cbSegRet);
2223 SFLOG3(("vbsf_iter_lock_pages: iov_iter_get_pages() -> %#zx; %#zx pages\n", cbSegRet, cPgRet));
2224 cPages += cPgRet;
2225 cMaxPages -= cPgRet;
2226 cbChunk += cbSegRet;
2227 if ( cMaxPages == 0
2228 || ((size_t)cbSegRet & PAGE_OFFSET_MASK))
2229 break;
2230 } else {
2231 AssertStmt(cbSegRet < 0, cbSegRet = -EFAULT);
2232 rc = (int)cbSegRet;
2233 break;
2234 }
2235 }
2236 } else {
2237 /* The segment didn't start at a page boundrary, so stash it for
2238 the next round: */
2239 SFLOGFLOW(("vbsf_iter_lock_pages: iov_iter_get_pages(1) -> %#zx @ %#zx; stashed\n", cbSegRet, offPgProbe));
2240 Assert(papPages[cPages]);
2241 pStash->pPage = papPages[cPages];
2242 pStash->off = offPgProbe;
2243 pStash->cb = cbSegRet;
2244 break;
2245 }
2246 } else {
2247 AssertStmt(cbSegRet < 0, cbSegRet = -EFAULT);
2248 rc = (int)cbSegRet;
2249 break;
2250 }
2251 }
2252 Assert(cMaxPages > 0);
2253 } while (iov_iter_count(iter) > 0);
2254
2255 } else {
2256 /*
2257 * The silly iov_iter_get_pages_alloc() function doesn't handle KVECs,
2258 * so everyone needs to do that by themselves.
2259 *
2260 * Note! Fixes here may apply to rtR0MemObjNativeLockKernel()
2261 * and vbsf_lock_user_pages_failed_check_kernel() as well.
2262 */
2263# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
2264 pStash->offFromEnd = iov_iter_count(iter);
2265 pStash->Copy = *iter;
2266# endif
2267 do {
2268 uint8_t *pbBuf;
2269 size_t offStart;
2270 size_t cPgSeg;
2271
2272 size_t cbSeg = iov_iter_single_seg_count(iter);
2273 while (!cbSeg) {
2274 iov_iter_advance(iter, 0);
2275 cbSeg = iov_iter_single_seg_count(iter);
2276 }
2277
2278# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
2279 pbBuf = iter->kvec->iov_base + iter->iov_offset;
2280# else
2281 pbBuf = iter->iov->iov_base + iter->iov_offset;
2282# endif
2283 offStart = (uintptr_t)pbBuf & PAGE_OFFSET_MASK;
2284 if (!cPages)
2285 offPage0 = offStart;
2286 else if (offStart)
2287 break;
2288
2289 cPgSeg = RT_ALIGN_Z(cbSeg, PAGE_SIZE) >> PAGE_SHIFT;
2290 if (cPgSeg > cMaxPages) {
2291 cPgSeg = cMaxPages;
2292 cbSeg = (cPgSeg << PAGE_SHIFT) - offStart;
2293 }
2294
2295 rc = vbsf_lock_kernel_pages(pbBuf, fWrite, cPgSeg, &papPages[cPages]);
2296 if (rc == 0) {
2297 iov_iter_advance(iter, cbSeg);
2298 cbChunk += cbSeg;
2299 cPages += cPgSeg;
2300 cMaxPages -= cPgSeg;
2301 if ( cMaxPages == 0
2302 || ((offStart + cbSeg) & PAGE_OFFSET_MASK) != 0)
2303 break;
2304 } else
2305 break;
2306 } while (iov_iter_count(iter) > 0);
2307 }
2308
2309 /*
2310 * Clean up if we failed; set return values.
2311 */
2312 if (rc == 0) {
2313 /* likely */
2314 } else {
2315 if (cPages > 0)
2316 vbsf_iter_unlock_pages(iter, papPages, cPages, false /*fSetDirty*/);
2317 offPage0 = cbChunk = cPages = 0;
2318 }
2319 *poffPage0 = offPage0;
2320 *pcbChunk = cbChunk;
2321 *pcPages = cPages;
2322 SFLOGFLOW(("vbsf_iter_lock_pages: returns %d - cPages=%#zx offPage0=%#zx cbChunk=%zx\n", rc, cPages, offPage0, cbChunk));
2323 return rc;
2324}
2325
2326
2327/**
2328 * Rewinds the I/O vector.
2329 */
2330static bool vbsf_iter_rewind(struct iov_iter *iter, struct vbsf_iter_stash *pStash, size_t cbToRewind, size_t cbChunk)
2331{
2332 size_t cbExtra;
2333 if (!pStash->pPage) {
2334 cbExtra = 0;
2335 } else {
2336 cbExtra = pStash->cb;
2337 vbsf_put_page(pStash->pPage);
2338 pStash->pPage = NULL;
2339 pStash->cb = 0;
2340 pStash->off = 0;
2341 }
2342
2343# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) || LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
2344 iov_iter_revert(iter, cbToRewind + cbExtra);
2345 return true;
2346# else
2347 /** @todo impl this */
2348 return false;
2349# endif
2350}
2351
2352
2353/**
2354 * Cleans up the page locking stash.
2355 */
2356DECLINLINE(void) vbsf_iter_cleanup_stash(struct iov_iter *iter, struct vbsf_iter_stash *pStash)
2357{
2358 if (pStash->pPage)
2359 vbsf_iter_rewind(iter, pStash, 0, 0);
2360}
2361
2362
2363/**
2364 * Calculates the longest span of pages we could transfer to the host in a
2365 * single request.
2366 *
2367 * @returns Page count, non-zero.
2368 * @param iter The I/O vector iterator to inspect.
2369 */
2370static size_t vbsf_iter_max_span_of_pages(struct iov_iter *iter)
2371{
2372 size_t cPages;
2373# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2374 if (iter_is_iovec(iter) || (iter->type & ITER_KVEC)) {
2375#endif
2376 const struct iovec *pCurIov = iter->iov;
2377 size_t cLeft = iter->nr_segs;
2378 size_t cPagesSpan = 0;
2379
2380 /* iovect and kvec are identical, except for the __user tagging of iov_base. */
2381 AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, struct kvec, iov_base);
2382 AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, struct kvec, iov_len);
2383 AssertCompile(sizeof(struct iovec) == sizeof(struct kvec));
2384
2385 cPages = 1;
2386 AssertReturn(cLeft > 0, cPages);
2387
2388 /* Special case: segment offset. */
2389 if (iter->iov_offset > 0) {
2390 if (iter->iov_offset < pCurIov->iov_len) {
2391 size_t const cbSegLeft = pCurIov->iov_len - iter->iov_offset;
2392 size_t const offPage0 = ((uintptr_t)pCurIov->iov_base + iter->iov_offset) & PAGE_OFFSET_MASK;
2393 cPages = cPagesSpan = RT_ALIGN_Z(offPage0 + cbSegLeft, PAGE_SIZE) >> PAGE_SHIFT;
2394 if ((offPage0 + cbSegLeft) & PAGE_OFFSET_MASK)
2395 cPagesSpan = 0;
2396 }
2397 SFLOGFLOW(("vbsf_iter: seg[0]= %p LB %#zx\n", pCurIov->iov_base, pCurIov->iov_len));
2398 pCurIov++;
2399 cLeft--;
2400 }
2401
2402 /* Full segments. */
2403 while (cLeft-- > 0) {
2404 if (pCurIov->iov_len > 0) {
2405 size_t const offPage0 = (uintptr_t)pCurIov->iov_base & PAGE_OFFSET_MASK;
2406 if (offPage0 == 0) {
2407 if (!(pCurIov->iov_len & PAGE_OFFSET_MASK)) {
2408 cPagesSpan += pCurIov->iov_len >> PAGE_SHIFT;
2409 } else {
2410 cPagesSpan += RT_ALIGN_Z(pCurIov->iov_len, PAGE_SIZE) >> PAGE_SHIFT;
2411 if (cPagesSpan > cPages)
2412 cPages = cPagesSpan;
2413 cPagesSpan = 0;
2414 }
2415 } else {
2416 if (cPagesSpan > cPages)
2417 cPages = cPagesSpan;
2418 if (!((offPage0 + pCurIov->iov_len) & PAGE_OFFSET_MASK)) {
2419 cPagesSpan = pCurIov->iov_len >> PAGE_SHIFT;
2420 } else {
2421 cPagesSpan += RT_ALIGN_Z(offPage0 + pCurIov->iov_len, PAGE_SIZE) >> PAGE_SHIFT;
2422 if (cPagesSpan > cPages)
2423 cPages = cPagesSpan;
2424 cPagesSpan = 0;
2425 }
2426 }
2427 }
2428 SFLOGFLOW(("vbsf_iter: seg[%u]= %p LB %#zx\n", iter->nr_segs - cLeft, pCurIov->iov_base, pCurIov->iov_len));
2429 pCurIov++;
2430 }
2431 if (cPagesSpan > cPages)
2432 cPages = cPagesSpan;
2433# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2434 } else {
2435 /* Won't bother with accurate counts for the next two types, just make
2436 some rough estimates (does pipes have segments?): */
2437 size_t cSegs = iter->type & ITER_BVEC ? RT_MAX(1, iter->nr_segs) : 1;
2438 cPages = (iov_iter_count(iter) + (PAGE_SIZE * 2 - 2) * cSegs) >> PAGE_SHIFT;
2439 }
2440# endif
2441 SFLOGFLOW(("vbsf_iter_max_span_of_pages: returns %#zx\n", cPages));
2442 return cPages;
2443}
2444
2445
2446/**
2447 * Worker for vbsf_reg_read_iter() that deals with larger reads using page
2448 * locking.
2449 */
2450static ssize_t vbsf_reg_read_iter_locking(struct kiocb *kio, struct iov_iter *iter, size_t cbToRead,
2451 struct vbsf_super_info *pSuperInfo, struct vbsf_reg_info *sf_r)
2452{
2453 /*
2454 * Estimate how many pages we may possible submit in a single request so
2455 * that we can allocate matching request buffer and page array.
2456 */
2457 struct page *apPagesStack[16];
2458 struct page **papPages = &apPagesStack[0];
2459 struct page **papPagesFree = NULL;
2460 VBOXSFREADPGLSTREQ *pReq;
2461 ssize_t cbRet = 0;
2462 size_t cMaxPages = vbsf_iter_max_span_of_pages(iter);
2463 cMaxPages = RT_MIN(RT_MAX(pSuperInfo->cMaxIoPages, 2), cMaxPages);
2464
2465 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
2466 while (!pReq && cMaxPages > 4) {
2467 cMaxPages /= 2;
2468 pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFREADPGLSTREQ, PgLst.aPages[cMaxPages]));
2469 }
2470 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
2471 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
2472 if (pReq && papPages) {
2473
2474 /*
2475 * The read loop.
2476 */
2477 struct vbsf_iter_stash Stash = VBSF_ITER_STASH_INITIALIZER;
2478 do {
2479 /*
2480 * Grab as many pages as we can. This means that if adjacent
2481 * segments both starts and ends at a page boundrary, we can
2482 * do them both in the same transfer from the host.
2483 */
2484 size_t cPages = 0;
2485 size_t cbChunk = 0;
2486 size_t offPage0 = 0;
2487 int rc = vbsf_iter_lock_pages(iter, true /*fWrite*/, &Stash, cMaxPages, papPages, &cPages, &offPage0, &cbChunk);
2488 if (rc == 0) {
2489 size_t iPage = cPages;
2490 while (iPage-- > 0)
2491 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
2492 pReq->PgLst.offFirstPage = (uint16_t)offPage0;
2493 AssertStmt(cbChunk <= cbToRead, cbChunk = cbToRead);
2494 } else {
2495 cbRet = rc;
2496 break;
2497 }
2498
2499 /*
2500 * Issue the request and unlock the pages.
2501 */
2502 rc = VbglR0SfHostReqReadPgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, kio->ki_pos, cbChunk, cPages);
2503 SFLOGFLOW(("vbsf_reg_read_iter_locking: VbglR0SfHostReqReadPgLst -> %d (cbActual=%#x cbChunk=%#zx of %#zx cPages=%#zx offPage0=%#x\n",
2504 rc, pReq->Parms.cb32Read.u.value32, cbChunk, cbToRead, cPages, offPage0));
2505
2506 vbsf_iter_unlock_pages(iter, papPages, cPages, true /*fSetDirty*/);
2507
2508 if (RT_SUCCESS(rc)) {
2509 /*
2510 * Success, advance position and buffer.
2511 */
2512 uint32_t cbActual = pReq->Parms.cb32Read.u.value32;
2513 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
2514 cbRet += cbActual;
2515 kio->ki_pos += cbActual;
2516 cbToRead -= cbActual;
2517
2518 /*
2519 * Are we done already?
2520 */
2521 if (!cbToRead)
2522 break;
2523 if (cbActual < cbChunk) { /* We ASSUME end-of-file here. */
2524 if (vbsf_iter_rewind(iter, &Stash, cbChunk - cbActual, cbActual))
2525 iov_iter_truncate(iter, 0);
2526 break;
2527 }
2528 } else {
2529 /*
2530 * Try rewind the iter structure.
2531 */
2532 bool const fRewindOkay = vbsf_iter_rewind(iter, &Stash, cbChunk, cbChunk);
2533 if (rc == VERR_NO_MEMORY && cMaxPages > 4 && fRewindOkay) {
2534 /*
2535 * The host probably doesn't have enough heap to handle the
2536 * request, reduce the page count and retry.
2537 */
2538 cMaxPages /= 4;
2539 Assert(cMaxPages > 0);
2540 } else {
2541 /*
2542 * If we've successfully read stuff, return it rather than
2543 * the error. (Not sure if this is such a great idea...)
2544 */
2545 if (cbRet <= 0)
2546 cbRet = -EPROTO;
2547 break;
2548 }
2549 }
2550 } while (cbToRead > 0);
2551
2552 vbsf_iter_cleanup_stash(iter, &Stash);
2553 }
2554 else
2555 cbRet = -ENOMEM;
2556 if (papPagesFree)
2557 kfree(papPages);
2558 if (pReq)
2559 VbglR0PhysHeapFree(pReq);
2560 SFLOGFLOW(("vbsf_reg_read_iter_locking: returns %#zx (%zd)\n", cbRet, cbRet));
2561 return cbRet;
2562}
2563
2564
2565/**
2566 * Read into I/O vector iterator.
2567 *
2568 * @returns Number of bytes read on success, negative errno on error.
2569 * @param kio The kernel I/O control block (or something like that).
2570 * @param iter The I/O vector iterator describing the buffer.
2571 */
2572# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2573static ssize_t vbsf_reg_read_iter(struct kiocb *kio, struct iov_iter *iter)
2574# else
2575static ssize_t vbsf_reg_aio_read(struct kiocb *kio, const struct iovec *iov, unsigned long cSegs, loff_t offFile)
2576# endif
2577{
2578# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
2579 struct vbsf_iov_iter fake_iter = VBSF_IOV_ITER_INITIALIZER(cSegs, iov, 0 /*write*/);
2580 struct vbsf_iov_iter *iter = &fake_iter;
2581# endif
2582 size_t cbToRead = iov_iter_count(iter);
2583 struct inode *inode = VBSF_GET_F_DENTRY(kio->ki_filp)->d_inode;
2584 struct address_space *mapping = inode->i_mapping;
2585
2586 struct vbsf_reg_info *sf_r = kio->ki_filp->private_data;
2587 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
2588
2589 SFLOGFLOW(("vbsf_reg_read_iter: inode=%p file=%p size=%#zx off=%#llx type=%#x\n",
2590 inode, kio->ki_filp, cbToRead, kio->ki_pos, iter->type));
2591 AssertReturn(S_ISREG(inode->i_mode), -EINVAL);
2592
2593 /*
2594 * Do we have anything at all to do here?
2595 */
2596 if (!cbToRead)
2597 return 0;
2598
2599 /*
2600 * If there is a mapping and O_DIRECT isn't in effect, we must at a
2601 * heed dirty pages in the mapping and read from them. For simplicity
2602 * though, we just do page cache reading when there are writable
2603 * mappings around with any kind of pages loaded.
2604 */
2605 if (vbsf_should_use_cached_read(kio->ki_filp, mapping, pSuperInfo)) {
2606# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2607 return generic_file_read_iter(kio, iter);
2608# else
2609 return generic_file_aio_read(kio, iov, cSegs, offFile);
2610# endif
2611 }
2612
2613 /*
2614 * Now now we reject async I/O requests.
2615 */
2616 if (!is_sync_kiocb(kio)) {
2617 SFLOGFLOW(("vbsf_reg_read_iter: async I/O not yet supported\n")); /** @todo extend FsPerf with AIO tests. */
2618 return -EOPNOTSUPP;
2619 }
2620
2621 /*
2622 * For small requests, try use an embedded buffer provided we get a heap block
2623 * that does not cross page boundraries (see host code).
2624 */
2625 if (cbToRead <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) /* see allocator */) {
2626 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFREADEMBEDDEDREQ, abData[0]) + cbToRead;
2627 VBOXSFREADEMBEDDEDREQ *pReq = (VBOXSFREADEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
2628 if (pReq) {
2629 if ((PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
2630 ssize_t cbRet;
2631 int vrc = VbglR0SfHostReqReadEmbedded(pSuperInfo->map.root, pReq, sf_r->Handle.hHost,
2632 kio->ki_pos, (uint32_t)cbToRead);
2633 if (RT_SUCCESS(vrc)) {
2634 cbRet = pReq->Parms.cb32Read.u.value32;
2635 AssertStmt(cbRet <= (ssize_t)cbToRead, cbRet = cbToRead);
2636 if (copy_to_iter(pReq->abData, cbRet, iter) == cbRet) {
2637 kio->ki_pos += cbRet;
2638 if (cbRet < cbToRead)
2639 iov_iter_truncate(iter, 0);
2640 } else
2641 cbRet = -EFAULT;
2642 } else
2643 cbRet = -EPROTO;
2644 VbglR0PhysHeapFree(pReq);
2645 SFLOGFLOW(("vbsf_reg_read_iter: returns %#zx (%zd)\n", cbRet, cbRet));
2646 return cbRet;
2647 }
2648 VbglR0PhysHeapFree(pReq);
2649 }
2650 }
2651
2652 /*
2653 * Otherwise do the page locking thing.
2654 */
2655 return vbsf_reg_read_iter_locking(kio, iter, cbToRead, pSuperInfo, sf_r);
2656}
2657
2658
2659/**
2660 * Worker for vbsf_reg_write_iter() that deals with larger writes using page
2661 * locking.
2662 */
2663static ssize_t vbsf_reg_write_iter_locking(struct kiocb *kio, struct iov_iter *iter, size_t cbToWrite, loff_t offFile,
2664 struct vbsf_super_info *pSuperInfo, struct vbsf_reg_info *sf_r, struct inode *inode,
2665 struct vbsf_inode_info *sf_i, struct address_space *mapping, bool fAppend)
2666{
2667 /*
2668 * Estimate how many pages we may possible submit in a single request so
2669 * that we can allocate matching request buffer and page array.
2670 */
2671 struct page *apPagesStack[16];
2672 struct page **papPages = &apPagesStack[0];
2673 struct page **papPagesFree = NULL;
2674 VBOXSFWRITEPGLSTREQ *pReq;
2675 ssize_t cbRet = 0;
2676 size_t cMaxPages = vbsf_iter_max_span_of_pages(iter);
2677 cMaxPages = RT_MIN(RT_MAX(pSuperInfo->cMaxIoPages, 2), cMaxPages);
2678
2679 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
2680 while (!pReq && cMaxPages > 4) {
2681 cMaxPages /= 2;
2682 pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(RT_UOFFSETOF_DYN(VBOXSFWRITEPGLSTREQ, PgLst.aPages[cMaxPages]));
2683 }
2684 if (pReq && cMaxPages > RT_ELEMENTS(apPagesStack))
2685 papPagesFree = papPages = kmalloc(cMaxPages * sizeof(sizeof(papPages[0])), GFP_KERNEL);
2686 if (pReq && papPages) {
2687
2688 /*
2689 * The write loop.
2690 */
2691 struct vbsf_iter_stash Stash = VBSF_ITER_STASH_INITIALIZER;
2692 do {
2693 /*
2694 * Grab as many pages as we can. This means that if adjacent
2695 * segments both starts and ends at a page boundrary, we can
2696 * do them both in the same transfer from the host.
2697 */
2698 size_t cPages = 0;
2699 size_t cbChunk = 0;
2700 size_t offPage0 = 0;
2701 int rc = vbsf_iter_lock_pages(iter, false /*fWrite*/, &Stash, cMaxPages, papPages, &cPages, &offPage0, &cbChunk);
2702 if (rc == 0) {
2703 size_t iPage = cPages;
2704 while (iPage-- > 0)
2705 pReq->PgLst.aPages[iPage] = page_to_phys(papPages[iPage]);
2706 pReq->PgLst.offFirstPage = (uint16_t)offPage0;
2707 AssertStmt(cbChunk <= cbToWrite, cbChunk = cbToWrite);
2708 } else {
2709 cbRet = rc;
2710 break;
2711 }
2712
2713 /*
2714 * Issue the request and unlock the pages.
2715 */
2716 rc = VbglR0SfHostReqWritePgLst(pSuperInfo->map.root, pReq, sf_r->Handle.hHost, offFile, cbChunk, cPages);
2717 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
2718 SFLOGFLOW(("vbsf_reg_write_iter_locking: VbglR0SfHostReqWritePgLst -> %d (cbActual=%#x cbChunk=%#zx of %#zx cPages=%#zx offPage0=%#x\n",
2719 rc, pReq->Parms.cb32Write.u.value32, cbChunk, cbToWrite, cPages, offPage0));
2720 if (RT_SUCCESS(rc)) {
2721 /*
2722 * Success, advance position and buffer.
2723 */
2724 uint32_t cbActual = pReq->Parms.cb32Write.u.value32;
2725 AssertStmt(cbActual <= cbChunk, cbActual = cbChunk);
2726
2727 vbsf_reg_write_sync_page_cache(mapping, offFile, cbActual, NULL /*pbSrcBuf*/, papPages, offPage0, cPages);
2728 vbsf_iter_unlock_pages(iter, papPages, cPages, false /*fSetDirty*/);
2729
2730 cbRet += cbActual;
2731 cbToWrite -= cbActual;
2732
2733 offFile += cbActual;
2734 if (fAppend && (g_fSfFeatures & SHFL_FEATURE_WRITE_UPDATES_OFFSET))
2735 offFile = pReq->Parms.off64Write.u.value64;
2736 kio->ki_pos = offFile;
2737 if (offFile > i_size_read(inode))
2738 i_size_write(inode, offFile);
2739
2740 sf_i->force_restat = 1; /* mtime (and size) may have changed */
2741
2742 /*
2743 * Are we done already?
2744 */
2745 if (!cbToWrite)
2746 break;
2747 if (cbActual < cbChunk) { /* We ASSUME end-of-file here. */
2748 if (vbsf_iter_rewind(iter, &Stash, cbChunk - cbActual, cbActual))
2749 iov_iter_truncate(iter, 0);
2750 break;
2751 }
2752 } else {
2753 /*
2754 * Try rewind the iter structure.
2755 */
2756 bool fRewindOkay;
2757 vbsf_iter_unlock_pages(iter, papPages, cPages, false /*fSetDirty*/);
2758 fRewindOkay = vbsf_iter_rewind(iter, &Stash, cbChunk, cbChunk);
2759 if (rc == VERR_NO_MEMORY && cMaxPages > 4 && fRewindOkay) {
2760 /*
2761 * The host probably doesn't have enough heap to handle the
2762 * request, reduce the page count and retry.
2763 */
2764 cMaxPages /= 4;
2765 Assert(cMaxPages > 0);
2766 } else {
2767 /*
2768 * If we've successfully written stuff, return it rather than
2769 * the error. (Not sure if this is such a great idea...)
2770 */
2771 if (cbRet <= 0)
2772 cbRet = -EPROTO;
2773 break;
2774 }
2775 }
2776 } while (cbToWrite > 0);
2777
2778 vbsf_iter_cleanup_stash(iter, &Stash);
2779 }
2780 else
2781 cbRet = -ENOMEM;
2782 if (papPagesFree)
2783 kfree(papPages);
2784 if (pReq)
2785 VbglR0PhysHeapFree(pReq);
2786 SFLOGFLOW(("vbsf_reg_write_iter_locking: returns %#zx (%zd)\n", cbRet, cbRet));
2787 return cbRet;
2788}
2789
2790
2791/**
2792 * Write from I/O vector iterator.
2793 *
2794 * @returns Number of bytes written on success, negative errno on error.
2795 * @param kio The kernel I/O control block (or something like that).
2796 * @param iter The I/O vector iterator describing the buffer.
2797 */
2798# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2799static ssize_t vbsf_reg_write_iter(struct kiocb *kio, struct iov_iter *iter)
2800# else
2801static ssize_t vbsf_reg_aio_write(struct kiocb *kio, const struct iovec *iov, unsigned long cSegs, loff_t offFile)
2802# endif
2803{
2804# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)
2805 struct vbsf_iov_iter fake_iter = VBSF_IOV_ITER_INITIALIZER(cSegs, iov, 1 /*write*/);
2806 struct vbsf_iov_iter *iter = &fake_iter;
2807# endif
2808 size_t cbToWrite = iov_iter_count(iter);
2809 struct inode *inode = VBSF_GET_F_DENTRY(kio->ki_filp)->d_inode;
2810 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
2811 struct address_space *mapping = inode->i_mapping;
2812
2813 struct vbsf_reg_info *sf_r = kio->ki_filp->private_data;
2814 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
2815# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
2816 loff_t offFile = kio->ki_pos;
2817# endif
2818# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)
2819 bool const fAppend = RT_BOOL(kio->ki_flags & IOCB_APPEND);
2820# else
2821 bool const fAppend = RT_BOOL(kio->ki_filp->f_flags & O_APPEND);
2822# endif
2823
2824
2825 SFLOGFLOW(("vbsf_reg_write_iter: inode=%p file=%p size=%#zx off=%#llx type=%#x\n",
2826 inode, kio->ki_filp, cbToWrite, offFile, iter->type));
2827 AssertReturn(S_ISREG(inode->i_mode), -EINVAL);
2828
2829 /*
2830 * Enforce APPEND flag (more later).
2831 */
2832 if (fAppend)
2833 kio->ki_pos = offFile = i_size_read(inode);
2834
2835 /*
2836 * Do we have anything at all to do here?
2837 */
2838 if (!cbToWrite)
2839 return 0;
2840
2841 /** @todo Implement the read-write caching mode. */
2842
2843 /*
2844 * Now now we reject async I/O requests.
2845 */
2846 if (!is_sync_kiocb(kio)) {
2847 SFLOGFLOW(("vbsf_reg_write_iter: async I/O not yet supported\n")); /** @todo extend FsPerf with AIO tests. */
2848 return -EOPNOTSUPP;
2849 }
2850
2851 /*
2852 * If there are active writable mappings, coordinate with any
2853 * pending writes via those.
2854 */
2855 if ( mapping
2856 && mapping->nrpages > 0
2857 && mapping_writably_mapped(mapping)) {
2858# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
2859 int err = filemap_fdatawait_range(mapping, offFile, offFile + cbToWrite - 1);
2860 if (err)
2861 return err;
2862# else
2863 /** @todo ... */
2864# endif
2865 }
2866
2867 /*
2868 * For small requests, try use an embedded buffer provided we get a heap block
2869 * that does not cross page boundraries (see host code).
2870 */
2871 if (cbToWrite <= PAGE_SIZE / 4 * 3 - RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) /* see allocator */) {
2872 uint32_t const cbReq = RT_UOFFSETOF(VBOXSFWRITEEMBEDDEDREQ, abData[0]) + cbToWrite;
2873 VBOXSFWRITEEMBEDDEDREQ *pReq = (VBOXSFWRITEEMBEDDEDREQ *)VbglR0PhysHeapAlloc(cbReq);
2874 if (pReq) {
2875 if ((PAGE_SIZE - ((uintptr_t)pReq & PAGE_OFFSET_MASK)) >= cbReq) {
2876 ssize_t cbRet;
2877 if (copy_from_iter(pReq->abData, cbToWrite, iter) == cbToWrite) {
2878 int vrc = VbglR0SfHostReqWriteEmbedded(pSuperInfo->map.root, pReq, sf_r->Handle.hHost,
2879 offFile, (uint32_t)cbToWrite);
2880 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
2881 if (RT_SUCCESS(vrc)) {
2882 cbRet = pReq->Parms.cb32Write.u.value32;
2883 AssertStmt(cbRet <= (ssize_t)cbToWrite, cbRet = cbToWrite);
2884 vbsf_reg_write_sync_page_cache(mapping, offFile, (uint32_t)cbRet, pReq->abData,
2885 NULL /*papSrcPages*/, 0 /*offSrcPage0*/, 0 /*cSrcPages*/);
2886
2887 offFile += cbRet;
2888 if (fAppend && (g_fSfFeatures & SHFL_FEATURE_WRITE_UPDATES_OFFSET))
2889 offFile = pReq->Parms.off64Write.u.value64;
2890 kio->ki_pos = offFile;
2891 if (offFile > i_size_read(inode))
2892 i_size_write(inode, offFile);
2893
2894# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
2895 if ((size_t)cbRet < cbToWrite)
2896 iov_iter_revert(iter, cbToWrite - cbRet);
2897# endif
2898 } else
2899 cbRet = -EPROTO;
2900 sf_i->force_restat = 1; /* mtime (and size) may have changed */
2901 } else
2902 cbRet = -EFAULT;
2903 VbglR0PhysHeapFree(pReq);
2904 SFLOGFLOW(("vbsf_reg_write_iter: returns %#zx (%zd)\n", cbRet, cbRet));
2905 return cbRet;
2906 }
2907 VbglR0PhysHeapFree(pReq);
2908 }
2909 }
2910
2911 /*
2912 * Otherwise do the page locking thing.
2913 */
2914 return vbsf_reg_write_iter_locking(kio, iter, cbToWrite, offFile, pSuperInfo, sf_r, inode, sf_i, mapping, fAppend);
2915}
2916
2917#endif /* >= 2.6.19 */
2918
2919/**
2920 * Used by vbsf_reg_open() and vbsf_inode_atomic_open() to
2921 *
2922 * @returns shared folders create flags.
2923 * @param fLnxOpen The linux O_XXX flags to convert.
2924 * @param pfHandle Pointer to vbsf_handle::fFlags.
2925 * @param pszCaller Caller, for logging purposes.
2926 */
2927uint32_t vbsf_linux_oflags_to_vbox(unsigned fLnxOpen, uint32_t *pfHandle, const char *pszCaller)
2928{
2929 uint32_t fVBoxFlags = SHFL_CF_ACCESS_DENYNONE;
2930
2931 /*
2932 * Disposition.
2933 */
2934 if (fLnxOpen & O_CREAT) {
2935 Log(("%s: O_CREAT set\n", pszCaller));
2936 fVBoxFlags |= SHFL_CF_ACT_CREATE_IF_NEW;
2937 if (fLnxOpen & O_EXCL) {
2938 Log(("%s: O_EXCL set\n", pszCaller));
2939 fVBoxFlags |= SHFL_CF_ACT_FAIL_IF_EXISTS;
2940 } else if (fLnxOpen & O_TRUNC) {
2941 Log(("%s: O_TRUNC set\n", pszCaller));
2942 fVBoxFlags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS;
2943 } else
2944 fVBoxFlags |= SHFL_CF_ACT_OPEN_IF_EXISTS;
2945 } else {
2946 fVBoxFlags |= SHFL_CF_ACT_FAIL_IF_NEW;
2947 if (fLnxOpen & O_TRUNC) {
2948 Log(("%s: O_TRUNC set\n", pszCaller));
2949 fVBoxFlags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS;
2950 }
2951 }
2952
2953 /*
2954 * Access.
2955 */
2956 switch (fLnxOpen & O_ACCMODE) {
2957 case O_RDONLY:
2958 fVBoxFlags |= SHFL_CF_ACCESS_READ;
2959 *pfHandle |= VBSF_HANDLE_F_READ;
2960 break;
2961
2962 case O_WRONLY:
2963 fVBoxFlags |= SHFL_CF_ACCESS_WRITE;
2964 *pfHandle |= VBSF_HANDLE_F_WRITE;
2965 break;
2966
2967 case O_RDWR:
2968 fVBoxFlags |= SHFL_CF_ACCESS_READWRITE;
2969 *pfHandle |= VBSF_HANDLE_F_READ | VBSF_HANDLE_F_WRITE;
2970 break;
2971
2972 default:
2973 BUG();
2974 }
2975
2976 if (fLnxOpen & O_APPEND) {
2977 Log(("%s: O_APPEND set\n", pszCaller));
2978 fVBoxFlags |= SHFL_CF_ACCESS_APPEND;
2979 *pfHandle |= VBSF_HANDLE_F_APPEND;
2980 }
2981
2982 /*
2983 * Only directories?
2984 */
2985 if (fLnxOpen & O_DIRECTORY) {
2986 Log(("%s: O_DIRECTORY set\n", pszCaller));
2987 fVBoxFlags |= SHFL_CF_DIRECTORY;
2988 }
2989
2990 return fVBoxFlags;
2991}
2992
2993
2994/**
2995 * Open a regular file.
2996 *
2997 * @param inode the inode
2998 * @param file the file
2999 * @returns 0 on success, Linux error code otherwise
3000 */
3001static int vbsf_reg_open(struct inode *inode, struct file *file)
3002{
3003 int rc, rc_linux = 0;
3004 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
3005 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
3006 struct dentry *dentry = VBSF_GET_F_DENTRY(file);
3007 struct vbsf_reg_info *sf_r;
3008 VBOXSFCREATEREQ *pReq;
3009
3010 SFLOGFLOW(("vbsf_reg_open: inode=%p file=%p flags=%#x %s\n", inode, file, file->f_flags, sf_i ? sf_i->path->String.ach : NULL));
3011 Assert(pSuperInfo);
3012 Assert(sf_i);
3013
3014 sf_r = kmalloc(sizeof(*sf_r), GFP_KERNEL);
3015 if (!sf_r) {
3016 LogRelFunc(("could not allocate reg info\n"));
3017 return -ENOMEM;
3018 }
3019
3020 RTListInit(&sf_r->Handle.Entry);
3021 sf_r->Handle.cRefs = 1;
3022 sf_r->Handle.fFlags = VBSF_HANDLE_F_FILE | VBSF_HANDLE_F_MAGIC;
3023 sf_r->Handle.hHost = SHFL_HANDLE_NIL;
3024
3025 /* Already open? */
3026 if (sf_i->handle != SHFL_HANDLE_NIL) {
3027 /*
3028 * This inode was created with vbsf_create_worker(). Check the CreateFlags:
3029 * O_CREAT, O_TRUNC: inherent true (file was just created). Not sure
3030 * about the access flags (SHFL_CF_ACCESS_*).
3031 */
3032 sf_i->force_restat = 1;
3033 sf_r->Handle.hHost = sf_i->handle;
3034 sf_i->handle = SHFL_HANDLE_NIL;
3035 file->private_data = sf_r;
3036
3037 sf_r->Handle.fFlags |= VBSF_HANDLE_F_READ | VBSF_HANDLE_F_WRITE; /** @todo fix */
3038 vbsf_handle_append(sf_i, &sf_r->Handle);
3039 SFLOGFLOW(("vbsf_reg_open: returns 0 (#1) - sf_i=%p hHost=%#llx\n", sf_i, sf_r->Handle.hHost));
3040 return 0;
3041 }
3042
3043 pReq = (VBOXSFCREATEREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq) + sf_i->path->u16Size);
3044 if (!pReq) {
3045 kfree(sf_r);
3046 LogRelFunc(("Failed to allocate a VBOXSFCREATEREQ buffer!\n"));
3047 return -ENOMEM;
3048 }
3049 memcpy(&pReq->StrPath, sf_i->path, SHFLSTRING_HEADER_SIZE + sf_i->path->u16Size);
3050 RT_ZERO(pReq->CreateParms);
3051 pReq->CreateParms.Handle = SHFL_HANDLE_NIL;
3052
3053 /* We check the value of pReq->CreateParms.Handle afterwards to
3054 * find out if the call succeeded or failed, as the API does not seem
3055 * to cleanly distinguish error and informational messages.
3056 *
3057 * Furthermore, we must set pReq->CreateParms.Handle to SHFL_HANDLE_NIL
3058 * to make the shared folders host service use our fMode parameter */
3059
3060 /* We ignore O_EXCL, as the Linux kernel seems to call create
3061 beforehand itself, so O_EXCL should always fail. */
3062 pReq->CreateParms.CreateFlags = vbsf_linux_oflags_to_vbox(file->f_flags & ~O_EXCL, &sf_r->Handle.fFlags, __FUNCTION__);
3063 pReq->CreateParms.Info.Attr.fMode = inode->i_mode;
3064 LogFunc(("vbsf_reg_open: calling VbglR0SfHostReqCreate, file %s, flags=%#x, %#x\n",
3065 sf_i->path->String.utf8, file->f_flags, pReq->CreateParms.CreateFlags));
3066 rc = VbglR0SfHostReqCreate(pSuperInfo->map.root, pReq);
3067 if (RT_FAILURE(rc)) {
3068 LogFunc(("VbglR0SfHostReqCreate failed flags=%d,%#x rc=%Rrc\n", file->f_flags, pReq->CreateParms.CreateFlags, rc));
3069 kfree(sf_r);
3070 VbglR0PhysHeapFree(pReq);
3071 return -RTErrConvertToErrno(rc);
3072 }
3073
3074 if (pReq->CreateParms.Handle != SHFL_HANDLE_NIL) {
3075 vbsf_dentry_chain_increase_ttl(dentry);
3076 vbsf_update_inode(inode, sf_i, &pReq->CreateParms.Info, pSuperInfo, false /*fInodeLocked*/, 0 /*fSetAttrs*/);
3077 rc_linux = 0;
3078 } else {
3079 switch (pReq->CreateParms.Result) {
3080 case SHFL_PATH_NOT_FOUND:
3081 vbsf_dentry_invalidate_ttl(dentry);
3082 rc_linux = -ENOENT;
3083 break;
3084 case SHFL_FILE_NOT_FOUND:
3085 vbsf_dentry_invalidate_ttl(dentry);
3086 /** @todo sf_dentry_increase_parent_ttl(file->f_dentry); if we can trust it. */
3087 rc_linux = -ENOENT;
3088 break;
3089 case SHFL_FILE_EXISTS:
3090 vbsf_dentry_chain_increase_ttl(dentry);
3091 vbsf_update_inode(inode, sf_i, &pReq->CreateParms.Info, pSuperInfo, false /*fInodeLocked*/, 0 /*fSetAttrs*/);
3092 rc_linux = -EEXIST;
3093 break;
3094 default:
3095 vbsf_dentry_chain_increase_parent_ttl(dentry);
3096 rc_linux = 0;
3097 break;
3098 }
3099 }
3100
3101 sf_r->Handle.hHost = pReq->CreateParms.Handle;
3102 file->private_data = sf_r;
3103 vbsf_handle_append(sf_i, &sf_r->Handle);
3104 VbglR0PhysHeapFree(pReq);
3105 SFLOGFLOW(("vbsf_reg_open: returns 0 (#2) - sf_i=%p hHost=%#llx\n", sf_i, sf_r->Handle.hHost));
3106 return rc_linux;
3107}
3108
3109
3110/**
3111 * Close a regular file.
3112 *
3113 * @param inode the inode
3114 * @param file the file
3115 * @returns 0 on success, Linux error code otherwise
3116 */
3117static int vbsf_reg_release(struct inode *inode, struct file *file)
3118{
3119 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
3120 struct vbsf_reg_info *sf_r = file->private_data;
3121
3122 SFLOGFLOW(("vbsf_reg_release: inode=%p file=%p\n", inode, file));
3123 if (sf_r) {
3124 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
3125 struct address_space *mapping = inode->i_mapping;
3126 Assert(pSuperInfo);
3127
3128 /* If we're closing the last handle for this inode, make sure the flush
3129 the mapping or we'll end up in vbsf_writepage without a handle. */
3130 if ( mapping
3131 && mapping->nrpages > 0
3132 /** @todo && last writable handle */ ) {
3133#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 25)
3134 if (filemap_fdatawrite(mapping) != -EIO)
3135#else
3136 if ( filemap_fdatasync(mapping) == 0
3137 && fsync_inode_data_buffers(inode) == 0)
3138#endif
3139 filemap_fdatawait(inode->i_mapping);
3140 }
3141
3142 /* Release sf_r, closing the handle if we're the last user. */
3143 file->private_data = NULL;
3144 vbsf_handle_release(&sf_r->Handle, pSuperInfo, "vbsf_reg_release");
3145
3146 sf_i->handle = SHFL_HANDLE_NIL;
3147 }
3148 return 0;
3149}
3150
3151
3152/**
3153 * Wrapper around generic/default seek function that ensures that we've got
3154 * the up-to-date file size when doing anything relative to EOF.
3155 *
3156 * The issue is that the host may extend the file while we weren't looking and
3157 * if the caller wishes to append data, it may end up overwriting existing data
3158 * if we operate with a stale size. So, we always retrieve the file size on EOF
3159 * relative seeks.
3160 */
3161static loff_t vbsf_reg_llseek(struct file *file, loff_t off, int whence)
3162{
3163 SFLOGFLOW(("vbsf_reg_llseek: file=%p off=%lld whence=%d\n", file, off, whence));
3164
3165 switch (whence) {
3166#ifdef SEEK_HOLE
3167 case SEEK_HOLE:
3168 case SEEK_DATA:
3169#endif
3170 case SEEK_END: {
3171 struct vbsf_reg_info *sf_r = file->private_data;
3172 int rc = vbsf_inode_revalidate_with_handle(VBSF_GET_F_DENTRY(file), sf_r->Handle.hHost,
3173 true /*fForce*/, false /*fInodeLocked*/);
3174 if (rc == 0)
3175 break;
3176 return rc;
3177 }
3178 }
3179
3180#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 8)
3181 return generic_file_llseek(file, off, whence);
3182#else
3183 return default_llseek(file, off, whence);
3184#endif
3185}
3186
3187
3188/**
3189 * Flush region of file - chiefly mmap/msync.
3190 *
3191 * We cannot use the noop_fsync / simple_sync_file here as that means
3192 * msync(,,MS_SYNC) will return before the data hits the host, thereby
3193 * causing coherency issues with O_DIRECT access to the same file as
3194 * well as any host interaction with the file.
3195 */
3196#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
3197static int vbsf_reg_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3198{
3199# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
3200 return __generic_file_fsync(file, start, end, datasync);
3201# else
3202 return generic_file_fsync(file, start, end, datasync);
3203# endif
3204}
3205#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35)
3206static int vbsf_reg_fsync(struct file *file, int datasync)
3207{
3208 return generic_file_fsync(file, datasync);
3209}
3210#else /* < 2.6.35 */
3211static int vbsf_reg_fsync(struct file *file, struct dentry *dentry, int datasync)
3212{
3213# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 31)
3214 return simple_fsync(file, dentry, datasync);
3215# else
3216 int rc;
3217 struct inode *inode = dentry->d_inode;
3218 AssertReturn(inode, -EINVAL);
3219
3220 /** @todo What about file_fsync()? (<= 2.5.11) */
3221
3222# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
3223 rc = sync_mapping_buffers(inode->i_mapping);
3224 if ( rc == 0
3225 && (inode->i_state & I_DIRTY)
3226 && ((inode->i_state & I_DIRTY_DATASYNC) || !datasync)
3227 ) {
3228 struct writeback_control wbc = {
3229 .sync_mode = WB_SYNC_ALL,
3230 .nr_to_write = 0
3231 };
3232 rc = sync_inode(inode, &wbc);
3233 }
3234# else /* < 2.5.12 */
3235 /** @todo
3236 * Somethings is buggy here or in the 2.4.21-27.EL kernel I'm testing on.
3237 *
3238 * In theory we shouldn't need to do anything here, since msync will call
3239 * writepage() on each dirty page and we write them out synchronously. So, the
3240 * problem is elsewhere... Doesn't happen all the time either. Sigh.
3241 */
3242 rc = fsync_inode_buffers(inode);
3243# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
3244 if (rc == 0 && datasync)
3245 rc = fsync_inode_data_buffers(inode);
3246# endif
3247
3248# endif /* < 2.5.12 */
3249 return rc;
3250# endif
3251}
3252#endif /* < 2.6.35 */
3253
3254
3255#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
3256/**
3257 * Copy a datablock from one file to another on the host side.
3258 */
3259static ssize_t vbsf_reg_copy_file_range(struct file *pFileSrc, loff_t offSrc, struct file *pFileDst, loff_t offDst,
3260 size_t cbRange, unsigned int fFlags)
3261{
3262 ssize_t cbRet;
3263 if (g_uSfLastFunction >= SHFL_FN_COPY_FILE_PART) {
3264 struct inode *pInodeSrc = pFileSrc->f_inode;
3265 struct vbsf_inode_info *pInodeInfoSrc = VBSF_GET_INODE_INFO(pInodeSrc);
3266 struct vbsf_super_info *pSuperInfoSrc = VBSF_GET_SUPER_INFO(pInodeSrc->i_sb);
3267 struct vbsf_reg_info *pFileInfoSrc = (struct vbsf_reg_info *)pFileSrc->private_data;
3268 struct inode *pInodeDst = pInodeSrc;
3269 struct vbsf_inode_info *pInodeInfoDst = VBSF_GET_INODE_INFO(pInodeDst);
3270 struct vbsf_super_info *pSuperInfoDst = VBSF_GET_SUPER_INFO(pInodeDst->i_sb);
3271 struct vbsf_reg_info *pFileInfoDst = (struct vbsf_reg_info *)pFileDst->private_data;
3272 VBOXSFCOPYFILEPARTREQ *pReq;
3273
3274 /*
3275 * Some extra validation.
3276 */
3277 AssertPtrReturn(pInodeInfoSrc, -EOPNOTSUPP);
3278 Assert(pInodeInfoSrc->u32Magic == SF_INODE_INFO_MAGIC);
3279 AssertPtrReturn(pInodeInfoDst, -EOPNOTSUPP);
3280 Assert(pInodeInfoDst->u32Magic == SF_INODE_INFO_MAGIC);
3281
3282# if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)
3283 if (!S_ISREG(pInodeSrc->i_mode) || !S_ISREG(pInodeDst->i_mode))
3284 return S_ISDIR(pInodeSrc->i_mode) || S_ISDIR(pInodeDst->i_mode) ? -EISDIR : -EINVAL;
3285# endif
3286
3287 /*
3288 * Allocate the request and issue it.
3289 */
3290 pReq = (VBOXSFCOPYFILEPARTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
3291 if (pReq) {
3292 int vrc = VbglR0SfHostReqCopyFilePart(pSuperInfoSrc->map.root, pFileInfoSrc->Handle.hHost, offSrc,
3293 pSuperInfoDst->map.root, pFileInfoDst->Handle.hHost, offDst,
3294 cbRange, 0 /*fFlags*/, pReq);
3295 if (RT_SUCCESS(vrc))
3296 cbRet = pReq->Parms.cb64ToCopy.u.value64;
3297 else if (vrc == VERR_NOT_IMPLEMENTED)
3298 cbRet = -EOPNOTSUPP;
3299 else
3300 cbRet = -RTErrConvertToErrno(vrc);
3301
3302 VbglR0PhysHeapFree(pReq);
3303 } else
3304 cbRet = -ENOMEM;
3305 } else {
3306 cbRet = -EOPNOTSUPP;
3307 }
3308 SFLOGFLOW(("vbsf_reg_copy_file_range: returns %zd\n", cbRet));
3309 return cbRet;
3310}
3311#endif /* > 4.5 */
3312
3313
3314#ifdef SFLOG_ENABLED
3315/*
3316 * This is just for logging page faults and such.
3317 */
3318
3319/** Pointer to the ops generic_file_mmap returns the first time it's called. */
3320static struct vm_operations_struct const *g_pGenericFileVmOps = NULL;
3321/** Merge of g_LoggingVmOpsTemplate and g_pGenericFileVmOps. */
3322static struct vm_operations_struct g_LoggingVmOps;
3323
3324
3325/* Generic page fault callback: */
3326# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
3327static vm_fault_t vbsf_vmlog_fault(struct vm_fault *vmf)
3328{
3329 vm_fault_t rc;
3330 SFLOGFLOW(("vbsf_vmlog_fault: vmf=%p flags=%#x addr=%p\n", vmf, vmf->flags, vmf->address));
3331 rc = g_pGenericFileVmOps->fault(vmf);
3332 SFLOGFLOW(("vbsf_vmlog_fault: returns %d\n", rc));
3333 return rc;
3334}
3335# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
3336static int vbsf_vmlog_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3337{
3338 int rc;
3339# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
3340 SFLOGFLOW(("vbsf_vmlog_fault: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->address));
3341# else
3342 SFLOGFLOW(("vbsf_vmlog_fault: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->virtual_address));
3343# endif
3344 rc = g_pGenericFileVmOps->fault(vma, vmf);
3345 SFLOGFLOW(("vbsf_vmlog_fault: returns %d\n", rc));
3346 return rc;
3347}
3348# endif
3349
3350
3351/* Special/generic page fault handler: */
3352# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 26)
3353# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 1)
3354static struct page *vbsf_vmlog_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
3355{
3356 struct page *page;
3357 SFLOGFLOW(("vbsf_vmlog_nopage: vma=%p address=%p type=%p:{%#x}\n", vma, address, type, type ? *type : 0));
3358 page = g_pGenericFileVmOps->nopage(vma, address, type);
3359 SFLOGFLOW(("vbsf_vmlog_nopage: returns %p\n", page));
3360 return page;
3361}
3362# else
3363static struct page *vbsf_vmlog_nopage(struct vm_area_struct *vma, unsigned long address, int write_access_or_unused)
3364{
3365 struct page *page;
3366 SFLOGFLOW(("vbsf_vmlog_nopage: vma=%p address=%p wau=%d\n", vma, address, write_access_or_unused));
3367 page = g_pGenericFileVmOps->nopage(vma, address, write_access_or_unused);
3368 SFLOGFLOW(("vbsf_vmlog_nopage: returns %p\n", page));
3369 return page;
3370}
3371# endif /* < 2.6.26 */
3372
3373
3374/* Special page fault callback for making something writable: */
3375# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
3376static vm_fault_t vbsf_vmlog_page_mkwrite(struct vm_fault *vmf)
3377{
3378 vm_fault_t rc;
3379 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vmf=%p flags=%#x addr=%p\n", vmf, vmf->flags, vmf->address));
3380 rc = g_pGenericFileVmOps->page_mkwrite(vmf);
3381 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: returns %d\n", rc));
3382 return rc;
3383}
3384# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 30)
3385static int vbsf_vmlog_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
3386{
3387 int rc;
3388# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
3389 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->address));
3390# else
3391 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vma=%p vmf=%p flags=%#x addr=%p\n", vma, vmf, vmf->flags, vmf->virtual_address));
3392# endif
3393 rc = g_pGenericFileVmOps->page_mkwrite(vma, vmf);
3394 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: returns %d\n", rc));
3395 return rc;
3396}
3397# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
3398static int vbsf_vmlog_page_mkwrite(struct vm_area_struct *vma, struct page *page)
3399{
3400 int rc;
3401 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: vma=%p page=%p\n", vma, page));
3402 rc = g_pGenericFileVmOps->page_mkwrite(vma, page);
3403 SFLOGFLOW(("vbsf_vmlog_page_mkwrite: returns %d\n", rc));
3404 return rc;
3405}
3406# endif
3407
3408
3409/* Special page fault callback for mapping pages: */
3410# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
3411static void vbsf_vmlog_map_pages(struct vm_fault *vmf, pgoff_t start, pgoff_t end)
3412{
3413 SFLOGFLOW(("vbsf_vmlog_map_pages: vmf=%p (flags=%#x addr=%p) start=%p end=%p\n", vmf, vmf->flags, vmf->address, start, end));
3414 g_pGenericFileVmOps->map_pages(vmf, start, end);
3415 SFLOGFLOW(("vbsf_vmlog_map_pages: returns\n"));
3416}
3417# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
3418static void vbsf_vmlog_map_pages(struct fault_env *fenv, pgoff_t start, pgoff_t end)
3419{
3420 SFLOGFLOW(("vbsf_vmlog_map_pages: fenv=%p (flags=%#x addr=%p) start=%p end=%p\n", fenv, fenv->flags, fenv->address, start, end));
3421 g_pGenericFileVmOps->map_pages(fenv, start, end);
3422 SFLOGFLOW(("vbsf_vmlog_map_pages: returns\n"));
3423}
3424# elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0)
3425static void vbsf_vmlog_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
3426{
3427 SFLOGFLOW(("vbsf_vmlog_map_pages: vma=%p vmf=%p (flags=%#x addr=%p)\n", vma, vmf, vmf->flags, vmf->virtual_address));
3428 g_pGenericFileVmOps->map_pages(vma, vmf);
3429 SFLOGFLOW(("vbsf_vmlog_map_pages: returns\n"));
3430}
3431# endif
3432
3433
3434/** Overload template. */
3435static struct vm_operations_struct const g_LoggingVmOpsTemplate = {
3436# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
3437 .fault = vbsf_vmlog_fault,
3438# endif
3439# if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 25)
3440 .nopage = vbsf_vmlog_nopage,
3441# endif
3442# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 18)
3443 .page_mkwrite = vbsf_vmlog_page_mkwrite,
3444# endif
3445# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0)
3446 .map_pages = vbsf_vmlog_map_pages,
3447# endif
3448};
3449
3450/** file_operations::mmap wrapper for logging purposes. */
3451extern int vbsf_reg_mmap(struct file *file, struct vm_area_struct *vma)
3452{
3453 int rc;
3454 SFLOGFLOW(("vbsf_reg_mmap: file=%p vma=%p\n", file, vma));
3455 rc = generic_file_mmap(file, vma);
3456 if (rc == 0) {
3457 /* Merge the ops and template the first time thru (there's a race here). */
3458 if (g_pGenericFileVmOps == NULL) {
3459 uintptr_t const *puSrc1 = (uintptr_t *)vma->vm_ops;
3460 uintptr_t const *puSrc2 = (uintptr_t *)&g_LoggingVmOpsTemplate;
3461 uintptr_t volatile *puDst = (uintptr_t *)&g_LoggingVmOps;
3462 size_t cbLeft = sizeof(g_LoggingVmOps) / sizeof(*puDst);
3463 while (cbLeft-- > 0) {
3464 *puDst = *puSrc2 && *puSrc1 ? *puSrc2 : *puSrc1;
3465 puSrc1++;
3466 puSrc2++;
3467 puDst++;
3468 }
3469 g_pGenericFileVmOps = vma->vm_ops;
3470 vma->vm_ops = &g_LoggingVmOps;
3471 } else if (g_pGenericFileVmOps == vma->vm_ops)
3472 vma->vm_ops = &g_LoggingVmOps;
3473 else
3474 SFLOGFLOW(("vbsf_reg_mmap: Warning: vm_ops=%p, expected %p!\n", vma->vm_ops, g_pGenericFileVmOps));
3475 }
3476 SFLOGFLOW(("vbsf_reg_mmap: returns %d\n", rc));
3477 return rc;
3478}
3479
3480#endif /* SFLOG_ENABLED */
3481
3482
3483/**
3484 * File operations for regular files.
3485 *
3486 * Note on splice_read/splice_write/sendfile:
3487 * - Splice was introduced in 2.6.17. The generic_file_splice_read/write
3488 * methods go thru the page cache, which is undesirable and is why we
3489 * need to cook our own versions of the code as long as we cannot track
3490 * host-side writes and correctly invalidate the guest page-cache.
3491 * - Sendfile reimplemented using splice in 2.6.23.
3492 * - The default_file_splice_read/write no-page-cache fallback functions,
3493 * were introduced in 2.6.31. The write one work in page units.
3494 * - Since linux 3.16 there is iter_file_splice_write that uses iter_write.
3495 * - Since linux 4.9 the generic_file_splice_read function started using
3496 * read_iter.
3497 */
3498struct file_operations vbsf_reg_fops = {
3499 .open = vbsf_reg_open,
3500 .read = vbsf_reg_read,
3501 .write = vbsf_reg_write,
3502#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
3503 .read_iter = vbsf_reg_read_iter,
3504 .write_iter = vbsf_reg_write_iter,
3505#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
3506 .aio_read = vbsf_reg_aio_read,
3507 .aio_write = vbsf_reg_aio_write,
3508#endif
3509 .release = vbsf_reg_release,
3510#ifdef SFLOG_ENABLED
3511 .mmap = vbsf_reg_mmap,
3512#else
3513 .mmap = generic_file_mmap,
3514#endif
3515#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 17) && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 31)
3516 .splice_read = vbsf_splice_read,
3517#endif
3518#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
3519 .splice_write = iter_file_splice_write,
3520#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 17)
3521 .splice_write = vbsf_splice_write,
3522#endif
3523#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 30) && LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)
3524 .sendfile = vbsf_reg_sendfile,
3525#endif
3526 .llseek = vbsf_reg_llseek,
3527 .fsync = vbsf_reg_fsync,
3528#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
3529 .copy_file_range = vbsf_reg_copy_file_range,
3530#endif
3531};
3532
3533
3534/**
3535 * Inodes operations for regular files.
3536 */
3537struct inode_operations vbsf_reg_iops = {
3538#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 18)
3539 .getattr = vbsf_inode_getattr,
3540#else
3541 .revalidate = vbsf_inode_revalidate,
3542#endif
3543 .setattr = vbsf_inode_setattr,
3544};
3545
3546
3547
3548/*********************************************************************************************************************************
3549* Address Space Operations on Regular Files (for mmap, sendfile, direct I/O) *
3550*********************************************************************************************************************************/
3551
3552/**
3553 * Used to read the content of a page into the page cache.
3554 *
3555 * Needed for mmap and reads+writes when the file is mmapped in a
3556 * shared+writeable fashion.
3557 */
3558static int vbsf_readpage(struct file *file, struct page *page)
3559{
3560 struct inode *inode = VBSF_GET_F_DENTRY(file)->d_inode;
3561 int err;
3562
3563 SFLOGFLOW(("vbsf_readpage: inode=%p file=%p page=%p off=%#llx\n", inode, file, page, (uint64_t)page->index << PAGE_SHIFT));
3564 Assert(PageLocked(page));
3565
3566 if (PageUptodate(page)) {
3567 unlock_page(page);
3568 return 0;
3569 }
3570
3571 if (!is_bad_inode(inode)) {
3572 VBOXSFREADPGLSTREQ *pReq = (VBOXSFREADPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
3573 if (pReq) {
3574 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
3575 struct vbsf_reg_info *sf_r = file->private_data;
3576 uint32_t cbRead;
3577 int vrc;
3578
3579 pReq->PgLst.offFirstPage = 0;
3580 pReq->PgLst.aPages[0] = page_to_phys(page);
3581 vrc = VbglR0SfHostReqReadPgLst(pSuperInfo->map.root,
3582 pReq,
3583 sf_r->Handle.hHost,
3584 (uint64_t)page->index << PAGE_SHIFT,
3585 PAGE_SIZE,
3586 1 /*cPages*/);
3587
3588 cbRead = pReq->Parms.cb32Read.u.value32;
3589 AssertStmt(cbRead <= PAGE_SIZE, cbRead = PAGE_SIZE);
3590 VbglR0PhysHeapFree(pReq);
3591
3592 if (RT_SUCCESS(vrc)) {
3593 if (cbRead == PAGE_SIZE) {
3594 /* likely */
3595 } else {
3596 uint8_t *pbMapped = (uint8_t *)kmap(page);
3597 RT_BZERO(&pbMapped[cbRead], PAGE_SIZE - cbRead);
3598 kunmap(page);
3599 /** @todo truncate the inode file size? */
3600 }
3601
3602 flush_dcache_page(page);
3603 SetPageUptodate(page);
3604 unlock_page(page);
3605 return 0;
3606 }
3607 err = -RTErrConvertToErrno(vrc);
3608 } else
3609 err = -ENOMEM;
3610 } else
3611 err = -EIO;
3612 SetPageError(page);
3613 unlock_page(page);
3614 return err;
3615}
3616
3617
3618/**
3619 * Used to write out the content of a dirty page cache page to the host file.
3620 *
3621 * Needed for mmap and writes when the file is mmapped in a shared+writeable
3622 * fashion.
3623 */
3624#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 52)
3625static int vbsf_writepage(struct page *page, struct writeback_control *wbc)
3626#else
3627static int vbsf_writepage(struct page *page)
3628#endif
3629{
3630 struct address_space *mapping = page->mapping;
3631 struct inode *inode = mapping->host;
3632 struct vbsf_inode_info *sf_i = VBSF_GET_INODE_INFO(inode);
3633 struct vbsf_handle *pHandle = vbsf_handle_find(sf_i, VBSF_HANDLE_F_WRITE, VBSF_HANDLE_F_APPEND);
3634 int err;
3635
3636 SFLOGFLOW(("vbsf_writepage: inode=%p page=%p off=%#llx pHandle=%p (%#llx)\n",
3637 inode, page, (uint64_t)page->index << PAGE_SHIFT, pHandle, pHandle ? pHandle->hHost : 0));
3638
3639 if (pHandle) {
3640 struct vbsf_super_info *pSuperInfo = VBSF_GET_SUPER_INFO(inode->i_sb);
3641 VBOXSFWRITEPGLSTREQ *pReq = (VBOXSFWRITEPGLSTREQ *)VbglR0PhysHeapAlloc(sizeof(*pReq));
3642 if (pReq) {
3643 uint64_t const cbFile = i_size_read(inode);
3644 uint64_t const offInFile = (uint64_t)page->index << PAGE_SHIFT;
3645 uint32_t const cbToWrite = page->index != (cbFile >> PAGE_SHIFT) ? PAGE_SIZE
3646 : (uint32_t)cbFile & (uint32_t)PAGE_OFFSET_MASK;
3647 int vrc;
3648
3649 pReq->PgLst.offFirstPage = 0;
3650 pReq->PgLst.aPages[0] = page_to_phys(page);
3651 vrc = VbglR0SfHostReqWritePgLst(pSuperInfo->map.root,
3652 pReq,
3653 pHandle->hHost,
3654 offInFile,
3655 cbToWrite,
3656 1 /*cPages*/);
3657 sf_i->ModificationTimeAtOurLastWrite = sf_i->ModificationTime;
3658 AssertMsgStmt(pReq->Parms.cb32Write.u.value32 == cbToWrite || RT_FAILURE(vrc), /* lazy bird */
3659 ("%#x vs %#x\n", pReq->Parms.cb32Write, cbToWrite),
3660 vrc = VERR_WRITE_ERROR);
3661 VbglR0PhysHeapFree(pReq);
3662
3663 if (RT_SUCCESS(vrc)) {
3664 /* Update the inode if we've extended the file. */
3665 /** @todo is this necessary given the cbToWrite calc above? */
3666 uint64_t const offEndOfWrite = offInFile + cbToWrite;
3667 if ( offEndOfWrite > cbFile
3668 && offEndOfWrite > i_size_read(inode))
3669 i_size_write(inode, offEndOfWrite);
3670
3671 /* Update and unlock the page. */
3672 if (PageError(page))
3673 ClearPageError(page);
3674 SetPageUptodate(page);
3675 unlock_page(page);
3676
3677 vbsf_handle_release(pHandle, pSuperInfo, "vbsf_writepage");
3678 return 0;
3679 }
3680
3681 /*
3682 * We failed.
3683 */
3684 err = -EIO;
3685 } else
3686 err = -ENOMEM;
3687 vbsf_handle_release(pHandle, pSuperInfo, "vbsf_writepage");
3688 } else {
3689 /** @todo we could re-open the file here and deal with this... */
3690 static uint64_t volatile s_cCalls = 0;
3691 if (s_cCalls++ < 16)
3692 printk("vbsf_writepage: no writable handle for %s..\n", sf_i->path->String.ach);
3693 err = -EIO;
3694 }
3695 SetPageError(page);
3696 unlock_page(page);
3697 return err;
3698}
3699
3700
3701#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
3702/**
3703 * Called when writing thru the page cache (which we shouldn't be doing).
3704 */
3705int vbsf_write_begin(struct file *file, struct address_space *mapping, loff_t pos,
3706 unsigned len, unsigned flags, struct page **pagep, void **fsdata)
3707{
3708 /** @todo r=bird: We shouldn't ever get here, should we? Because we don't use
3709 * the page cache for any writes AFAIK. We could just as well use
3710 * simple_write_begin & simple_write_end here if we think we really
3711 * need to have non-NULL function pointers in the table... */
3712 static uint64_t volatile s_cCalls = 0;
3713 if (s_cCalls++ < 16) {
3714 printk("vboxsf: Unexpected call to vbsf_write_begin(pos=%#llx len=%#x flags=%#x)! Please report.\n",
3715 (unsigned long long)pos, len, flags);
3716 RTLogBackdoorPrintf("vboxsf: Unexpected call to vbsf_write_begin(pos=%#llx len=%#x flags=%#x)! Please report.\n",
3717 (unsigned long long)pos, len, flags);
3718# ifdef WARN_ON
3719 WARN_ON(1);
3720# endif
3721 }
3722 return simple_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
3723}
3724#endif /* KERNEL_VERSION >= 2.6.24 */
3725
3726
3727#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
3728
3729# ifdef VBOX_UEK
3730# undef iov_iter /* HACK ALERT! Don't put anything needing vbsf_iov_iter after this fun! */
3731# endif
3732
3733/**
3734 * This is needed to make open accept O_DIRECT as well as dealing with direct
3735 * I/O requests if we don't intercept them earlier.
3736 */
3737# if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
3738static ssize_t vbsf_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3739# elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)
3740static ssize_t vbsf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
3741# elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) || defined(VBOX_UEK)
3742static ssize_t vbsf_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
3743# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 6)
3744static ssize_t vbsf_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
3745# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 55)
3746static int vbsf_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
3747# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 41)
3748static int vbsf_direct_IO(int rw, struct file *file, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
3749# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 35)
3750static int vbsf_direct_IO(int rw, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
3751# elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 26)
3752static int vbsf_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset, size_t count)
3753# elif LINUX_VERSION_CODE == KERNEL_VERSION(2, 4, 21) && defined(I_NEW) /* RHEL3 Frankenkernel. */
3754static int vbsf_direct_IO(int rw, struct file *file, struct kiobuf *buf, unsigned long whatever1, int whatever2)
3755# else
3756static int vbsf_direct_IO(int rw, struct inode *inode, struct kiobuf *buf, unsigned long whatever1, int whatever2)
3757# endif
3758{
3759 TRACE();
3760 return -EINVAL;
3761}
3762
3763#endif
3764
3765/**
3766 * Address space (for the page cache) operations for regular files.
3767 *
3768 * @todo the FsPerf touch/flush (mmap) test fails on 4.4.0 (ubuntu 16.04 lts).
3769 */
3770struct address_space_operations vbsf_reg_aops = {
3771 .readpage = vbsf_readpage,
3772 .writepage = vbsf_writepage,
3773 /** @todo Need .writepages if we want msync performance... */
3774#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 12)
3775 .set_page_dirty = __set_page_dirty_buffers,
3776#endif
3777#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
3778 .write_begin = vbsf_write_begin,
3779 .write_end = simple_write_end,
3780#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 45)
3781 .prepare_write = simple_prepare_write,
3782 .commit_write = simple_commit_write,
3783#endif
3784#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 10)
3785 .direct_IO = vbsf_direct_IO,
3786#endif
3787};
3788
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette