VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 103068

Last change on this file since 103068 was 98103, checked in by vboxsync, 21 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 43.6 KB
RevLine 
[28449]1/* $Id: socket.c 98103 2023-01-17 14:15:46Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
[1]6/*
[98103]7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
[28449]8 *
[96407]9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
[28449]26 */
27
28/*
29 * This code is based on:
30 *
[1]31 * Copyright (c) 1995 Danny Gasparovski.
[926]32 *
33 * Please read the file COPYRIGHT for the
[1]34 * terms and conditions of the copyright.
35 */
36
37#include <slirp.h>
38#include "ip_icmp.h"
39#include "main.h"
40#ifdef __sun__
41#include <sys/filio.h>
42#endif
[35346]43#include <VBox/vmm/pdmdrv.h>
[15636]44#if defined (RT_OS_WINDOWS)
[62692]45#include <iprt/win/iphlpapi.h>
[15086]46#include <icmpapi.h>
47#endif
[71982]48#include <alias.h>
[1]49
[52154]50#if defined(DECLARE_IOVEC) && defined(RT_OS_WINDOWS)
51AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, WSABUF, buf);
52AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, WSABUF, len);
53#endif
54
[39287]55#ifdef VBOX_WITH_NAT_SEND2HOME
56DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
57{
58 int idxAddr;
59 int ret = 0;
60 bool fSendDone = false;
61 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
62 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
63 {
64
65 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
66 AssertReturn((pNewSocket, false));
67 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
[63562]68 /** @todo more verbose on errors,
[39287]69 * @note: we shouldn't care if this send fail or not (we're in broadcast).
70 */
71 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
72 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
73 if (ret < 0)
74 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
75 fSendDone |= ret > 0;
76 }
77 LogFlowFunc(("Leave %RTbool\n", fSendDone));
78 return fSendDone;
79}
80#endif /* !VBOX_WITH_NAT_SEND2HOME */
[53399]81
82#if !defined(RT_OS_WINDOWS)
[39101]83static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
[15086]84static void sorecvfrom_icmp_unix(PNATState, struct socket *);
[15636]85#endif /* !RT_OS_WINDOWS */
[15086]86
[1]87void
[63013]88so_init(void)
[1]89{
90}
91
92struct socket *
[14964]93solookup(struct socket *head, struct in_addr laddr,
94 u_int lport, struct in_addr faddr, u_int fport)
[1]95{
[14964]96 struct socket *so;
[926]97
[14964]98 for (so = head->so_next; so != head; so = so->so_next)
99 {
100 if ( so->so_lport == lport
101 && so->so_laddr.s_addr == laddr.s_addr
102 && so->so_faddr.s_addr == faddr.s_addr
103 && so->so_fport == fport)
104 return so;
105 }
[926]106
[14964]107 return (struct socket *)NULL;
[1]108}
109
110/*
111 * Create a new socket, initialise the fields
112 * It is the responsibility of the caller to
113 * insque() it into the correct linked-list
114 */
115struct socket *
[63013]116socreate(void)
[1]117{
[14964]118 struct socket *so;
[926]119
[16443]120 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
[23369]121 if (so)
[14964]122 {
123 so->so_state = SS_NOFDREF;
124 so->s = -1;
[18902]125#if !defined(RT_OS_WINDOWS)
[16653]126 so->so_poll_index = -1;
127#endif
[14964]128 }
129 return so;
[1]130}
131
132/*
133 * remque and free a socket, clobber cache
134 */
135void
[1033]136sofree(PNATState pData, struct socket *so)
[1]137{
[41227]138 LogFlowFunc(("ENTER:%R[natsock]\n", so));
139 /*
140 * We should not remove socket when polling routine do the polling
141 * instead we mark it for deletion.
142 */
143 if (so->fUnderPolling)
144 {
145 so->fShouldBeRemoved = 1;
146 LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so));
147 return;
148 }
[41806]149 /**
150 * Check that we don't freeng socket with tcbcb
151 */
152 Assert(!sototcpcb(so));
[45261]153 /* udp checks */
154 Assert(!so->so_timeout);
155 Assert(!so->so_timeout_arg);
[14964]156 if (so == tcp_last_so)
157 tcp_last_so = &tcb;
158 else if (so == udp_last_so)
159 udp_last_so = &udb;
[41227]160
[15447]161 /* check if mbuf haven't been already freed */
162 if (so->so_m != NULL)
[41227]163 {
[28443]164 m_freem(pData, so->so_m);
[41227]165 so->so_m = NULL;
166 }
167
[52798]168 if (so->so_ohdr != NULL)
169 {
170 RTMemFree(so->so_ohdr);
171 so->so_ohdr = NULL;
172 }
173
[41227]174 if (so->so_next && so->so_prev)
[16653]175 {
[41227]176 remque(pData, so); /* crashes if so is not in a queue */
177 NSOCK_DEC();
178 }
[41178]179
[41227]180 RTMemFree(so);
181 LogFlowFuncLeave();
[1]182}
183
[71982]184
[72283]185/*
186 * Worker for sobind() below.
187 */
188static int
189sobindto(struct socket *so, uint32_t addr, uint16_t port)
[71982]190{
191 struct sockaddr_in self;
[72283]192 int status;
[71984]193
[72283]194 if (addr == INADDR_ANY && port == 0 && so->so_type != IPPROTO_UDP)
195 {
[72292]196 /* TCP sockets without constraints don't need to be bound */
197 Log2(("NAT: sobind: %s guest %RTnaipv4:%d - nothing to do\n",
198 so->so_type == IPPROTO_UDP ? "udp" : "tcp",
199 so->so_laddr.s_addr, ntohs(so->so_lport)));
200 return 0;
[72283]201 }
[71982]202
203 RT_ZERO(self);
204#ifdef RT_OS_DARWIN
205 self.sin_len = sizeof(self);
206#endif
207 self.sin_family = AF_INET;
[72283]208 self.sin_addr.s_addr = addr;
209 self.sin_port = port;
[71984]210
[72283]211 status = bind(so->s, (struct sockaddr *)&self, sizeof(self));
212 if (status == 0)
[71982]213 {
[72292]214 Log2(("NAT: sobind: %s guest %RTnaipv4:%d to host %RTnaipv4:%d\n",
215 so->so_type == IPPROTO_UDP ? "udp" : "tcp",
216 so->so_laddr.s_addr, ntohs(so->so_lport), addr, ntohs(port)));
217 return 0;
[71982]218 }
219
[72283]220 Log2(("NAT: sobind: %s guest %RTnaipv4:%d to host %RTnaipv4:%d error %d%s\n",
[72292]221 so->so_type == IPPROTO_UDP ? "udp" : "tcp",
222 so->so_laddr.s_addr, ntohs(so->so_lport),
223 addr, ntohs(port),
224 errno, port ? " (will retry with random port)" : ""));
[71982]225
[72283]226 if (port) /* retry without */
[72292]227 status = sobindto(so, addr, 0);
[71982]228
[72283]229 if (addr)
[72292]230 return status;
[72283]231 else
[72292]232 return 0;
[72283]233}
[71982]234
235
[72283]236/*
237 * Bind the socket to specific host address and/or port if necessary.
238 * We also always bind udp sockets to force the local port to be
239 * allocated and known in advance.
240 */
241int
242sobind(PNATState pData, struct socket *so)
243{
244 uint32_t addr = pData->bindIP.s_addr; /* may be INADDR_ANY */
245 bool fSamePorts = !!(pData->i32AliasMode & PKT_ALIAS_SAME_PORTS);
246 uint16_t port;
247 int status;
[71982]248
[72283]249 if (fSamePorts)
250 {
251 int opt = 1;
252 setsockopt(so->s, SOL_SOCKET, SO_REUSEADDR, (char *)&opt, sizeof(opt));
[72292]253 port = so->so_lport;
[72283]254 }
255 else
256 {
[72292]257 port = 0;
[72283]258 }
[71982]259
[72283]260 status = sobindto(so, addr, port);
261 return status;
[71982]262}
263
264
[1]265/*
266 * Read from so's socket into sb_snd, updating all relevant sbuf fields
267 * NOTE: This will only be called if it is select()ed for reading, so
268 * a read() of 0 (or less) means it's disconnected
269 */
270int
[16501]271soread(PNATState pData, struct socket *so)
[1]272{
[14964]273 int n, nn, lss, total;
274 struct sbuf *sb = &so->so_snd;
[63013]275 u_int len = sb->sb_datalen - sb->sb_cc;
[14964]276 struct iovec iov[2];
277 int mss = so->so_tcpcb->t_maxseg;
[63672]278 int sockerr;
[20378]279
[20712]280 STAM_PROFILE_START(&pData->StatIOread, a);
281 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
282 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
[20378]283
[16443]284 QSOCKET_LOCK(tcb);
285 SOCKET_LOCK(so);
286 QSOCKET_UNLOCK(tcb);
[926]287
[37743]288 LogFlow(("soread: so = %R[natsock]\n", so));
[56377]289 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb));
[926]290
[14964]291 /*
292 * No need to check if there's enough room to read.
293 * soread wouldn't have been called if there weren't
294 */
[926]295
[14964]296 len = sb->sb_datalen - sb->sb_cc;
[926]297
[14964]298 iov[0].iov_base = sb->sb_wptr;
299 iov[1].iov_base = 0;
300 iov[1].iov_len = 0;
301 if (sb->sb_wptr < sb->sb_rptr)
302 {
303 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
304 /* Should never succeed, but... */
305 if (iov[0].iov_len > len)
306 iov[0].iov_len = len;
307 if (iov[0].iov_len > mss)
308 iov[0].iov_len -= iov[0].iov_len%mss;
309 n = 1;
310 }
311 else
312 {
313 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
314 /* Should never succeed, but... */
[15207]315 if (iov[0].iov_len > len)
316 iov[0].iov_len = len;
[14964]317 len -= iov[0].iov_len;
318 if (len)
319 {
320 iov[1].iov_base = sb->sb_data;
321 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
[23369]322 if (iov[1].iov_len > len)
[14964]323 iov[1].iov_len = len;
324 total = iov[0].iov_len + iov[1].iov_len;
325 if (total > mss)
326 {
327 lss = total % mss;
328 if (iov[1].iov_len > lss)
329 {
330 iov[1].iov_len -= lss;
331 n = 2;
[14470]332 }
[14964]333 else
334 {
335 lss -= iov[1].iov_len;
336 iov[0].iov_len -= lss;
337 n = 1;
338 }
339 }
340 else
341 n = 2;
[14470]342 }
[14964]343 else
344 {
345 if (iov[0].iov_len > mss)
346 iov[0].iov_len -= iov[0].iov_len%mss;
347 n = 1;
348 }
349 }
[926]350
[1]351#ifdef HAVE_READV
[14964]352 nn = readv(so->s, (struct iovec *)iov, n);
[1]353#else
[28034]354 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
[926]355#endif
[63676]356 if (nn < 0)
357 sockerr = errno; /* save it, as it may be clobbered by logging */
358 else
359 sockerr = 0;
360
[56377]361 Log2(("%s: read(1) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
362 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb));
[14964]363 if (nn <= 0)
364 {
[64617]365#ifdef RT_OS_WINDOWS
366 /*
367 * Windows reports ESHUTDOWN after SHUT_RD (SD_RECEIVE)
368 * instead of just returning EOF indication.
369 */
370 if (nn < 0 && sockerr == ESHUTDOWN)
371 {
372 nn = 0;
373 sockerr = 0;
374 }
375#endif
376
[63675]377 if (nn == 0) /* XXX: should this be inside #if defined(RT_OS_WINDOWS)? */
[16501]378 {
[63675]379 /*
380 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
381 * _could_ mean that the connection is closed. But we will receive an
382 * FD_CLOSE event later if the connection was _really_ closed. With
383 * www.youtube.com I see this very often. Closing the socket too early
384 * would be dangerous.
385 */
386 int status;
387 unsigned long pending = 0;
388 status = ioctlsocket(so->s, FIONREAD, &pending);
389 if (status < 0)
390 Log(("NAT:%s: error in WSAIoctl: %d\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, errno));
391 if (pending != 0)
392 {
393 SOCKET_UNLOCK(so);
394 STAM_PROFILE_STOP(&pData->StatIOread, a);
395 return 0;
396 }
[16443]397 }
[63675]398
[23369]399 if ( nn < 0
[63672]400 && soIgnorableErrorCode(sockerr))
[16443]401 {
402 SOCKET_UNLOCK(so);
[20378]403 STAM_PROFILE_STOP(&pData->StatIOread, a);
[14964]404 return 0;
[16443]405 }
[14964]406 else
407 {
[63668]408 int fUninitializedTemplate = 0;
[64298]409 int shuterr;
410
[63668]411 fUninitializedTemplate = RT_BOOL(( sototcpcb(so)
[41455]412 && ( sototcpcb(so)->t_template.ti_src.s_addr == INADDR_ANY
413 || sototcpcb(so)->t_template.ti_dst.s_addr == INADDR_ANY)));
[14964]414 /* nn == 0 means peer has performed an orderly shutdown */
[37743]415 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
[63672]416 RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sockerr, strerror(sockerr)));
[64298]417
418 shuterr = sofcantrcvmore(so);
419 if (!sockerr && !shuterr && !fUninitializedTemplate)
[41455]420 tcp_sockclosed(pData, sototcpcb(so));
421 else
[64346]422 {
[64535]423 LogRel2(("NAT: sockerr %d, shuterr %d - %R[natsock]\n", sockerr, shuterr, so));
[63672]424 tcp_drop(pData, sototcpcb(so), sockerr);
[64346]425 }
[16443]426 SOCKET_UNLOCK(so);
[20378]427 STAM_PROFILE_STOP(&pData->StatIOread, a);
[14964]428 return -1;
[14470]429 }
[14964]430 }
[20378]431 STAM_STATS(
432 if (n == 1)
[20712]433 {
[20378]434 STAM_COUNTER_INC(&pData->StatIORead_in_1);
435 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
436 }
437 else
438 {
439 STAM_COUNTER_INC(&pData->StatIORead_in_2);
440 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
441 }
442 );
[926]443
[1]444#ifndef HAVE_READV
[14964]445 /*
446 * If there was no error, try and read the second time round
447 * We read again if n = 2 (ie, there's another part of the buffer)
448 * and we read as much as we could in the first read
449 * We don't test for <= 0 this time, because there legitimately
450 * might not be any more data (since the socket is non-blocking),
451 * a close will be detected on next iteration.
452 * A return of -1 wont (shouldn't) happen, since it didn't happen above
453 */
[63013]454 if (n == 2 && (unsigned)nn == iov[0].iov_len)
[14964]455 {
456 int ret;
[17191]457 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
[14964]458 if (ret > 0)
459 nn += ret;
[20378]460 STAM_STATS(
[23369]461 if (ret > 0)
[20378]462 {
[20712]463 STAM_COUNTER_INC(&pData->StatIORead_in_2);
464 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
[20378]465 }
466 );
[14964]467 }
[926]468
[56377]469 Log2(("%s: read(2) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
[1]470#endif
[926]471
[14964]472 /* Update fields */
473 sb->sb_cc += nn;
474 sb->sb_wptr += nn;
[56377]475 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sb));
[14964]476 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
[37745]477 {
[14964]478 sb->sb_wptr -= sb->sb_datalen;
[56377]479 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, sb));
[37745]480 }
[20378]481 STAM_PROFILE_STOP(&pData->StatIOread, a);
[16443]482 SOCKET_UNLOCK(so);
[14964]483 return nn;
[1]484}
[926]485
[1]486/*
487 * Get urgent data
[926]488 *
[1]489 * When the socket is created, we set it SO_OOBINLINE,
490 * so when OOB data arrives, we soread() it and everything
491 * in the send buffer is sent as urgent data
492 */
493void
[1033]494sorecvoob(PNATState pData, struct socket *so)
[1]495{
[14964]496 struct tcpcb *tp = sototcpcb(so);
[28034]497 ssize_t ret;
[1]498
[37936]499 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
[926]500
[14964]501 /*
502 * We take a guess at how much urgent data has arrived.
503 * In most situations, when urgent data arrives, the next
504 * read() should get all the urgent data. This guess will
505 * be wrong however if more data arrives just after the
506 * urgent data, or the read() doesn't return all the
507 * urgent data.
508 */
[28034]509 ret = soread(pData, so);
[40120]510 if (RT_LIKELY(ret > 0))
511 {
[80279]512 /*
513 * @todo for now just scrub the URG pointer. To faithfully
514 * proxy URG we need to read the srteam until SIOCATMARK, and
515 * then mark the first byte of the next read ar urgent.
516 */
517#if 0
[40120]518 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
[80279]519#endif
[40120]520 tp->t_force = 1;
521 tcp_output(pData, tp);
522 tp->t_force = 0;
523 }
[1]524}
[54827]525
[1]526/*
527 * Send urgent data
528 * There's a lot duplicated code here, but...
529 */
530int
[14964]531sosendoob(struct socket *so)
[1]532{
[14964]533 struct sbuf *sb = &so->so_rcv;
534 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
[926]535
[14964]536 int n, len;
[13738]537
[37936]538 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
[926]539
[14964]540 if (so->so_urgc > sizeof(buff))
541 so->so_urgc = sizeof(buff); /* XXX */
[926]542
[14964]543 if (sb->sb_rptr < sb->sb_wptr)
544 {
545 /* We can send it directly */
546 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
547 so->so_urgc -= n;
[926]548
[34103]549 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
550 n, so->so_urgc));
[14964]551 }
552 else
553 {
554 /*
555 * Since there's no sendv or sendtov like writev,
556 * we must copy all data to a linear buffer then
557 * send it all
558 */
559 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
[15293]560 if (len > so->so_urgc)
561 len = so->so_urgc;
[14964]562 memcpy(buff, sb->sb_rptr, len);
563 so->so_urgc -= len;
564 if (so->so_urgc)
565 {
566 n = sb->sb_wptr - sb->sb_data;
567 if (n > so->so_urgc)
568 n = so->so_urgc;
569 memcpy(buff + len, sb->sb_data, n);
570 so->so_urgc -= n;
571 len += n;
572 }
573 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
[1]574#ifdef DEBUG
[14964]575 if (n != len)
[34103]576 Log(("Didn't send all data urgently XXXXX\n"));
[926]577#endif
[34103]578 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
579 n, so->so_urgc));
[14964]580 }
[926]581
[14964]582 sb->sb_cc -= n;
583 sb->sb_rptr += n;
584 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
585 sb->sb_rptr -= sb->sb_datalen;
[926]586
[14964]587 return n;
[1]588}
589
590/*
[926]591 * Write data from so_rcv to so's socket,
[1]592 * updating all sbuf field as necessary
593 */
594int
[1033]595sowrite(PNATState pData, struct socket *so)
[1]596{
[17191]597 int n, nn;
[14964]598 struct sbuf *sb = &so->so_rcv;
[63013]599 u_int len = sb->sb_cc;
[14964]600 struct iovec iov[2];
[926]601
[20712]602 STAM_PROFILE_START(&pData->StatIOwrite, a);
603 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
604 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
605 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
606 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
607 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
608 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
609 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
610 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
[37745]611 LogFlowFunc(("so = %R[natsock]\n", so));
[56377]612 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb));
[16443]613 QSOCKET_LOCK(tcb);
614 SOCKET_LOCK(so);
615 QSOCKET_UNLOCK(tcb);
[14964]616 if (so->so_urgc)
617 {
618 sosendoob(so);
619 if (sb->sb_cc == 0)
[16443]620 {
[16562]621 SOCKET_UNLOCK(so);
[20378]622 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
[14964]623 return 0;
[16443]624 }
[14964]625 }
[1]626
[14964]627 /*
628 * No need to check if there's something to write,
629 * sowrite wouldn't have been called otherwise
630 */
[926]631
[14964]632 len = sb->sb_cc;
[926]633
[14964]634 iov[0].iov_base = sb->sb_rptr;
635 iov[1].iov_base = 0;
636 iov[1].iov_len = 0;
637 if (sb->sb_rptr < sb->sb_wptr)
638 {
639 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
640 /* Should never succeed, but... */
[15207]641 if (iov[0].iov_len > len)
642 iov[0].iov_len = len;
[14964]643 n = 1;
644 }
645 else
646 {
647 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
[15207]648 if (iov[0].iov_len > len)
649 iov[0].iov_len = len;
[14964]650 len -= iov[0].iov_len;
651 if (len)
652 {
653 iov[1].iov_base = sb->sb_data;
654 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
655 if (iov[1].iov_len > len)
656 iov[1].iov_len = len;
657 n = 2;
[14470]658 }
[14964]659 else
660 n = 1;
661 }
[20378]662 STAM_STATS({
[20712]663 if (n == 1)
[20378]664 {
[20712]665 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
666 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
[20378]667 }
668 else
669 {
[20712]670 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
671 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
672 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
[20378]673 }
674 });
[14964]675 /* Check if there's urgent data to send, and if so, send it */
[1]676#ifdef HAVE_READV
[14964]677 nn = writev(so->s, (const struct iovec *)iov, n);
[1]678#else
[14964]679 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
[1]680#endif
[56377]681 Log2(("%s: wrote(1) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
[14964]682 /* This should never happen, but people tell me it does *shrug* */
[23369]683 if ( nn < 0
[41855]684 && soIgnorableErrorCode(errno))
[16443]685 {
[16562]686 SOCKET_UNLOCK(so);
[20378]687 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
[14964]688 return 0;
[16443]689 }
[926]690
[14964]691 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
692 {
[37743]693 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
[56377]694 RT_GCC_EXTENSION __PRETTY_FUNCTION__, so->so_state, errno));
[14964]695 sofcantsendmore(so);
696 tcp_sockclosed(pData, sototcpcb(so));
[16562]697 SOCKET_UNLOCK(so);
[20378]698 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
[14964]699 return -1;
700 }
[926]701
[1]702#ifndef HAVE_READV
[63013]703 if (n == 2 && (unsigned)nn == iov[0].iov_len)
[14964]704 {
705 int ret;
[17191]706 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
[14964]707 if (ret > 0)
708 nn += ret;
[63016]709# ifdef VBOX_WITH_STATISTICS
710 if (ret > 0 && ret != (ssize_t)iov[1].iov_len)
711 {
712 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
713 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
714 }
715#endif
[14964]716 }
[56377]717 Log2(("%s: wrote(2) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
[1]718#endif
[926]719
[14964]720 /* Update sbuf */
721 sb->sb_cc -= nn;
722 sb->sb_rptr += nn;
[56377]723 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sb));
[14964]724 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
[37745]725 {
[14964]726 sb->sb_rptr -= sb->sb_datalen;
[56377]727 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, sb));
[37745]728 }
[926]729
[14964]730 /*
731 * If in DRAIN mode, and there's no more data, set
732 * it CANTSENDMORE
733 */
734 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
735 sofcantsendmore(so);
[926]736
[16562]737 SOCKET_UNLOCK(so);
[20712]738 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
[14964]739 return nn;
[1]740}
741
742/*
743 * recvfrom() a UDP socket
744 */
745void
[1033]746sorecvfrom(PNATState pData, struct socket *so)
[1]747{
[63016]748 LogFlowFunc(("sorecvfrom: so = %p\n", so));
[53624]749
[53399]750#ifdef RT_OS_WINDOWS
751 /* ping is handled with ICMP API in ip_icmpwin.c */
752 Assert(so->so_type == IPPROTO_UDP);
753#else
[15207]754 if (so->so_type == IPPROTO_ICMP)
755 {
756 /* This is a "ping" reply */
757 sorecvfrom_icmp_unix(pData, so);
758 udp_detach(pData, so);
759 }
760 else
[53399]761#endif /* !RT_OS_WINDOWS */
[15207]762 {
[63013]763 static char achBuf[64 * 1024];
[52154]764
[15207]765 /* A "normal" UDP packet */
[52154]766 struct sockaddr_in addr;
767 socklen_t addrlen = sizeof(struct sockaddr_in);
768 struct iovec iov[2];
769 ssize_t nread;
[15207]770 struct mbuf *m;
[1]771
[16443]772 QSOCKET_LOCK(udb);
773 SOCKET_LOCK(so);
774 QSOCKET_UNLOCK(udb);
775
[52154]776 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
777 if (m == NULL)
[23154]778 {
[52154]779 SOCKET_UNLOCK(so);
[33145]780 return;
[23369]781 }
[23154]782
[23369]783 m->m_data += ETH_HLEN;
[23154]784 m->m_pkthdr.header = mtod(m, void *);
[52154]785
[23369]786 m->m_data += sizeof(struct udpiphdr);
[34038]787
[52154]788 /* small packets will fit without copying */
789 iov[0].iov_base = mtod(m, char *);
790 iov[0].iov_len = M_TRAILINGSPACE(m);
791
792 /* large packets will spill into a temp buffer */
[63013]793 iov[1].iov_base = achBuf;
794 iov[1].iov_len = sizeof(achBuf);
[52154]795
796#if !defined(RT_OS_WINDOWS)
[34038]797 {
[52154]798 struct msghdr mh;
799 memset(&mh, 0, sizeof(mh));
800
801 mh.msg_iov = iov;
802 mh.msg_iovlen = 2;
803 mh.msg_name = &addr;
804 mh.msg_namelen = addrlen;
805
806 nread = recvmsg(so->s, &mh, 0);
[34038]807 }
[52154]808#else /* RT_OS_WINDOWS */
[34038]809 {
[52154]810 DWORD nbytes; /* NB: can't use nread b/c of different size */
[63013]811 DWORD flags = 0;
[52154]812 int status;
[63013]813 AssertCompile(sizeof(WSABUF) == sizeof(struct iovec));
814 AssertCompileMembersSameSizeAndOffset(WSABUF, len, struct iovec, iov_len);
815 AssertCompileMembersSameSizeAndOffset(WSABUF, buf, struct iovec, iov_base);
816 status = WSARecvFrom(so->s, (WSABUF *)&iov[0], 2, &nbytes, &flags,
[52154]817 (struct sockaddr *)&addr, &addrlen,
818 NULL, NULL);
[53510]819 if (status != SOCKET_ERROR)
820 nread = nbytes;
821 else
822 nread = -1;
[34038]823 }
[52154]824#endif
825 if (nread >= 0)
[22940]826 {
[52154]827 if (nread <= iov[0].iov_len)
828 m->m_len = nread;
829 else
[23241]830 {
[52154]831 m->m_len = iov[0].iov_len;
832 m_append(pData, m, nread - iov[0].iov_len, iov[1].iov_base);
[23241]833 }
[63121]834 Assert(m_length(m, NULL) == (size_t)nread);
[23241]835
[14964]836 /*
837 * Hack: domain name lookup will be used the most for UDP,
838 * and since they'll only be used once there's no need
839 * for the 4 minute (or whatever) timeout... So we time them
840 * out much quicker (10 seconds for now...)
841 */
842 if (so->so_expire)
843 {
[25822]844 if (so->so_fport != RT_H2N_U16_C(53))
[17302]845 so->so_expire = curtime + SO_EXPIRE;
846 }
[52154]847
[20712]848 /*
[55002]849 * DNS proxy requests are forwarded to the real resolver,
850 * but its socket's so_faddr is that of the DNS proxy
851 * itself.
852 *
853 * last argument should be changed if Slirp will inject IP attributes
[17302]854 */
[30013]855 if ( pData->fUseDnsProxy
[55002]856 && so->so_fport == RT_H2N_U16_C(53)
857 && CTL_CHECK(so->so_faddr.s_addr, CTL_DNS))
[20712]858 dnsproxy_answer(pData, so, m);
[1]859
[39269]860 /* packets definetly will be fragmented, could confuse receiver peer. */
[52154]861 if (nread > if_mtu)
[39268]862 m->m_flags |= M_SKIP_FIREWALL;
[52154]863
[14470]864 /*
865 * If this packet was destined for CTL_ADDR,
866 * make it look like that's where it came from, done by udp_output
867 */
868 udp_output(pData, so, m, &addr);
[52154]869 }
870 else
871 {
872 m_freem(pData, m);
873
874 if (!soIgnorableErrorCode(errno))
875 {
876 u_char code;
877 if (errno == EHOSTUNREACH)
878 code = ICMP_UNREACH_HOST;
879 else if (errno == ENETUNREACH)
880 code = ICMP_UNREACH_NET;
881 else
882 code = ICMP_UNREACH_PORT;
883
884 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
885 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
[53056]886 so->so_m = NULL;
[52154]887 }
888 }
889
890 SOCKET_UNLOCK(so);
891 }
[1]892}
893
894/*
895 * sendto() a socket
896 */
897int
[1033]898sosendto(PNATState pData, struct socket *so, struct mbuf *m)
[1]899{
[14964]900 int ret;
[21665]901 struct sockaddr_in *paddr;
902 struct sockaddr addr;
[3226]903#if 0
[14964]904 struct sockaddr_in host_addr;
[3226]905#endif
[35826]906 caddr_t buf = 0;
[22940]907 int mlen;
[1]908
[63016]909 LogFlowFunc(("sosendto: so = %R[natsock], m = %p\n", so, m));
[23369]910
[21665]911 memset(&addr, 0, sizeof(struct sockaddr));
912#ifdef RT_OS_DARWIN
913 addr.sa_len = sizeof(struct sockaddr_in);
914#endif
915 paddr = (struct sockaddr_in *)&addr;
916 paddr->sin_family = AF_INET;
[25822]917 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
[14964]918 {
919 /* It's an alias */
[25822]920 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
[14964]921 switch(last_byte)
922 {
[8009]923#if 0
[14964]924 /* handle this case at 'default:' */
925 case CTL_BROADCAST:
926 addr.sin_addr.s_addr = INADDR_BROADCAST;
927 /* Send the packet to host to fully emulate broadcast */
928 /** @todo r=klaus: on Linux host this causes the host to receive
929 * the packet twice for some reason. And I cannot find any place
930 * in the man pages which states that sending a broadcast does not
931 * reach the host itself. */
932 host_addr.sin_family = AF_INET;
933 host_addr.sin_port = so->so_fport;
934 host_addr.sin_addr = our_addr;
935 sendto(so->s, m->m_data, m->m_len, 0,
936 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
937 break;
[1824]938#endif
[14964]939 case CTL_DNS:
940 case CTL_ALIAS:
941 default:
942 if (last_byte == ~pData->netmask)
[21665]943 paddr->sin_addr.s_addr = INADDR_BROADCAST;
[14964]944 else
[21665]945 paddr->sin_addr = loopback_addr;
[14964]946 break;
947 }
948 }
949 else
[21665]950 paddr->sin_addr = so->so_faddr;
951 paddr->sin_port = so->so_fport;
[1]952
[34103]953 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
954 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
[926]955
[14964]956 /* Don't care what port we get */
[35826]957 /*
[37423]958 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
[35826]959 * generates bodyless messages, annoying memmory management system.
960 */
[22940]961 mlen = m_length(m, NULL);
[35826]962 if (mlen > 0)
[22940]963 {
[35826]964 buf = RTMemAlloc(mlen);
965 if (buf == NULL)
966 {
967 return -1;
968 }
969 m_copydata(m, 0, mlen, buf);
[22940]970 }
971 ret = sendto(so->s, buf, mlen, 0,
972 (struct sockaddr *)&addr, sizeof (struct sockaddr));
[39287]973#ifdef VBOX_WITH_NAT_SEND2HOME
974 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
975 {
976 slirpSend2Home(pData, so, buf, mlen, 0);
977 }
978#endif
[35826]979 if (buf)
980 RTMemFree(buf);
[14964]981 if (ret < 0)
[15355]982 {
[24065]983 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
[14964]984 return -1;
[15355]985 }
[926]986
[14964]987 /*
988 * Kill the socket if there's no reply in 4 minutes,
989 * but only if it's an expirable socket
990 */
991 if (so->so_expire)
992 so->so_expire = curtime + SO_EXPIRE;
993 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
994 return 0;
[1]995}
996
997/*
998 * XXX This should really be tcp_listen
999 */
1000struct socket *
[21004]1001solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
[1]1002{
[14964]1003 struct sockaddr_in addr;
1004 struct socket *so;
1005 socklen_t addrlen = sizeof(addr);
1006 int s, opt = 1;
[20297]1007 int status;
[1]1008
[37936]1009 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
[926]1010
[14964]1011 if ((so = socreate()) == NULL)
1012 {
[15792]1013 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
[14964]1014 return NULL;
1015 }
[926]1016
[14964]1017 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1018 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1019 {
[15792]1020 RTMemFree(so);
[14964]1021 return NULL;
1022 }
[16291]1023
[16293]1024 SOCKET_LOCK_CREATE(so);
[16291]1025 SOCKET_LOCK(so);
1026 QSOCKET_LOCK(tcb);
[14964]1027 insque(pData, so,&tcb);
[16562]1028 NSOCK_INC();
[16291]1029 QSOCKET_UNLOCK(tcb);
[926]1030
[14964]1031 /*
1032 * SS_FACCEPTONCE sockets must time out.
1033 */
1034 if (flags & SS_FACCEPTONCE)
1035 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
[926]1036
[14964]1037 so->so_state = (SS_FACCEPTCONN|flags);
1038 so->so_lport = lport; /* Kept in network format */
1039 so->so_laddr.s_addr = laddr; /* Ditto */
[926]1040
[21049]1041 memset(&addr, 0, sizeof(addr));
1042#ifdef RT_OS_DARWIN
1043 addr.sin_len = sizeof(addr);
1044#endif
[14964]1045 addr.sin_family = AF_INET;
[21004]1046 addr.sin_addr.s_addr = bind_addr;
[14964]1047 addr.sin_port = port;
[926]1048
[38108]1049 /**
1050 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1051 * kernel will choose the optimal value for requests queue length.
1052 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1053 */
[17191]1054 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1055 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
[14964]1056 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
[38111]1057 || (listen(s, pData->soMaxConn) < 0))
[14964]1058 {
[3666]1059#ifdef RT_OS_WINDOWS
[14964]1060 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1061 closesocket(s);
[16443]1062 QSOCKET_LOCK(tcb);
[14964]1063 sofree(pData, so);
[16443]1064 QSOCKET_UNLOCK(tcb);
[14964]1065 /* Restore the real errno */
1066 WSASetLastError(tmperrno);
[1]1067#else
[14964]1068 int tmperrno = errno; /* Don't clobber the real reason we failed */
1069 close(s);
[43752]1070 if (sototcpcb(so))
1071 tcp_close(pData, sototcpcb(so));
1072 else
1073 sofree(pData, so);
[14964]1074 /* Restore the real errno */
1075 errno = tmperrno;
[1]1076#endif
[14964]1077 return NULL;
1078 }
[20297]1079 fd_nonblock(s);
[17191]1080 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
[926]1081
[14964]1082 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1083 so->so_fport = addr.sin_port;
[20297]1084 /* set socket buffers */
[20379]1085 opt = pData->socket_rcv;
[20297]1086 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1087 if (status < 0)
1088 {
1089 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1090 goto no_sockopt;
1091 }
[20379]1092 opt = pData->socket_snd;
[20297]1093 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1094 if (status < 0)
1095 {
1096 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1097 goto no_sockopt;
1098 }
1099no_sockopt:
[14964]1100 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1101 so->so_faddr = alias_addr;
1102 else
1103 so->so_faddr = addr.sin_addr;
[1]1104
[14964]1105 so->s = s;
[16445]1106 SOCKET_UNLOCK(so);
[14964]1107 return so;
[1]1108}
1109
[926]1110/*
[1]1111 * Data is available in so_rcv
1112 * Just write() the data to the socket
1113 * XXX not yet...
[39101]1114 * @todo do we really need this function, what it's intended to do?
[1]1115 */
1116void
[14964]1117sorwakeup(struct socket *so)
[1]1118{
[39101]1119 NOREF(so);
[14964]1120#if 0
1121 sowrite(so);
1122 FD_CLR(so->s,&writefds);
1123#endif
[1]1124}
[926]1125
[1]1126/*
1127 * Data has been freed in so_snd
1128 * We have room for a read() if we want to
1129 * For now, don't read, it'll be done in the main loop
1130 */
1131void
[14964]1132sowwakeup(struct socket *so)
[1]1133{
[39101]1134 NOREF(so);
[1]1135}
1136
1137/*
1138 * Various session state calls
1139 * XXX Should be #define's
1140 * The socket state stuff needs work, these often get call 2 or 3
1141 * times each when only 1 was needed
1142 */
1143void
[14964]1144soisfconnecting(struct socket *so)
[1]1145{
[14964]1146 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1147 SS_FCANTSENDMORE|SS_FWDRAIN);
1148 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
[1]1149}
1150
1151void
[14964]1152soisfconnected(struct socket *so)
[1]1153{
[39556]1154 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
[14964]1155 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1156 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
[39556]1157 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
[1]1158}
1159
[64298]1160int
[14964]1161sofcantrcvmore(struct socket *so)
[1]1162{
[64298]1163 int err = 0;
1164
[40621]1165 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
[14964]1166 if ((so->so_state & SS_NOFDREF) == 0)
1167 {
[64298]1168 /*
1169 * If remote closes first and then sends an RST, the recv() in
1170 * soread() will keep reporting EOF without any error
[64534]1171 * indication. As far as I can tell the only way to detect
1172 * this on Linux is to check if shutdown() succeeds here (but
1173 * see below).
[65849]1174 *
[64534]1175 * OTOH on OS X shutdown() "helpfully" checks if remote has
1176 * already closed and then always returns ENOTCONN
1177 * immediately.
[64298]1178 */
[64534]1179 int status = shutdown(so->s, SHUT_RD);
1180#if defined(RT_OS_LINUX)
[64298]1181 if (status < 0)
1182 err = errno;
[64534]1183#else
1184 RT_NOREF(status);
1185#endif
[14964]1186 }
1187 so->so_state &= ~(SS_ISFCONNECTING);
1188 if (so->so_state & SS_FCANTSENDMORE)
[64298]1189 {
[64534]1190#if defined(RT_OS_LINUX)
[64298]1191 /*
1192 * If we have closed first, and remote closes, shutdown will
1193 * return ENOTCONN, but this is expected. Don't tell the
1194 * caller there was an error.
1195 */
1196 if (err == ENOTCONN)
1197 err = 0;
[64534]1198#endif
[15035]1199 so->so_state = SS_NOFDREF; /* Don't select it */
[14964]1200 /* XXX close() here as well? */
[64298]1201 }
[14964]1202 else
1203 so->so_state |= SS_FCANTRCVMORE;
[64298]1204
1205 LogFlowFunc(("LEAVE: %d\n", err));
1206 return err;
[1]1207}
1208
1209void
[14964]1210sofcantsendmore(struct socket *so)
[1]1211{
[40621]1212 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
[14964]1213 if ((so->so_state & SS_NOFDREF) == 0)
1214 shutdown(so->s, 1); /* send FIN to fhost */
1215
1216 so->so_state &= ~(SS_ISFCONNECTING);
1217 if (so->so_state & SS_FCANTRCVMORE)
1218 so->so_state = SS_NOFDREF; /* as above */
1219 else
1220 so->so_state |= SS_FCANTSENDMORE;
[40621]1221 LogFlowFuncLeave();
[1]1222}
1223
1224void
[14964]1225soisfdisconnected(struct socket *so)
[1]1226{
[39101]1227 NOREF(so);
[14964]1228#if 0
1229 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1230 close(so->s);
1231 so->so_state = SS_ISFDISCONNECTED;
1232 /*
1233 * XXX Do nothing ... ?
1234 */
1235#endif
[1]1236}
1237
1238/*
1239 * Set write drain mode
1240 * Set CANTSENDMORE once all data has been write()n
1241 */
1242void
[14964]1243sofwdrain(struct socket *so)
[1]1244{
[30045]1245 if (SBUF_LEN(&so->so_rcv))
[14964]1246 so->so_state |= SS_FWDRAIN;
1247 else
1248 sofcantsendmore(so);
[1]1249}
[15086]1250
[53399]1251#if !defined(RT_OS_WINDOWS)
[15232]1252static void
[39101]1253send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
[15086]1254{
[15207]1255 struct ip *ip;
[17191]1256 uint32_t dst, src;
[15207]1257 char ip_copy[256];
1258 struct icmp *icp;
[16571]1259 int old_ip_len = 0;
[15573]1260 int hlen, original_hlen = 0;
[15207]1261 struct mbuf *m;
1262 struct icmp_msg *icm;
[15365]1263 uint8_t proto;
[16293]1264 int type = 0;
[15138]1265
[15207]1266 ip = (struct ip *)buff;
[27573]1267 /* Fix ip->ip_len to contain the total packet length including the header
1268 * in _host_ byte order for all OSes. On Darwin, that value already is in
1269 * host byte order. Solaris and Darwin report only the payload. */
[27568]1270#ifndef RT_OS_DARWIN
[27466]1271 ip->ip_len = RT_N2H_U16(ip->ip_len);
1272#endif
[27573]1273 hlen = (ip->ip_hl << 2);
[27568]1274#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
[27573]1275 ip->ip_len += hlen;
[27568]1276#endif
[27466]1277 if (ip->ip_len < hlen + ICMP_MINLEN)
[27441]1278 {
[27443]1279 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
[27448]1280 return;
[27441]1281 }
[15435]1282 icp = (struct icmp *)((char *)ip + hlen);
[15207]1283
[15590]1284 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
[15573]1285 if ( icp->icmp_type != ICMP_ECHOREPLY
1286 && icp->icmp_type != ICMP_TIMXCEED
[15447]1287 && icp->icmp_type != ICMP_UNREACH)
[15232]1288 {
1289 return;
1290 }
[15207]1291
[27417]1292 /*
[27448]1293 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
[27417]1294 * ICMP_ECHOREPLY assuming data 0
1295 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1296 */
[27466]1297 if (ip->ip_len < hlen + 8)
[27417]1298 {
1299 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1300 return;
[27448]1301 }
[27417]1302
[16293]1303 type = icp->icmp_type;
1304 if ( type == ICMP_TIMXCEED
1305 || type == ICMP_UNREACH)
[15575]1306 {
[27417]1307 /*
[27448]1308 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
[27417]1309 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1310 */
[27466]1311 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
[27417]1312 {
1313 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1314 return;
[27448]1315 }
[15207]1316 ip = &icp->icmp_ip;
[15575]1317 }
[15207]1318
1319 icm = icmp_find_original_mbuf(pData, ip);
1320 if (icm == NULL)
1321 {
[15590]1322 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
[15207]1323 return;
1324 }
1325
1326 m = icm->im_m;
[41198]1327 if (!m)
1328 {
1329 LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so));
[55856]1330 goto done;
[41198]1331 }
[15207]1332
1333 src = addr->sin_addr.s_addr;
[27447]1334 if (type == ICMP_ECHOREPLY)
1335 {
1336 struct ip *ip0 = mtod(m, struct ip *);
1337 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1338 if (icp0->icmp_type != ICMP_ECHO)
1339 {
1340 Log(("NAT: we haven't found echo for this reply\n"));
[55856]1341 goto done;
[27447]1342 }
[27468]1343 /*
[27976]1344 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1345 * IP header combined by OS network stack, our local copy of IP header contians values
[27468]1346 * in host byte order so no byte order conversion is required. IP headers fields are converting
[27976]1347 * in ip_output0 routine only.
[27466]1348 */
[27573]1349 if ( (ip->ip_len - hlen)
[27976]1350 != (ip0->ip_len - (ip0->ip_hl << 2)))
[27447]1351 {
[27466]1352 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
[27573]1353 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
[55856]1354 goto done;
[27447]1355 }
1356 }
[15207]1357
[27466]1358 /* ip points on origianal ip header */
[15207]1359 ip = mtod(m, struct ip *);
[15365]1360 proto = ip->ip_p;
[15207]1361 /* Now ip is pointing on header we've sent from guest */
[15573]1362 if ( icp->icmp_type == ICMP_TIMXCEED
1363 || icp->icmp_type == ICMP_UNREACH)
[15207]1364 {
[15388]1365 old_ip_len = (ip->ip_hl << 2) + 64;
[15573]1366 if (old_ip_len > sizeof(ip_copy))
1367 old_ip_len = sizeof(ip_copy);
[15207]1368 memcpy(ip_copy, ip, old_ip_len);
1369 }
1370
1371 /* source address from original IP packet*/
1372 dst = ip->ip_src.s_addr;
1373
1374 /* overide ther tail of old packet */
1375 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
[15435]1376 original_hlen = ip->ip_hl << 2;
1377 /* saves original ip header and options */
[27568]1378 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1379 ip->ip_len = m_length(m, NULL);
[15458]1380 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
[15207]1381
1382 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
[16293]1383 type = icp->icmp_type;
1384 if ( type == ICMP_TIMXCEED
1385 || type == ICMP_UNREACH)
[15207]1386 {
1387 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1388 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
[56957]1389
1390 /* undo byte order conversions done in ip_input() */
1391 HTONS(icp->icmp_ip.ip_len);
1392 HTONS(icp->icmp_ip.ip_id);
1393 HTONS(icp->icmp_ip.ip_off);
1394
[15293]1395 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
[15207]1396 }
1397
1398 ip->ip_src.s_addr = src;
1399 ip->ip_dst.s_addr = dst;
1400 icmp_reflect(pData, m);
[55741]1401 /* m was freed */
1402 icm->im_m = NULL;
[55856]1403
1404 done:
1405 icmp_msg_delete(pData, icm);
[15086]1406}
[15087]1407
[15086]1408static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1409{
1410 struct sockaddr_in addr;
1411 socklen_t addrlen = sizeof(struct sockaddr_in);
[27399]1412 struct ip ip;
[23004]1413 char *buff;
1414 int len = 0;
[23369]1415
[27399]1416 /* 1- step: read the ip header */
1417 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1418 (struct sockaddr *)&addr, &addrlen);
1419 if ( len < 0
[41855]1420 && ( soIgnorableErrorCode(errno)
[23369]1421 || errno == ENOTCONN))
[23004]1422 {
[27399]1423 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
[23241]1424 return;
1425 }
[15086]1426
[27399]1427 if ( len < sizeof(struct ip)
1428 || len < 0
1429 || len == 0)
[15086]1430 {
[23241]1431 u_char code;
1432 code = ICMP_UNREACH_PORT;
[15086]1433
1434 if (errno == EHOSTUNREACH)
[15293]1435 code = ICMP_UNREACH_HOST;
[23369]1436 else if (errno == ENETUNREACH)
[15293]1437 code = ICMP_UNREACH_NET;
[15086]1438
[58077]1439 LogRel(("NAT: UDP ICMP rx errno=%d (%s)\n", errno, strerror(errno)));
[17191]1440 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
[17145]1441 so->so_m = NULL;
[34103]1442 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
[27399]1443 return;
[15086]1444 }
[27399]1445 /* basic check of IP header */
1446 if ( ip.ip_v != IPVERSION
[27573]1447# ifndef RT_OS_DARWIN
[27568]1448 || ip.ip_p != IPPROTO_ICMP
[27573]1449# endif
1450 )
[15086]1451 {
[34103]1452 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
[27399]1453 return;
[15086]1454 }
[27573]1455# ifndef RT_OS_DARWIN
1456 /* Darwin reports the IP length already in host byte order. */
[27568]1457 ip.ip_len = RT_N2H_U16(ip.ip_len);
[27573]1458# endif
1459# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1460 /* Solaris and Darwin report the payload only */
1461 ip.ip_len += (ip.ip_hl << 2);
1462# endif
[27568]1463 /* Note: ip->ip_len in host byte order (all OS) */
1464 len = ip.ip_len;
[27399]1465 buff = RTMemAlloc(len);
1466 if (buff == NULL)
1467 {
1468 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1469 return;
1470 }
1471 /* 2 - step: we're reading rest of the datagramm to the buffer */
1472 addrlen = sizeof(struct sockaddr_in);
1473 memset(&addr, 0, addrlen);
1474 len = recvfrom(so->s, buff, len, 0,
1475 (struct sockaddr *)&addr, &addrlen);
1476 if ( len < 0
[41855]1477 && ( soIgnorableErrorCode(errno)
[27399]1478 || errno == ENOTCONN))
1479 {
[27448]1480 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
[27568]1481 ip.ip_len));
[27399]1482 RTMemFree(buff);
1483 return;
1484 }
[27448]1485 if ( len < 0
[27399]1486 || len == 0)
1487 {
1488 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
[27568]1489 errno, len, (ip.ip_len - sizeof(struct ip))));
[27399]1490 RTMemFree(buff);
1491 return;
1492 }
1493 /* len is modified in 2nd read, when the rest of the datagramm was read */
[39101]1494 send_icmp_to_guest(pData, buff, len, &addr);
[23004]1495 RTMemFree(buff);
[15086]1496}
[15636]1497#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette