VirtualBox

source: vbox/trunk/src/VBox/NetworkServices/NAT/pxtcp.c

Last change on this file was 106061, checked in by vboxsync, 3 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 69.4 KB
Line 
1/* $Id: pxtcp.c 106061 2024-09-16 14:03:52Z vboxsync $ */
2/** @file
3 * NAT Network - TCP proxy.
4 */
5
6/*
7 * Copyright (C) 2013-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#define LOG_GROUP LOG_GROUP_NAT_SERVICE
29
30#include "winutils.h"
31
32#include "pxtcp.h"
33
34#include "proxy.h"
35#include "proxy_pollmgr.h"
36#include "pxremap.h"
37#include "portfwd.h" /* fwspec */
38
39#ifndef RT_OS_WINDOWS
40#include <sys/types.h>
41#include <sys/socket.h>
42#include <sys/ioctl.h>
43#ifdef RT_OS_SOLARIS
44#include <sys/filio.h> /* FIONREAD is BSD'ism */
45#endif
46#include <stdlib.h>
47#include <stdint.h>
48#include <stdio.h>
49#include <string.h>
50#include <poll.h>
51
52#include <err.h> /* BSD'ism */
53#else
54#include <stdlib.h>
55#include <stdio.h>
56#include <string.h>
57
58#include <iprt/stdint.h>
59#include "winpoll.h"
60#endif
61
62#include "lwip/opt.h"
63
64#include "lwip/sys.h"
65#include "lwip/tcpip.h"
66#include "lwip/netif.h"
67#include "lwip/tcp_impl.h" /* XXX: to access tcp_abandon() */
68#include "lwip/icmp.h"
69#include "lwip/icmp6.h"
70
71/*
72 * Different OSes have different quirks in reporting POLLHUP for TCP
73 * sockets.
74 *
75 * Using shutdown(2) "how" values here would be more readable, but
76 * since SHUT_RD is 0, we can't use 0 for "none", unfortunately.
77 */
78#if defined(RT_OS_NETBSD) || defined(RT_OS_SOLARIS)
79# define HAVE_TCP_POLLHUP 0 /* not reported */
80#elif defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS)
81# define HAVE_TCP_POLLHUP POLLIN /* reported when remote closes */
82#else
83# define HAVE_TCP_POLLHUP (POLLIN|POLLOUT) /* reported when both directions are closed */
84#endif
85
86
87/**
88 * Ring buffer for inbound data. Filled with data from the host
89 * socket on poll manager thread. Data consumed by scheduling
90 * tcp_write() to the pcb on the lwip thread.
91 *
92 * NB: There is actually third party present, the lwip stack itself.
93 * Thus the buffer doesn't have dual free vs. data split, but rather
94 * three-way free / send and unACKed data / unsent data split.
95 */
96struct ringbuf {
97 char *buf;
98 size_t bufsize;
99
100 /*
101 * Start of free space, producer writes here (up till "unacked").
102 */
103 volatile size_t vacant;
104
105 /*
106 * Start of sent but unacknowledged data. The data are "owned" by
107 * the stack as it may need to retransmit. This is the free space
108 * limit for producer.
109 */
110 volatile size_t unacked;
111
112 /*
113 * Start of unsent data, consumer reads/sends from here (up till
114 * "vacant"). Not declared volatile since it's only accessed from
115 * the consumer thread.
116 */
117 size_t unsent;
118};
119
120
121/**
122 */
123struct pxtcp {
124 /**
125 * Our poll manager handler. Must be first, strong/weak
126 * references depend on this "inheritance".
127 */
128 struct pollmgr_handler pmhdl;
129
130 /**
131 * lwIP (internal/guest) side of the proxied connection.
132 */
133 struct tcp_pcb *pcb;
134
135 /**
136 * Host (external) side of the proxied connection.
137 */
138 SOCKET sock;
139
140 /**
141 * Socket events we are currently polling for.
142 */
143 int events;
144
145 /**
146 * Socket error. Currently used to save connect(2) errors so that
147 * we can decide if we need to send ICMP error.
148 */
149 int sockerr;
150
151 /**
152 * Interface that we have got the SYN from. Needed to send ICMP
153 * with correct source address.
154 */
155 struct netif *netif;
156
157 /**
158 * For tentatively accepted connections for which we are in
159 * process of connecting to the real destination this is the
160 * initial pbuf that we might need to build ICMP error.
161 *
162 * When connection is established this is used to hold outbound
163 * pbuf chain received by pxtcp_pcb_recv() but not yet completely
164 * forwarded over the socket. We cannot "return" it to lwIP since
165 * the head of the chain is already sent and freed.
166 */
167 struct pbuf *unsent;
168
169 /**
170 * Guest has closed its side. Reported to pxtcp_pcb_recv() only
171 * once and we might not be able to forward it immediately if we
172 * have unsent pbuf.
173 */
174 int outbound_close;
175
176 /**
177 * Outbound half-close has been done on the socket.
178 */
179 int outbound_close_done;
180
181 /**
182 * External has closed its side. We might not be able to forward
183 * it immediately if we have unforwarded data.
184 */
185 int inbound_close;
186
187 /**
188 * Inbound half-close has been done on the pcb.
189 */
190 int inbound_close_done;
191
192 /**
193 * On systems that report POLLHUP as soon as the final FIN is
194 * received on a socket we cannot continue polling for the rest of
195 * input, so we have to read (pull) last data from the socket on
196 * the lwIP thread instead of polling/pushing it from the poll
197 * manager thread. See comment in pxtcp_pmgr_pump() POLLHUP case.
198 */
199 int inbound_pull;
200
201
202 /**
203 * When poll manager schedules delete we may not be able to delete
204 * a pxtcp immediately if not all inbound data has been acked by
205 * the guest: lwIP may need to resend and the data are in pxtcp's
206 * inbuf::buf. We defer delete until all data are acked to
207 * pxtcp_pcb_sent().
208 */
209 int deferred_delete;
210
211 /**
212 * Ring-buffer for inbound data.
213 */
214 struct ringbuf inbuf;
215
216 /**
217 * lwIP thread's strong reference to us.
218 */
219 struct pollmgr_refptr *rp;
220
221
222 /*
223 * We use static messages to call functions on the lwIP thread to
224 * void malloc/free overhead.
225 */
226 struct tcpip_msg msg_delete; /* delete pxtcp */
227 struct tcpip_msg msg_reset; /* reset connection and delete pxtcp */
228 struct tcpip_msg msg_accept; /* confirm accept of proxied connection */
229 struct tcpip_msg msg_outbound; /* trigger send of outbound data */
230 struct tcpip_msg msg_inbound; /* trigger send of inbound data */
231#if HAVE_TCP_POLLHUP
232 struct tcpip_msg msg_inpull; /* trigger pull of last inbound data */
233#endif
234};
235
236
237
238static struct pxtcp *pxtcp_allocate(void);
239static void pxtcp_free(struct pxtcp *);
240
241static void pxtcp_pcb_associate(struct pxtcp *, struct tcp_pcb *);
242static void pxtcp_pcb_dissociate(struct pxtcp *);
243
244/* poll manager callbacks for pxtcp related channels */
245static int pxtcp_pmgr_chan_add(struct pollmgr_handler *, SOCKET, int);
246static int pxtcp_pmgr_chan_pollout(struct pollmgr_handler *, SOCKET, int);
247static int pxtcp_pmgr_chan_pollin(struct pollmgr_handler *, SOCKET, int);
248#if !(HAVE_TCP_POLLHUP & POLLOUT)
249static int pxtcp_pmgr_chan_del(struct pollmgr_handler *, SOCKET, int);
250#endif
251static int pxtcp_pmgr_chan_reset(struct pollmgr_handler *, SOCKET, int);
252
253/* helper functions for sending/receiving pxtcp over poll manager channels */
254static ssize_t pxtcp_chan_send(enum pollmgr_slot_t, struct pxtcp *);
255static ssize_t pxtcp_chan_send_weak(enum pollmgr_slot_t, struct pxtcp *);
256static struct pxtcp *pxtcp_chan_recv(struct pollmgr_handler *, SOCKET, int);
257static struct pxtcp *pxtcp_chan_recv_strong(struct pollmgr_handler *, SOCKET, int);
258
259/* poll manager callbacks for individual sockets */
260static int pxtcp_pmgr_connect(struct pollmgr_handler *, SOCKET, int);
261static int pxtcp_pmgr_pump(struct pollmgr_handler *, SOCKET, int);
262
263/* get incoming traffic into ring buffer */
264static ssize_t pxtcp_sock_read(struct pxtcp *, int *);
265static ssize_t pxtcp_sock_recv(struct pxtcp *, IOVEC *, size_t); /* default */
266
267/* convenience functions for poll manager callbacks */
268static int pxtcp_schedule_delete(struct pxtcp *);
269static int pxtcp_schedule_reset(struct pxtcp *);
270static int pxtcp_schedule_reject(struct pxtcp *);
271
272/* lwip thread callbacks called via proxy_lwip_post() */
273static void pxtcp_pcb_delete_pxtcp(void *);
274static void pxtcp_pcb_reset_pxtcp(void *);
275static void pxtcp_pcb_accept_refuse(void *);
276static void pxtcp_pcb_accept_confirm(void *);
277static void pxtcp_pcb_write_outbound(void *);
278static void pxtcp_pcb_write_inbound(void *);
279#if HAVE_TCP_POLLHUP
280static void pxtcp_pcb_pull_inbound(void *);
281#endif
282
283/* tcp pcb callbacks */
284static err_t pxtcp_pcb_heard(void *, struct tcp_pcb *, struct pbuf *); /* global */
285static err_t pxtcp_pcb_accept(void *, struct tcp_pcb *, err_t);
286static err_t pxtcp_pcb_connected(void *, struct tcp_pcb *, err_t);
287static err_t pxtcp_pcb_recv(void *, struct tcp_pcb *, struct pbuf *, err_t);
288static err_t pxtcp_pcb_sent(void *, struct tcp_pcb *, u16_t);
289static err_t pxtcp_pcb_poll(void *, struct tcp_pcb *);
290static void pxtcp_pcb_err(void *, err_t);
291
292static err_t pxtcp_pcb_forward_outbound(struct pxtcp *, struct pbuf *);
293static void pxtcp_pcb_forward_outbound_close(struct pxtcp *);
294
295static ssize_t pxtcp_sock_send(struct pxtcp *, IOVEC *, size_t);
296
297static void pxtcp_pcb_forward_inbound(struct pxtcp *);
298static void pxtcp_pcb_forward_inbound_close(struct pxtcp *);
299DECLINLINE(int) pxtcp_pcb_forward_inbound_done(const struct pxtcp *);
300static void pxtcp_pcb_schedule_poll(struct pxtcp *);
301static void pxtcp_pcb_cancel_poll(struct pxtcp *);
302
303static void pxtcp_pcb_reject(struct tcp_pcb *, int, struct netif *, struct pbuf *);
304DECLINLINE(void) pxtcp_pcb_maybe_deferred_delete(struct pxtcp *);
305
306/* poll manager handlers for pxtcp channels */
307static struct pollmgr_handler pxtcp_pmgr_chan_add_hdl;
308static struct pollmgr_handler pxtcp_pmgr_chan_pollout_hdl;
309static struct pollmgr_handler pxtcp_pmgr_chan_pollin_hdl;
310#if !(HAVE_TCP_POLLHUP & POLLOUT)
311static struct pollmgr_handler pxtcp_pmgr_chan_del_hdl;
312#endif
313static struct pollmgr_handler pxtcp_pmgr_chan_reset_hdl;
314
315
316/**
317 * Init PXTCP - must be run when neither lwIP tcpip thread, nor poll
318 * manager threads haven't been created yet.
319 */
320void
321pxtcp_init(void)
322{
323 /*
324 * Create channels.
325 */
326#define CHANNEL(SLOT, NAME) do { \
327 NAME##_hdl.callback = NAME; \
328 NAME##_hdl.data = NULL; \
329 NAME##_hdl.slot = -1; \
330 pollmgr_add_chan(SLOT, &NAME##_hdl); \
331 } while (0)
332
333 CHANNEL(POLLMGR_CHAN_PXTCP_ADD, pxtcp_pmgr_chan_add);
334 CHANNEL(POLLMGR_CHAN_PXTCP_POLLIN, pxtcp_pmgr_chan_pollin);
335 CHANNEL(POLLMGR_CHAN_PXTCP_POLLOUT, pxtcp_pmgr_chan_pollout);
336#if !(HAVE_TCP_POLLHUP & POLLOUT)
337 CHANNEL(POLLMGR_CHAN_PXTCP_DEL, pxtcp_pmgr_chan_del);
338#endif
339 CHANNEL(POLLMGR_CHAN_PXTCP_RESET, pxtcp_pmgr_chan_reset);
340
341#undef CHANNEL
342
343 /*
344 * Listen to outgoing connection from guest(s).
345 */
346 tcp_proxy_accept(pxtcp_pcb_heard);
347}
348
349
350/**
351 * Syntactic sugar for sending pxtcp pointer over poll manager
352 * channel. Used by lwip thread functions.
353 */
354static ssize_t
355pxtcp_chan_send(enum pollmgr_slot_t slot, struct pxtcp *pxtcp)
356{
357 return pollmgr_chan_send(slot, &pxtcp, sizeof(pxtcp));
358}
359
360
361/**
362 * Syntactic sugar for sending weak reference to pxtcp over poll
363 * manager channel. Used by lwip thread functions.
364 */
365static ssize_t
366pxtcp_chan_send_weak(enum pollmgr_slot_t slot, struct pxtcp *pxtcp)
367{
368 pollmgr_refptr_weak_ref(pxtcp->rp);
369 return pollmgr_chan_send(slot, &pxtcp->rp, sizeof(pxtcp->rp));
370}
371
372
373/**
374 * Counterpart of pxtcp_chan_send().
375 */
376static struct pxtcp *
377pxtcp_chan_recv(struct pollmgr_handler *handler, SOCKET fd, int revents)
378{
379 struct pxtcp *pxtcp;
380
381 pxtcp = (struct pxtcp *)pollmgr_chan_recv_ptr(handler, fd, revents);
382 return pxtcp;
383}
384
385
386/**
387 * Counterpart of pxtcp_chan_send_weak().
388 */
389static struct pxtcp *
390pxtcp_chan_recv_strong(struct pollmgr_handler *handler, SOCKET fd, int revents)
391{
392 struct pollmgr_refptr *rp;
393 struct pollmgr_handler *base;
394 struct pxtcp *pxtcp;
395
396 rp = (struct pollmgr_refptr *)pollmgr_chan_recv_ptr(handler, fd, revents);
397 base = (struct pollmgr_handler *)pollmgr_refptr_get(rp);
398 pxtcp = (struct pxtcp *)base;
399
400 return pxtcp;
401}
402
403
404/**
405 * Register pxtcp with poll manager.
406 *
407 * Used for POLLMGR_CHAN_PXTCP_ADD and by port-forwarding. Since
408 * error handling is different in these two cases, we leave it up to
409 * the caller.
410 */
411int
412pxtcp_pmgr_add(struct pxtcp *pxtcp)
413{
414 int status;
415
416 LWIP_ASSERT1(pxtcp != NULL);
417#ifdef RT_OS_WINDOWS
418 LWIP_ASSERT1(pxtcp->sock != INVALID_SOCKET);
419#else
420 LWIP_ASSERT1(pxtcp->sock >= 0);
421#endif
422 LWIP_ASSERT1(pxtcp->pmhdl.callback != NULL);
423 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
424 LWIP_ASSERT1(pxtcp->pmhdl.slot < 0);
425
426 status = pollmgr_add(&pxtcp->pmhdl, pxtcp->sock, pxtcp->events);
427 return status;
428}
429
430
431/**
432 * Unregister pxtcp with poll manager.
433 *
434 * Used for POLLMGR_CHAN_PXTCP_RESET and by port-forwarding (on error
435 * leg).
436 */
437void
438pxtcp_pmgr_del(struct pxtcp *pxtcp)
439{
440 LWIP_ASSERT1(pxtcp != NULL);
441
442 pollmgr_del_slot(pxtcp->pmhdl.slot);
443}
444
445
446/**
447 * POLLMGR_CHAN_PXTCP_ADD handler.
448 *
449 * Get new pxtcp from lwip thread and start polling its socket.
450 */
451static int
452pxtcp_pmgr_chan_add(struct pollmgr_handler *handler, SOCKET fd, int revents)
453{
454 struct pxtcp *pxtcp;
455 int status;
456
457 pxtcp = pxtcp_chan_recv(handler, fd, revents);
458 DPRINTF0(("pxtcp_add: new pxtcp %p; pcb %p; sock %d\n",
459 (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
460
461 status = pxtcp_pmgr_add(pxtcp);
462 if (status < 0) {
463 (void) pxtcp_schedule_reset(pxtcp);
464 }
465
466 return POLLIN;
467}
468
469
470/**
471 * POLLMGR_CHAN_PXTCP_POLLOUT handler.
472 *
473 * pxtcp_pcb_forward_outbound() on the lwIP thread tried to send data
474 * and failed, it now requests us to poll the socket for POLLOUT and
475 * schedule pxtcp_pcb_forward_outbound() when sock is writable again.
476 */
477static int
478pxtcp_pmgr_chan_pollout(struct pollmgr_handler *handler, SOCKET fd, int revents)
479{
480 struct pxtcp *pxtcp;
481
482 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
483 DPRINTF0(("pxtcp_pollout: pxtcp %p\n", (void *)pxtcp));
484
485 if (pxtcp == NULL) {
486 return POLLIN;
487 }
488
489 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
490 LWIP_ASSERT1(pxtcp->pmhdl.slot > 0);
491
492 pxtcp->events |= POLLOUT;
493 pollmgr_update_events(pxtcp->pmhdl.slot, pxtcp->events);
494
495 return POLLIN;
496}
497
498
499/**
500 * POLLMGR_CHAN_PXTCP_POLLIN handler.
501 */
502static int
503pxtcp_pmgr_chan_pollin(struct pollmgr_handler *handler, SOCKET fd, int revents)
504{
505 struct pxtcp *pxtcp;
506
507 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
508 DPRINTF2(("pxtcp_pollin: pxtcp %p\n", (void *)pxtcp));
509
510 if (pxtcp == NULL) {
511 return POLLIN;
512 }
513
514 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
515 LWIP_ASSERT1(pxtcp->pmhdl.slot > 0);
516
517 if (pxtcp->inbound_close) {
518 return POLLIN;
519 }
520
521 pxtcp->events |= POLLIN;
522 pollmgr_update_events(pxtcp->pmhdl.slot, pxtcp->events);
523
524 return POLLIN;
525}
526
527
528#if !(HAVE_TCP_POLLHUP & POLLOUT)
529/**
530 * POLLMGR_CHAN_PXTCP_DEL handler.
531 *
532 * Schedule pxtcp deletion. We only need this if host system doesn't
533 * report POLLHUP for fully closed tcp sockets.
534 */
535static int
536pxtcp_pmgr_chan_del(struct pollmgr_handler *handler, SOCKET fd, int revents)
537{
538 struct pxtcp *pxtcp;
539
540 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
541 if (pxtcp == NULL) {
542 return POLLIN;
543 }
544
545 DPRINTF(("PXTCP_DEL: pxtcp %p; pcb %p; sock %d\n",
546 (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
547
548 LWIP_ASSERT1(pxtcp->pmhdl.callback != NULL);
549 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
550
551 LWIP_ASSERT1(pxtcp->inbound_close); /* EOF read */
552 LWIP_ASSERT1(pxtcp->outbound_close_done); /* EOF sent */
553
554 pxtcp_pmgr_del(pxtcp);
555 (void) pxtcp_schedule_delete(pxtcp);
556
557 return POLLIN;
558}
559#endif /* !(HAVE_TCP_POLLHUP & POLLOUT) */
560
561
562/**
563 * POLLMGR_CHAN_PXTCP_RESET handler.
564 *
565 * Close the socket with RST and delete pxtcp.
566 */
567static int
568pxtcp_pmgr_chan_reset(struct pollmgr_handler *handler, SOCKET fd, int revents)
569{
570 struct pxtcp *pxtcp;
571
572 pxtcp = pxtcp_chan_recv_strong(handler, fd, revents);
573 if (pxtcp == NULL) {
574 return POLLIN;
575 }
576
577 DPRINTF0(("PXTCP_RESET: pxtcp %p; pcb %p; sock %d\n",
578 (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
579
580 LWIP_ASSERT1(pxtcp->pmhdl.callback != NULL);
581 LWIP_ASSERT1(pxtcp->pmhdl.data == (void *)pxtcp);
582
583 pxtcp_pmgr_del(pxtcp);
584
585 proxy_reset_socket(pxtcp->sock);
586 pxtcp->sock = INVALID_SOCKET;
587
588 (void) pxtcp_schedule_reset(pxtcp);
589
590 return POLLIN;
591}
592
593
594static struct pxtcp *
595pxtcp_allocate(void)
596{
597 struct pxtcp *pxtcp;
598
599 pxtcp = (struct pxtcp *)malloc(sizeof(*pxtcp));
600 if (pxtcp == NULL) {
601 return NULL;
602 }
603
604 pxtcp->pmhdl.callback = NULL;
605 pxtcp->pmhdl.data = (void *)pxtcp;
606 pxtcp->pmhdl.slot = -1;
607
608 pxtcp->pcb = NULL;
609 pxtcp->sock = INVALID_SOCKET;
610 pxtcp->events = 0;
611 pxtcp->sockerr = 0;
612 pxtcp->netif = NULL;
613 pxtcp->unsent = NULL;
614 pxtcp->outbound_close = 0;
615 pxtcp->outbound_close_done = 0;
616 pxtcp->inbound_close = 0;
617 pxtcp->inbound_close_done = 0;
618 pxtcp->inbound_pull = 0;
619 pxtcp->deferred_delete = 0;
620
621 pxtcp->inbuf.bufsize = 64 * 1024;
622 pxtcp->inbuf.buf = (char *)malloc(pxtcp->inbuf.bufsize);
623 if (pxtcp->inbuf.buf == NULL) {
624 free(pxtcp);
625 return NULL;
626 }
627 pxtcp->inbuf.vacant = 0;
628 pxtcp->inbuf.unacked = 0;
629 pxtcp->inbuf.unsent = 0;
630
631 pxtcp->rp = pollmgr_refptr_create(&pxtcp->pmhdl);
632 if (pxtcp->rp == NULL) {
633 free(pxtcp->inbuf.buf);
634 free(pxtcp);
635 return NULL;
636 }
637
638#define CALLBACK_MSG(MSG, FUNC) \
639 do { \
640 pxtcp->MSG.type = TCPIP_MSG_CALLBACK_STATIC; \
641 pxtcp->MSG.sem = NULL; \
642 pxtcp->MSG.msg.cb.function = FUNC; \
643 pxtcp->MSG.msg.cb.ctx = (void *)pxtcp; \
644 } while (0)
645
646 CALLBACK_MSG(msg_delete, pxtcp_pcb_delete_pxtcp);
647 CALLBACK_MSG(msg_reset, pxtcp_pcb_reset_pxtcp);
648 CALLBACK_MSG(msg_accept, pxtcp_pcb_accept_confirm);
649 CALLBACK_MSG(msg_outbound, pxtcp_pcb_write_outbound);
650 CALLBACK_MSG(msg_inbound, pxtcp_pcb_write_inbound);
651#if HAVE_TCP_POLLHUP
652 CALLBACK_MSG(msg_inpull, pxtcp_pcb_pull_inbound);
653#endif
654
655#undef CALLBACK_MSG
656
657 return pxtcp;
658}
659
660
661/**
662 * Exported to fwtcp to create pxtcp for incoming port-forwarded
663 * connections. Completed with pcb in pxtcp_pcb_connect().
664 */
665struct pxtcp *
666pxtcp_create_forwarded(SOCKET sock)
667{
668 struct pxtcp *pxtcp;
669
670 pxtcp = pxtcp_allocate();
671 if (pxtcp == NULL) {
672 return NULL;
673 }
674
675 pxtcp->sock = sock;
676 pxtcp->pmhdl.callback = pxtcp_pmgr_pump;
677 pxtcp->events = 0;
678
679 return pxtcp;
680}
681
682
683static void
684pxtcp_pcb_associate(struct pxtcp *pxtcp, struct tcp_pcb *pcb)
685{
686 LWIP_ASSERT1(pxtcp != NULL);
687 LWIP_ASSERT1(pcb != NULL);
688
689 pxtcp->pcb = pcb;
690
691 tcp_arg(pcb, pxtcp);
692
693 tcp_recv(pcb, pxtcp_pcb_recv);
694 tcp_sent(pcb, pxtcp_pcb_sent);
695 tcp_poll(pcb, NULL, 255);
696 tcp_err(pcb, pxtcp_pcb_err);
697}
698
699
700static void
701pxtcp_free(struct pxtcp *pxtcp)
702{
703 if (pxtcp->unsent != NULL) {
704 pbuf_free(pxtcp->unsent);
705 }
706 if (pxtcp->inbuf.buf != NULL) {
707 free(pxtcp->inbuf.buf);
708 }
709 free(pxtcp);
710}
711
712
713/**
714 * Counterpart to pxtcp_create_forwarded() to destruct pxtcp that
715 * fwtcp failed to register with poll manager to post to lwip thread
716 * for doing connect.
717 */
718void
719pxtcp_cancel_forwarded(struct pxtcp *pxtcp)
720{
721 LWIP_ASSERT1(pxtcp->pcb == NULL);
722 pxtcp_pcb_reset_pxtcp(pxtcp);
723}
724
725
726static void
727pxtcp_pcb_dissociate(struct pxtcp *pxtcp)
728{
729 if (pxtcp == NULL || pxtcp->pcb == NULL) {
730 return;
731 }
732
733 DPRINTF(("%s: pxtcp %p <-> pcb %p\n",
734 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
735
736 /*
737 * We must have dissociated from a fully closed pcb immediately
738 * since lwip recycles them and we don't wan't to mess with what
739 * would be someone else's pcb that we happen to have a stale
740 * pointer to.
741 */
742 LWIP_ASSERT1(pxtcp->pcb->callback_arg == pxtcp);
743
744 tcp_recv(pxtcp->pcb, NULL);
745 tcp_sent(pxtcp->pcb, NULL);
746 tcp_poll(pxtcp->pcb, NULL, 255);
747 tcp_err(pxtcp->pcb, NULL);
748 tcp_arg(pxtcp->pcb, NULL);
749 pxtcp->pcb = NULL;
750}
751
752
753/**
754 * Lwip thread callback invoked via pxtcp::msg_delete
755 *
756 * Since we use static messages to communicate to the lwip thread, we
757 * cannot delete pxtcp without making sure there are no unprocessed
758 * messages in the lwip thread mailbox.
759 *
760 * The easiest way to ensure that is to send this "delete" message as
761 * the last one and when it's processed we know there are no more and
762 * it's safe to delete pxtcp.
763 *
764 * Poll manager handlers should use pxtcp_schedule_delete()
765 * convenience function.
766 */
767static void
768pxtcp_pcb_delete_pxtcp(void *ctx)
769{
770 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
771
772 DPRINTF(("%s: pxtcp %p, pcb %p, sock %d%s\n",
773 __func__, (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock,
774 (pxtcp->deferred_delete && !pxtcp->inbound_pull
775 ? " (was deferred)" : "")));
776
777 LWIP_ASSERT1(pxtcp != NULL);
778 LWIP_ASSERT1(pxtcp->pmhdl.slot < 0);
779 LWIP_ASSERT1(pxtcp->outbound_close_done);
780 LWIP_ASSERT1(pxtcp->inbound_close); /* not necessarily done */
781
782
783 /*
784 * pxtcp is no longer registered with poll manager, so it's safe
785 * to close the socket.
786 */
787 if (pxtcp->sock != INVALID_SOCKET) {
788 closesocket(pxtcp->sock);
789 pxtcp->sock = INVALID_SOCKET;
790 }
791
792 /*
793 * We might have already dissociated from a fully closed pcb, or
794 * guest might have sent us a reset while msg_delete was in
795 * transit. If there's no pcb, we are done.
796 */
797 if (pxtcp->pcb == NULL) {
798 pollmgr_refptr_unref(pxtcp->rp);
799 pxtcp_free(pxtcp);
800 return;
801 }
802
803 /*
804 * Have we completely forwarded all inbound traffic to the guest?
805 *
806 * We may still be waiting for ACKs. We may have failed to send
807 * some of the data (tcp_write() failed with ERR_MEM). We may
808 * have failed to send the FIN (tcp_shutdown() failed with
809 * ERR_MEM).
810 */
811 if (pxtcp_pcb_forward_inbound_done(pxtcp)) {
812 pxtcp_pcb_dissociate(pxtcp);
813 pollmgr_refptr_unref(pxtcp->rp);
814 pxtcp_free(pxtcp);
815 }
816 else {
817 DPRINTF2(("delete: pxtcp %p; pcb %p:"
818 " unacked %d, unsent %d, vacant %d, %s - DEFER!\n",
819 (void *)pxtcp, (void *)pxtcp->pcb,
820 (int)pxtcp->inbuf.unacked,
821 (int)pxtcp->inbuf.unsent,
822 (int)pxtcp->inbuf.vacant,
823 pxtcp->inbound_close_done ? "FIN sent" : "FIN is NOT sent"));
824
825 LWIP_ASSERT1(!pxtcp->deferred_delete);
826 pxtcp->deferred_delete = 1;
827 }
828}
829
830
831/**
832 * If we couldn't delete pxtcp right away in the msg_delete callback
833 * from the poll manager thread, we repeat the check at the end of
834 * relevant pcb callbacks.
835 */
836DECLINLINE(void)
837pxtcp_pcb_maybe_deferred_delete(struct pxtcp *pxtcp)
838{
839 if (pxtcp->deferred_delete && pxtcp_pcb_forward_inbound_done(pxtcp)) {
840 pxtcp_pcb_delete_pxtcp(pxtcp);
841 }
842}
843
844
845/**
846 * Poll manager callbacks should use this convenience wrapper to
847 * schedule pxtcp deletion on the lwip thread and to deregister from
848 * the poll manager.
849 */
850static int
851pxtcp_schedule_delete(struct pxtcp *pxtcp)
852{
853 /*
854 * If pollmgr_refptr_get() is called by any channel before
855 * scheduled deletion happens, let them know we are gone.
856 */
857 pxtcp->pmhdl.slot = -1;
858
859 /*
860 * Schedule deletion. Since poll manager thread may be pre-empted
861 * right after we send the message, the deletion may actually
862 * happen on the lwip thread before we return from this function,
863 * so it's not safe to refer to pxtcp after this call.
864 */
865 proxy_lwip_post(&pxtcp->msg_delete);
866
867 /* tell poll manager to deregister us */
868 return -1;
869}
870
871
872/**
873 * Lwip thread callback invoked via pxtcp::msg_reset
874 *
875 * Like pxtcp_pcb_delete(), but sends RST to the guest before
876 * deleting this pxtcp.
877 */
878static void
879pxtcp_pcb_reset_pxtcp(void *ctx)
880{
881 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
882 LWIP_ASSERT1(pxtcp != NULL);
883
884 DPRINTF0(("%s: pxtcp %p, pcb %p, sock %d\n",
885 __func__, (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
886
887 if (pxtcp->sock != INVALID_SOCKET) {
888 proxy_reset_socket(pxtcp->sock);
889 pxtcp->sock = INVALID_SOCKET;
890 }
891
892 if (pxtcp->pcb != NULL) {
893 struct tcp_pcb *pcb = pxtcp->pcb;
894 pxtcp_pcb_dissociate(pxtcp);
895 tcp_abort(pcb);
896 }
897
898 pollmgr_refptr_unref(pxtcp->rp);
899 pxtcp_free(pxtcp);
900}
901
902
903
904/**
905 * Poll manager callbacks should use this convenience wrapper to
906 * schedule pxtcp reset and deletion on the lwip thread and to
907 * deregister from the poll manager.
908 *
909 * See pxtcp_schedule_delete() for additional comments.
910 */
911static int
912pxtcp_schedule_reset(struct pxtcp *pxtcp)
913{
914 pxtcp->pmhdl.slot = -1;
915 proxy_lwip_post(&pxtcp->msg_reset);
916 return -1;
917}
918
919
920/**
921 * Reject proxy connection attempt. Depending on the cause (sockerr)
922 * we may just drop the pcb silently, generate an ICMP datagram or
923 * send TCP reset.
924 */
925static void
926pxtcp_pcb_reject(struct tcp_pcb *pcb, int sockerr,
927 struct netif *netif, struct pbuf *p)
928{
929 int reset = 0;
930
931 if (sockerr == ECONNREFUSED) {
932 reset = 1;
933 }
934 else if (p != NULL) {
935 struct netif *oif;
936
937 LWIP_ASSERT1(netif != NULL);
938
939 oif = ip_current_netif();
940 ip_current_netif() = netif;
941
942 if (PCB_ISIPV6(pcb)) {
943 if (sockerr == EHOSTDOWN) {
944 icmp6_dest_unreach(p, ICMP6_DUR_ADDRESS); /* XXX: ??? */
945 }
946 else if (sockerr == EHOSTUNREACH
947 || sockerr == ENETDOWN
948 || sockerr == ENETUNREACH)
949 {
950 icmp6_dest_unreach(p, ICMP6_DUR_NO_ROUTE);
951 }
952 }
953 else {
954 if (sockerr == EHOSTDOWN
955 || sockerr == EHOSTUNREACH
956 || sockerr == ENETDOWN
957 || sockerr == ENETUNREACH)
958 {
959 icmp_dest_unreach(p, ICMP_DUR_HOST);
960 }
961 }
962
963 ip_current_netif() = oif;
964 }
965
966 tcp_abandon(pcb, reset);
967}
968
969
970/**
971 * Called from poll manager thread via pxtcp::msg_accept when proxy
972 * failed to connect to the destination. Also called when we failed
973 * to register pxtcp with poll manager.
974 *
975 * This is like pxtcp_pcb_reset_pxtcp() but is more discriminate in
976 * how this unestablished connection is terminated.
977 */
978static void
979pxtcp_pcb_accept_refuse(void *ctx)
980{
981 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
982
983 DPRINTF0(("%s: pxtcp %p, pcb %p, sock %d: %R[sockerr]\n",
984 __func__, (void *)pxtcp, (void *)pxtcp->pcb,
985 pxtcp->sock, pxtcp->sockerr));
986
987 LWIP_ASSERT1(pxtcp != NULL);
988 LWIP_ASSERT1(pxtcp->sock == INVALID_SOCKET);
989
990 if (pxtcp->pcb != NULL) {
991 struct tcp_pcb *pcb = pxtcp->pcb;
992 pxtcp_pcb_dissociate(pxtcp);
993 pxtcp_pcb_reject(pcb, pxtcp->sockerr, pxtcp->netif, pxtcp->unsent);
994 }
995
996 pollmgr_refptr_unref(pxtcp->rp);
997 pxtcp_free(pxtcp);
998}
999
1000
1001/**
1002 * Convenience wrapper for poll manager connect callback to reject
1003 * connection attempt.
1004 *
1005 * Like pxtcp_schedule_reset(), but the callback is more discriminate
1006 * in how this unestablished connection is terminated.
1007 */
1008static int
1009pxtcp_schedule_reject(struct pxtcp *pxtcp)
1010{
1011 pxtcp->msg_accept.msg.cb.function = pxtcp_pcb_accept_refuse;
1012 pxtcp->pmhdl.slot = -1;
1013 proxy_lwip_post(&pxtcp->msg_accept);
1014 return -1;
1015}
1016
1017
1018/**
1019 * Global tcp_proxy_accept() callback for proxied outgoing TCP
1020 * connections from guest(s).
1021 */
1022static err_t
1023pxtcp_pcb_heard(void *arg, struct tcp_pcb *newpcb, struct pbuf *syn)
1024{
1025 LWIP_UNUSED_ARG(arg);
1026
1027 return pxtcp_pcb_accept_outbound(newpcb, syn,
1028 PCB_ISIPV6(newpcb), &newpcb->local_ip, newpcb->local_port);
1029}
1030
1031
1032err_t
1033pxtcp_pcb_accept_outbound(struct tcp_pcb *newpcb, struct pbuf *p,
1034 int is_ipv6, ipX_addr_t *dst_addr, u16_t dst_port)
1035{
1036 struct pxtcp *pxtcp;
1037 ipX_addr_t mapped_dst_addr;
1038 int sdom;
1039 SOCKET sock;
1040 ssize_t nsent;
1041 int sockerr = 0;
1042
1043 /*
1044 * TCP first calls accept callback when it receives the first SYN
1045 * and "tentatively accepts" new proxied connection attempt. When
1046 * proxy "confirms" the SYN and sends SYN|ACK and the guest
1047 * replies with ACK the accept callback is called again, this time
1048 * with the established connection.
1049 */
1050 LWIP_ASSERT1(newpcb->state == SYN_RCVD_0);
1051 tcp_accept(newpcb, pxtcp_pcb_accept);
1052 tcp_arg(newpcb, NULL);
1053
1054 tcp_setprio(newpcb, TCP_PRIO_MAX);
1055
1056 pxremap_outbound_ipX(is_ipv6, &mapped_dst_addr, dst_addr);
1057
1058 sdom = is_ipv6 ? PF_INET6 : PF_INET;
1059 sock = proxy_connected_socket(sdom, SOCK_STREAM,
1060 &mapped_dst_addr, dst_port);
1061 if (sock == INVALID_SOCKET) {
1062 sockerr = SOCKERRNO();
1063 goto abort;
1064 }
1065
1066 pxtcp = pxtcp_allocate();
1067 if (pxtcp == NULL) {
1068 proxy_reset_socket(sock);
1069 goto abort;
1070 }
1071
1072 /* save initial datagram in case we need to reply with ICMP */
1073 if (p != NULL) {
1074 pbuf_ref(p);
1075 pxtcp->unsent = p;
1076 pxtcp->netif = ip_current_netif();
1077 }
1078
1079 pxtcp_pcb_associate(pxtcp, newpcb);
1080 pxtcp->sock = sock;
1081
1082 pxtcp->pmhdl.callback = pxtcp_pmgr_connect;
1083 pxtcp->events = POLLOUT;
1084
1085 nsent = pxtcp_chan_send(POLLMGR_CHAN_PXTCP_ADD, pxtcp);
1086 if (nsent < 0) {
1087 pxtcp->sock = INVALID_SOCKET;
1088 proxy_reset_socket(sock);
1089 pxtcp_pcb_accept_refuse(pxtcp);
1090 return ERR_ABRT;
1091 }
1092
1093 return ERR_OK;
1094
1095 abort:
1096 DPRINTF0(("%s: pcb %p, sock %d: %R[sockerr]\n",
1097 __func__, (void *)newpcb, sock, sockerr));
1098 pxtcp_pcb_reject(newpcb, sockerr, ip_current_netif(), p);
1099 return ERR_ABRT;
1100}
1101
1102
1103/**
1104 * tcp_proxy_accept() callback for accepted proxied outgoing TCP
1105 * connections from guest(s). This is "real" accept with three-way
1106 * handshake completed.
1107 */
1108static err_t
1109pxtcp_pcb_accept(void *arg, struct tcp_pcb *pcb, err_t error)
1110{
1111 struct pxtcp *pxtcp = (struct pxtcp *)arg;
1112
1113 LWIP_UNUSED_ARG(pcb); /* used only in asserts */
1114 LWIP_UNUSED_ARG(error); /* always ERR_OK */
1115
1116 LWIP_ASSERT1(pxtcp != NULL);
1117 LWIP_ASSERT1(pxtcp->pcb = pcb);
1118 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
1119
1120 /* send any inbound data that are already queued */
1121 pxtcp_pcb_forward_inbound(pxtcp);
1122 return ERR_OK;
1123}
1124
1125
1126/**
1127 * Initial poll manager callback for proxied outgoing TCP connections.
1128 * pxtcp_pcb_accept() sets pxtcp::pmhdl::callback to this.
1129 *
1130 * Waits for connect(2) to the destination to complete. On success
1131 * replaces itself with pxtcp_pmgr_pump() callback common to all
1132 * established TCP connections.
1133 */
1134static int
1135pxtcp_pmgr_connect(struct pollmgr_handler *handler, SOCKET fd, int revents)
1136{
1137 struct pxtcp *pxtcp;
1138 RT_NOREF(fd);
1139
1140 pxtcp = (struct pxtcp *)handler->data;
1141 LWIP_ASSERT1(handler == &pxtcp->pmhdl);
1142 LWIP_ASSERT1(fd == pxtcp->sock);
1143 LWIP_ASSERT1(pxtcp->sockerr == 0);
1144
1145 if (revents & POLLNVAL) {
1146 pxtcp->sock = INVALID_SOCKET;
1147 pxtcp->sockerr = ETIMEDOUT;
1148 return pxtcp_schedule_reject(pxtcp);
1149 }
1150
1151 /*
1152 * Solaris and NetBSD don't report either POLLERR or POLLHUP when
1153 * connect(2) fails, just POLLOUT. In that case we always need to
1154 * check SO_ERROR.
1155 */
1156#if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD)
1157# define CONNECT_CHECK_ERROR POLLOUT
1158#else
1159# define CONNECT_CHECK_ERROR (POLLERR | POLLHUP)
1160#endif
1161
1162 /*
1163 * Check the cause of the failure so that pxtcp_pcb_reject() may
1164 * behave accordingly.
1165 */
1166 if (revents & CONNECT_CHECK_ERROR) {
1167 socklen_t optlen = (socklen_t)sizeof(pxtcp->sockerr);
1168 int status;
1169 SOCKET s;
1170
1171 status = getsockopt(pxtcp->sock, SOL_SOCKET, SO_ERROR,
1172 (char *)&pxtcp->sockerr, &optlen);
1173 if (RT_UNLIKELY(status == SOCKET_ERROR)) { /* should not happen */
1174 DPRINTF(("%s: sock %d: SO_ERROR failed: %R[sockerr]\n",
1175 __func__, fd, SOCKERRNO()));
1176 pxtcp->sockerr = ETIMEDOUT;
1177 }
1178 else {
1179 /* don't spam this log on successful connect(2) */
1180 if ((revents & (POLLERR | POLLHUP)) /* we were told it's failed */
1181 || pxtcp->sockerr != 0) /* we determined it's failed */
1182 {
1183 DPRINTF(("%s: sock %d: connect: %R[sockerr]\n",
1184 __func__, fd, pxtcp->sockerr));
1185 }
1186
1187 if ((revents & (POLLERR | POLLHUP))
1188 && RT_UNLIKELY(pxtcp->sockerr == 0))
1189 {
1190 /* if we're told it's failed, make sure it's marked as such */
1191 pxtcp->sockerr = ETIMEDOUT;
1192 }
1193 }
1194
1195 if (pxtcp->sockerr != 0) {
1196 s = pxtcp->sock;
1197 pxtcp->sock = INVALID_SOCKET;
1198 closesocket(s);
1199 return pxtcp_schedule_reject(pxtcp);
1200 }
1201 }
1202
1203 if (revents & POLLOUT) { /* connect is successful */
1204 /* confirm accept to the guest */
1205 proxy_lwip_post(&pxtcp->msg_accept);
1206
1207 /*
1208 * Switch to common callback used for all established proxied
1209 * connections.
1210 */
1211 pxtcp->pmhdl.callback = pxtcp_pmgr_pump;
1212
1213 /*
1214 * Initially we poll for incoming traffic only. Outgoing
1215 * traffic is fast-forwarded by pxtcp_pcb_recv(); if it fails
1216 * it will ask us to poll for POLLOUT too.
1217 */
1218 pxtcp->events = POLLIN;
1219 return pxtcp->events;
1220 }
1221
1222 /* should never get here */
1223 DPRINTF0(("%s: pxtcp %p, sock %d: unexpected revents 0x%x\n",
1224 __func__, (void *)pxtcp, fd, revents));
1225 return pxtcp_schedule_reset(pxtcp);
1226}
1227
1228
1229/**
1230 * Called from poll manager thread via pxtcp::msg_accept when proxy
1231 * connected to the destination. Finalize accept by sending SYN|ACK
1232 * to the guest.
1233 */
1234static void
1235pxtcp_pcb_accept_confirm(void *ctx)
1236{
1237 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
1238 err_t error;
1239
1240 LWIP_ASSERT1(pxtcp != NULL);
1241 if (pxtcp->pcb == NULL) {
1242 return;
1243 }
1244
1245 /* we are not going to reply with ICMP, so we can drop initial pbuf */
1246 if (pxtcp->unsent != NULL) {
1247 pbuf_free(pxtcp->unsent);
1248 pxtcp->unsent = NULL;
1249 }
1250
1251 error = tcp_proxy_accept_confirm(pxtcp->pcb);
1252
1253 /*
1254 * If lwIP failed to enqueue SYN|ACK because it's out of pbufs it
1255 * abandons the pcb. Retrying that is not very easy, since it
1256 * would require keeping "fractional state". From guest's point
1257 * of view there is no reply to its SYN so it will either resend
1258 * the SYN (effetively triggering full connection retry for us),
1259 * or it will eventually time out.
1260 */
1261 if (error == ERR_ABRT) {
1262 pxtcp->pcb = NULL; /* pcb is gone */
1263 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
1264 }
1265
1266 /*
1267 * else if (error != ERR_OK): even if tcp_output() failed with
1268 * ERR_MEM - don't give up, that SYN|ACK is enqueued and will be
1269 * retransmitted eventually.
1270 */
1271}
1272
1273
1274/**
1275 * Entry point for port-forwarding.
1276 *
1277 * fwtcp accepts new incoming connection, creates pxtcp for the socket
1278 * (with no pcb yet) and adds it to the poll manager (polling for
1279 * errors only). Then it calls this function to construct the pcb and
1280 * perform connection to the guest.
1281 */
1282void
1283pxtcp_pcb_connect(struct pxtcp *pxtcp, const struct fwspec *fwspec)
1284{
1285 struct sockaddr_storage ss;
1286 socklen_t sslen;
1287 struct tcp_pcb *pcb;
1288 ipX_addr_t src_addr, dst_addr;
1289 u16_t src_port, dst_port;
1290 int status;
1291 err_t error;
1292
1293 LWIP_ASSERT1(pxtcp != NULL);
1294 LWIP_ASSERT1(pxtcp->pcb == NULL);
1295 LWIP_ASSERT1(fwspec->stype == SOCK_STREAM);
1296
1297 pcb = tcp_new();
1298 if (pcb == NULL) {
1299 goto reset;
1300 }
1301
1302 tcp_setprio(pcb, TCP_PRIO_MAX);
1303 pxtcp_pcb_associate(pxtcp, pcb);
1304
1305 sslen = sizeof(ss);
1306 status = getpeername(pxtcp->sock, (struct sockaddr *)&ss, &sslen);
1307 if (status == SOCKET_ERROR) {
1308 goto reset;
1309 }
1310
1311 /* nit: compares PF and AF, but they are the same everywhere */
1312 LWIP_ASSERT1(ss.ss_family == fwspec->sdom);
1313
1314 status = fwany_ipX_addr_set_src(&src_addr, (const struct sockaddr *)&ss);
1315 if (status == PXREMAP_FAILED) {
1316 goto reset;
1317 }
1318
1319 if (ss.ss_family == PF_INET) {
1320 const struct sockaddr_in *peer4 = (const struct sockaddr_in *)&ss;
1321
1322 src_port = peer4->sin_port;
1323
1324 memcpy(&dst_addr.ip4, &fwspec->dst.sin.sin_addr, sizeof(ip_addr_t));
1325 dst_port = fwspec->dst.sin.sin_port;
1326 }
1327 else { /* PF_INET6 */
1328 const struct sockaddr_in6 *peer6 = (const struct sockaddr_in6 *)&ss;
1329 ip_set_v6(pcb, 1);
1330
1331 src_port = peer6->sin6_port;
1332
1333 memcpy(&dst_addr.ip6, &fwspec->dst.sin6.sin6_addr, sizeof(ip6_addr_t));
1334 dst_port = fwspec->dst.sin6.sin6_port;
1335 }
1336
1337 /* lwip port arguments are in host order */
1338 src_port = ntohs(src_port);
1339 dst_port = ntohs(dst_port);
1340
1341 error = tcp_proxy_bind(pcb, ipX_2_ip(&src_addr), src_port);
1342 if (error != ERR_OK) {
1343 goto reset;
1344 }
1345
1346 error = tcp_connect(pcb, ipX_2_ip(&dst_addr), dst_port,
1347 /* callback: */ pxtcp_pcb_connected);
1348 if (error != ERR_OK) {
1349 goto reset;
1350 }
1351
1352 return;
1353
1354 reset:
1355 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
1356}
1357
1358
1359/**
1360 * Port-forwarded connection to guest is successful, pump data.
1361 */
1362static err_t
1363pxtcp_pcb_connected(void *arg, struct tcp_pcb *pcb, err_t error)
1364{
1365 struct pxtcp *pxtcp = (struct pxtcp *)arg;
1366
1367 LWIP_ASSERT1(error == ERR_OK); /* always called with ERR_OK */
1368 LWIP_UNUSED_ARG(error);
1369
1370 LWIP_ASSERT1(pxtcp != NULL);
1371 LWIP_ASSERT1(pxtcp->pcb == pcb);
1372 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
1373 LWIP_UNUSED_ARG(pcb);
1374
1375 DPRINTF0(("%s: new pxtcp %p; pcb %p; sock %d\n",
1376 __func__, (void *)pxtcp, (void *)pxtcp->pcb, pxtcp->sock));
1377
1378 /* ACK on connection is like ACK on data in pxtcp_pcb_sent() */
1379 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_POLLIN, pxtcp);
1380
1381 return ERR_OK;
1382}
1383
1384
1385/**
1386 * tcp_recv() callback.
1387 */
1388static err_t
1389pxtcp_pcb_recv(void *arg, struct tcp_pcb *pcb, struct pbuf *p, err_t error)
1390{
1391 struct pxtcp *pxtcp = (struct pxtcp *)arg;
1392
1393 LWIP_ASSERT1(error == ERR_OK); /* always called with ERR_OK */
1394 LWIP_UNUSED_ARG(error);
1395
1396 LWIP_ASSERT1(pxtcp != NULL);
1397 LWIP_ASSERT1(pxtcp->pcb == pcb);
1398 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
1399 LWIP_UNUSED_ARG(pcb);
1400
1401
1402 /*
1403 * Have we done sending previous batch?
1404 */
1405 if (pxtcp->unsent != NULL) {
1406 if (p != NULL) {
1407 /*
1408 * Return an error to tell TCP to hold onto that pbuf.
1409 * It will be presented to us later from tcp_fasttmr().
1410 */
1411 return ERR_WOULDBLOCK;
1412 }
1413 else {
1414 /*
1415 * Unlike data, p == NULL indicating orderly shutdown is
1416 * NOT presented to us again
1417 */
1418 pxtcp->outbound_close = 1;
1419 return ERR_OK;
1420 }
1421 }
1422
1423
1424 /*
1425 * Guest closed?
1426 */
1427 if (p == NULL) {
1428 pxtcp->outbound_close = 1;
1429 pxtcp_pcb_forward_outbound_close(pxtcp);
1430 return ERR_OK;
1431 }
1432
1433
1434 /*
1435 * Got data, send what we can without blocking.
1436 */
1437 return pxtcp_pcb_forward_outbound(pxtcp, p);
1438}
1439
1440
1441/**
1442 * Guest half-closed its TX side of the connection.
1443 *
1444 * Called either immediately from pxtcp_pcb_recv() when it gets NULL,
1445 * or from pxtcp_pcb_forward_outbound() when it finishes forwarding
1446 * previously unsent data and sees pxtcp::outbound_close flag saved by
1447 * pxtcp_pcb_recv().
1448 */
1449static void
1450pxtcp_pcb_forward_outbound_close(struct pxtcp *pxtcp)
1451{
1452 struct tcp_pcb *pcb;
1453
1454 LWIP_ASSERT1(pxtcp != NULL);
1455 LWIP_ASSERT1(pxtcp->outbound_close);
1456 LWIP_ASSERT1(!pxtcp->outbound_close_done);
1457
1458 pcb = pxtcp->pcb;
1459 LWIP_ASSERT1(pcb != NULL);
1460
1461 DPRINTF(("outbound_close: pxtcp %p; pcb %p %s\n",
1462 (void *)pxtcp, (void *)pcb, tcp_debug_state_str(pcb->state)));
1463
1464
1465 /* set the flag first, since shutdown() may trigger POLLHUP */
1466 pxtcp->outbound_close_done = 1;
1467 shutdown(pxtcp->sock, SHUT_WR); /* half-close the socket */
1468
1469#if !(HAVE_TCP_POLLHUP & POLLOUT)
1470 /*
1471 * We need to nudge poll manager manually, since OS will not
1472 * report POLLHUP.
1473 */
1474 if (pxtcp->inbound_close) {
1475 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_DEL, pxtcp);
1476 }
1477#endif
1478
1479
1480 /* no more outbound data coming to us */
1481 tcp_recv(pcb, NULL);
1482
1483 /*
1484 * If we have already done inbound close previously (active close
1485 * on the pcb), then we must not hold onto a pcb in TIME_WAIT
1486 * state since those will be recycled by lwip when it runs out of
1487 * free pcbs in the pool.
1488 *
1489 * The test is true also for a pcb in CLOSING state that waits
1490 * just for the ACK of its FIN (to transition to TIME_WAIT).
1491 */
1492 if (pxtcp_pcb_forward_inbound_done(pxtcp)) {
1493 pxtcp_pcb_dissociate(pxtcp);
1494 }
1495}
1496
1497
1498/**
1499 * Forward outbound data from pcb to socket.
1500 *
1501 * Called by pxtcp_pcb_recv() to forward new data and by callout
1502 * triggered by POLLOUT on the socket to send previously unsent data.
1503 *
1504 * (Re)scehdules one-time callout if not all data are sent.
1505 */
1506static err_t
1507pxtcp_pcb_forward_outbound(struct pxtcp *pxtcp, struct pbuf *p)
1508{
1509 struct pbuf *qs, *q;
1510 size_t qoff;
1511 size_t forwarded;
1512 int sockerr;
1513
1514 LWIP_ASSERT1(pxtcp->unsent == NULL || pxtcp->unsent == p);
1515
1516 forwarded = 0;
1517 sockerr = 0;
1518
1519 q = NULL;
1520 qoff = 0;
1521
1522 qs = p;
1523 while (qs != NULL) {
1524 IOVEC iov[8];
1525 const size_t iovsize = sizeof(iov)/sizeof(iov[0]);
1526 size_t fwd1;
1527 ssize_t nsent;
1528 size_t i;
1529
1530 fwd1 = 0;
1531 for (i = 0, q = qs; i < iovsize && q != NULL; ++i, q = q->next) {
1532 LWIP_ASSERT1(q->len > 0);
1533 IOVEC_SET_BASE(iov[i], q->payload);
1534 IOVEC_SET_LEN(iov[i], q->len);
1535 fwd1 += q->len;
1536 }
1537
1538 /*
1539 * TODO: This is where application-level proxy can hook into
1540 * to process outbound traffic.
1541 */
1542 nsent = pxtcp_sock_send(pxtcp, iov, i);
1543
1544 if (nsent == (ssize_t)fwd1) {
1545 /* successfully sent this chain fragment completely */
1546 forwarded += nsent;
1547 qs = q;
1548 }
1549 else if (nsent >= 0) {
1550 /* successfully sent only some data */
1551 forwarded += nsent;
1552
1553 /* find the first pbuf that was not completely forwarded */
1554 qoff = nsent;
1555 for (i = 0, q = qs; i < iovsize && q != NULL; ++i, q = q->next) {
1556 if (qoff < q->len) {
1557 break;
1558 }
1559 qoff -= q->len;
1560 }
1561 LWIP_ASSERT1(q != NULL);
1562 LWIP_ASSERT1(qoff < q->len);
1563 break;
1564 }
1565 else {
1566 sockerr = -nsent;
1567
1568 /*
1569 * Some errors are really not errors - if we get them,
1570 * it's not different from getting nsent == 0, so filter
1571 * them out here.
1572 */
1573 if (proxy_error_is_transient(sockerr)) {
1574 sockerr = 0;
1575 }
1576 q = qs;
1577 qoff = 0;
1578 break;
1579 }
1580 }
1581
1582 if (forwarded > 0) {
1583 DPRINTF2(("forward_outbound: pxtcp %p, pcb %p: sent %d bytes\n",
1584 (void *)pxtcp, (void *)pxtcp->pcb, (int)forwarded));
1585 tcp_recved(pxtcp->pcb, (u16_t)forwarded);
1586 }
1587
1588 if (q == NULL) { /* everything is forwarded? */
1589 LWIP_ASSERT1(sockerr == 0);
1590 LWIP_ASSERT1(forwarded == p->tot_len);
1591
1592 pxtcp->unsent = NULL;
1593 pbuf_free(p);
1594 if (pxtcp->outbound_close) {
1595 pxtcp_pcb_forward_outbound_close(pxtcp);
1596 }
1597 }
1598 else {
1599 if (q != p) {
1600 /* free forwarded pbufs at the beginning of the chain */
1601 pbuf_ref(q);
1602 pbuf_free(p);
1603 }
1604 if (qoff > 0) {
1605 /* advance payload pointer past the forwarded part */
1606 pbuf_header(q, -(s16_t)qoff);
1607 }
1608 pxtcp->unsent = q;
1609 DPRINTF2(("forward_outbound: pxtcp %p, pcb %p: kept %d bytes\n",
1610 (void *)pxtcp, (void *)pxtcp->pcb, (int)q->tot_len));
1611
1612 /*
1613 * Have sendmsg() failed?
1614 *
1615 * Connection reset will be detected by poll and
1616 * pxtcp_schedule_reset() will be called.
1617 *
1618 * Otherwise something *really* unexpected must have happened,
1619 * so we'd better abort.
1620 */
1621 if (sockerr != 0 && sockerr != ECONNRESET) {
1622 struct tcp_pcb *pcb = pxtcp->pcb;
1623 DPRINTF2(("forward_outbound: pxtcp %p, pcb %p: %R[sockerr]\n",
1624 (void *)pxtcp, (void *)pcb, sockerr));
1625
1626 pxtcp_pcb_dissociate(pxtcp);
1627
1628 tcp_abort(pcb);
1629
1630 /* call error callback manually since we've already dissociated */
1631 pxtcp_pcb_err((void *)pxtcp, ERR_ABRT);
1632 return ERR_ABRT;
1633 }
1634
1635 /* schedule one-shot POLLOUT on the socket */
1636 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_POLLOUT, pxtcp);
1637 }
1638 return ERR_OK;
1639}
1640
1641
1642#if !defined(RT_OS_WINDOWS)
1643static ssize_t
1644pxtcp_sock_send(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1645{
1646 struct msghdr mh;
1647 ssize_t nsent;
1648
1649#ifdef MSG_NOSIGNAL
1650 const int send_flags = MSG_NOSIGNAL;
1651#else
1652 const int send_flags = 0;
1653#endif
1654
1655 memset(&mh, 0, sizeof(mh));
1656
1657 mh.msg_iov = iov;
1658 mh.msg_iovlen = iovlen;
1659
1660 nsent = sendmsg(pxtcp->sock, &mh, send_flags);
1661 if (nsent < 0) {
1662 nsent = -SOCKERRNO();
1663 }
1664
1665 return nsent;
1666}
1667#else /* RT_OS_WINDOWS */
1668static ssize_t
1669pxtcp_sock_send(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1670{
1671 DWORD nsent;
1672 int status;
1673
1674 status = WSASend(pxtcp->sock, iov, (DWORD)iovlen, &nsent,
1675 0, NULL, NULL);
1676 if (status == SOCKET_ERROR) {
1677 return -SOCKERRNO();
1678 }
1679
1680 return nsent;
1681}
1682#endif /* RT_OS_WINDOWS */
1683
1684
1685/**
1686 * Callback from poll manager (on POLLOUT) to send data from
1687 * pxtcp::unsent pbuf to socket.
1688 */
1689static void
1690pxtcp_pcb_write_outbound(void *ctx)
1691{
1692 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
1693 LWIP_ASSERT1(pxtcp != NULL);
1694
1695 if (pxtcp->pcb == NULL) {
1696 return;
1697 }
1698
1699 pxtcp_pcb_forward_outbound(pxtcp, pxtcp->unsent);
1700}
1701
1702
1703/**
1704 * Common poll manager callback used by both outgoing and incoming
1705 * (port-forwarded) connections that has connected socket.
1706 */
1707static int
1708pxtcp_pmgr_pump(struct pollmgr_handler *handler, SOCKET fd, int revents)
1709{
1710 struct pxtcp *pxtcp;
1711 int status;
1712 int sockerr;
1713 RT_NOREF(fd);
1714
1715 pxtcp = (struct pxtcp *)handler->data;
1716 LWIP_ASSERT1(handler == &pxtcp->pmhdl);
1717 LWIP_ASSERT1(fd == pxtcp->sock);
1718
1719 if (revents & POLLNVAL) {
1720 pxtcp->sock = INVALID_SOCKET;
1721 return pxtcp_schedule_reset(pxtcp);
1722 }
1723
1724 if (revents & POLLERR) {
1725 socklen_t optlen = (socklen_t)sizeof(sockerr);
1726
1727 status = getsockopt(pxtcp->sock, SOL_SOCKET, SO_ERROR,
1728 (char *)&sockerr, &optlen);
1729 if (status == SOCKET_ERROR) { /* should not happen */
1730 DPRINTF(("sock %d: POLLERR: SO_ERROR failed: %R[sockerr]\n",
1731 fd, SOCKERRNO()));
1732 }
1733 else {
1734 DPRINTF0(("sock %d: POLLERR: %R[sockerr]\n", fd, sockerr));
1735 }
1736 return pxtcp_schedule_reset(pxtcp);
1737 }
1738
1739 if (revents & POLLOUT) {
1740 pxtcp->events &= ~POLLOUT;
1741 proxy_lwip_post(&pxtcp->msg_outbound);
1742 }
1743
1744 if (revents & POLLIN) {
1745 ssize_t nread;
1746 int stop_pollin;
1747
1748 nread = pxtcp_sock_read(pxtcp, &stop_pollin);
1749 if (nread < 0) {
1750 DPRINTF0(("sock %d: POLLIN: %R[sockerr]\n", fd, -(int)nread));
1751 return pxtcp_schedule_reset(pxtcp);
1752 }
1753
1754 if (stop_pollin) {
1755 pxtcp->events &= ~POLLIN;
1756 }
1757
1758 if (nread > 0) {
1759 proxy_lwip_post(&pxtcp->msg_inbound);
1760#if !HAVE_TCP_POLLHUP
1761 /*
1762 * If host does not report POLLHUP for closed sockets
1763 * (e.g. NetBSD) we should check for full close manually.
1764 */
1765 if (pxtcp->inbound_close && pxtcp->outbound_close_done) {
1766 LWIP_ASSERT1((revents & POLLHUP) == 0);
1767 return pxtcp_schedule_delete(pxtcp);
1768 }
1769#endif
1770 }
1771 }
1772
1773#if !HAVE_TCP_POLLHUP
1774 LWIP_ASSERT1((revents & POLLHUP) == 0);
1775#else
1776 if (revents & POLLHUP) {
1777 DPRINTF(("sock %d: HUP\n", fd));
1778
1779#if HAVE_TCP_POLLHUP == POLLIN
1780 /*
1781 * XXX: OSX reports POLLHUP once more when inbound is already
1782 * half-closed (which has already been reported as a "normal"
1783 * POLLHUP, handled below), the socket is polled for POLLOUT
1784 * (guest sends a lot of data that we can't push out fast
1785 * enough), and remote sends a reset - e.g. an http client
1786 * that half-closes after request and then aborts the transfer.
1787 *
1788 * It really should have been reported as POLLERR, but it
1789 * seems OSX never reports POLLERR for sockets.
1790 */
1791#if defined(RT_OS_DARWIN)
1792 {
1793 socklen_t optlen = (socklen_t)sizeof(sockerr);
1794
1795 status = getsockopt(pxtcp->sock, SOL_SOCKET, SO_ERROR,
1796 (char *)&sockerr, &optlen);
1797 if (status == SOCKET_ERROR) { /* should not happen */
1798 DPRINTF(("sock %d: POLLHUP: SO_ERROR failed: %R[sockerr]\n",
1799 fd, SOCKERRNO()));
1800 sockerr = ECONNRESET;
1801 }
1802 else if (sockerr != 0) {
1803 DPRINTF0(("sock %d: POLLHUP: %R[sockerr]\n", fd, sockerr));
1804 }
1805
1806 if (sockerr != 0) { /* XXX: should have been POLLERR */
1807 return pxtcp_schedule_reset(pxtcp);
1808 }
1809 }
1810#endif /* RT_OS_DARWIN */
1811
1812 /*
1813 * Remote closed inbound.
1814 */
1815 if (!pxtcp->outbound_close_done) {
1816 /*
1817 * We might still need to poll for POLLOUT, but we can not
1818 * poll for POLLIN anymore (even if not all data are read)
1819 * because we will be spammed by POLLHUP.
1820 */
1821 pxtcp->events &= ~POLLIN;
1822 if (!pxtcp->inbound_close) {
1823 /* the rest of the input has to be pulled */
1824 proxy_lwip_post(&pxtcp->msg_inpull);
1825 }
1826 }
1827 else
1828#endif
1829 /*
1830 * Both directions are closed.
1831 */
1832 {
1833 LWIP_ASSERT1(pxtcp->outbound_close_done);
1834
1835 if (pxtcp->inbound_close) {
1836 /* there's no unread data, we are done */
1837 return pxtcp_schedule_delete(pxtcp);
1838 }
1839 else {
1840 /* pull the rest of the input first (deferred_delete) */
1841 pxtcp->pmhdl.slot = -1;
1842 proxy_lwip_post(&pxtcp->msg_inpull);
1843 return -1;
1844 }
1845 /* NOTREACHED */
1846 }
1847
1848 }
1849#endif /* HAVE_TCP_POLLHUP */
1850
1851 return pxtcp->events;
1852}
1853
1854
1855/**
1856 * Read data from socket to ringbuf. This may be used both on lwip
1857 * and poll manager threads.
1858 *
1859 * Flag pointed to by pstop is set when further reading is impossible,
1860 * either temporary when buffer is full, or permanently when EOF is
1861 * received.
1862 *
1863 * Returns number of bytes read. NB: EOF is reported as 1!
1864 *
1865 * Returns zero if nothing was read, either because buffer is full, or
1866 * if no data is available (EWOULDBLOCK, EINTR &c).
1867 *
1868 * Returns -errno on real socket errors.
1869 */
1870static ssize_t
1871pxtcp_sock_read(struct pxtcp *pxtcp, int *pstop)
1872{
1873 IOVEC iov[2];
1874 size_t iovlen;
1875 ssize_t nread;
1876
1877 const size_t sz = pxtcp->inbuf.bufsize;
1878 size_t beg, lim, wrnew;
1879
1880 *pstop = 0;
1881
1882 beg = pxtcp->inbuf.vacant;
1883 IOVEC_SET_BASE(iov[0], &pxtcp->inbuf.buf[beg]);
1884
1885 /* lim is the index we can NOT write to */
1886 lim = pxtcp->inbuf.unacked;
1887 if (lim == 0) {
1888 lim = sz - 1; /* empty slot at the end */
1889 }
1890 else if (lim == 1 && beg != 0) {
1891 lim = sz; /* empty slot at the beginning */
1892 }
1893 else {
1894 --lim;
1895 }
1896
1897 if (beg == lim) {
1898 /*
1899 * Buffer is full, stop polling for POLLIN.
1900 *
1901 * pxtcp_pcb_sent() will re-enable POLLIN when guest ACKs
1902 * data, freeing space in the ring buffer.
1903 */
1904 *pstop = 1;
1905 return 0;
1906 }
1907
1908 if (beg < lim) {
1909 /* free space in one chunk */
1910 iovlen = 1;
1911 IOVEC_SET_LEN(iov[0], lim - beg);
1912 }
1913 else {
1914 /* free space in two chunks */
1915 iovlen = 2;
1916 IOVEC_SET_LEN(iov[0], sz - beg);
1917 IOVEC_SET_BASE(iov[1], &pxtcp->inbuf.buf[0]);
1918 IOVEC_SET_LEN(iov[1], lim);
1919 }
1920
1921 /*
1922 * TODO: This is where application-level proxy can hook into to
1923 * process inbound traffic.
1924 */
1925 nread = pxtcp_sock_recv(pxtcp, iov, iovlen);
1926
1927 if (nread > 0) {
1928 wrnew = beg + nread;
1929 if (wrnew >= sz) {
1930 wrnew -= sz;
1931 }
1932 pxtcp->inbuf.vacant = wrnew;
1933 DPRINTF2(("pxtcp %p: sock %d read %d bytes\n",
1934 (void *)pxtcp, pxtcp->sock, (int)nread));
1935 return nread;
1936 }
1937 else if (nread == 0) {
1938 *pstop = 1;
1939 pxtcp->inbound_close = 1;
1940 DPRINTF2(("pxtcp %p: sock %d read EOF\n",
1941 (void *)pxtcp, pxtcp->sock));
1942 return 1;
1943 }
1944 else {
1945 int sockerr = -nread;
1946
1947 if (proxy_error_is_transient(sockerr)) {
1948 /* haven't read anything, just return */
1949 DPRINTF2(("pxtcp %p: sock %d read cancelled\n",
1950 (void *)pxtcp, pxtcp->sock));
1951 return 0;
1952 }
1953 else {
1954 /* socket error! */
1955 DPRINTF0(("pxtcp %p: sock %d read: %R[sockerr]\n",
1956 (void *)pxtcp, pxtcp->sock, sockerr));
1957 return -sockerr;
1958 }
1959 }
1960}
1961
1962
1963#if !defined(RT_OS_WINDOWS)
1964static ssize_t
1965pxtcp_sock_recv(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1966{
1967 struct msghdr mh;
1968 ssize_t nread;
1969
1970 memset(&mh, 0, sizeof(mh));
1971
1972 mh.msg_iov = iov;
1973 mh.msg_iovlen = iovlen;
1974
1975 nread = recvmsg(pxtcp->sock, &mh, 0);
1976 if (nread < 0) {
1977 nread = -SOCKERRNO();
1978 }
1979
1980 return nread;
1981}
1982#else /* RT_OS_WINDOWS */
1983static ssize_t
1984pxtcp_sock_recv(struct pxtcp *pxtcp, IOVEC *iov, size_t iovlen)
1985{
1986 DWORD flags;
1987 DWORD nread;
1988 int status;
1989
1990 flags = 0;
1991 status = WSARecv(pxtcp->sock, iov, (DWORD)iovlen, &nread,
1992 &flags, NULL, NULL);
1993 if (status == SOCKET_ERROR) {
1994 return -SOCKERRNO();
1995 }
1996
1997 return (ssize_t)nread;
1998}
1999#endif /* RT_OS_WINDOWS */
2000
2001
2002/**
2003 * Callback from poll manager (pxtcp::msg_inbound) to trigger output
2004 * from ringbuf to guest.
2005 */
2006static void
2007pxtcp_pcb_write_inbound(void *ctx)
2008{
2009 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
2010 LWIP_ASSERT1(pxtcp != NULL);
2011
2012 if (pxtcp->pcb == NULL) {
2013 return;
2014 }
2015
2016 pxtcp_pcb_forward_inbound(pxtcp);
2017}
2018
2019
2020/**
2021 * tcp_poll() callback
2022 *
2023 * We swtich it on when tcp_write() or tcp_shutdown() fail with
2024 * ERR_MEM to prevent connection from stalling. If there are ACKs or
2025 * more inbound data then pxtcp_pcb_forward_inbound() will be
2026 * triggered again, but if neither happens, tcp_poll() comes to the
2027 * rescue.
2028 */
2029static err_t
2030pxtcp_pcb_poll(void *arg, struct tcp_pcb *pcb)
2031{
2032 struct pxtcp *pxtcp = (struct pxtcp *)arg;
2033 LWIP_UNUSED_ARG(pcb);
2034
2035 DPRINTF2(("%s: pxtcp %p; pcb %p\n",
2036 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2037
2038 pxtcp_pcb_forward_inbound(pxtcp);
2039
2040 /*
2041 * If the last thing holding up deletion of the pxtcp was failed
2042 * tcp_shutdown() and it succeeded, we may be the last callback.
2043 */
2044 pxtcp_pcb_maybe_deferred_delete(pxtcp);
2045
2046 return ERR_OK;
2047}
2048
2049
2050static void
2051pxtcp_pcb_schedule_poll(struct pxtcp *pxtcp)
2052{
2053 tcp_poll(pxtcp->pcb, pxtcp_pcb_poll, 0);
2054}
2055
2056
2057static void
2058pxtcp_pcb_cancel_poll(struct pxtcp *pxtcp)
2059{
2060 tcp_poll(pxtcp->pcb, NULL, 255);
2061}
2062
2063
2064/**
2065 * Forward inbound data from ring buffer to the guest.
2066 *
2067 * Scheduled by poll manager thread after it receives more data into
2068 * the ring buffer (we have more data to send).
2069
2070 * Also called from tcp_sent() callback when guest ACKs some data,
2071 * increasing pcb->snd_buf (we are permitted to send more data).
2072 *
2073 * Also called from tcp_poll() callback if previous attempt to forward
2074 * inbound data failed with ERR_MEM (we need to try again).
2075 */
2076static void
2077pxtcp_pcb_forward_inbound(struct pxtcp *pxtcp)
2078{
2079 struct tcp_pcb *pcb;
2080 size_t sndbuf;
2081 size_t beg, lim, sndlim;
2082 size_t toeob, tolim;
2083 size_t nsent;
2084 err_t error;
2085
2086 LWIP_ASSERT1(pxtcp != NULL);
2087 pcb = pxtcp->pcb;
2088 if (pcb == NULL) {
2089 return;
2090 }
2091
2092 if (/* __predict_false */ pcb->state < ESTABLISHED) {
2093 /*
2094 * If we have just confirmed accept of this connection, the
2095 * pcb is in SYN_RCVD state and we still haven't received the
2096 * ACK of our SYN. It's only in SYN_RCVD -> ESTABLISHED
2097 * transition that lwip decrements pcb->acked so that that ACK
2098 * is not reported to pxtcp_pcb_sent(). If we send something
2099 * now and immediately close (think "daytime", e.g.) while
2100 * still in SYN_RCVD state, we will move directly to
2101 * FIN_WAIT_1 and when our confirming SYN is ACK'ed lwip will
2102 * report it to pxtcp_pcb_sent().
2103 */
2104 DPRINTF2(("forward_inbound: pxtcp %p; pcb %p %s - later...\n",
2105 (void *)pxtcp, (void *)pcb, tcp_debug_state_str(pcb->state)));
2106 return;
2107 }
2108
2109
2110 beg = pxtcp->inbuf.unsent; /* private to lwip thread */
2111 lim = pxtcp->inbuf.vacant;
2112
2113 if (beg == lim) {
2114 if (pxtcp->inbound_close && !pxtcp->inbound_close_done) {
2115 pxtcp_pcb_forward_inbound_close(pxtcp);
2116 tcp_output(pcb);
2117 return;
2118 }
2119
2120 /*
2121 * Else, there's no data to send.
2122 *
2123 * If there is free space in the buffer, producer will
2124 * reschedule us as it receives more data and vacant (lim)
2125 * advances.
2126 *
2127 * If buffer is full when all data have been passed to
2128 * tcp_write() but not yet acknowledged, we will advance
2129 * unacked on ACK, freeing some space for producer to write to
2130 * (then see above).
2131 */
2132 return;
2133 }
2134
2135 sndbuf = tcp_sndbuf(pcb);
2136 if (sndbuf == 0) {
2137 /*
2138 * Can't send anything now. As guest ACKs some data, TCP will
2139 * call pxtcp_pcb_sent() callback and we will come here again.
2140 */
2141 return;
2142 }
2143
2144 nsent = 0;
2145
2146 /*
2147 * We have three limits to consider:
2148 * - how much data we have in the ringbuf
2149 * - how much data we are allowed to send
2150 * - ringbuf size
2151 */
2152 toeob = pxtcp->inbuf.bufsize - beg;
2153 if (lim < beg) { /* lim wrapped */
2154 if (sndbuf < toeob) { /* but we are limited by sndbuf */
2155 /* so beg is not going to wrap, treat sndbuf as lim */
2156 lim = beg + sndbuf; /* ... and proceed to the simple case */
2157 }
2158 else { /* we are limited by the end of the buffer, beg will wrap */
2159 u8_t maybemore;
2160 if (toeob == sndbuf || lim == 0) {
2161 maybemore = 0;
2162 }
2163 else {
2164 maybemore = TCP_WRITE_FLAG_MORE;
2165 }
2166
2167 Assert(toeob == (u16_t)toeob);
2168 error = tcp_write(pcb, &pxtcp->inbuf.buf[beg], (u16_t)toeob, maybemore);
2169 if (error != ERR_OK) {
2170 goto writeerr;
2171 }
2172 nsent += toeob;
2173 pxtcp->inbuf.unsent = 0; /* wrap */
2174
2175 if (maybemore) {
2176 beg = 0;
2177 sndbuf -= toeob;
2178 }
2179 else {
2180 /* we are done sending, but ... */
2181 goto check_inbound_close;
2182 }
2183 }
2184 }
2185
2186 LWIP_ASSERT1(beg < lim);
2187 sndlim = beg + sndbuf;
2188 if (lim > sndlim) {
2189 lim = sndlim;
2190 }
2191 tolim = lim - beg;
2192 if (tolim > 0) {
2193 error = tcp_write(pcb, &pxtcp->inbuf.buf[beg], (u16_t)tolim, 0);
2194 if (error != ERR_OK) {
2195 goto writeerr;
2196 }
2197 nsent += tolim;
2198 pxtcp->inbuf.unsent = lim;
2199 }
2200
2201 check_inbound_close:
2202 if (pxtcp->inbound_close && pxtcp->inbuf.unsent == pxtcp->inbuf.vacant) {
2203 pxtcp_pcb_forward_inbound_close(pxtcp);
2204 }
2205
2206 DPRINTF2(("forward_inbound: pxtcp %p, pcb %p: sent %d bytes\n",
2207 (void *)pxtcp, (void *)pcb, (int)nsent));
2208 tcp_output(pcb);
2209 pxtcp_pcb_cancel_poll(pxtcp);
2210 return;
2211
2212 writeerr:
2213 if (error == ERR_MEM) {
2214 if (nsent > 0) { /* first write succeeded, second failed */
2215 DPRINTF2(("forward_inbound: pxtcp %p, pcb %p: sent %d bytes only\n",
2216 (void *)pxtcp, (void *)pcb, (int)nsent));
2217 tcp_output(pcb);
2218 }
2219 DPRINTF(("forward_inbound: pxtcp %p, pcb %p: ERR_MEM\n",
2220 (void *)pxtcp, (void *)pcb));
2221 pxtcp_pcb_schedule_poll(pxtcp);
2222 }
2223 else {
2224 DPRINTF(("forward_inbound: pxtcp %p, pcb %p: %s\n",
2225 (void *)pxtcp, (void *)pcb, proxy_lwip_strerr(error)));
2226
2227 /* XXX: We shouldn't get ERR_ARG. Check ERR_CONN conditions early? */
2228 LWIP_ASSERT1(error == ERR_MEM);
2229 }
2230}
2231
2232
2233static void
2234pxtcp_pcb_forward_inbound_close(struct pxtcp *pxtcp)
2235{
2236 struct tcp_pcb *pcb;
2237 err_t error;
2238
2239 LWIP_ASSERT1(pxtcp != NULL);
2240 LWIP_ASSERT1(pxtcp->inbound_close);
2241 LWIP_ASSERT1(!pxtcp->inbound_close_done);
2242 LWIP_ASSERT1(pxtcp->inbuf.unsent == pxtcp->inbuf.vacant);
2243
2244 pcb = pxtcp->pcb;
2245 LWIP_ASSERT1(pcb != NULL);
2246
2247 DPRINTF(("inbound_close: pxtcp %p; pcb %p: %s\n",
2248 (void *)pxtcp, (void *)pcb, tcp_debug_state_str(pcb->state)));
2249
2250 error = tcp_shutdown(pcb, /*RX*/ 0, /*TX*/ 1);
2251 if (error != ERR_OK) {
2252 DPRINTF(("inbound_close: pxtcp %p; pcb %p:"
2253 " tcp_shutdown: error=%s\n",
2254 (void *)pxtcp, (void *)pcb, proxy_lwip_strerr(error)));
2255 pxtcp_pcb_schedule_poll(pxtcp);
2256 return;
2257 }
2258
2259 pxtcp_pcb_cancel_poll(pxtcp);
2260 pxtcp->inbound_close_done = 1;
2261
2262
2263 /*
2264 * If we have already done outbound close previously (passive
2265 * close on the pcb), then we must not hold onto a pcb in LAST_ACK
2266 * state since those will be deleted by lwip when that last ack
2267 * comes from the guest.
2268 *
2269 * NB: We do NOT check for deferred delete here, even though we
2270 * have just set one of its conditions, inbound_close_done. We
2271 * let pcb callbacks that called us do that. It's simpler and
2272 * cleaner that way.
2273 */
2274 if (pxtcp->outbound_close_done && pxtcp_pcb_forward_inbound_done(pxtcp)) {
2275 pxtcp_pcb_dissociate(pxtcp);
2276 }
2277}
2278
2279
2280/**
2281 * Check that all forwarded inbound data is sent and acked, and that
2282 * inbound close is scheduled (we aren't called back when it's acked).
2283 */
2284DECLINLINE(int)
2285pxtcp_pcb_forward_inbound_done(const struct pxtcp *pxtcp)
2286{
2287 return (pxtcp->inbound_close_done /* also implies that all data forwarded */
2288 && pxtcp->inbuf.unacked == pxtcp->inbuf.unsent);
2289}
2290
2291
2292/**
2293 * tcp_sent() callback - guest acknowledged len bytes.
2294 *
2295 * We can advance inbuf::unacked index, making more free space in the
2296 * ringbuf and wake up producer on poll manager thread.
2297 *
2298 * We can also try to send more data if we have any since pcb->snd_buf
2299 * was increased and we are now permitted to send more.
2300 */
2301static err_t
2302pxtcp_pcb_sent(void *arg, struct tcp_pcb *pcb, u16_t len)
2303{
2304 struct pxtcp *pxtcp = (struct pxtcp *)arg;
2305 size_t unacked;
2306
2307 LWIP_ASSERT1(pxtcp != NULL);
2308 LWIP_ASSERT1(pxtcp->pcb == pcb);
2309 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
2310 LWIP_UNUSED_ARG(pcb); /* only in assert */
2311
2312 DPRINTF2(("%s: pxtcp %p; pcb %p: +%d ACKed:"
2313 " unacked %d, unsent %d, vacant %d\n",
2314 __func__, (void *)pxtcp, (void *)pcb, (int)len,
2315 (int)pxtcp->inbuf.unacked,
2316 (int)pxtcp->inbuf.unsent,
2317 (int)pxtcp->inbuf.vacant));
2318
2319 if (/* __predict_false */ len == 0) {
2320 /* we are notified to start pulling */
2321 LWIP_ASSERT1(!pxtcp->inbound_close);
2322 LWIP_ASSERT1(pxtcp->inbound_pull);
2323
2324 unacked = pxtcp->inbuf.unacked;
2325 }
2326 else {
2327 /*
2328 * Advance unacked index. Guest acknowledged the data, so it
2329 * won't be needed again for potential retransmits.
2330 */
2331 unacked = pxtcp->inbuf.unacked + len;
2332 if (unacked > pxtcp->inbuf.bufsize) {
2333 unacked -= pxtcp->inbuf.bufsize;
2334 }
2335 pxtcp->inbuf.unacked = unacked;
2336 }
2337
2338 /* arrange for more inbound data */
2339 if (!pxtcp->inbound_close) {
2340 if (!pxtcp->inbound_pull) {
2341 /* wake up producer, in case it has stopped polling for POLLIN */
2342 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_POLLIN, pxtcp);
2343#ifdef RT_OS_WINDOWS
2344 /**
2345 * We have't got enought room in ring buffer to read atm,
2346 * but we don't want to lose notification from WSAW4ME when
2347 * space would be available, so we reset event with empty recv
2348 */
2349 recv(pxtcp->sock, NULL, 0, 0);
2350#endif
2351 }
2352 else {
2353 ssize_t nread;
2354 int stop_pollin; /* ignored */
2355
2356 nread = pxtcp_sock_read(pxtcp, &stop_pollin);
2357
2358 if (nread < 0) {
2359 int sockerr = -(int)nread;
2360 LWIP_UNUSED_ARG(sockerr);
2361 DPRINTF0(("%s: sock %d: %R[sockerr]\n",
2362 __func__, pxtcp->sock, sockerr));
2363
2364#if HAVE_TCP_POLLHUP == POLLIN /* see counterpart in pxtcp_pmgr_pump() */
2365 /*
2366 * It may still be registered with poll manager for POLLOUT.
2367 */
2368 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
2369 return ERR_OK;
2370#else
2371 /*
2372 * It is no longer registered with poll manager so we
2373 * can kill it directly.
2374 */
2375 pxtcp_pcb_reset_pxtcp(pxtcp);
2376 return ERR_ABRT;
2377#endif
2378 }
2379 }
2380 }
2381
2382 /* forward more data if we can */
2383 if (!pxtcp->inbound_close_done) {
2384 pxtcp_pcb_forward_inbound(pxtcp);
2385
2386 /*
2387 * NB: we might have dissociated from a pcb that transitioned
2388 * to LAST_ACK state, so don't refer to pcb below.
2389 */
2390 }
2391
2392
2393 /* have we got all the acks? */
2394 if (pxtcp->inbound_close /* no more new data */
2395 && pxtcp->inbuf.unsent == pxtcp->inbuf.vacant /* all data is sent */
2396 && unacked == pxtcp->inbuf.unsent) /* ... and is acked */
2397 {
2398 char *buf;
2399
2400 DPRINTF(("%s: pxtcp %p; pcb %p; all data ACKed\n",
2401 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2402
2403 /* no more retransmits, so buf is not needed */
2404 buf = pxtcp->inbuf.buf;
2405 pxtcp->inbuf.buf = NULL;
2406 free(buf);
2407
2408 /* no more acks, so no more callbacks */
2409 if (pxtcp->pcb != NULL) {
2410 tcp_sent(pxtcp->pcb, NULL);
2411 }
2412
2413 /*
2414 * We may be the last callback for this pcb if we have also
2415 * successfully forwarded inbound_close.
2416 */
2417 pxtcp_pcb_maybe_deferred_delete(pxtcp);
2418 }
2419
2420 return ERR_OK;
2421}
2422
2423
2424#if HAVE_TCP_POLLHUP
2425/**
2426 * Callback from poll manager (pxtcp::msg_inpull) to switch
2427 * pxtcp_pcb_sent() to actively pull the last bits of input. See
2428 * POLLHUP comment in pxtcp_pmgr_pump().
2429 *
2430 * pxtcp::sock is deregistered from poll manager after this callback
2431 * is scheduled.
2432 */
2433static void
2434pxtcp_pcb_pull_inbound(void *ctx)
2435{
2436 struct pxtcp *pxtcp = (struct pxtcp *)ctx;
2437 LWIP_ASSERT1(pxtcp != NULL);
2438
2439 if (pxtcp->pcb == NULL) {
2440 DPRINTF(("%s: pxtcp %p: PCB IS GONE\n", __func__, (void *)pxtcp));
2441 pxtcp_pcb_reset_pxtcp(pxtcp);
2442 return;
2443 }
2444
2445 pxtcp->inbound_pull = 1;
2446 if (pxtcp->pmhdl.slot < 0) {
2447 DPRINTF(("%s: pxtcp %p: pcb %p (deferred delete)\n",
2448 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2449 pxtcp->deferred_delete = 1;
2450 }
2451 else {
2452 DPRINTF(("%s: pxtcp %p: pcb %p\n",
2453 __func__, (void *)pxtcp, (void *)pxtcp->pcb));
2454 }
2455
2456 pxtcp_pcb_sent(pxtcp, pxtcp->pcb, 0);
2457}
2458#endif /* HAVE_TCP_POLLHUP */
2459
2460
2461/**
2462 * tcp_err() callback.
2463 *
2464 * pcb is not passed to this callback since it may be already
2465 * deallocated by the stack, but we can't do anything useful with it
2466 * anyway since connection is gone.
2467 */
2468static void
2469pxtcp_pcb_err(void *arg, err_t error)
2470{
2471 struct pxtcp *pxtcp = (struct pxtcp *)arg;
2472 LWIP_ASSERT1(pxtcp != NULL);
2473
2474 /*
2475 * ERR_CLSD is special - it is reported here when:
2476 *
2477 * . guest has already half-closed
2478 * . we send FIN to guest when external half-closes
2479 * . guest acks that FIN
2480 *
2481 * Since connection is closed but receive has been already closed
2482 * lwip can only report this via tcp_err. At this point the pcb
2483 * is still alive, so we can peek at it if need be.
2484 *
2485 * The interesting twist is when the ACK from guest that akcs our
2486 * FIN also acks some data. In this scenario lwip will NOT call
2487 * tcp_sent() callback with the ACK for that last bit of data but
2488 * instead will call tcp_err with ERR_CLSD right away. Since that
2489 * ACK also acknowledges all the data, we should run some of
2490 * pxtcp_pcb_sent() logic here.
2491 */
2492 if (error == ERR_CLSD) {
2493 struct tcp_pcb *pcb = pxtcp->pcb; /* still alive */
2494
2495 DPRINTF2(("ERR_CLSD: pxtcp %p; pcb %p:"
2496 " pcb->acked %d;"
2497 " unacked %d, unsent %d, vacant %d\n",
2498 (void *)pxtcp, (void *)pcb,
2499 pcb->acked,
2500 (int)pxtcp->inbuf.unacked,
2501 (int)pxtcp->inbuf.unsent,
2502 (int)pxtcp->inbuf.vacant));
2503
2504 LWIP_ASSERT1(pxtcp->pcb == pcb);
2505 LWIP_ASSERT1(pcb->callback_arg == pxtcp);
2506
2507 if (pcb->acked > 0) {
2508 pxtcp_pcb_sent(pxtcp, pcb, pcb->acked);
2509 }
2510 return;
2511 }
2512
2513 DPRINTF0(("tcp_err: pxtcp=%p, error=%s\n",
2514 (void *)pxtcp, proxy_lwip_strerr(error)));
2515
2516 pxtcp->pcb = NULL; /* pcb is gone */
2517 if (pxtcp->deferred_delete) {
2518 pxtcp_pcb_reset_pxtcp(pxtcp);
2519 }
2520 else {
2521 pxtcp_chan_send_weak(POLLMGR_CHAN_PXTCP_RESET, pxtcp);
2522 }
2523}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette