VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_input.c@ 103068

Last change on this file since 103068 was 100886, checked in by vboxsync, 13 months ago

Devices/Network/slirp: Check the complete network address instead of just the host part, fixes accessing any IP ending with .2, ticketref:21513

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 67.1 KB
Line 
1/* $Id: tcp_input.c 100886 2023-08-16 17:22:30Z vboxsync $ */
2/** @file
3 * NAT - TCP input.
4 */
5
6/*
7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28/*
29 * This code is based on:
30 *
31 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
32 * The Regents of the University of California. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
59 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
60 */
61
62/*
63 * Changes and additions relating to SLiRP
64 * Copyright (c) 1995 Danny Gasparovski.
65 *
66 * Please read the file COPYRIGHT for the
67 * terms and conditions of the copyright.
68 */
69
70#include <slirp.h>
71#include "ip_icmp.h"
72
73
74#if 0 /* code using this macroses is commented out */
75# define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
76
77/* for modulo comparisons of timestamps */
78# define TSTMP_LT(a, b) ((int)((a)-(b)) < 0)
79# define TSTMP_GEQ(a, b) ((int)((a)-(b)) >= 0)
80#endif
81
82#ifndef TCP_ACK_HACK
83#define DELAY_ACK(tp, ti) \
84 if (ti->ti_flags & TH_PUSH) \
85 tp->t_flags |= TF_ACKNOW; \
86 else \
87 tp->t_flags |= TF_DELACK;
88#else /* !TCP_ACK_HACK */
89#define DELAY_ACK(tp, ign) \
90 tp->t_flags |= TF_DELACK;
91#endif /* TCP_ACK_HACK */
92
93
94/*
95 * deps: netinet/tcp_reass.c
96 * tcp_reass_maxqlen = 48 (deafault)
97 * tcp_reass_maxseg = nmbclusters/16 (nmbclusters = 1024 + maxusers * 64 from kern/kern_mbuf.c let's say 256)
98 */
99int
100tcp_reass(PNATState pData, struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
101{
102 struct tseg_qent *q;
103 struct tseg_qent *p = NULL;
104 struct tseg_qent *nq;
105 struct tseg_qent *te = NULL;
106 struct socket *so = tp->t_socket;
107 int flags;
108 STAM_PROFILE_START(&pData->StatTCP_reassamble, tcp_reassamble);
109 LogFlowFunc(("ENTER: pData:%p, tp:%R[tcpcb793], th:%p, tlenp:%p, m:%p\n", pData, tp, th, tlenp, m));
110
111 /*
112 * XXX: tcp_reass() is rather inefficient with its data structures
113 * and should be rewritten (see NetBSD for optimizations). While
114 * doing that it should move to its own file tcp_reass.c.
115 */
116
117 /*
118 * Call with th==NULL after become established to
119 * force pre-ESTABLISHED data up to user socket.
120 */
121 if (th == NULL)
122 {
123 LogFlowFunc(("%d -> present\n", __LINE__));
124 goto present;
125 }
126
127 /*
128 * Limit the number of segments in the reassembly queue to prevent
129 * holding on to too many segments (and thus running out of mbufs).
130 * Make sure to let the missing segment through which caused this
131 * queue. Always keep one global queue entry spare to be able to
132 * process the missing segment.
133 */
134 if ( th->th_seq != tp->rcv_nxt
135 && ( tcp_reass_qsize + 1 >= tcp_reass_maxseg
136 || tp->t_segqlen >= tcp_reass_maxqlen))
137 {
138 tcp_reass_overflows++;
139 tcpstat.tcps_rcvmemdrop++;
140 m_freem(pData, m);
141 *tlenp = 0;
142 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
143 LogFlowFuncLeave();
144 return (0);
145 }
146
147 /*
148 * Allocate a new queue entry. If we can't, or hit the zone limit
149 * just drop the pkt.
150 */
151 te = RTMemAlloc(sizeof(struct tseg_qent));
152 if (te == NULL)
153 {
154 tcpstat.tcps_rcvmemdrop++;
155 m_freem(pData, m);
156 *tlenp = 0;
157 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
158 LogFlowFuncLeave();
159 return (0);
160 }
161 tp->t_segqlen++;
162 tcp_reass_qsize++;
163
164 /*
165 * Find a segment which begins after this one does.
166 */
167 LIST_FOREACH(q, &tp->t_segq, tqe_q)
168 {
169 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
170 break;
171 p = q;
172 }
173
174 /*
175 * If there is a preceding segment, it may provide some of
176 * our data already. If so, drop the data from the incoming
177 * segment. If it provides all of our data, drop us.
178 */
179 if (p != NULL)
180 {
181 int i;
182 /* conversion to int (in i) handles seq wraparound */
183 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
184 if (i > 0)
185 {
186 if (i >= *tlenp)
187 {
188 tcpstat.tcps_rcvduppack++;
189 tcpstat.tcps_rcvdupbyte += *tlenp;
190 m_freem(pData, m);
191 RTMemFree(te);
192 tp->t_segqlen--;
193 tcp_reass_qsize--;
194 /*
195 * Try to present any queued data
196 * at the left window edge to the user.
197 * This is needed after the 3-WHS
198 * completes.
199 */
200 LogFlowFunc(("%d -> present\n", __LINE__));
201 goto present; /* ??? */
202 }
203 m_adj(m, i);
204 *tlenp -= i;
205 th->th_seq += i;
206 }
207 }
208 tcpstat.tcps_rcvoopack++;
209 tcpstat.tcps_rcvoobyte += *tlenp;
210
211 /*
212 * While we overlap succeeding segments trim them or,
213 * if they are completely covered, dequeue them.
214 */
215 while (q)
216 {
217 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
218 if (i <= 0)
219 break;
220 if (i < q->tqe_len)
221 {
222 q->tqe_th->th_seq += i;
223 q->tqe_len -= i;
224 m_adj(q->tqe_m, i);
225 break;
226 }
227
228 nq = LIST_NEXT(q, tqe_q);
229 LIST_REMOVE(q, tqe_q);
230 m_freem(pData, q->tqe_m);
231 RTMemFree(q);
232 tp->t_segqlen--;
233 tcp_reass_qsize--;
234 q = nq;
235 }
236
237 /* Insert the new segment queue entry into place. */
238 te->tqe_m = m;
239 te->tqe_th = th;
240 te->tqe_len = *tlenp;
241
242 if (p == NULL)
243 {
244 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
245 }
246 else
247 {
248 LIST_INSERT_AFTER(p, te, tqe_q);
249 }
250
251present:
252 /*
253 * Present data to user, advancing rcv_nxt through
254 * completed sequence space.
255 */
256 if (!TCPS_HAVEESTABLISHED(tp->t_state))
257 {
258 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
259 return (0);
260 }
261 q = LIST_FIRST(&tp->t_segq);
262 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
263 {
264 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
265 return (0);
266 }
267 do
268 {
269 tp->rcv_nxt += q->tqe_len;
270 flags = q->tqe_th->th_flags & TH_FIN;
271 nq = LIST_NEXT(q, tqe_q);
272 LIST_REMOVE(q, tqe_q);
273 /* XXX: This place should be checked for the same code in
274 * original BSD code for Slirp and current BSD used SS_FCANTRCVMORE
275 */
276 if (so->so_state & SS_FCANTSENDMORE)
277 m_freem(pData, q->tqe_m);
278 else
279 sbappend(pData, so, q->tqe_m);
280 RTMemFree(q);
281 tp->t_segqlen--;
282 tcp_reass_qsize--;
283 q = nq;
284 }
285 while (q && q->tqe_th->th_seq == tp->rcv_nxt);
286
287 STAM_PROFILE_STOP(&pData->StatTCP_reassamble, tcp_reassamble);
288 return flags;
289}
290
291/*
292 * TCP input routine, follows pages 65-76 of the
293 * protocol specification dated September, 1981 very closely.
294 */
295void
296tcp_input(PNATState pData, register struct mbuf *m, int iphlen, struct socket *inso)
297{
298 struct ip *ip, *save_ip;
299 register struct tcpiphdr *ti;
300 caddr_t optp = NULL;
301 int optlen = 0;
302 int len, off;
303 int tlen = 0; /* Shut up MSC (didn't check whether MSC was right). */
304 register struct tcpcb *tp = 0;
305 register int tiflags;
306 struct socket *so = 0;
307 int todrop, acked, ourfinisacked, needoutput = 0;
308/* int dropsocket = 0; */
309 int iss = 0;
310 u_long tiwin;
311/* int ts_present = 0; */
312 unsigned ohdrlen;
313 uint8_t ohdr[60 + 8]; /* max IP header plus 8 bytes of payload for icmp */
314
315 STAM_PROFILE_START(&pData->StatTCP_input, counter_input);
316
317 LogFlow(("tcp_input: m = %p, iphlen = %2d, inso = %R[natsock]\n", m, iphlen, inso));
318
319 if (inso != NULL)
320 {
321 QSOCKET_LOCK(tcb);
322 SOCKET_LOCK(inso);
323 QSOCKET_UNLOCK(tcb);
324 }
325 /*
326 * If called with m == 0, then we're continuing the connect
327 */
328 if (m == NULL)
329 {
330 so = inso;
331 Log4(("NAT: tcp_input: %R[natsock]\n", so));
332
333 /* Re-set a few variables */
334 tp = sototcpcb(so);
335
336 m = so->so_m;
337 optp = so->so_optp; /* points into m if set */
338 optlen = so->so_optlen;
339 so->so_m = NULL;
340 so->so_optp = 0;
341 so->so_optlen = 0;
342
343 if (RT_LIKELY(so->so_ohdr != NULL))
344 {
345 RTMemFree(so->so_ohdr);
346 so->so_ohdr = NULL;
347 }
348
349 ti = so->so_ti;
350
351 /** @todo (vvl) clarify why it might happens */
352 if (ti == NULL)
353 {
354 LogRel(("NAT: ti is null. can't do any reseting connection actions\n"));
355 /* mbuf should be cleared in sofree called from tcp_close */
356 tcp_close(pData, tp);
357 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
358 LogFlowFuncLeave();
359 return;
360 }
361
362 tiwin = ti->ti_win;
363 tiflags = ti->ti_flags;
364
365 LogFlowFunc(("%d -> cont_conn\n", __LINE__));
366 goto cont_conn;
367 }
368
369 tcpstat.tcps_rcvtotal++;
370
371 ip = mtod(m, struct ip *);
372
373 /* ip_input() subtracts iphlen from ip::ip_len */
374 AssertStmt(ip->ip_len + iphlen == (ssize_t)m_length(m, NULL), goto drop);
375 if (RT_UNLIKELY(ip->ip_len < sizeof(struct tcphdr)))
376 {
377 /* tcps_rcvshort++; */
378 goto drop;
379 }
380
381 /*
382 * Save a copy of the IP header in case we want to restore it for
383 * sending an ICMP error message in response.
384 *
385 * XXX: This function should really be fixed to not strip IP
386 * options, to not overwrite IP header and to use "tlen" local
387 * variable (instead of ti->ti_len), then "m" could be passed to
388 * icmp_error() directly.
389 */
390 ohdrlen = iphlen + 8;
391 m_copydata(m, 0, ohdrlen, (caddr_t)ohdr);
392 save_ip = (struct ip *)ohdr;
393 save_ip->ip_len += iphlen; /* undo change by ip_input() */
394
395
396 /*
397 * Get IP and TCP header together in first mbuf.
398 * Note: IP leaves IP header in first mbuf.
399 */
400 ti = mtod(m, struct tcpiphdr *);
401 if (iphlen > sizeof(struct ip))
402 {
403 ip_stripoptions(m, (struct mbuf *)0);
404 iphlen = sizeof(struct ip);
405 }
406
407 /*
408 * Checksum extended TCP header and data.
409 */
410 tlen = ((struct ip *)ti)->ip_len;
411 memset(ti->ti_x1, 0, 9);
412 ti->ti_len = RT_H2N_U16((u_int16_t)tlen);
413 len = sizeof(struct ip) + tlen;
414 /* keep checksum for ICMP reply
415 * ti->ti_sum = cksum(m, len);
416 * if (ti->ti_sum) { */
417 if (cksum(m, len))
418 {
419 tcpstat.tcps_rcvbadsum++;
420 LogFlowFunc(("%d -> drop\n", __LINE__));
421 goto drop;
422 }
423
424 /*
425 * Check that TCP offset makes sense,
426 * pull out TCP options and adjust length. XXX
427 */
428 off = ti->ti_off << 2;
429 if ( off < sizeof (struct tcphdr)
430 || off > tlen)
431 {
432 tcpstat.tcps_rcvbadoff++;
433 LogFlowFunc(("%d -> drop\n", __LINE__));
434 goto drop;
435 }
436 tlen -= off;
437 ti->ti_len = tlen;
438 if (off > sizeof (struct tcphdr))
439 {
440 optlen = off - sizeof (struct tcphdr);
441 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
442
443 /*
444 * Do quick retrieval of timestamp options ("options
445 * prediction?"). If timestamp is the only option and it's
446 * formatted as recommended in RFC 1323 appendix A, we
447 * quickly get the values now and not bother calling
448 * tcp_dooptions(), etc.
449 */
450#if 0
451 if (( optlen == TCPOLEN_TSTAMP_APPA
452 || ( optlen > TCPOLEN_TSTAMP_APPA
453 && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
454 *(u_int32_t *)optp == RT_H2N_U32_C(TCPOPT_TSTAMP_HDR) &&
455 (ti->ti_flags & TH_SYN) == 0)
456 {
457 ts_present = 1;
458 ts_val = RT_N2H_U32(*(u_int32_t *)(optp + 4));
459 ts_ecr = RT_N2H_U32(*(u_int32_t *)(optp + 8));
460 optp = NULL; / * we have parsed the options * /
461 }
462#endif
463 }
464 tiflags = ti->ti_flags;
465
466 /*
467 * Convert TCP protocol specific fields to host format.
468 */
469 NTOHL(ti->ti_seq);
470 NTOHL(ti->ti_ack);
471 NTOHS(ti->ti_win);
472 NTOHS(ti->ti_urp);
473
474 /*
475 * Drop TCP, IP headers and TCP options.
476 */
477 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
478 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
479
480 /*
481 * Locate pcb for segment.
482 */
483findso:
484 LogFlowFunc(("(enter) findso: %R[natsock]\n", so));
485 if (so != NULL && so != &tcb)
486 SOCKET_UNLOCK(so);
487 QSOCKET_LOCK(tcb);
488 so = tcp_last_so;
489 if ( so->so_fport != ti->ti_dport
490 || so->so_lport != ti->ti_sport
491 || so->so_laddr.s_addr != ti->ti_src.s_addr
492 || so->so_faddr.s_addr != ti->ti_dst.s_addr)
493 {
494 QSOCKET_UNLOCK(tcb);
495 /** @todo fix SOLOOKUP macrodefinition to be usable here */
496 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
497 ti->ti_dst, ti->ti_dport);
498 if (so)
499 {
500 tcp_last_so = so;
501 }
502 ++tcpstat.tcps_socachemiss;
503 }
504 else
505 {
506 SOCKET_LOCK(so);
507 QSOCKET_UNLOCK(tcb);
508 }
509 LogFlowFunc(("(leave) findso: %R[natsock]\n", so));
510
511 /*
512 * Check whether the packet is targeting CTL_ALIAS and drop it if the connection wasn't
513 * initiated by localhost (so == NULL), see @bugref{9896}.
514 */
515 if ( (CTL_CHECK(ti->ti_dst.s_addr, CTL_ALIAS))
516 && !pData->fLocalhostReachable
517 && !so)
518 {
519 LogFlowFunc(("Packet for CTL_ALIAS and fLocalhostReachable=false so=NULL -> drop\n"));
520 goto drop;
521 }
522
523 /*
524 * If the state is CLOSED (i.e., TCB does not exist) then
525 * all data in the incoming segment is discarded.
526 * If the TCB exists but is in CLOSED state, it is embryonic,
527 * but should either do a listen or a connect soon.
528 *
529 * state == CLOSED means we've done socreate() but haven't
530 * attached it to a protocol yet...
531 *
532 * XXX If a TCB does not exist, and the TH_SYN flag is
533 * the only flag set, then create a session, mark it
534 * as if it was LISTENING, and continue...
535 */
536 if (so == 0)
537 {
538 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
539 {
540 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
541 goto dropwithreset;
542 }
543
544 if ((so = socreate()) == NULL)
545 {
546 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
547 goto dropwithreset;
548 }
549 if (tcp_attach(pData, so) < 0)
550 {
551 RTMemFree(so); /* Not sofree (if it failed, it's not insqued) */
552 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
553 goto dropwithreset;
554 }
555 SOCKET_LOCK(so);
556 sbreserve(pData, &so->so_snd, tcp_sndspace);
557 sbreserve(pData, &so->so_rcv, tcp_rcvspace);
558
559/* tcp_last_so = so; */ /* XXX ? */
560/* tp = sototcpcb(so); */
561
562 so->so_laddr = ti->ti_src;
563 so->so_lport = ti->ti_sport;
564 so->so_faddr = ti->ti_dst;
565 so->so_fport = ti->ti_dport;
566
567 so->so_iptos = ((struct ip *)ti)->ip_tos;
568
569 tp = sototcpcb(so);
570 TCP_STATE_SWITCH_TO(tp, TCPS_LISTEN);
571 }
572
573 /*
574 * If this is a still-connecting socket, this probably
575 * a retransmit of the SYN. Whether it's a retransmit SYN
576 * or something else, we nuke it.
577 */
578 if (so->so_state & SS_ISFCONNECTING)
579 {
580 LogFlowFunc(("%d -> drop\n", __LINE__));
581 goto drop;
582 }
583
584 tp = sototcpcb(so);
585
586 /* XXX Should never fail */
587 if (tp == 0)
588 {
589 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
590 goto dropwithreset;
591 }
592 if (tp->t_state == TCPS_CLOSED)
593 {
594 LogFlowFunc(("%d -> drop\n", __LINE__));
595 goto drop;
596 }
597
598 /* Unscale the window into a 32-bit value. */
599/* if ((tiflags & TH_SYN) == 0)
600 * tiwin = ti->ti_win << tp->snd_scale;
601 * else
602 */
603 tiwin = ti->ti_win;
604
605 /*
606 * Segment received on connection.
607 * Reset idle time and keep-alive timer.
608 */
609 tp->t_idle = 0;
610 if (so_options)
611 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
612 else
613 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
614
615 /*
616 * Process options if not in LISTEN state,
617 * else do it below (after getting remote address).
618 */
619 if (optp && tp->t_state != TCPS_LISTEN)
620 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
621/* , */
622/* &ts_present, &ts_val, &ts_ecr); */
623
624 /*
625 * Header prediction: check for the two common cases
626 * of a uni-directional data xfer. If the packet has
627 * no control flags, is in-sequence, the window didn't
628 * change and we're not retransmitting, it's a
629 * candidate. If the length is zero and the ack moved
630 * forward, we're the sender side of the xfer. Just
631 * free the data acked & wake any higher level process
632 * that was blocked waiting for space. If the length
633 * is non-zero and the ack didn't move, we're the
634 * receiver side. If we're getting packets in-order
635 * (the reassembly queue is empty), add the data to
636 * the socket buffer and note that we need a delayed ack.
637 *
638 * XXX Some of these tests are not needed
639 * eg: the tiwin == tp->snd_wnd prevents many more
640 * predictions.. with no *real* advantage..
641 */
642 if ( tp->t_state == TCPS_ESTABLISHED
643 && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK
644/* && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) */
645 && ti->ti_seq == tp->rcv_nxt
646 && tiwin && tiwin == tp->snd_wnd
647 && tp->snd_nxt == tp->snd_max)
648 {
649 /*
650 * If last ACK falls within this segment's sequence numbers,
651 * record the timestamp.
652 */
653#if 0
654 if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
655 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len))
656 {
657 tp->ts_recent_age = tcp_now;
658 tp->ts_recent = ts_val;
659 }
660#endif
661
662 if (ti->ti_len == 0)
663 {
664 if ( SEQ_GT(ti->ti_ack, tp->snd_una)
665 && SEQ_LEQ(ti->ti_ack, tp->snd_max)
666 && tp->snd_cwnd >= tp->snd_wnd)
667 {
668 /*
669 * this is a pure ack for outstanding data.
670 */
671 ++tcpstat.tcps_predack;
672#if 0
673 if (ts_present)
674 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
675 else
676#endif
677 if ( tp->t_rtt
678 && SEQ_GT(ti->ti_ack, tp->t_rtseq))
679 tcp_xmit_timer(pData, tp, tp->t_rtt);
680 acked = ti->ti_ack - tp->snd_una;
681 tcpstat.tcps_rcvackpack++;
682 tcpstat.tcps_rcvackbyte += acked;
683 sbdrop(&so->so_snd, acked);
684 tp->snd_una = ti->ti_ack;
685 m_freem(pData, m);
686
687 /*
688 * If all outstanding data are acked, stop
689 * retransmit timer, otherwise restart timer
690 * using current (possibly backed-off) value.
691 * If process is waiting for space,
692 * wakeup/selwakeup/signal. If data
693 * are ready to send, let tcp_output
694 * decide between more output or persist.
695 */
696 if (tp->snd_una == tp->snd_max)
697 tp->t_timer[TCPT_REXMT] = 0;
698 else if (tp->t_timer[TCPT_PERSIST] == 0)
699 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
700
701 /*
702 * There's room in so_snd, sowwakup will read()
703 * from the socket if we can
704 */
705#if 0
706 if (so->so_snd.sb_flags & SB_NOTIFY)
707 sowwakeup(so);
708#endif
709 /*
710 * This is called because sowwakeup might have
711 * put data into so_snd. Since we don't so sowwakeup,
712 * we don't need this.. XXX???
713 */
714 if (SBUF_LEN(&so->so_snd))
715 (void) tcp_output(pData, tp);
716
717 SOCKET_UNLOCK(so);
718 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
719 return;
720 }
721 }
722 else if ( ti->ti_ack == tp->snd_una
723 && LIST_EMPTY(&tp->t_segq)
724 && ti->ti_len <= sbspace(&so->so_rcv))
725 {
726 /*
727 * this is a pure, in-sequence data packet
728 * with nothing on the reassembly queue and
729 * we have enough buffer space to take it.
730 */
731 ++tcpstat.tcps_preddat;
732 tp->rcv_nxt += ti->ti_len;
733 tcpstat.tcps_rcvpack++;
734 tcpstat.tcps_rcvbyte += ti->ti_len;
735 /*
736 * Add data to socket buffer.
737 */
738 sbappend(pData, so, m);
739
740 /*
741 * XXX This is called when data arrives. Later, check
742 * if we can actually write() to the socket
743 * XXX Need to check? It's be NON_BLOCKING
744 */
745/* sorwakeup(so); */
746
747 /*
748 * If this is a short packet, then ACK now - with Nagle
749 * congestion avoidance sender won't send more until
750 * he gets an ACK.
751 *
752 * It is better to not delay acks at all to maximize
753 * TCP throughput. See RFC 2581.
754 */
755 tp->t_flags |= TF_ACKNOW;
756 tcp_output(pData, tp);
757 SOCKET_UNLOCK(so);
758 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
759 return;
760 }
761 } /* header prediction */
762 /*
763 * Calculate amount of space in receive window,
764 * and then do TCP input processing.
765 * Receive window is amount of space in rcv queue,
766 * but not less than advertised window.
767 */
768 {
769 int win;
770 win = sbspace(&so->so_rcv);
771 if (win < 0)
772 win = 0;
773 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
774 }
775
776 switch (tp->t_state)
777 {
778 /*
779 * If the state is LISTEN then ignore segment if it contains an RST.
780 * If the segment contains an ACK then it is bad and send a RST.
781 * If it does not contain a SYN then it is not interesting; drop it.
782 * Don't bother responding if the destination was a broadcast.
783 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
784 * tp->iss, and send a segment:
785 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
786 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
787 * Fill in remote peer address fields if not previously specified.
788 * Enter SYN_RECEIVED state, and process any other fields of this
789 * segment in this state.
790 */
791 case TCPS_LISTEN:
792 {
793 if (tiflags & TH_RST)
794 {
795 LogFlowFunc(("%d -> drop\n", __LINE__));
796 goto drop;
797 }
798 if (tiflags & TH_ACK)
799 {
800 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
801 goto dropwithreset;
802 }
803 if ((tiflags & TH_SYN) == 0)
804 {
805 LogFlowFunc(("%d -> drop\n", __LINE__));
806 goto drop;
807 }
808
809 /*
810 * This has way too many gotos...
811 * But a bit of spaghetti code never hurt anybody :)
812 */
813 if ( (tcp_fconnect(pData, so) == -1)
814 && errno != EINPROGRESS
815 && errno != EWOULDBLOCK)
816 {
817 u_char code = ICMP_UNREACH_NET;
818 Log2((" tcp fconnect errno = %d (%s)\n", errno, strerror(errno)));
819 if (errno == ECONNREFUSED)
820 {
821 /* ACK the SYN, send RST to refuse the connection */
822 tcp_respond(pData, tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
823 TH_RST|TH_ACK);
824 }
825 else
826 {
827 if (errno == EHOSTUNREACH)
828 code = ICMP_UNREACH_HOST;
829 HTONL(ti->ti_seq); /* restore tcp header */
830 HTONL(ti->ti_ack);
831 HTONS(ti->ti_win);
832 HTONS(ti->ti_urp);
833 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
834 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
835 *ip = *save_ip;
836 icmp_error(pData, m, ICMP_UNREACH, code, 0, strerror(errno));
837 tp->t_socket->so_m = NULL;
838 }
839 tp = tcp_close(pData, tp);
840 }
841 else
842 {
843 /*
844 * Haven't connected yet, save the current mbuf
845 * and ti, and return
846 * XXX Some OS's don't tell us whether the connect()
847 * succeeded or not. So we must time it out.
848 */
849 so->so_m = m;
850 so->so_ti = ti;
851 so->so_ohdr = RTMemDup(ohdr, ohdrlen);
852 so->so_optp = optp;
853 so->so_optlen = optlen;
854 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
855 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
856 }
857 SOCKET_UNLOCK(so);
858 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
859 LogFlowFuncLeave();
860 return;
861
862cont_conn:
863 /* m==NULL
864 * Check if the connect succeeded
865 */
866 LogFlowFunc(("cont_conn:\n"));
867 if (so->so_state & SS_NOFDREF)
868 {
869 tp = tcp_close(pData, tp);
870 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
871 goto dropwithreset;
872 }
873
874 tcp_template(tp);
875
876 if (optp)
877 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
878
879 if (iss)
880 tp->iss = iss;
881 else
882 tp->iss = tcp_iss;
883 tcp_iss += TCP_ISSINCR/2;
884 tp->irs = ti->ti_seq;
885 tcp_sendseqinit(tp);
886 tcp_rcvseqinit(tp);
887 tp->t_flags |= TF_ACKNOW;
888 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
889 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
890 tcpstat.tcps_accepts++;
891 LogFlowFunc(("%d -> trimthenstep6\n", __LINE__));
892 goto trimthenstep6;
893 } /* case TCPS_LISTEN */
894
895 /*
896 * If the state is SYN_SENT:
897 * if seg contains an ACK, but not for our SYN, drop the input.
898 * if seg contains a RST, then drop the connection.
899 * if seg does not contain SYN, then drop it.
900 * Otherwise this is an acceptable SYN segment
901 * initialize tp->rcv_nxt and tp->irs
902 * if seg contains ack then advance tp->snd_una
903 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
904 * arrange for segment to be acked (eventually)
905 * continue processing rest of data/controls, beginning with URG
906 */
907 case TCPS_SYN_SENT:
908 if ( (tiflags & TH_ACK)
909 && ( SEQ_LEQ(ti->ti_ack, tp->iss)
910 || SEQ_GT(ti->ti_ack, tp->snd_max)))
911 {
912 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
913 goto dropwithreset;
914 }
915
916 if (tiflags & TH_RST)
917 {
918 if (tiflags & TH_ACK)
919 tp = tcp_drop(pData, tp, 0); /* XXX Check t_softerror! */
920 LogFlowFunc(("%d -> drop\n", __LINE__));
921 goto drop;
922 }
923
924 if ((tiflags & TH_SYN) == 0)
925 {
926 LogFlowFunc(("%d -> drop\n", __LINE__));
927 goto drop;
928 }
929 if (tiflags & TH_ACK)
930 {
931 tp->snd_una = ti->ti_ack;
932 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
933 tp->snd_nxt = tp->snd_una;
934 }
935
936 tp->t_timer[TCPT_REXMT] = 0;
937 tp->irs = ti->ti_seq;
938 tcp_rcvseqinit(tp);
939 tp->t_flags |= TF_ACKNOW;
940 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss))
941 {
942 tcpstat.tcps_connects++;
943 soisfconnected(so);
944 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
945
946 /* Do window scaling on this connection? */
947#if 0
948 if (( tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
949 == (TF_RCVD_SCALE|TF_REQ_SCALE))
950 {
951 tp->snd_scale = tp->requested_s_scale;
952 tp->rcv_scale = tp->request_r_scale;
953 }
954#endif
955 (void) tcp_reass(pData, tp, (struct tcphdr *)0, NULL, (struct mbuf *)0);
956 /*
957 * if we didn't have to retransmit the SYN,
958 * use its rtt as our initial srtt & rtt var.
959 */
960 if (tp->t_rtt)
961 tcp_xmit_timer(pData, tp, tp->t_rtt);
962 }
963 else
964 TCP_STATE_SWITCH_TO(tp, TCPS_SYN_RECEIVED);
965
966trimthenstep6:
967 LogFlowFunc(("trimthenstep6:\n"));
968 /*
969 * Advance ti->ti_seq to correspond to first data byte.
970 * If data, trim to stay within window,
971 * dropping FIN if necessary.
972 */
973 ti->ti_seq++;
974 if (ti->ti_len > tp->rcv_wnd)
975 {
976 todrop = ti->ti_len - tp->rcv_wnd;
977 m_adj(m, -todrop);
978 ti->ti_len = tp->rcv_wnd;
979 tiflags &= ~TH_FIN;
980 tcpstat.tcps_rcvpackafterwin++;
981 tcpstat.tcps_rcvbyteafterwin += todrop;
982 }
983 tp->snd_wl1 = ti->ti_seq - 1;
984 tp->rcv_up = ti->ti_seq;
985 LogFlowFunc(("%d -> step6\n", __LINE__));
986 goto step6;
987 } /* switch tp->t_state */
988 /*
989 * States other than LISTEN or SYN_SENT.
990 * First check timestamp, if present.
991 * Then check that at least some bytes of segment are within
992 * receive window. If segment begins before rcv_nxt,
993 * drop leading data (and SYN); if nothing left, just ack.
994 *
995 * RFC 1323 PAWS: If we have a timestamp reply on this segment
996 * and it's less than ts_recent, drop it.
997 */
998#if 0
999 if ( ts_present
1000 && (tiflags & TH_RST) == 0
1001 && tp->ts_recent
1002 && TSTMP_LT(ts_val, tp->ts_recent))
1003 {
1004 /* Check to see if ts_recent is over 24 days old. */
1005 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE)
1006 {
1007 /*
1008 * Invalidate ts_recent. If this segment updates
1009 * ts_recent, the age will be reset later and ts_recent
1010 * will get a valid value. If it does not, setting
1011 * ts_recent to zero will at least satisfy the
1012 * requirement that zero be placed in the timestamp
1013 * echo reply when ts_recent isn't valid. The
1014 * age isn't reset until we get a valid ts_recent
1015 * because we don't want out-of-order segments to be
1016 * dropped when ts_recent is old.
1017 */
1018 tp->ts_recent = 0;
1019 }
1020 else
1021 {
1022 tcpstat.tcps_rcvduppack++;
1023 tcpstat.tcps_rcvdupbyte += ti->ti_len;
1024 tcpstat.tcps_pawsdrop++;
1025 goto dropafterack;
1026 }
1027 }
1028#endif
1029
1030 todrop = tp->rcv_nxt - ti->ti_seq;
1031 if (todrop > 0)
1032 {
1033 if (tiflags & TH_SYN)
1034 {
1035 tiflags &= ~TH_SYN;
1036 ti->ti_seq++;
1037 if (ti->ti_urp > 1)
1038 ti->ti_urp--;
1039 else
1040 tiflags &= ~TH_URG;
1041 todrop--;
1042 }
1043 /*
1044 * Following if statement from Stevens, vol. 2, p. 960.
1045 */
1046 if ( todrop > ti->ti_len
1047 || ( todrop == ti->ti_len
1048 && (tiflags & TH_FIN) == 0))
1049 {
1050 /*
1051 * Any valid FIN must be to the left of the window.
1052 * At this point the FIN must be a duplicate or out
1053 * of sequence; drop it.
1054 */
1055 tiflags &= ~TH_FIN;
1056
1057 /*
1058 * Send an ACK to resynchronize and drop any data.
1059 * But keep on processing for RST or ACK.
1060 */
1061 tp->t_flags |= TF_ACKNOW;
1062 todrop = ti->ti_len;
1063 tcpstat.tcps_rcvduppack++;
1064 tcpstat.tcps_rcvdupbyte += todrop;
1065 }
1066 else
1067 {
1068 tcpstat.tcps_rcvpartduppack++;
1069 tcpstat.tcps_rcvpartdupbyte += todrop;
1070 }
1071 m_adj(m, todrop);
1072 ti->ti_seq += todrop;
1073 ti->ti_len -= todrop;
1074 if (ti->ti_urp > todrop)
1075 ti->ti_urp -= todrop;
1076 else
1077 {
1078 tiflags &= ~TH_URG;
1079 ti->ti_urp = 0;
1080 }
1081 }
1082 /*
1083 * If new data are received on a connection after the
1084 * user processes are gone, then RST the other end.
1085 */
1086 if ( (so->so_state & SS_NOFDREF)
1087 && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len)
1088 {
1089 tp = tcp_close(pData, tp);
1090 tcpstat.tcps_rcvafterclose++;
1091 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1092 goto dropwithreset;
1093 }
1094
1095 /*
1096 * If segment ends after window, drop trailing data
1097 * (and PUSH and FIN); if nothing left, just ACK.
1098 */
1099 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1100 if (todrop > 0)
1101 {
1102 tcpstat.tcps_rcvpackafterwin++;
1103 if (todrop >= ti->ti_len)
1104 {
1105 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1106 /*
1107 * If a new connection request is received
1108 * while in TIME_WAIT, drop the old connection
1109 * and start over if the sequence numbers
1110 * are above the previous ones.
1111 */
1112 if ( tiflags & TH_SYN
1113 && tp->t_state == TCPS_TIME_WAIT
1114 && SEQ_GT(ti->ti_seq, tp->rcv_nxt))
1115 {
1116 iss = tp->rcv_nxt + TCP_ISSINCR;
1117 tp = tcp_close(pData, tp);
1118 SOCKET_UNLOCK(tp->t_socket);
1119 LogFlowFunc(("%d -> findso\n", __LINE__));
1120 goto findso;
1121 }
1122 /*
1123 * If window is closed can only take segments at
1124 * window edge, and have to drop data and PUSH from
1125 * incoming segments. Continue processing, but
1126 * remember to ack. Otherwise, drop segment
1127 * and ack.
1128 */
1129 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt)
1130 {
1131 tp->t_flags |= TF_ACKNOW;
1132 tcpstat.tcps_rcvwinprobe++;
1133 }
1134 else
1135 {
1136 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1137 goto dropafterack;
1138 }
1139 }
1140 else
1141 tcpstat.tcps_rcvbyteafterwin += todrop;
1142 m_adj(m, -todrop);
1143 ti->ti_len -= todrop;
1144 tiflags &= ~(TH_PUSH|TH_FIN);
1145 }
1146
1147 /*
1148 * If last ACK falls within this segment's sequence numbers,
1149 * record its timestamp.
1150 */
1151#if 0
1152 if ( ts_present
1153 && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)
1154 && SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1155 {
1156 tp->ts_recent_age = tcp_now;
1157 tp->ts_recent = ts_val;
1158 }
1159#endif
1160
1161 /*
1162 * If the RST bit is set examine the state:
1163 * SYN_RECEIVED STATE:
1164 * If passive open, return to LISTEN state.
1165 * If active open, inform user that connection was refused.
1166 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1167 * Inform user that connection was reset, and close tcb.
1168 * CLOSING, LAST_ACK, TIME_WAIT STATES
1169 * Close the tcb.
1170 */
1171 if (tiflags&TH_RST)
1172 switch (tp->t_state)
1173 {
1174 case TCPS_SYN_RECEIVED:
1175/* so->so_error = ECONNREFUSED; */
1176 LogFlowFunc(("%d -> close\n", __LINE__));
1177 goto close;
1178
1179 case TCPS_ESTABLISHED:
1180 case TCPS_FIN_WAIT_1:
1181 case TCPS_FIN_WAIT_2:
1182 case TCPS_CLOSE_WAIT:
1183/* so->so_error = ECONNRESET; */
1184close:
1185 LogFlowFunc(("close:\n"));
1186 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSED);
1187 tcpstat.tcps_drops++;
1188 tp = tcp_close(pData, tp);
1189 LogFlowFunc(("%d -> drop\n", __LINE__));
1190 goto drop;
1191
1192 case TCPS_CLOSING:
1193 case TCPS_LAST_ACK:
1194 case TCPS_TIME_WAIT:
1195 tp = tcp_close(pData, tp);
1196 LogFlowFunc(("%d -> drop\n", __LINE__));
1197 goto drop;
1198 }
1199
1200 /*
1201 * If a SYN is in the window, then this is an
1202 * error and we send an RST and drop the connection.
1203 */
1204 if (tiflags & TH_SYN)
1205 {
1206 tp = tcp_drop(pData, tp, 0);
1207 LogFlowFunc(("%d -> dropwithreset\n", __LINE__));
1208 goto dropwithreset;
1209 }
1210
1211 /*
1212 * If the ACK bit is off we drop the segment and return.
1213 */
1214 if ((tiflags & TH_ACK) == 0)
1215 {
1216 LogFlowFunc(("%d -> drop\n", __LINE__));
1217 goto drop;
1218 }
1219
1220 /*
1221 * Ack processing.
1222 */
1223 switch (tp->t_state)
1224 {
1225 /*
1226 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1227 * ESTABLISHED state and continue processing, otherwise
1228 * send an RST. una<=ack<=max
1229 */
1230 case TCPS_SYN_RECEIVED:
1231 LogFlowFunc(("%d -> TCPS_SYN_RECEIVED\n", __LINE__));
1232 if ( SEQ_GT(tp->snd_una, ti->ti_ack)
1233 || SEQ_GT(ti->ti_ack, tp->snd_max))
1234 goto dropwithreset;
1235 tcpstat.tcps_connects++;
1236 TCP_STATE_SWITCH_TO(tp, TCPS_ESTABLISHED);
1237 /*
1238 * The sent SYN is ack'ed with our sequence number +1
1239 * The first data byte already in the buffer will get
1240 * lost if no correction is made. This is only needed for
1241 * SS_CTL since the buffer is empty otherwise.
1242 * tp->snd_una++; or:
1243 */
1244 tp->snd_una = ti->ti_ack;
1245 soisfconnected(so);
1246
1247 /* Do window scaling? */
1248#if 0
1249 if ( (tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
1250 == (TF_RCVD_SCALE|TF_REQ_SCALE))
1251 {
1252 tp->snd_scale = tp->requested_s_scale;
1253 tp->rcv_scale = tp->request_r_scale;
1254 }
1255#endif
1256 (void) tcp_reass(pData, tp, (struct tcphdr *)0, (int *)0, (struct mbuf *)0);
1257 tp->snd_wl1 = ti->ti_seq - 1;
1258 /* Avoid ack processing; snd_una==ti_ack => dup ack */
1259 LogFlowFunc(("%d -> synrx_to_est\n", __LINE__));
1260 goto synrx_to_est;
1261 /* fall into ... */
1262
1263 /*
1264 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1265 * ACKs. If the ack is in the range
1266 * tp->snd_una < ti->ti_ack <= tp->snd_max
1267 * then advance tp->snd_una to ti->ti_ack and drop
1268 * data from the retransmission queue. If this ACK reflects
1269 * more up to date window information we update our window information.
1270 */
1271 case TCPS_ESTABLISHED:
1272 case TCPS_FIN_WAIT_1:
1273 case TCPS_FIN_WAIT_2:
1274 case TCPS_CLOSE_WAIT:
1275 case TCPS_CLOSING:
1276 case TCPS_LAST_ACK:
1277 case TCPS_TIME_WAIT:
1278 LogFlowFunc(("%d -> TCPS_ESTABLISHED|TCPS_FIN_WAIT_1|TCPS_FIN_WAIT_2|TCPS_CLOSE_WAIT|"
1279 "TCPS_CLOSING|TCPS_LAST_ACK|TCPS_TIME_WAIT\n", __LINE__));
1280 if (SEQ_LEQ(ti->ti_ack, tp->snd_una))
1281 {
1282 if (ti->ti_len == 0 && tiwin == tp->snd_wnd)
1283 {
1284 tcpstat.tcps_rcvdupack++;
1285 Log2((" dup ack m = %p, so = %p\n", m, so));
1286 /*
1287 * If we have outstanding data (other than
1288 * a window probe), this is a completely
1289 * duplicate ack (ie, window info didn't
1290 * change), the ack is the biggest we've
1291 * seen and we've seen exactly our rexmt
1292 * threshold of them, assume a packet
1293 * has been dropped and retransmit it.
1294 * Kludge snd_nxt & the congestion
1295 * window so we send only this one
1296 * packet.
1297 *
1298 * We know we're losing at the current
1299 * window size so do congestion avoidance
1300 * (set ssthresh to half the current window
1301 * and pull our congestion window back to
1302 * the new ssthresh).
1303 *
1304 * Dup acks mean that packets have left the
1305 * network (they're now cached at the receiver)
1306 * so bump cwnd by the amount in the receiver
1307 * to keep a constant cwnd packets in the
1308 * network.
1309 */
1310 if ( tp->t_timer[TCPT_REXMT] == 0
1311 || ti->ti_ack != tp->snd_una)
1312 tp->t_dupacks = 0;
1313 else if (++tp->t_dupacks == tcprexmtthresh)
1314 {
1315 tcp_seq onxt = tp->snd_nxt;
1316 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
1317 if (win < 2)
1318 win = 2;
1319 tp->snd_ssthresh = win * tp->t_maxseg;
1320 tp->t_timer[TCPT_REXMT] = 0;
1321 tp->t_rtt = 0;
1322 tp->snd_nxt = ti->ti_ack;
1323 tp->snd_cwnd = tp->t_maxseg;
1324 (void) tcp_output(pData, tp);
1325 tp->snd_cwnd = tp->snd_ssthresh +
1326 tp->t_maxseg * tp->t_dupacks;
1327 if (SEQ_GT(onxt, tp->snd_nxt))
1328 tp->snd_nxt = onxt;
1329 LogFlowFunc(("%d -> drop\n", __LINE__));
1330 goto drop;
1331 }
1332 else if (tp->t_dupacks > tcprexmtthresh)
1333 {
1334 tp->snd_cwnd += tp->t_maxseg;
1335 (void) tcp_output(pData, tp);
1336 LogFlowFunc(("%d -> drop\n", __LINE__));
1337 goto drop;
1338 }
1339 }
1340 else
1341 tp->t_dupacks = 0;
1342 break;
1343 }
1344synrx_to_est:
1345 LogFlowFunc(("synrx_to_est:\n"));
1346 /*
1347 * If the congestion window was inflated to account
1348 * for the other side's cached packets, retract it.
1349 */
1350 if ( tp->t_dupacks > tcprexmtthresh
1351 && tp->snd_cwnd > tp->snd_ssthresh)
1352 tp->snd_cwnd = tp->snd_ssthresh;
1353 tp->t_dupacks = 0;
1354 if (SEQ_GT(ti->ti_ack, tp->snd_max))
1355 {
1356 tcpstat.tcps_rcvacktoomuch++;
1357 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1358 goto dropafterack;
1359 }
1360 acked = ti->ti_ack - tp->snd_una;
1361 tcpstat.tcps_rcvackpack++;
1362 tcpstat.tcps_rcvackbyte += acked;
1363
1364 /*
1365 * If we have a timestamp reply, update smoothed
1366 * round trip time. If no timestamp is present but
1367 * transmit timer is running and timed sequence
1368 * number was acked, update smoothed round trip time.
1369 * Since we now have an rtt measurement, cancel the
1370 * timer backoff (cf., Phil Karn's retransmit alg.).
1371 * Recompute the initial retransmit timer.
1372 */
1373#if 0
1374 if (ts_present)
1375 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1376 else
1377#endif
1378 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1379 tcp_xmit_timer(pData, tp, tp->t_rtt);
1380
1381 /*
1382 * If all outstanding data is acked, stop retransmit
1383 * timer and remember to restart (more output or persist).
1384 * If there is more data to be acked, restart retransmit
1385 * timer, using current (possibly backed-off) value.
1386 */
1387 if (ti->ti_ack == tp->snd_max)
1388 {
1389 tp->t_timer[TCPT_REXMT] = 0;
1390 needoutput = 1;
1391 }
1392 else if (tp->t_timer[TCPT_PERSIST] == 0)
1393 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1394 /*
1395 * When new data is acked, open the congestion window.
1396 * If the window gives us less than ssthresh packets
1397 * in flight, open exponentially (maxseg per packet).
1398 * Otherwise open linearly: maxseg per window
1399 * (maxseg^2 / cwnd per packet).
1400 */
1401 {
1402 register u_int cw = tp->snd_cwnd;
1403 register u_int incr = tp->t_maxseg;
1404
1405 if (cw > tp->snd_ssthresh)
1406 incr = incr * incr / cw;
1407 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1408 }
1409 if (acked > SBUF_LEN(&so->so_snd))
1410 {
1411 tp->snd_wnd -= SBUF_LEN(&so->so_snd);
1412 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1413 ourfinisacked = 1;
1414 }
1415 else
1416 {
1417 sbdrop(&so->so_snd, acked);
1418 tp->snd_wnd -= acked;
1419 ourfinisacked = 0;
1420 }
1421 /*
1422 * XXX sowwakup is called when data is acked and there's room for
1423 * for more data... it should read() the socket
1424 */
1425#if 0
1426 if (so->so_snd.sb_flags & SB_NOTIFY)
1427 sowwakeup(so);
1428#endif
1429 tp->snd_una = ti->ti_ack;
1430 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1431 tp->snd_nxt = tp->snd_una;
1432
1433 switch (tp->t_state)
1434 {
1435 /*
1436 * In FIN_WAIT_1 STATE in addition to the processing
1437 * for the ESTABLISHED state if our FIN is now acknowledged
1438 * then enter FIN_WAIT_2.
1439 */
1440 case TCPS_FIN_WAIT_1:
1441 if (ourfinisacked)
1442 {
1443 /*
1444 * If we can't receive any more
1445 * data, then closing user can proceed.
1446 * Starting the timer is contrary to the
1447 * specification, but if we don't get a FIN
1448 * we'll hang forever.
1449 */
1450 if (so->so_state & SS_FCANTRCVMORE)
1451 {
1452 soisfdisconnected(so);
1453 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1454 }
1455 TCP_STATE_SWITCH_TO(tp, TCPS_FIN_WAIT_2);
1456 }
1457 break;
1458
1459 /*
1460 * In CLOSING STATE in addition to the processing for
1461 * the ESTABLISHED state if the ACK acknowledges our FIN
1462 * then enter the TIME-WAIT state, otherwise ignore
1463 * the segment.
1464 */
1465 case TCPS_CLOSING:
1466 if (ourfinisacked)
1467 {
1468 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1469 tcp_canceltimers(tp);
1470 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1471 soisfdisconnected(so);
1472 }
1473 break;
1474
1475 /*
1476 * In LAST_ACK, we may still be waiting for data to drain
1477 * and/or to be acked, as well as for the ack of our FIN.
1478 * If our FIN is now acknowledged, delete the TCB,
1479 * enter the closed state and return.
1480 */
1481 case TCPS_LAST_ACK:
1482 if (ourfinisacked)
1483 {
1484 tp = tcp_close(pData, tp);
1485 LogFlowFunc(("%d -> drop\n", __LINE__));
1486 goto drop;
1487 }
1488 break;
1489
1490 /*
1491 * In TIME_WAIT state the only thing that should arrive
1492 * is a retransmission of the remote FIN. Acknowledge
1493 * it and restart the finack timer.
1494 */
1495 case TCPS_TIME_WAIT:
1496 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1497 LogFlowFunc(("%d -> dropafterack\n", __LINE__));
1498 goto dropafterack;
1499 }
1500 } /* switch(tp->t_state) */
1501
1502step6:
1503 LogFlowFunc(("step6:\n"));
1504 /*
1505 * Update window information.
1506 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1507 */
1508 if ( (tiflags & TH_ACK)
1509 && ( SEQ_LT(tp->snd_wl1, ti->ti_seq)
1510 || ( tp->snd_wl1 == ti->ti_seq
1511 && ( SEQ_LT(tp->snd_wl2, ti->ti_ack)
1512 || ( tp->snd_wl2 == ti->ti_ack
1513 && tiwin > tp->snd_wnd)))))
1514 {
1515 /* keep track of pure window updates */
1516 if ( ti->ti_len == 0
1517 && tp->snd_wl2 == ti->ti_ack
1518 && tiwin > tp->snd_wnd)
1519 tcpstat.tcps_rcvwinupd++;
1520 tp->snd_wnd = tiwin;
1521 tp->snd_wl1 = ti->ti_seq;
1522 tp->snd_wl2 = ti->ti_ack;
1523 if (tp->snd_wnd > tp->max_sndwnd)
1524 tp->max_sndwnd = tp->snd_wnd;
1525 needoutput = 1;
1526 }
1527
1528 /*
1529 * Process segments with URG.
1530 */
1531 if ((tiflags & TH_URG) && ti->ti_urp &&
1532 TCPS_HAVERCVDFIN(tp->t_state) == 0)
1533 {
1534 /*
1535 * This is a kludge, but if we receive and accept
1536 * random urgent pointers, we'll crash in
1537 * soreceive. It's hard to imagine someone
1538 * actually wanting to send this much urgent data.
1539 */
1540 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen)
1541 {
1542 ti->ti_urp = 0;
1543 tiflags &= ~TH_URG;
1544 LogFlowFunc(("%d -> dodata\n", __LINE__));
1545 goto dodata;
1546 }
1547
1548 /*
1549 * If this segment advances the known urgent pointer,
1550 * then mark the data stream. This should not happen
1551 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1552 * a FIN has been received from the remote side.
1553 * In these states we ignore the URG.
1554 *
1555 * According to RFC961 (Assigned Protocols),
1556 * the urgent pointer points to the last octet
1557 * of urgent data. We continue, however,
1558 * to consider it to indicate the first octet
1559 * of data past the urgent section as the original
1560 * spec states (in one of two places).
1561 */
1562 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up))
1563 {
1564 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1565 so->so_urgc = SBUF_LEN(&so->so_rcv) +
1566 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1567 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1568 }
1569 }
1570 else
1571 /*
1572 * If no out of band data is expected,
1573 * pull receive urgent pointer along
1574 * with the receive window.
1575 */
1576 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1577 tp->rcv_up = tp->rcv_nxt;
1578dodata:
1579 LogFlowFunc(("dodata:\n"));
1580
1581 /*
1582 * If this is a small packet, then ACK now - with Nagel
1583 * congestion avoidance sender won't send more until
1584 * he gets an ACK.
1585 *
1586 * XXX: In case you wonder... The magic "27" below is ESC that
1587 * presumably starts a terminal escape-sequence and that we want
1588 * to ACK ASAP. [Original slirp code had three different
1589 * heuristics to chose from here and in the header prediction case
1590 * above, but the commented out alternatives were lost and the
1591 * header prediction case that had an expanded comment about this
1592 * has been modified to always send an ACK].
1593 */
1594 if ( ti->ti_len
1595 && (unsigned)ti->ti_len <= 5
1596 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27)
1597 {
1598 tp->t_flags |= TF_ACKNOW;
1599 }
1600
1601 /*
1602 * Process the segment text, merging it into the TCP sequencing queue,
1603 * and arranging for acknowledgment of receipt if necessary.
1604 * This process logically involves adjusting tp->rcv_wnd as data
1605 * is presented to the user (this happens in tcp_usrreq.c,
1606 * case PRU_RCVD). If a FIN has already been received on this
1607 * connection then we just ignore the text.
1608 */
1609 if ( (ti->ti_len || (tiflags&TH_FIN))
1610 && TCPS_HAVERCVDFIN(tp->t_state) == 0)
1611 {
1612 if ( ti->ti_seq == tp->rcv_nxt
1613 && LIST_EMPTY(&tp->t_segq)
1614 && tp->t_state == TCPS_ESTABLISHED)
1615 {
1616 DELAY_ACK(tp, ti); /* little bit different from BSD declaration see netinet/tcp_input.c */
1617 tp->rcv_nxt += tlen;
1618 tiflags = ti->ti_t.th_flags & TH_FIN;
1619 tcpstat.tcps_rcvpack++;
1620 tcpstat.tcps_rcvbyte += tlen;
1621 if (so->so_state & SS_FCANTRCVMORE)
1622 m_freem(pData, m);
1623 else
1624 sbappend(pData, so, m);
1625 }
1626 else
1627 {
1628 tiflags = tcp_reass(pData, tp, &ti->ti_t, &tlen, m);
1629 tp->t_flags |= TF_ACKNOW;
1630 }
1631 /*
1632 * Note the amount of data that peer has sent into
1633 * our window, in order to estimate the sender's
1634 * buffer size.
1635 */
1636 len = SBUF_SIZE(&so->so_rcv) - (tp->rcv_adv - tp->rcv_nxt);
1637 }
1638 else
1639 {
1640 m_freem(pData, m);
1641 tiflags &= ~TH_FIN;
1642 }
1643
1644 /*
1645 * If FIN is received ACK the FIN and let the user know
1646 * that the connection is closing.
1647 */
1648 if (tiflags & TH_FIN)
1649 {
1650 if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1651 {
1652 /*
1653 * If we receive a FIN we can't send more data,
1654 * set it SS_FDRAIN
1655 * Shutdown the socket if there is no rx data in the
1656 * buffer.
1657 * soread() is called on completion of shutdown() and
1658 * will got to TCPS_LAST_ACK, and use tcp_output()
1659 * to send the FIN.
1660 */
1661/* sofcantrcvmore(so); */
1662 sofwdrain(so);
1663
1664 tp->t_flags |= TF_ACKNOW;
1665 tp->rcv_nxt++;
1666 }
1667 switch (tp->t_state)
1668 {
1669 /*
1670 * In SYN_RECEIVED and ESTABLISHED STATES
1671 * enter the CLOSE_WAIT state.
1672 */
1673 case TCPS_SYN_RECEIVED:
1674 case TCPS_ESTABLISHED:
1675 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSE_WAIT);
1676 break;
1677
1678 /*
1679 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1680 * enter the CLOSING state.
1681 */
1682 case TCPS_FIN_WAIT_1:
1683 TCP_STATE_SWITCH_TO(tp, TCPS_CLOSING);
1684 break;
1685
1686 /*
1687 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1688 * starting the time-wait timer, turning off the other
1689 * standard timers.
1690 */
1691 case TCPS_FIN_WAIT_2:
1692 TCP_STATE_SWITCH_TO(tp, TCPS_TIME_WAIT);
1693 tcp_canceltimers(tp);
1694 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1695 soisfdisconnected(so);
1696 break;
1697
1698 /*
1699 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1700 */
1701 case TCPS_TIME_WAIT:
1702 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1703 break;
1704 }
1705 }
1706
1707 /*
1708 * Return any desired output.
1709 */
1710 if (needoutput || (tp->t_flags & TF_ACKNOW))
1711 tcp_output(pData, tp);
1712
1713 SOCKET_UNLOCK(so);
1714 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1715 LogFlowFuncLeave();
1716 return;
1717
1718dropafterack:
1719 LogFlowFunc(("dropafterack:\n"));
1720 /*
1721 * Generate an ACK dropping incoming segment if it occupies
1722 * sequence space, where the ACK reflects our state.
1723 */
1724 if (tiflags & TH_RST)
1725 {
1726 LogFlowFunc(("%d -> drop\n", __LINE__));
1727 goto drop;
1728 }
1729 m_freem(pData, m);
1730 tp->t_flags |= TF_ACKNOW;
1731 (void) tcp_output(pData, tp);
1732 SOCKET_UNLOCK(so);
1733 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1734 LogFlowFuncLeave();
1735 return;
1736
1737dropwithreset:
1738 LogFlowFunc(("dropwithreset:\n"));
1739 /* reuses m if m!=NULL, m_free() unnecessary */
1740 if (tiflags & TH_ACK)
1741 tcp_respond(pData, tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1742 else
1743 {
1744 if (tiflags & TH_SYN)
1745 ti->ti_len++;
1746 tcp_respond(pData, tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1747 TH_RST|TH_ACK);
1748 }
1749
1750 if (so != &tcb)
1751 SOCKET_UNLOCK(so);
1752 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1753 LogFlowFuncLeave();
1754 return;
1755
1756drop:
1757 LogFlowFunc(("drop:\n"));
1758 /*
1759 * Drop space held by incoming segment and return.
1760 */
1761 m_freem(pData, m);
1762
1763#ifdef VBOX_WITH_SLIRP_MT
1764 if (RTCritSectIsOwned(&so->so_mutex))
1765 {
1766 SOCKET_UNLOCK(so);
1767 }
1768#endif
1769
1770 STAM_PROFILE_STOP(&pData->StatTCP_input, counter_input);
1771 LogFlowFuncLeave();
1772 return;
1773}
1774
1775
1776void
1777tcp_fconnect_failed(PNATState pData, struct socket *so, int sockerr)
1778{
1779 struct tcpcb *tp;
1780 int code;
1781
1782 Log2(("NAT: connect error %d %R[natsock]\n", sockerr, so));
1783
1784 Assert(so->so_state & SS_ISFCONNECTING);
1785 so->so_state = SS_NOFDREF;
1786
1787 if (sockerr == ECONNREFUSED || sockerr == ECONNRESET)
1788 {
1789 /* hand off to tcp_input():cont_conn to send RST */
1790 TCP_INPUT(pData, NULL, 0, so);
1791 return;
1792 }
1793
1794 tp = sototcpcb(so);
1795 if (RT_UNLIKELY(tp == NULL)) /* should never happen */
1796 {
1797 LogRel(("NAT: tp == NULL %R[natsock]\n", so));
1798 sofree(pData, so);
1799 return;
1800 }
1801
1802 if (sockerr == ENETUNREACH || sockerr == ENETDOWN)
1803 code = ICMP_UNREACH_NET;
1804 else if (sockerr == EHOSTUNREACH || sockerr == EHOSTDOWN)
1805 code = ICMP_UNREACH_HOST;
1806 else
1807 code = -1;
1808
1809 if (code >= 0)
1810 {
1811 struct ip *oip;
1812 unsigned ohdrlen;
1813 struct mbuf *m;
1814
1815 if (RT_UNLIKELY(so->so_ohdr == NULL))
1816 goto out;
1817
1818 oip = (struct ip *)so->so_ohdr;
1819 ohdrlen = oip->ip_hl * 4 + 8;
1820
1821 m = m_gethdr(pData, M_NOWAIT, MT_HEADER);
1822 if (RT_UNLIKELY(m == NULL))
1823 goto out;
1824
1825 m_copyback(pData, m, 0, ohdrlen, (caddr_t)so->so_ohdr);
1826 m->m_pkthdr.header = mtod(m, void *);
1827
1828 icmp_error(pData, m, ICMP_UNREACH, code, 0, NULL);
1829 }
1830
1831 out:
1832 tcp_close(pData, tp);
1833}
1834
1835
1836void
1837tcp_dooptions(PNATState pData, struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1838{
1839 u_int16_t mss;
1840 int opt, optlen;
1841
1842 LogFlowFunc(("tcp_dooptions: tp = %R[tcpcb793], cnt=%i\n", tp, cnt));
1843
1844 for (; cnt > 0; cnt -= optlen, cp += optlen)
1845 {
1846 opt = cp[0];
1847 if (opt == TCPOPT_EOL)
1848 break;
1849 if (opt == TCPOPT_NOP)
1850 optlen = 1;
1851 else
1852 {
1853 optlen = cp[1];
1854 if (optlen <= 0)
1855 break;
1856 }
1857 switch (opt)
1858 {
1859 default:
1860 continue;
1861
1862 case TCPOPT_MAXSEG:
1863 if (optlen != TCPOLEN_MAXSEG)
1864 continue;
1865 if (!(ti->ti_flags & TH_SYN))
1866 continue;
1867 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1868 NTOHS(mss);
1869 (void) tcp_mss(pData, tp, mss); /* sets t_maxseg */
1870 break;
1871
1872#if 0
1873 case TCPOPT_WINDOW:
1874 if (optlen != TCPOLEN_WINDOW)
1875 continue;
1876 if (!(ti->ti_flags & TH_SYN))
1877 continue;
1878 tp->t_flags |= TF_RCVD_SCALE;
1879 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1880 break;
1881
1882 case TCPOPT_TIMESTAMP:
1883 if (optlen != TCPOLEN_TIMESTAMP)
1884 continue;
1885 *ts_present = 1;
1886 memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1887 NTOHL(*ts_val);
1888 memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1889 NTOHL(*ts_ecr);
1890
1891 /*
1892 * A timestamp received in a SYN makes
1893 * it ok to send timestamp requests and replies.
1894 */
1895 if (ti->ti_flags & TH_SYN)
1896 {
1897 tp->t_flags |= TF_RCVD_TSTMP;
1898 tp->ts_recent = *ts_val;
1899 tp->ts_recent_age = tcp_now;
1900 }
1901 break;
1902#endif
1903 }
1904 }
1905}
1906
1907
1908/*
1909 * Pull out of band byte out of a segment so
1910 * it doesn't appear in the user's data queue.
1911 * It is still reflected in the segment length for
1912 * sequencing purposes.
1913 */
1914
1915#if 0
1916void
1917tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m)
1918{
1919 int cnt = ti->ti_urp - 1;
1920
1921 while (cnt >= 0)
1922 {
1923 if (m->m_len > cnt)
1924 {
1925 char *cp = mtod(m, caddr_t) + cnt;
1926 struct tcpcb *tp = sototcpcb(so);
1927
1928 tp->t_iobc = *cp;
1929 tp->t_oobflags |= TCPOOB_HAVEDATA;
1930 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1931 m->m_len--;
1932 return;
1933 }
1934 cnt -= m->m_len;
1935 m = m->m_next; /* XXX WRONG! Fix it! */
1936 if (m == 0)
1937 break;
1938 }
1939 panic("tcp_pulloutofband");
1940}
1941#endif
1942
1943/*
1944 * Collect new round-trip time estimate
1945 * and update averages and current timeout.
1946 */
1947
1948void
1949tcp_xmit_timer(PNATState pData, register struct tcpcb *tp, int rtt)
1950{
1951 register short delta;
1952
1953 LogFlowFunc(("ENTER: tcp_xmit_timer: tp = %R[tcpcb793] rtt = %d\n", tp, rtt));
1954
1955 tcpstat.tcps_rttupdated++;
1956 if (tp->t_srtt != 0)
1957 {
1958 /*
1959 * srtt is stored as fixed point with 3 bits after the
1960 * binary point (i.e., scaled by 8). The following magic
1961 * is equivalent to the smoothing algorithm in rfc793 with
1962 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1963 * point). Adjust rtt to origin 0.
1964 */
1965 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1966 if ((tp->t_srtt += delta) <= 0)
1967 tp->t_srtt = 1;
1968 /*
1969 * We accumulate a smoothed rtt variance (actually, a
1970 * smoothed mean difference), then set the retransmit
1971 * timer to smoothed rtt + 4 times the smoothed variance.
1972 * rttvar is stored as fixed point with 2 bits after the
1973 * binary point (scaled by 4). The following is
1974 * equivalent to rfc793 smoothing with an alpha of .75
1975 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1976 * rfc793's wired-in beta.
1977 */
1978 if (delta < 0)
1979 delta = -delta;
1980 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1981 if ((tp->t_rttvar += delta) <= 0)
1982 tp->t_rttvar = 1;
1983 }
1984 else
1985 {
1986 /*
1987 * No rtt measurement yet - use the unsmoothed rtt.
1988 * Set the variance to half the rtt (so our first
1989 * retransmit happens at 3*rtt).
1990 */
1991 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1992 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1993 }
1994 tp->t_rtt = 0;
1995 tp->t_rxtshift = 0;
1996
1997 /*
1998 * the retransmit should happen at rtt + 4 * rttvar.
1999 * Because of the way we do the smoothing, srtt and rttvar
2000 * will each average +1/2 tick of bias. When we compute
2001 * the retransmit timer, we want 1/2 tick of rounding and
2002 * 1 extra tick because of +-1/2 tick uncertainty in the
2003 * firing of the timer. The bias will give us exactly the
2004 * 1.5 tick we need. But, because the bias is
2005 * statistical, we have to test that we don't drop below
2006 * the minimum feasible timer (which is 2 ticks).
2007 */
2008 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2009 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
2010
2011 /*
2012 * We received an ack for a packet that wasn't retransmitted;
2013 * it is probably safe to discard any error indications we've
2014 * received recently. This isn't quite right, but close enough
2015 * for now (a route might have failed after we sent a segment,
2016 * and the return path might not be symmetrical).
2017 */
2018 tp->t_softerror = 0;
2019}
2020
2021/*
2022 * Determine a reasonable value for maxseg size.
2023 * If the route is known, check route for mtu.
2024 * If none, use an mss that can be handled on the outgoing
2025 * interface without forcing IP to fragment; if bigger than
2026 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2027 * to utilize large mbufs. If no route is found, route has no mtu,
2028 * or the destination isn't local, use a default, hopefully conservative
2029 * size (usually 512 or the default IP max size, but no more than the mtu
2030 * of the interface), as we can't discover anything about intervening
2031 * gateways or networks. We also initialize the congestion/slow start
2032 * window to be a single segment if the destination isn't local.
2033 * While looking at the routing entry, we also initialize other path-dependent
2034 * parameters from pre-set or cached values in the routing entry.
2035 */
2036
2037int
2038tcp_mss(PNATState pData, register struct tcpcb *tp, u_int offer)
2039{
2040 struct socket *so = tp->t_socket;
2041 int mss;
2042
2043 LogFlowFunc(("ENTER: tcp_mss: offer=%u, t_maxseg=%u; tp=%R[natsock]\n",
2044 offer, (unsigned int)tp->t_maxseg, so));
2045
2046 mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
2047 if (offer)
2048 mss = min(mss, offer);
2049 mss = max(mss, 32);
2050 if (mss < tp->t_maxseg || offer != 0)
2051 tp->t_maxseg = mss;
2052
2053 tp->snd_cwnd = mss;
2054
2055 sbreserve(pData, &so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
2056 sbreserve(pData, &so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
2057
2058 LogFlowFunc(("LEAVE: mss=%d\n", mss));
2059 return mss;
2060}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette