/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: @(#)tcp.c 1.0.16 05/25/93 * * Authors: Ross Biro, * Fred N. van Kempen, * Mark Evans, * Corey Minyard * Florian La Roche, * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 * and was trying to connect (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It wakes people * on errors. select behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_reset() fixed to work for everything not just * packets for unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. Acking * any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst receive * otherwise odd bits of prattle escape still * Alan Cox : Fixed another acking RST frame bug. Should stop * LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential nasty. * Alan Cox : Added some beter commenting, as the tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised state ack error/ * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * * * To Fix: * Possibly a problem with accept(). BSD accept never fails after * it causes a select. Linux can - given the official select semantics I * feel that _really_ its the BSD network programs that are bust (notably * inetd, which hangs occasionally because of this). * Add VJ Fastrecovery algorithm ? * Protocol closedown badly messed up. * Incompatiblity with spider ports (tcp hangs on that * socket occasionally). * MSG_PEEK and read on same socket at once can cause crashes. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or(at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include "inet.h" #include "dev.h" #include "ip.h" #include "protocol.h" #include "icmp.h" #include "tcp.h" #include "skbuff.h" #include "sock.h" #include "arp.h" #include #include #include #include #include #define SEQ_TICK 3 unsigned long seq_offset; #define SUBNETSARELOCAL static __inline__ int min(unsigned int a, unsigned int b) { if (a < b) return(a); return(b); } void print_th(struct tcphdr *th) { unsigned char *ptr; if (inet_debug != DBG_TCP) return; printk("TCP header:\n"); ptr =(unsigned char *)(th + 1); printk(" source=%d, dest=%d, seq =%ld, ack_seq = %ld\n", ntohs(th->source), ntohs(th->dest), ntohl(th->seq), ntohl(th->ack_seq)); printk(" fin=%d, syn=%d, rst=%d, psh=%d, ack=%d, urg=%d res1=%d res2=%d\n", th->fin, th->syn, th->rst, th->psh, th->ack, th->urg, th->res1, th->res2); printk(" window = %d, check = %d urg_ptr = %d\n", ntohs(th->window), ntohs(th->check), ntohs(th->urg_ptr)); printk(" doff = %d\n", th->doff); printk(" options = %d %d %d %d\n", ptr[0], ptr[1], ptr[2], ptr[3]); } /* This routine grabs the first thing off of a rcv queue. */ static struct sk_buff * get_firstr(struct sock *sk) { return skb_dequeue(&sk->rqueue); } /* * Difference between two values in tcp ack terms. */ static long diff(unsigned long seq1, unsigned long seq2) { long d; d = seq1 - seq2; if (d > 0) return(d); /* I hope this returns what I want. */ return(~d+1); } /* This routine picks a TCP windows for a socket based on the following constraints 1. The window can never be shrunk once it is offered (RFC 793) 2. We limit memory per socket For now we use NET2E3's heuristic of offering half the memory we have handy. All is not as bad as this seems however because of two things. Firstly we will bin packets even within the window in order to get the data we are waiting for into the memory limit. Secondly we bin common duplicate forms at receive time Better heuristics welcome */ static int tcp_select_window(struct sock *sk) { int new_window = sk->prot->rspace(sk); /* * two things are going on here. First, we don't ever offer a * window less than min(sk->mss, MAX_WINDOW/2). This is the * receiver side of SWS as specified in RFC1122. * Second, we always give them at least the window they * had before, in order to avoid retracting window. This * is technically allowed, but RFC1122 advises against it and * in practice it causes trouble. */ if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window) return(sk->window); return(new_window); } /* Enter the time wait state. */ static void tcp_time_wait(struct sock *sk) { sk->state = TCP_TIME_WAIT; sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } /* * A timer event has trigger a tcp retransmit timeout. The * socket xmit queue is ready and set up to send. Because * the ack receive code keeps the queue straight we do * nothing clever here. */ static void tcp_retransmit(struct sock *sk, int all) { if (all) { ip_retransmit(sk, all); return; } sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ /* sk->ssthresh in theory can be zero. I guess that's OK */ sk->cong_count = 0; sk->cong_window = 1; /* Do the actual retransmit. */ ip_retransmit(sk, all); } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. */ void tcp_err(int err, unsigned char *header, unsigned long daddr, unsigned long saddr, struct inet_protocol *protocol) { struct tcphdr *th; struct sock *sk; struct iphdr *iph=(struct iphdr *)header; header+=4*iph->ihl; DPRINTF((DBG_TCP, "TCP: tcp_err(%d, hdr=%X, daddr=%X saddr=%X, protocol=%X)\n", err, header, daddr, saddr, protocol)); th =(struct tcphdr *)header; sk = get_sock(&tcp_prot, th->source/*dest*/, daddr, th->dest/*source*/, saddr); print_th(th); if (sk == NULL) return; if(err<0) { sk->err = -err; sk->error_report(sk); return; } if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) { /* * FIXME: * For now we will just trigger a linear backoff. * The slow start code should cause a real backoff here. */ if (sk->cong_window > 4) sk->cong_window--; return; } DPRINTF((DBG_TCP, "TCP: icmp_err got error\n")); sk->err = icmp_err_convert[err & 0xff].errno; /* * If we've already connected we will keep trying * until we time out, or the user gives up. */ if (icmp_err_convert[err & 0xff].fatal) { if (sk->state == TCP_SYN_SENT) { sk->state = TCP_CLOSE; sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ } } return; } /* * Walk down the receive queue counting readable data until we hit the end or we find a gap * in the received data queue (ie a frame missing that needs sending to us) */ static int tcp_readable(struct sock *sk) { unsigned long counted; unsigned long amount; struct sk_buff *skb; int count=0; int sum; unsigned long flags; DPRINTF((DBG_TCP, "tcp_readable(sk=%X)\n", sk)); if(sk && sk->debug) printk("tcp_readable: %p - ",sk); if (sk == NULL || skb_peek(&sk->rqueue) == NULL) /* Empty sockets are easy! */ { if(sk && sk->debug) printk("empty\n"); return(0); } counted = sk->copied_seq+1; /* Where we are at the moment */ amount = 0; save_flags(flags); /* So nobody adds things at the wrong moment */ cli(); skb =(struct sk_buff *)sk->rqueue; /* Do until a push or until we are out of data. */ do { count++; #ifdef OLD /* This is wrong: It breaks Chameleon amongst other stacks */ if (count > 20) { restore_flags(flags); DPRINTF((DBG_TCP, "tcp_readable, more than 20 packets without a psh\n")); printk("tcp_read: possible read_queue corruption.\n"); return(amount); } #endif if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ break; sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ if (skb->h.th->syn) sum++; if (skb->h.th->urg) { sum -= ntohs(skb->h.th->urg_ptr); /* Dont count urg data */ } if (sum >= 0) { /* Add it up, move on */ amount += sum; if (skb->h.th->syn) amount--; counted += sum; } if (amount && skb->h.th->psh) break; skb =(struct sk_buff *)skb->next; /* Move along */ } while(skb != sk->rqueue); restore_flags(flags); DPRINTF((DBG_TCP, "tcp readable returning %d bytes\n", amount)); if(sk->debug) printk("got %lu bytes.\n",amount); return(amount); } /* * Wait for a TCP event. Note the oddity with SEL_IN and reading. The * listening socket has a receive queue of sockets to accept. */ static int tcp_select(struct sock *sk, int sel_type, select_table *wait) { DPRINTF((DBG_TCP, "tcp_select(sk=%X, sel_type = %d, wait = %X)\n", sk, sel_type, wait)); sk->inuse = 1; switch(sel_type) { case SEL_IN: if(sk->debug) printk("select in"); select_wait(sk->sleep, wait); if(sk->debug) printk("-select out"); if (skb_peek(&sk->rqueue) != NULL) { if (sk->state == TCP_LISTEN || tcp_readable(sk)) { release_sock(sk); if(sk->debug) printk("-select ok data\n"); return(1); } } if (sk->err != 0) /* Receiver error */ { release_sock(sk); if(sk->debug) printk("-select ok error"); return(1); } if (sk->shutdown & RCV_SHUTDOWN) { release_sock(sk); if(sk->debug) printk("-select ok down\n"); return(1); } else { release_sock(sk); if(sk->debug) printk("-select fail\n"); return(0); } case SEL_OUT: select_wait(sk->sleep, wait); if (sk->shutdown & SEND_SHUTDOWN) { DPRINTF((DBG_TCP, "write select on shutdown socket.\n")); /* FIXME: should this return an error? */ release_sock(sk); return(0); } /* * FIXME: * Hack so it will probably be able to write * something if it says it's ok to write. */ if (sk->prot->wspace(sk) >= sk->mss) { release_sock(sk); /* This should cause connect to work ok. */ if (sk->state == TCP_SYN_RECV || sk->state == TCP_SYN_SENT) return(0); return(1); } DPRINTF((DBG_TCP, "tcp_select: sleeping on write sk->wmem_alloc = %d, " "sk->packets_out = %d\n" "sk->wback = %X, sk->wfront = %X\n" "sk->send_seq = %u, sk->window_seq=%u\n", sk->wmem_alloc, sk->packets_out, sk->wback, sk->wfront, sk->send_seq, sk->window_seq)); release_sock(sk); return(0); case SEL_EX: select_wait(sk->sleep,wait); if (sk->err) { release_sock(sk); return(1); } release_sock(sk); return(0); } release_sock(sk); return(0); } int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { int err; DPRINTF((DBG_TCP, "tcp_ioctl(sk=%X, cmd = %d, arg=%X)\n", sk, cmd, arg)); switch(cmd) { case DDIOCSDBG: return(dbg_ioctl((void *) arg, DBG_TCP)); case TIOCINQ: #ifdef FIXME /* FIXME: */ case FIONREAD: #endif { unsigned long amount; if (sk->state == TCP_LISTEN) return(-EINVAL); sk->inuse = 1; amount = tcp_readable(sk); release_sock(sk); DPRINTF((DBG_TCP, "returning %d\n", amount)); err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(unsigned long)); if(err) return err; put_fs_long(amount,(unsigned long *)arg); return(0); } case SIOCATMARK: { struct sk_buff *skb; int answ = 0; /* * Try to figure out if we need to read * some urgent data. */ sk->inuse = 1; if ((skb=skb_peek(&sk->rqueue)) != NULL) { if (sk->copied_seq+1 == skb->h.th->seq && skb->h.th->urg) answ = 1; } release_sock(sk); err=verify_area(VERIFY_WRITE,(void *) arg, sizeof(unsigned long)); if(err) return err; put_fs_long(answ,(int *) arg); return(0); } case TIOCOUTQ: { unsigned long amount; if (sk->state == TCP_LISTEN) return(-EINVAL); amount = sk->prot->wspace(sk); err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(unsigned long)); if(err) return err; put_fs_long(amount,(unsigned long *)arg); return(0); } default: return(-EINVAL); } } /* This routine computes a TCP checksum. */ unsigned short tcp_check(struct tcphdr *th, int len, unsigned long saddr, unsigned long daddr) { unsigned long sum; if (saddr == 0) saddr = my_addr(); print_th(th); __asm__("\t addl %%ecx,%%ebx\n" "\t adcl %%edx,%%ebx\n" "\t adcl $0, %%ebx\n" : "=b"(sum) : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256) : "cx","bx","dx" ); if (len > 3) { __asm__("\tclc\n" "1:\n" "\t lodsl\n" "\t adcl %%eax, %%ebx\n" "\t loop 1b\n" "\t adcl $0, %%ebx\n" : "=b"(sum) , "=S"(th) : "0"(sum), "c"(len/4) ,"1"(th) : "ax", "cx", "bx", "si" ); } /* Convert from 32 bits to 16 bits. */ __asm__("\t movl %%ebx, %%ecx\n" "\t shrl $16,%%ecx\n" "\t addw %%cx, %%bx\n" "\t adcw $0, %%bx\n" : "=b"(sum) : "0"(sum) : "bx", "cx"); /* Check for an extra word. */ if ((len & 2) != 0) { __asm__("\t lodsw\n" "\t addw %%ax,%%bx\n" "\t adcw $0, %%bx\n" : "=b"(sum), "=S"(th) : "0"(sum) ,"1"(th) : "si", "ax", "bx"); } /* Now check for the extra byte. */ if ((len & 1) != 0) { __asm__("\t lodsb\n" "\t movb $0,%%ah\n" "\t addw %%ax,%%bx\n" "\t adcw $0, %%bx\n" : "=b"(sum) : "0"(sum) ,"S"(th) : "si", "ax", "bx"); } /* We only want the bottom 16 bits, but we never cleared the top 16. */ return((~sum) & 0xffff); } void tcp_send_check(struct tcphdr *th, unsigned long saddr, unsigned long daddr, int len, struct sock *sk) { th->check = 0; th->check = tcp_check(th, len, saddr, daddr); return; } static void tcp_send_skb(struct sock *sk, struct sk_buff *skb) { int size; /* length of packet (not counting length of pre-tcp headers) */ size = skb->len - ((unsigned char *) skb->h.th - skb->data); /* sanity check it.. */ if (size < sizeof(struct tcphdr) || size > skb->len) { printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n", skb, skb->data, skb->h.th, skb->len); kfree_skb(skb, FREE_WRITE); return; } /* If we have queued a header size packet.. */ if (size == sizeof(struct tcphdr)) { /* If its got a syn or fin its notionally included in the size..*/ if(!skb->h.th->syn && !skb->h.th->fin) { printk("tcp_send_skb: attempt to queue a bogon.\n"); kfree_skb(skb,FREE_WRITE); return; } } /* We need to complete and send the packet. */ tcp_send_check(skb->h.th, sk->saddr, sk->daddr, size, sk); skb->h.seq = sk->send_seq; if (after(sk->send_seq , sk->window_seq) || (sk->retransmits && sk->timeout == TIME_WRITE) || sk->packets_out >= sk->cong_window) { DPRINTF((DBG_TCP, "sk->cong_window = %d, sk->packets_out = %d\n", sk->cong_window, sk->packets_out)); DPRINTF((DBG_TCP, "sk->send_seq = %d, sk->window_seq = %d\n", sk->send_seq, sk->window_seq)); skb->next = NULL; skb->magic = TCP_WRITE_QUEUE_MAGIC; if (sk->wback == NULL) { sk->wfront = skb; } else { sk->wback->next = skb; } sk->wback = skb; if (before(sk->window_seq, sk->wfront->h.seq) && sk->send_head == NULL && sk->ack_backlog == 0) reset_timer(sk, TIME_PROBE0, sk->rto); } else { sk->prot->queue_xmit(sk, skb->dev, skb, 0); } } struct sk_buff * tcp_dequeue_partial(struct sock * sk) { struct sk_buff * skb; unsigned long flags; save_flags(flags); cli(); skb = sk->partial; if (skb) { sk->partial = NULL; del_timer(&sk->partial_timer); } restore_flags(flags); return skb; } static void tcp_send_partial(struct sock *sk) { struct sk_buff *skb; if (sk == NULL) return; while ((skb = tcp_dequeue_partial(sk)) != NULL) tcp_send_skb(sk, skb); } void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk) { struct sk_buff * tmp; unsigned long flags; save_flags(flags); cli(); tmp = sk->partial; if (tmp) del_timer(&sk->partial_timer); sk->partial = skb; sk->partial_timer.expires = HZ; sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial; sk->partial_timer.data = (unsigned long) sk; add_timer(&sk->partial_timer); restore_flags(flags); if (tmp) tcp_send_skb(sk, tmp); } /* This routine sends an ack and also updates the window. */ static void tcp_send_ack(unsigned long sequence, unsigned long ack, struct sock *sk, struct tcphdr *th, unsigned long daddr) { struct sk_buff *buff; struct tcphdr *t1; struct device *dev = NULL; int tmp; if(sk->zapped) return; /* We have been reset, we may not send again */ /* * We need to grab some memory, and put together an ack, * and then put it into the queue to be sent. */ buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); if (buff == NULL) { /* Force it to send an ack. */ sk->ack_backlog++; if (sk->timeout != TIME_WRITE && tcp_connected(sk->state)) { reset_timer(sk, TIME_WRITE, 10); } if (inet_debug == DBG_SLIP) printk("\rtcp_ack: malloc failed\n"); return; } buff->mem_addr = buff; buff->mem_len = MAX_ACK_SIZE; buff->len = sizeof(struct tcphdr); buff->sk = sk; t1 =(struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev, IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { buff->free=1; sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); if (inet_debug == DBG_SLIP) printk("\rtcp_ack: build_header failed\n"); return; } buff->len += tmp; t1 =(struct tcphdr *)((char *)t1 +tmp); /* FIXME: */ memcpy(t1, th, sizeof(*t1)); /* this should probably be removed */ /* swap the send and the receive. */ t1->dest = th->source; t1->source = th->dest; t1->seq = ntohl(sequence); t1->ack = 1; sk->window = tcp_select_window(sk);/*sk->prot->rspace(sk);*/ t1->window = ntohs(sk->window); t1->res1 = 0; t1->res2 = 0; t1->rst = 0; t1->urg = 0; t1->syn = 0; t1->psh = 0; t1->fin = 0; if (ack == sk->acked_seq) { sk->ack_backlog = 0; sk->bytes_rcv = 0; sk->ack_timed = 0; if (sk->send_head == NULL && sk->wfront == NULL && sk->timeout == TIME_WRITE) { if(sk->keepopen) reset_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN); else delete_timer(sk); } } t1->ack_seq = ntohl(ack); t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk); if (sk->debug) printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack); sk->prot->queue_xmit(sk, dev, buff, 1); } /* This routine builds a generic TCP header. */ static int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) { /* FIXME: want to get rid of this. */ memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); th->seq = htonl(sk->send_seq); th->psh =(push == 0) ? 1 : 0; th->doff = sizeof(*th)/4; th->ack = 1; th->fin = 0; sk->ack_backlog = 0; sk->bytes_rcv = 0; sk->ack_timed = 0; th->ack_seq = htonl(sk->acked_seq); sk->window = tcp_select_window(sk)/*sk->prot->rspace(sk)*/; th->window = htons(sk->window); return(sizeof(*th)); } /* * This routine copies from a user buffer into a socket, * and starts the transmit system. */ static int tcp_write(struct sock *sk, unsigned char *from, int len, int nonblock, unsigned flags) { int copied = 0; int copy; int tmp; struct sk_buff *skb; struct sk_buff *send_tmp; unsigned char *buff; struct proto *prot; struct device *dev = NULL; DPRINTF((DBG_TCP, "tcp_write(sk=%X, from=%X, len=%d, nonblock=%d, flags=%X)\n", sk, from, len, nonblock, flags)); sk->inuse=1; prot = sk->prot; while(len > 0) { if (sk->err) { /* Stop on an error */ release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } /* First thing we do is make sure that we are established. */ if (sk->shutdown & SEND_SHUTDOWN) { release_sock(sk); sk->err = EPIPE; if (copied) return(copied); sk->err = 0; return(-EPIPE); } /* Wait for a connection to finish. */ while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { if (sk->err) { release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { release_sock(sk); DPRINTF((DBG_TCP, "tcp_write: return 1\n")); if (copied) return(copied); if (sk->err) { tmp = -sk->err; sk->err = 0; return(tmp); } if (sk->keepopen) { send_sig(SIGPIPE, current, 0); } return(-EPIPE); } if (nonblock || copied) { release_sock(sk); DPRINTF((DBG_TCP, "tcp_write: return 2\n")); if (copied) return(copied); return(-EAGAIN); } release_sock(sk); cli(); if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) { interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); DPRINTF((DBG_TCP, "tcp_write: return 3\n")); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); } /* * The following code can result in copy <= if sk->mss is ever * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). * sk->mtu is constant once SYN processing is finished. I.e. we * had better not get here until we've seen his SYN and at least one * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) * But ESTABLISHED should guarantee that. sk->max_window is by definition * non-decreasing. Note that any ioctl to set user_mss must be done * before the exchange of SYN's. If the initial ack from the other * end has a window of 0, max_window and thus mss will both be 0. */ /* Now we need to check if we have a half built packet. */ if ((skb = tcp_dequeue_partial(sk)) != NULL) { int hdrlen; /* IP header + TCP header */ hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data) + sizeof(struct tcphdr); /* Add more stuff to the end of skb->len */ if (!(flags & MSG_OOB)) { copy = min(sk->mss - (skb->len - hdrlen), len); /* FIXME: this is really a bug. */ if (copy <= 0) { printk("TCP: **bug**: \"copy\" <= 0!!\n"); copy = 0; } memcpy_fromfs(skb->data + skb->len, from, copy); skb->len += copy; from += copy; copied += copy; len -= copy; sk->send_seq += copy; } if ((skb->len - hdrlen) >= sk->mss || (flags & MSG_OOB) || !sk->packets_out) tcp_send_skb(sk, skb); else tcp_enqueue_partial(skb, sk); continue; } /* * We also need to worry about the window. * If window < 1/2 the maximum window we've seen from this * host, don't use it. This is sender side * silly window prevention, as specified in RFC1122. * (Note that this is diffferent than earlier versions of * SWS prevention, e.g. RFC813.). What we actually do is * use the whole MSS. Since the results in the right * edge of the packet being outside the window, it will * be queued for later rather than sent. */ copy = diff(sk->window_seq, sk->send_seq); /* what if max_window == 1? In that case max_window >> 1 is 0. * however in that case copy == max_window, so it's OK to use * the window */ if (copy < (sk->max_window >> 1)) copy = sk->mss; copy = min(copy, sk->mss); copy = min(copy, len); /* We should really check the window here also. */ send_tmp = NULL; if (copy < sk->mss && !(flags & MSG_OOB)) { /* We will release the socket incase we sleep here. */ release_sock(sk); /* NB: following must be mtu, because mss can be increased. * mss is always <= mtu */ skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + sizeof(*skb), 0, GFP_KERNEL); sk->inuse = 1; send_tmp = skb; } else { /* We will release the socket incase we sleep here. */ release_sock(sk); skb = prot->wmalloc(sk, copy + prot->max_header + sizeof(*skb), 0, GFP_KERNEL); sk->inuse = 1; } /* If we didn't get any memory, we need to sleep. */ if (skb == NULL) { if (nonblock /* || copied */) { release_sock(sk); DPRINTF((DBG_TCP, "tcp_write: return 4\n")); if (copied) return(copied); return(-EAGAIN); } /* FIXME: here is another race condition. */ tmp = sk->wmem_alloc; release_sock(sk); cli(); /* Again we will try to avoid it. */ if (tmp <= sk->wmem_alloc && (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) && sk->err == 0) { interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); DPRINTF((DBG_TCP, "tcp_write: return 5\n")); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); continue; } skb->len = 0; skb->sk = sk; skb->free = 0; buff = skb->data; /* * FIXME: we need to optimize this. * Perhaps some hints here would be good. */ tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl); if (tmp < 0 ) { prot->wfree(sk, skb->mem_addr, skb->mem_len); release_sock(sk); DPRINTF((DBG_TCP, "tcp_write: return 6\n")); if (copied) return(copied); return(tmp); } skb->len += tmp; skb->dev = dev; buff += tmp; skb->h.th =(struct tcphdr *) buff; tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy); if (tmp < 0) { prot->wfree(sk, skb->mem_addr, skb->mem_len); release_sock(sk); DPRINTF((DBG_TCP, "tcp_write: return 7\n")); if (copied) return(copied); return(tmp); } if (flags & MSG_OOB) { ((struct tcphdr *)buff)->urg = 1; ((struct tcphdr *)buff)->urg_ptr = ntohs(copy); } skb->len += tmp; memcpy_fromfs(buff+tmp, from, copy); from += copy; copied += copy; len -= copy; skb->len += copy; skb->free = 0; sk->send_seq += copy; if (send_tmp != NULL && sk->packets_out) { tcp_enqueue_partial(send_tmp, sk); continue; } tcp_send_skb(sk, skb); } sk->err = 0; /* * Nagles rule. Turn Nagle off with TCP_NODELAY for highly * interactive fast network servers. It's meant to be on and * it really improves the throughput though not the echo time * on my slow slip link - Alan */ /* Avoid possible race on send_tmp - c/o Johannes Stille */ if(sk->partial && ((!sk->packets_out) /* If not nagling we can send on the before case too.. */ || (sk->nonagle && before(sk->send_seq , sk->window_seq)) )) tcp_send_partial(sk); /* -- */ release_sock(sk); DPRINTF((DBG_TCP, "tcp_write: return 8\n")); return(copied); } static int tcp_sendto(struct sock *sk, unsigned char *from, int len, int nonblock, unsigned flags, struct sockaddr_in *addr, int addr_len) { struct sockaddr_in sin; if (addr_len < sizeof(sin)) return(-EINVAL); memcpy_fromfs(&sin, addr, sizeof(sin)); if (sin.sin_family && sin.sin_family != AF_INET) return(-EINVAL); if (sin.sin_port != sk->dummy_th.dest) return(-EINVAL); if (sin.sin_addr.s_addr != sk->daddr) return(-EINVAL); return(tcp_write(sk, from, len, nonblock, flags)); } static void tcp_read_wakeup(struct sock *sk) { int tmp; struct device *dev = NULL; struct tcphdr *t1; struct sk_buff *buff; DPRINTF((DBG_TCP, "in tcp read wakeup\n")); if (!sk->ack_backlog) return; /* * FIXME: we need to put code here to prevent this routine from * being called. Being called once in a while is ok, so only check * if this is the second time in a row. */ /* * We need to grab some memory, and put together an ack, * and then put it into the queue to be sent. */ buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); if (buff == NULL) { /* Try again real soon. */ reset_timer(sk, TIME_WRITE, 10); return; } buff->mem_addr = buff; buff->mem_len = MAX_ACK_SIZE; buff->len = sizeof(struct tcphdr); buff->sk = sk; /* Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { buff->free=1; sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); return; } buff->len += tmp; t1 =(struct tcphdr *)(buff->data +tmp); memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); t1->seq = ntohl(sk->send_seq); t1->ack = 1; t1->res1 = 0; t1->res2 = 0; t1->rst = 0; t1->urg = 0; t1->syn = 0; t1->psh = 0; sk->ack_backlog = 0; sk->bytes_rcv = 0; sk->window = tcp_select_window(sk);/*sk->prot->rspace(sk);*/ t1->window = ntohs(sk->window); t1->ack_seq = ntohl(sk->acked_seq); t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); sk->prot->queue_xmit(sk, dev, buff, 1); } /* * FIXME: * This routine frees used buffers. * It should consider sending an ACK to let the * other end know we now have a bigger window. */ static void cleanup_rbuf(struct sock *sk) { unsigned long flags; int left; struct sk_buff *skb; if(sk->debug) printk("cleaning rbuf for sk=%p\n", sk); save_flags(flags); cli(); left = sk->prot->rspace(sk); /* * We have to loop through all the buffer headers, * and try to free up all the space we can. */ while((skb=skb_peek(&sk->rqueue)) != NULL ) { if (!skb->used) break; skb_unlink(skb); skb->sk = sk; kfree_skb(skb, FREE_READ); } restore_flags(flags); /* * FIXME: * At this point we should send an ack if the difference * in the window, and the amount of space is bigger than * TCP_WINDOW_DIFF. */ DPRINTF((DBG_TCP, "sk->window left = %d, sk->prot->rspace(sk)=%d\n", sk->window - sk->bytes_rcv, sk->prot->rspace(sk))); if(sk->debug) printk("sk->rspace = %lu, was %d\n", sk->prot->rspace(sk), left); if (sk->prot->rspace(sk) != left) { /* * This area has caused the most trouble. The current strategy * is to simply do nothing if the other end has room to send at * least 3 full packets, because the ack from those will auto- * matically update the window. If the other end doesn't think * we have much space left, but we have room for atleast 1 more * complete packet than it thinks we do, we will send an ack * immediatedly. Otherwise we will wait up to .5 seconds in case * the user reads some more. */ sk->ack_backlog++; /* * It's unclear whether to use sk->mtu or sk->mss here. They differ only * if the other end is offering a window smaller than the agreed on MSS * (called sk->mtu here). In theory there's no connection between send * and receive, and so no reason to think that they're going to send * small packets. For the moment I'm using the hack of reducing the mss * only on the send side, so I'm putting mtu here. */ if ((sk->prot->rspace(sk) > (sk->window - sk->bytes_rcv + sk->mtu))) { /* Send an ack right now. */ tcp_read_wakeup(sk); } else { /* Force it to send an ack soon. */ int was_active = del_timer(&sk->timer); if (!was_active || TCP_ACK_TIME < sk->timer.expires) { reset_timer(sk, TIME_WRITE, TCP_ACK_TIME); } else add_timer(&sk->timer); } } } /* Handle reading urgent data. */ static int tcp_read_urg(struct sock * sk, int nonblock, unsigned char *to, int len, unsigned flags) { int copied = 0; struct sk_buff *skb; DPRINTF((DBG_TCP, "tcp_read_urg(sk=%X, to=%X, len=%d, flags=%X)\n", sk, to, len, flags)); while(len > 0) { sk->inuse = 1; while(sk->urg==0 || skb_peek(&sk->rqueue) == NULL) { if (sk->err) { int tmp; release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } if (sk->state == TCP_CLOSE || sk->done) { release_sock(sk); if (copied) return(copied); if (!sk->done) { sk->done = 1; return(0); } return(-ENOTCONN); } if (sk->shutdown & RCV_SHUTDOWN) { release_sock(sk); if (copied == 0) sk->done = 1; return(copied); } if (nonblock || copied) { release_sock(sk); if (copied) return(copied); return(-EAGAIN); } /* Now at this point, we may have gotten some data. */ release_sock(sk); cli(); if ((sk->urg == 0 || skb_peek(&sk->rqueue) == NULL) && sk->err == 0 && !(sk->shutdown & RCV_SHUTDOWN)) { interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); } skb = skb_peek(&sk->rqueue); do { int amt; if (skb->h.th->urg && !skb->urg_used) { if (skb->h.th->urg_ptr == 0) { skb->h.th->urg_ptr = ntohs(skb->len); } amt = min(ntohs(skb->h.th->urg_ptr),len); if(amt) { memcpy_tofs(to,(unsigned char *)(skb->h.th) + skb->h.th->doff*4, amt); } if (!(flags & MSG_PEEK)) { skb->urg_used = 1; sk->urg--; } release_sock(sk); copied += amt; return(copied); } skb =(struct sk_buff *)skb->next; } while(skb != sk->rqueue); } /*sk->urg = 0;*/ release_sock(sk); return(0); } /* This routine copies from a sock struct into the user buffer. */ static int tcp_read(struct sock *sk, unsigned char *to, int len, int nonblock, unsigned flags) { int copied=0; /* will be used to say how much has been copied. */ struct sk_buff *skb; unsigned long offset; unsigned long used; int err; if (len == 0) return(0); if (len < 0) { return(-EINVAL); } err=verify_area(VERIFY_WRITE,to,len); if(err) return err; /* This error should be checked. */ if (sk->state == TCP_LISTEN) return(-ENOTCONN); /* Urgent data needs to be handled specially. */ if ((flags & MSG_OOB)) return(tcp_read_urg(sk, nonblock, to, len, flags)); /* So no-one else will use this socket. */ sk->inuse = 1; skb=skb_peek(&sk->rqueue); DPRINTF((DBG_TCP, "tcp_read(sk=%X, to=%X, len=%d, nonblock=%d, flags=%X)\n", sk, to, len, nonblock, flags)); while(len > 0) { /* skb->used just checks to see if we've gone all the way around. */ /* While no data, or first data indicates some is missing, or data is used */ while(skb == NULL || before(sk->copied_seq+1, skb->h.th->seq) || skb->used) { DPRINTF((DBG_TCP, "skb = %X:\n", skb)); cleanup_rbuf(sk); if (sk->err) { int tmp; release_sock(sk); if (copied) { DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); return(copied); } tmp = -sk->err; sk->err = 0; return(tmp); } if (sk->state == TCP_CLOSE) { release_sock(sk); if (copied) { DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); return(copied); } if (!sk->done) { sk->done = 1; return(0); } return(-ENOTCONN); } if (sk->shutdown & RCV_SHUTDOWN) { release_sock(sk); if (copied == 0) sk->done = 1; DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); return(copied); } if (nonblock || copied) { release_sock(sk); if(sk->debug) printk("read: EAGAIN\n"); if (copied) { DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); return(copied); } return(-EAGAIN); } if ((flags & MSG_PEEK) && copied != 0) { release_sock(sk); DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); return(copied); } DPRINTF((DBG_TCP, "tcp_read about to sleep. state = %d\n", sk->state)); release_sock(sk); /* * Now we may have some data waiting or we could * have changed state. */ cli(); if (sk->shutdown & RCV_SHUTDOWN || sk->err != 0) { sk->inuse = 1; sti(); continue; } if (skb_peek(&sk->rqueue) == NULL || before(sk->copied_seq+1, sk->rqueue->h.th->seq)) { if(sk->debug) printk("Read wait sleep\n"); interruptible_sleep_on(sk->sleep); if(sk->debug) printk("Read wait wakes\n"); if (current->signal & ~current->blocked) { sti(); if (copied) { DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); return(copied); } return(-ERESTARTSYS); } } sk->inuse = 1; sti(); DPRINTF((DBG_TCP, "tcp_read woke up. \n")); skb=skb_peek(&sk->rqueue); /* That may have been null if we were beaten, if so we loop again */ } /* * Copy anything from the current block that needs * to go into the user buffer. */ offset = sk->copied_seq+1 - skb->h.th->seq; if (skb->h.th->syn) offset--; if (offset < skb->len) /* Some of the packet is useful */ { /* * If there is urgent data we must either * return or skip over it. */ if (skb->h.th->urg) { if (skb->urg_used) { sk->copied_seq += ntohs(skb->h.th->urg_ptr); offset += ntohs(skb->h.th->urg_ptr); if (offset >= skb->len) { skb->used = 1; skb =(struct sk_buff *)skb->next; continue; } } else { release_sock(sk); if (copied) return(copied); send_sig(SIGURG, current, 0); return(-EINTR); } } /* Ok so how much can we use ? */ used = min(skb->len - offset, len); /* Copy it */ memcpy_tofs(to,((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); copied += used; len -= used; to += used; /* If we were reading the data is 'eaten' */ if (!(flags & MSG_PEEK)) sk->copied_seq += used; /* * Mark this data used if we are really reading it, * and if it doesn't contain any urgent data. And we * have used all the data. */ if (!(flags & MSG_PEEK) && (!skb->h.th->urg || skb->urg_used) && (used + offset >= skb->len)) skb->used = 1; /* * See if this is the end of a message or if the * remaining data is urgent. */ if (/*skb->h.th->psh || */skb->h.th->urg) { break; } } else { /* already used this data, must be a retransmit */ skb->used = 1; } /* Move along a packet */ skb =(struct sk_buff *)skb->next; } /* Clean up data we have read: This will do ACK frames */ cleanup_rbuf(sk); release_sock(sk); DPRINTF((DBG_TCP, "tcp_read: returning %d\n", copied)); if (copied == 0 && nonblock) return(-EAGAIN); return(copied); } /* * Send a FIN without closing the connection. * Not called at interrupt time. */ void tcp_shutdown(struct sock *sk, int how) { struct sk_buff *buff; struct tcphdr *t1, *th; struct proto *prot; int tmp; struct device *dev = NULL; /* * We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * FIXME: * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. * Most of this is guesswork, so maybe it will work... */ /* If we've already sent a FIN, return. */ if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2) return; if (!(how & SEND_SHUTDOWN)) return; sk->inuse = 1; /* Clear out any half completed packets. */ if (sk->partial) tcp_send_partial(sk); prot =(struct proto *)sk->prot; th =(struct tcphdr *)&sk->dummy_th; release_sock(sk); /* incase the malloc sleeps. */ buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL); if (buff == NULL) return; sk->inuse = 1; DPRINTF((DBG_TCP, "tcp_shutdown_send buff = %X\n", buff)); buff->mem_addr = buff; buff->mem_len = MAX_RESET_SIZE; buff->sk = sk; buff->len = sizeof(*t1); t1 =(struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl); if (tmp < 0) { buff->free=1; prot->wfree(sk,buff->mem_addr, buff->mem_len); release_sock(sk); DPRINTF((DBG_TCP, "Unable to build header for fin.\n")); return; } t1 =(struct tcphdr *)((char *)t1 +tmp); buff->len += tmp; buff->dev = dev; memcpy(t1, th, sizeof(*t1)); t1->seq = ntohl(sk->send_seq); sk->send_seq++; buff->h.seq = sk->send_seq; t1->ack = 1; t1->ack_seq = ntohl(sk->acked_seq); t1->window = ntohs(sk->window=tcp_select_window(sk)/*sk->prot->rspace(sk)*/); t1->fin = 1; t1->rst = 0; t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); /* * Can't just queue this up. * It should go at the end of the write queue. */ if (sk->wback != NULL) { buff->free=0; buff->next = NULL; sk->wback->next = buff; sk->wback = buff; buff->magic = TCP_WRITE_QUEUE_MAGIC; } else { sk->prot->queue_xmit(sk, dev, buff, 0); } if (sk->state == TCP_ESTABLISHED) sk->state = TCP_FIN_WAIT1; else sk->state = TCP_FIN_WAIT2; release_sock(sk); } static int tcp_recvfrom(struct sock *sk, unsigned char *to, int to_len, int nonblock, unsigned flags, struct sockaddr_in *addr, int *addr_len) { struct sockaddr_in sin; int len; int err; int result; /* Have to check these first unlike the old code. If we check them after we lose data on an error which is wrong */ err = verify_area(VERIFY_WRITE,addr_len,sizeof(long)); if(err) return err; len = get_fs_long(addr_len); if(len > sizeof(sin)) len = sizeof(sin); err=verify_area(VERIFY_WRITE, addr, len); if(err) return err; result=tcp_read(sk, to, to_len, nonblock, flags); if (result < 0) return(result); sin.sin_family = AF_INET; sin.sin_port = sk->dummy_th.dest; sin.sin_addr.s_addr = sk->daddr; memcpy_tofs(addr, &sin, len); put_fs_long(len, addr_len); return(result); } /* This routine will send an RST to the other tcp. */ static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl) { struct sk_buff *buff; struct tcphdr *t1; int tmp; /* * We need to grab some memory, and put together an RST, * and then put it into the queue to be sent. */ buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC); if (buff == NULL) return; DPRINTF((DBG_TCP, "tcp_reset buff = %X\n", buff)); buff->mem_addr = buff; buff->mem_len = MAX_RESET_SIZE; buff->len = sizeof(*t1); buff->sk = NULL; buff->dev = dev; t1 =(struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = prot->build_header(buff, saddr, daddr, &dev, IPPROTO_TCP, opt, sizeof(struct tcphdr),tos,ttl); if (tmp < 0) { buff->free = 1; prot->wfree(NULL, buff->mem_addr, buff->mem_len); return; } t1 =(struct tcphdr *)((char *)t1 +tmp); buff->len += tmp; memcpy(t1, th, sizeof(*t1)); /* Swap the send and the receive. */ t1->dest = th->source; t1->source = th->dest; t1->rst = 1; t1->window = 0; if(th->ack) { t1->ack=0; t1->seq=th->ack_seq; t1->ack_seq=0; } else { t1->ack=1; if(!th->syn) t1->ack_seq=htonl(th->seq); else t1->ack_seq=htonl(th->seq+1); t1->seq=0; } t1->syn = 0; t1->urg = 0; t1->fin = 0; t1->psh = 0; t1->doff = sizeof(*t1)/4; tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL); prot->queue_xmit(NULL, dev, buff, 1); } /* * Look for tcp options. Parses everything but only knows about MSS. * This routine is always called with the packet containing the SYN. * However it may also be called with the ack to the SYN. So you * can't assume this is always the SYN. It's always called after * we have set up sk->mtu to our own MTU. */ static void tcp_options(struct sock *sk, struct tcphdr *th) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); int mss_seen = 0; ptr = (unsigned char *)(th + 1); while(length>0) { int opcode=*ptr++; int opsize=*ptr++; switch(opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: length-=2; continue; default: if(opsize<=2) /* Avoid silly options looping forever */ return; switch(opcode) { case TCPOPT_MSS: if(opsize==4 && th->syn) { sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); mss_seen = 1; } break; /* Add other options here as people feel the urge to implement stuff like large windows */ } ptr+=opsize-2; length-=opsize; } } if (th->syn) { if (! mss_seen) sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ } sk->mss = min(sk->max_window, sk->mtu); } static inline unsigned long default_mask(unsigned long dst) { dst = ntohl(dst); if (IN_CLASSA(dst)) return htonl(IN_CLASSA_NET); if (IN_CLASSB(dst)) return htonl(IN_CLASSB_NET); return htonl(IN_CLASSC_NET); } /* * This routine handles a connection request. * It should make sure we haven't already responded. * Because of the way BSD works, we have to send a syn/ack now. * This also means it will be harder to close a socket which is * listening. */ static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, unsigned long daddr, unsigned long saddr, struct options *opt, struct device *dev) { struct sk_buff *buff; struct tcphdr *t1; unsigned char *ptr; struct sock *newsk; struct tcphdr *th; int tmp; DPRINTF((DBG_TCP, "tcp_conn_request(sk = %X, skb = %X, daddr = %X, sadd4= %X, \n" " opt = %X, dev = %X)\n", sk, skb, daddr, saddr, opt, dev)); th = skb->h.th; /* If the socket is dead, don't accept the connection. */ if (!sk->dead) { sk->data_ready(sk,0); } else { DPRINTF((DBG_TCP, "tcp_conn_request on dead socket\n")); tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); return; } /* * Make sure we can accept more. This will prevent a * flurry of syns from eating up all our memory. */ if (sk->ack_backlog >= sk->max_ack_backlog) { kfree_skb(skb, FREE_READ); return; } /* * We need to build a new sock struct. * It is sort of bad to have a socket without an inode attached * to it, but the wake_up's will just wake up the listening socket, * and if the listening socket is destroyed before this is taken * off of the queue, this will take care of it. */ newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); if (newsk == NULL) { /* just ignore the syn. It will get retransmitted. */ kfree_skb(skb, FREE_READ); return; } DPRINTF((DBG_TCP, "newsk = %X\n", newsk)); memcpy((void *)newsk,(void *)sk, sizeof(*newsk)); newsk->wback = NULL; newsk->wfront = NULL; newsk->rqueue = NULL; newsk->send_head = NULL; newsk->send_tail = NULL; newsk->back_log = NULL; newsk->rtt = TCP_CONNECT_TIME << 3; newsk->rto = TCP_CONNECT_TIME; newsk->mdev = 0; newsk->max_window = 0; newsk->cong_window = 1; newsk->cong_count = 0; newsk->ssthresh = 0; newsk->backoff = 0; newsk->blog = 0; newsk->intr = 0; newsk->proc = 0; newsk->done = 0; newsk->partial = NULL; newsk->pair = NULL; newsk->wmem_alloc = 0; newsk->rmem_alloc = 0; newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; newsk->err = 0; newsk->shutdown = 0; newsk->ack_backlog = 0; newsk->acked_seq = skb->h.th->seq+1; newsk->fin_seq = skb->h.th->seq; newsk->copied_seq = skb->h.th->seq; newsk->state = TCP_SYN_RECV; newsk->timeout = 0; newsk->send_seq = jiffies * SEQ_TICK - seq_offset; newsk->window_seq = newsk->send_seq; newsk->rcv_ack_seq = newsk->send_seq; newsk->urg =0; newsk->retransmits = 0; newsk->destroy = 0; newsk->timer.data = (unsigned long)newsk; newsk->timer.function = &net_timer; newsk->dummy_th.source = skb->h.th->dest; newsk->dummy_th.dest = skb->h.th->source; /* Swap these two, they are from our point of view. */ newsk->daddr = saddr; newsk->saddr = daddr; put_sock(newsk->num,newsk); newsk->dummy_th.res1 = 0; newsk->dummy_th.doff = 6; newsk->dummy_th.fin = 0; newsk->dummy_th.syn = 0; newsk->dummy_th.rst = 0; newsk->dummy_th.psh = 0; newsk->dummy_th.ack = 0; newsk->dummy_th.urg = 0; newsk->dummy_th.res2 = 0; newsk->acked_seq = skb->h.th->seq + 1; newsk->copied_seq = skb->h.th->seq; /* Grab the ttl and tos values and use them */ newsk->ip_ttl=sk->ip_ttl; newsk->ip_tos=skb->ip_hdr->tos; /* use 512 or whatever user asked for */ /* note use of sk->user_mss, since user has no direct access to newsk */ if (sk->user_mss) newsk->mtu = sk->user_mss; else { #ifdef SUBNETSARELOCAL if ((saddr ^ daddr) & default_mask(saddr)) #else if ((saddr ^ daddr) & dev->pa_mask) #endif newsk->mtu = 576 - HEADER_SIZE; else newsk->mtu = MAX_WINDOW; } /* but not bigger than device MTU */ newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE); /* this will min with what arrived in the packet */ tcp_options(newsk,skb->h.th); buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); if (buff == NULL) { sk->err = -ENOMEM; newsk->dead = 1; release_sock(newsk); kfree_skb(skb, FREE_READ); return; } buff->mem_addr = buff; buff->mem_len = MAX_SYN_SIZE; buff->len = sizeof(struct tcphdr)+4; buff->sk = newsk; t1 =(struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &dev, IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); /* Something went wrong. */ if (tmp < 0) { sk->err = tmp; buff->free=1; kfree_skb(buff,FREE_WRITE); newsk->dead = 1; release_sock(newsk); skb->sk = sk; kfree_skb(skb, FREE_READ); return; } buff->len += tmp; t1 =(struct tcphdr *)((char *)t1 +tmp); memcpy(t1, skb->h.th, sizeof(*t1)); buff->h.seq = newsk->send_seq; /* Swap the send and the receive. */ t1->dest = skb->h.th->source; t1->source = newsk->dummy_th.source; t1->seq = ntohl(newsk->send_seq++); t1->ack = 1; newsk->window = tcp_select_window(newsk);/*newsk->prot->rspace(newsk);*/ t1->window = ntohs(newsk->window); t1->res1 = 0; t1->res2 = 0; t1->rst = 0; t1->urg = 0; t1->psh = 0; t1->syn = 1; t1->ack_seq = ntohl(skb->h.th->seq+1); t1->doff = sizeof(*t1)/4+1; ptr =(unsigned char *)(t1+1); ptr[0] = 2; ptr[1] = 4; ptr[2] = ((newsk->mtu) >> 8) & 0xff; ptr[3] =(newsk->mtu) & 0xff; tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk); newsk->prot->queue_xmit(newsk, dev, buff, 0); reset_timer(newsk, TIME_WRITE /* -1 ? FIXME ??? */, TCP_CONNECT_TIME); skb->sk = newsk; /* Charge the sock_buff to newsk. */ sk->rmem_alloc -= skb->mem_len; newsk->rmem_alloc += skb->mem_len; skb_queue_tail(&sk->rqueue,skb); sk->ack_backlog++; release_sock(newsk); } static void tcp_close(struct sock *sk, int timeout) { struct sk_buff *buff; int need_reset = 0; struct tcphdr *t1, *th; struct proto *prot; struct device *dev=NULL; int tmp; /* * We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. */ DPRINTF((DBG_TCP, "tcp_close((struct sock *)%X, %d)\n",sk, timeout)); sk->inuse = 1; sk->keepopen = 1; sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); /* We need to flush the recv. buffs. */ if (skb_peek(&sk->rqueue) != NULL) { struct sk_buff *skb; if(sk->debug) printk("Clean rcv queue\n"); while((skb=skb_dequeue(&sk->rqueue))!=NULL) { if(skb->len > 0 && after(skb->h.th->seq + skb->len + 1 , sk->copied_seq)) need_reset = 1; kfree_skb(skb, FREE_READ); } if(sk->debug) printk("Cleaned.\n"); } sk->rqueue = NULL; /* Get rid off any half-completed packets. */ if (sk->partial) { tcp_send_partial(sk); } switch(sk->state) { case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: case TCP_LAST_ACK: /* start a timer. */ /* original code was 4 * sk->rtt. In converting to the * new rtt representation, we can't quite use that. * it seems to make most sense to use the backed off value */ reset_timer(sk, TIME_CLOSE, 4 * sk->rto); if (timeout) tcp_time_wait(sk); release_sock(sk); return; /* break causes a double release - messy */ case TCP_TIME_WAIT: if (timeout) { sk->state = TCP_CLOSE; } release_sock(sk); return; case TCP_LISTEN: sk->state = TCP_CLOSE; release_sock(sk); return; case TCP_CLOSE: release_sock(sk); return; case TCP_CLOSE_WAIT: case TCP_ESTABLISHED: case TCP_SYN_SENT: case TCP_SYN_RECV: prot =(struct proto *)sk->prot; th =(struct tcphdr *)&sk->dummy_th; buff = prot->wmalloc(sk, MAX_FIN_SIZE, 1, GFP_ATOMIC); if (buff == NULL) { /* This will force it to try again later. */ /* Or it would have if someone released the socket first. Anyway it might work now */ release_sock(sk); if (sk->state != TCP_CLOSE_WAIT) sk->state = TCP_ESTABLISHED; reset_timer(sk, TIME_CLOSE, 100); return; } buff->mem_addr = buff; buff->mem_len = MAX_FIN_SIZE; buff->sk = sk; buff->free = 1; buff->len = sizeof(*t1); t1 =(struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl); if (tmp < 0) { kfree_skb(buff,FREE_WRITE); DPRINTF((DBG_TCP, "Unable to build header for fin.\n")); release_sock(sk); return; } t1 =(struct tcphdr *)((char *)t1 +tmp); buff->len += tmp; buff->dev = dev; memcpy(t1, th, sizeof(*t1)); t1->seq = ntohl(sk->send_seq); sk->send_seq++; buff->h.seq = sk->send_seq; t1->ack = 1; /* Ack everything immediately from now on. */ sk->delay_acks = 0; t1->ack_seq = ntohl(sk->acked_seq); t1->window = ntohs(sk->window=tcp_select_window(sk)/*sk->prot->rspace(sk)*/); t1->fin = 1; t1->rst = need_reset; t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); if (sk->wfront == NULL) { prot->queue_xmit(sk, dev, buff, 0); } else { reset_timer(sk, TIME_WRITE, sk->rto); buff->next = NULL; if (sk->wback == NULL) { sk->wfront = buff; } else { sk->wback->next = buff; } sk->wback = buff; buff->magic = TCP_WRITE_QUEUE_MAGIC; } if (sk->state == TCP_CLOSE_WAIT) { sk->state = TCP_FIN_WAIT2; } else { sk->state = TCP_FIN_WAIT1; } } release_sock(sk); } /* * This routine takes stuff off of the write queue, * and puts it in the xmit queue. */ static void tcp_write_xmit(struct sock *sk) { struct sk_buff *skb; DPRINTF((DBG_TCP, "tcp_write_xmit(sk=%X)\n", sk)); /* The bytes will have to remain here. In time closedown will empty the write queue and all will be happy */ if(sk->zapped) return; while(sk->wfront != NULL && before(sk->wfront->h.seq, sk->window_seq +1) && (sk->retransmits == 0 || sk->timeout != TIME_WRITE || before(sk->wfront->h.seq, sk->rcv_ack_seq +1)) && sk->packets_out < sk->cong_window) { skb = sk->wfront; IS_SKB(skb); sk->wfront = skb->next; if (sk->wfront == NULL) sk->wback = NULL; skb->next = NULL; if (skb->magic != TCP_WRITE_QUEUE_MAGIC) { printk("tcp.c skb with bad magic(%X) on write queue. Squashing " "queue\n", skb->magic); sk->wfront = NULL; sk->wback = NULL; return; } skb->magic = 0; DPRINTF((DBG_TCP, "Sending a packet.\n")); /* See if we really need to send the packet. */ if (before(skb->h.seq, sk->rcv_ack_seq +1)) { sk->retransmits = 0; kfree_skb(skb, FREE_WRITE); if (!sk->dead) sk->write_space(sk); } else { sk->prot->queue_xmit(sk, skb->dev, skb, skb->free); } } } /* * This routine sorts the send list, and resets the * sk->send_head and sk->send_tail pointers. */ void sort_send(struct sock *sk) { struct sk_buff *list = NULL; struct sk_buff *skb,*skb2,*skb3; for (skb = sk->send_head; skb != NULL; skb = skb2) { skb2 = (struct sk_buff *)skb->link3; if (list == NULL || before (skb2->h.seq, list->h.seq)) { skb->link3 = list; sk->send_tail = skb; list = skb; } else { for (skb3 = list; ; skb3 = (struct sk_buff *)skb3->link3) { if (skb3->link3 == NULL || before(skb->h.seq, skb3->link3->h.seq)) { skb->link3 = skb3->link3; skb3->link3 = skb; if (skb->link3 == NULL) sk->send_tail = skb; break; } } } } sk->send_head = list; } /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len) { unsigned long ack; int flag = 0; /* * 1 - there was data in packet as well as ack or new data is sent or * in shutdown state * 2 - data from retransmit queue was acked and removed * 4 - window shrunk or data from retransmit queue was acked and removed */ if(sk->zapped) return(1); /* Dead, cant ack any more so why bother */ ack = ntohl(th->ack_seq); DPRINTF((DBG_TCP, "tcp_ack ack=%d, window=%d, " "sk->rcv_ack_seq=%d, sk->window_seq = %d\n", ack, ntohs(th->window), sk->rcv_ack_seq, sk->window_seq)); if (ntohs(th->window) > sk->max_window) { sk->max_window = ntohs(th->window); sk->mss = min(sk->max_window, sk->mtu); } if (sk->retransmits && sk->timeout == TIME_KEEPOPEN) sk->retransmits = 0; if (after(ack, sk->send_seq+1) || before(ack, sk->rcv_ack_seq-1)) { if (after(ack, sk->send_seq) || (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)) { return(0); } if (sk->keepopen) { reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); } return(1); } if (len != th->doff*4) flag |= 1; /* See if our window has been shrunk. */ if (after(sk->window_seq, ack+ntohs(th->window))) { /* * We may need to move packets from the send queue * to the write queue, if the window has been shrunk on us. * The RFC says you are not allowed to shrink your window * like this, but if the other end does, you must be able * to deal with it. */ struct sk_buff *skb; struct sk_buff *skb2; struct sk_buff *wskb = NULL; skb2 = sk->send_head; sk->send_head = NULL; sk->send_tail = NULL; flag |= 4; sk->window_seq = ack + ntohs(th->window); cli(); while (skb2 != NULL) { skb = skb2; skb2 = (struct sk_buff *)skb->link3; skb->link3 = NULL; if (after(skb->h.seq, sk->window_seq)) { if (sk->packets_out > 0) sk->packets_out--; /* We may need to remove this from the dev send list. */ if (skb->next != NULL) { skb_unlink(skb); } /* Now add it to the write_queue. */ skb->magic = TCP_WRITE_QUEUE_MAGIC; if (wskb == NULL) { skb->next = sk->wfront; sk->wfront = skb; } else { skb->next = wskb->next; wskb->next = skb; } if (sk->wback == wskb) sk->wback = skb; wskb = skb; } else { if (sk->send_head == NULL) { sk->send_head = skb; sk->send_tail = skb; } else { sk->send_tail->link3 = skb; sk->send_tail = skb; } skb->link3 = NULL; } } sti(); } if (sk->send_tail == NULL || sk->send_head == NULL) { sk->send_head = NULL; sk->send_tail = NULL; sk->packets_out= 0; } sk->window_seq = ack + ntohs(th->window); /* We don't want too many packets out there. */ if (sk->timeout == TIME_WRITE && sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) { /* * This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. Because we keep cong_window in integral * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a * counter and increment it once every cwnd times. It's possible * that this should be done only if sk->retransmits == 0. I'm * interpreting "new data is acked" as including data that has * been retransmitted but is just now being acked. */ if (sk->cong_window < sk->ssthresh) /* in "safe" area, increase */ sk->cong_window++; else { /* in dangerous area, increase slowly. In theory this is sk->cong_window += 1 / sk->cong_window */ if (sk->cong_count >= sk->cong_window) { sk->cong_window++; sk->cong_count = 0; } else sk->cong_count++; } } DPRINTF((DBG_TCP, "tcp_ack: Updating rcv ack sequence.\n")); sk->rcv_ack_seq = ack; /* * if this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission */ if (sk->timeout == TIME_PROBE0) { if (sk->wfront != NULL && /* should always be non-null */ ! before (sk->window_seq, sk->wfront->h.seq)) { sk->retransmits = 0; sk->backoff = 0; /* recompute rto from rtt. this eliminates any backoff */ sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; if (sk->rto > 120*HZ) sk->rto = 120*HZ; if (sk->rto < 1*HZ) sk->rto = 1*HZ; } } /* See if we can take anything off of the retransmit queue. */ while(sk->send_head != NULL) { /* Check for a bug. */ if (sk->send_head->link3 && after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) { printk("INET: tcp.c: *** bug send_list out of order.\n"); sort_send(sk); } if (before(sk->send_head->h.seq, ack+1)) { struct sk_buff *oskb; if (sk->retransmits) { /* we were retransmitting. don't count this in RTT est */ flag |= 2; /* * even though we've gotten an ack, we're still * retransmitting as long as we're sending from * the retransmit queue. Keeping retransmits non-zero * prevents us from getting new data interspersed with * retransmissions. */ if (sk->send_head->link3) sk->retransmits = 1; else sk->retransmits = 0; } /* * Note that we only reset backoff and rto in the * rtt recomputation code. And that doesn't happen * if there were retransmissions in effect. So the * first new packet after the retransmissions is * sent with the backoff still in effect. Not until * we get an ack from a non-retransmitted packet do * we reset the backoff and rto. This allows us to deal * with a situation where the network delay has increased * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ /* We have one less packet out there. */ if (sk->packets_out > 0) sk->packets_out --; DPRINTF((DBG_TCP, "skb=%X skb->h.seq = %d acked ack=%d\n", sk->send_head, sk->send_head->h.seq, ack)); /* Wake up the process, it can probably write more. */ if (!sk->dead) sk->write_space(sk); oskb = sk->send_head; if (!(flag&2)) { long m; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". */ m = jiffies - oskb->when; /* RTT */ m -= (sk->rtt >> 3); /* m is now error in rtt est */ sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) m = -m; /* m is now abs(error) */ m -= (sk->mdev >> 2); /* similar update on mdev */ sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ /* now update timeout. Note that this removes any backoff */ sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; if (sk->rto > 120*HZ) sk->rto = 120*HZ; if (sk->rto < 1*HZ) sk->rto = 1*HZ; sk->backoff = 0; } flag |= (2|4); cli(); oskb = sk->send_head; IS_SKB(oskb); sk->send_head =(struct sk_buff *)oskb->link3; if (sk->send_head == NULL) { sk->send_tail = NULL; } /* We may need to remove this from the dev send list. */ skb_unlink(oskb); /* Much easier! */ sti(); oskb->magic = 0; kfree_skb(oskb, FREE_WRITE); /* write. */ if (!sk->dead) sk->write_space(sk); } else { break; } } /* * Maybe we can take some stuff off of the write queue, * and put it onto the xmit queue. */ if (sk->wfront != NULL) { if (after (sk->window_seq+1, sk->wfront->h.seq) && (sk->retransmits == 0 || sk->timeout != TIME_WRITE || before(sk->wfront->h.seq, sk->rcv_ack_seq +1)) && sk->packets_out < sk->cong_window) { flag |= 1; tcp_write_xmit(sk); } else if (before(sk->window_seq, sk->wfront->h.seq) && sk->send_head == NULL && sk->ack_backlog == 0 && sk->state != TCP_TIME_WAIT) { reset_timer(sk, TIME_PROBE0, sk->rto); } } else { if (sk->send_head == NULL && sk->ack_backlog == 0 && sk->state != TCP_TIME_WAIT && !sk->keepopen) { DPRINTF((DBG_TCP, "Nothing to do, going to sleep.\n")); if (!sk->dead) sk->write_space(sk); if (sk->keepopen) reset_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); else delete_timer(sk); } else { if (sk->state != (unsigned char) sk->keepopen) { reset_timer(sk, TIME_WRITE, sk->rto); } if (sk->state == TCP_TIME_WAIT) { reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } } } if (sk->packets_out == 0 && sk->partial != NULL && sk->wfront == NULL && sk->send_head == NULL) { flag |= 1; tcp_send_partial(sk); } /* See if we are done. */ if (sk->state == TCP_TIME_WAIT) { if (!sk->dead) sk->state_change(sk); if (sk->rcv_ack_seq == sk->send_seq && sk->acked_seq == sk->fin_seq) { flag |= 1; sk->state = TCP_CLOSE; sk->shutdown = SHUTDOWN_MASK; } } if (sk->state == TCP_LAST_ACK || sk->state == TCP_FIN_WAIT2) { if (!sk->dead) sk->state_change(sk); if (sk->rcv_ack_seq == sk->send_seq) { flag |= 1; if (sk->acked_seq != sk->fin_seq) { tcp_time_wait(sk); } else { DPRINTF((DBG_TCP, "tcp_ack closing socket - %X\n", sk)); tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, sk->daddr); sk->shutdown = SHUTDOWN_MASK; sk->state = TCP_CLOSE; } } } /* * I make no guarantees about the first clause in the following * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under * what conditions "!flag" would be true. However I think the rest * of the conditions would prevent that from causing any * unnecessary retransmission. * Clearly if the first packet has expired it should be * retransmitted. The other alternative, "flag&2 && retransmits", is * harder to explain: You have to look carefully at how and when the * timer is set and with what timeout. The most recent transmission always * sets the timer. So in general if the most recent thing has timed * out, everything before it has as well. So we want to go ahead and * retransmit some more. If we didn't explicitly test for this * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" * would not be true. If you look at the pattern of timing, you can * show that rto is increased fast enough that the next packet would * almost never be retransmitted immediately. Then you'd end up * waiting for a timeout to send each packet on the retranmission * queue. With my implementation of the Karn sampling algorithm, * the timeout would double each time. The net result is that it would * take a hideous amount of time to recover from a single dropped packet. * It's possible that there should also be a test for TIME_WRITE, but * I think as long as "send_head != NULL" and "retransmit" is on, we've * got to be in real retransmission mode. * Note that ip_do_retransmit is called with all==1. Setting cong_window * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. * As long as no further losses occur, this seems reasonable. */ if (((!flag) || (flag&4)) && sk->send_head != NULL && (((flag&2) && sk->retransmits) || (sk->send_head->when + sk->rto < jiffies))) { ip_do_retransmit(sk, 1); reset_timer(sk, TIME_WRITE, sk->rto); } DPRINTF((DBG_TCP, "leaving tcp_ack\n")); return(1); } /* * This routine handles the data. If there is room in the buffer, * it will be have already been moved into it. If there is no * room, then we will just have to discard the packet. */ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned long saddr, unsigned short len) { struct sk_buff *skb1, *skb2; struct tcphdr *th; int dup_dumped=0; th = skb->h.th; print_th(th); skb->len = len -(th->doff*4); DPRINTF((DBG_TCP, "tcp_data len = %d sk = %X:\n", skb->len, sk)); sk->bytes_rcv += skb->len; if (skb->len == 0 && !th->fin && !th->urg && !th->psh) { /* Don't want to keep passing ack's back and forth. */ if (!th->ack) tcp_send_ack(sk->send_seq, sk->acked_seq,sk, th, saddr); kfree_skb(skb, FREE_READ); return(0); } if (sk->shutdown & RCV_SHUTDOWN) { sk->acked_seq = th->seq + skb->len + th->syn + th->fin; tcp_reset(sk->saddr, sk->daddr, skb->h.th, sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); sk->state = TCP_CLOSE; sk->err = EPIPE; sk->shutdown = SHUTDOWN_MASK; DPRINTF((DBG_TCP, "tcp_data: closing socket - %X\n", sk)); kfree_skb(skb, FREE_READ); if (!sk->dead) sk->state_change(sk); return(0); } /* * Now we have to walk the chain, and figure out where this one * goes into it. This is set up so that the last packet we received * will be the first one we look at, that way if everything comes * in order, there will be no performance loss, and if they come * out of order we will be able to fit things in nicely. */ /* This should start at the last one, and then go around forwards. */ if (sk->rqueue == NULL) { DPRINTF((DBG_TCP, "tcp_data: skb = %X:\n", skb)); #ifdef OLDWAY sk->rqueue = skb; skb->next = skb; skb->prev = skb; skb->list = &sk->rqueue; #else skb_queue_head(&sk->rqueue,skb); #endif skb1= NULL; } else { DPRINTF((DBG_TCP, "tcp_data adding to chain sk = %X:\n", sk)); for(skb1=sk->rqueue->prev; ; skb1 =(struct sk_buff *)skb1->prev) { if(sk->debug) { printk("skb1=%p :", skb1); printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq); printk("skb->h.th->seq = %ld\n",skb->h.th->seq); printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq, sk->acked_seq); } #ifdef OLD if (after(th->seq+1, skb1->h.th->seq)) { skb->prev = skb1; skb->next = skb1->next; skb->next->prev = skb; skb1->next = skb; if (skb1 == sk->rqueue) sk->rqueue = skb; break; } if (skb1->prev == sk->rqueue) { skb->next= skb1; skb->prev = skb1->prev; skb->prev->next = skb; skb1->prev = skb; skb1 = NULL; /* so we know we might be able to ack stuff. */ break; } #else if (th->seq==skb1->h.th->seq && skb->len>= skb1->len) { skb_append(skb1,skb); skb_unlink(skb1); kfree_skb(skb1,FREE_READ); dup_dumped=1; skb1=NULL; break; } if (after(th->seq+1, skb1->h.th->seq)) { skb_append(skb1,skb); break; } if (skb1 == sk->rqueue) { skb_queue_head(&sk->rqueue, skb); break; } #endif } DPRINTF((DBG_TCP, "skb = %X:\n", skb)); } th->ack_seq = th->seq + skb->len; if (th->syn) th->ack_seq++; if (th->fin) th->ack_seq++; if (before(sk->acked_seq, sk->copied_seq)) { printk("*** tcp.c:tcp_data bug acked < copied\n"); sk->acked_seq = sk->copied_seq; } /* Now figure out if we can ack anything. */ if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) { if (before(th->seq, sk->acked_seq+1)) { if (after(th->ack_seq, sk->acked_seq)) sk->acked_seq = th->ack_seq; skb->acked = 1; /* When we ack the fin, we turn on the RCV_SHUTDOWN flag. */ if (skb->h.th->fin) { if (!sk->dead) sk->state_change(sk); sk->shutdown |= RCV_SHUTDOWN; } for(skb2 = (struct sk_buff *)skb->next; skb2 !=(struct sk_buff *) sk->rqueue; skb2 = (struct sk_buff *)skb2->next) { if (before(skb2->h.th->seq, sk->acked_seq+1)) { if (after(skb2->h.th->ack_seq, sk->acked_seq)) { long old_acked_seq = sk->acked_seq; sk->acked_seq = skb2->h.th->ack_seq; if((int)(sk->acked_seq - old_acked_seq) >0) { int new_window=sk->window-sk->acked_seq+ old_acked_seq; if(new_window<0) new_window=0; sk->window = new_window; } } skb2->acked = 1; /* * When we ack the fin, we turn on * the RCV_SHUTDOWN flag. */ if (skb2->h.th->fin) { sk->shutdown |= RCV_SHUTDOWN; if (!sk->dead) sk->state_change(sk); } /* Force an immediate ack. */ sk->ack_backlog = sk->max_ack_backlog; } else { break; } } /* * This also takes care of updating the window. * This if statement needs to be simplified. */ if (!sk->delay_acks || sk->ack_backlog >= sk->max_ack_backlog || sk->bytes_rcv > sk->max_unacked || th->fin) { /* tcp_send_ack(sk->send_seq, sk->acked_seq,sk,th, saddr); */ } else { sk->ack_backlog++; if(sk->debug) printk("Ack queued.\n"); reset_timer(sk, TIME_WRITE, TCP_ACK_TIME); } } } /* * If we've missed a packet, send an ack. * Also start a timer to send another. */ if (!skb->acked) { /* * This is important. If we don't have much room left, * we need to throw out a few packets so we have a good * window. Note that mtu is used, not mss, because mss is really * for the send side. He could be sending us stuff as large as mtu. */ while (sk->prot->rspace(sk) < sk->mtu) { skb1 = skb_peek(&sk->rqueue); if (skb1 == NULL) { printk("INET: tcp.c:tcp_data memory leak detected.\n"); break; } /* Don't throw out something that has been acked. */ if (skb1->acked) { break; } skb_unlink(skb1); #ifdef OLDWAY if (skb1->prev == skb1) { sk->rqueue = NULL; } else { sk->rqueue = (struct sk_buff *)skb1->prev; skb1->next->prev = skb1->prev; skb1->prev->next = skb1->next; } #endif kfree_skb(skb1, FREE_READ); } tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); sk->ack_backlog++; reset_timer(sk, TIME_WRITE, TCP_ACK_TIME); } else { /* We missed a packet. Send an ack to try to resync things. */ tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); } /* Now tell the user we may have some data. */ if (!sk->dead) { if(sk->debug) printk("Data wakeup.\n"); sk->data_ready(sk,0); } else { DPRINTF((DBG_TCP, "data received on dead socket.\n")); } if (sk->state == TCP_FIN_WAIT2 && sk->acked_seq == sk->fin_seq && sk->rcv_ack_seq == sk->send_seq) { DPRINTF((DBG_TCP, "tcp_data: entering last_ack state sk = %X\n", sk)); /* tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); */ sk->shutdown = SHUTDOWN_MASK; sk->state = TCP_LAST_ACK; if (!sk->dead) sk->state_change(sk); } return(0); } static int tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long saddr) { extern int kill_pg(int pg, int sig, int priv); extern int kill_proc(int pid, int sig, int priv); if (!sk->dead) sk->data_ready(sk,0); if (sk->urginline) { th->urg = 0; th->psh = 1; return(0); } if (!sk->urg) { /* So if we get more urgent data, we don't signal the user again. */ if (sk->proc != 0) { if (sk->proc > 0) { kill_proc(sk->proc, SIGURG, 1); } else { kill_pg(-sk->proc, SIGURG, 1); } } } sk->urg++; return(0); } /* This deals with incoming fins. 'Linus at 9 O'clock' 8-) */ static int tcp_fin(struct sock *sk, struct tcphdr *th, unsigned long saddr, struct device *dev) { DPRINTF((DBG_TCP, "tcp_fin(sk=%X, th=%X, saddr=%X, dev=%X)\n", sk, th, saddr, dev)); if (!sk->dead) { sk->state_change(sk); } switch(sk->state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_ESTABLISHED: /* Contains the one that needs to be acked */ sk->fin_seq = th->seq+1; sk->state = TCP_CLOSE_WAIT; if (th->rst) sk->shutdown = SHUTDOWN_MASK; break; case TCP_CLOSE_WAIT: case TCP_FIN_WAIT2: break; /* we got a retransmit of the fin. */ case TCP_FIN_WAIT1: /* Contains the one that needs to be acked */ sk->fin_seq = th->seq+1; sk->state = TCP_FIN_WAIT2; break; default: case TCP_TIME_WAIT: sk->state = TCP_LAST_ACK; /* Start the timers. */ reset_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); return(0); } sk->ack_backlog++; return(0); } /* This will accept the next outstanding connection. */ static struct sock * tcp_accept(struct sock *sk, int flags) { struct sock *newsk; struct sk_buff *skb; DPRINTF((DBG_TCP, "tcp_accept(sk=%X, flags=%X, addr=%s)\n", sk, flags, in_ntoa(sk->saddr))); /* * We need to make sure that this socket is listening, * and that it has something pending. */ if (sk->state != TCP_LISTEN) { sk->err = EINVAL; return(NULL); } /* avoid the race. */ cli(); sk->inuse = 1; while((skb = get_firstr(sk)) == NULL) { if (flags & O_NONBLOCK) { sti(); release_sock(sk); sk->err = EAGAIN; return(NULL); } release_sock(sk); interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); sk->err = ERESTARTSYS; return(NULL); } sk->inuse = 1; } sti(); /* Now all we need to do is return skb->sk. */ newsk = skb->sk; kfree_skb(skb, FREE_READ); sk->ack_backlog--; release_sock(sk); return(newsk); } /* This will initiate an outgoing connection. */ static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) { struct sk_buff *buff; struct sockaddr_in sin; struct device *dev=NULL; unsigned char *ptr; int tmp; struct tcphdr *t1; int err; if (sk->state != TCP_CLOSE) return(-EISCONN); if (addr_len < 8) return(-EINVAL); err=verify_area(VERIFY_READ, usin, addr_len); if(err) return err; memcpy_fromfs(&sin,usin, min(sizeof(sin), addr_len)); if (sin.sin_family && sin.sin_family != AF_INET) return(-EAFNOSUPPORT); DPRINTF((DBG_TCP, "TCP connect daddr=%s\n", in_ntoa(sin.sin_addr.s_addr))); /* Don't want a TCP connection going to a broadcast address */ if (chk_addr(sin.sin_addr.s_addr) == IS_BROADCAST) { DPRINTF((DBG_TCP, "TCP connection to broadcast address not allowed\n")); return(-ENETUNREACH); } /* Connect back to the same socket: Blows up so disallow it */ if(sk->saddr == sin.sin_addr.s_addr && sk->num==ntohs(sin.sin_port)) return -EBUSY; sk->inuse = 1; sk->daddr = sin.sin_addr.s_addr; sk->send_seq = jiffies * SEQ_TICK - seq_offset; sk->window_seq = sk->send_seq; sk->rcv_ack_seq = sk->send_seq -1; sk->err = 0; sk->dummy_th.dest = sin.sin_port; release_sock(sk); buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); if (buff == NULL) { return(-ENOMEM); } sk->inuse = 1; buff->mem_addr = buff; buff->mem_len = MAX_SYN_SIZE; buff->len = 24; buff->sk = sk; buff->free = 1; t1 = (struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ /* We need to build the routing stuff fromt the things saved in skb. */ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); release_sock(sk); return(-ENETUNREACH); } buff->len += tmp; t1 = (struct tcphdr *)((char *)t1 +tmp); memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); t1->seq = ntohl(sk->send_seq++); buff->h.seq = sk->send_seq; t1->ack = 0; t1->window = 2; t1->res1=0; t1->res2=0; t1->rst = 0; t1->urg = 0; t1->psh = 0; t1->syn = 1; t1->urg_ptr = 0; t1->doff = 6; /* use 512 or whatever user asked for */ if (sk->user_mss) sk->mtu = sk->user_mss; else { #ifdef SUBNETSARELOCAL if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr)) #else if ((sk->saddr ^ sk->daddr) & dev->pa_mask) #endif sk->mtu = 576 - HEADER_SIZE; else sk->mtu = MAX_WINDOW; } /* but not bigger than device MTU */ sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE); /* Put in the TCP options to say MTU. */ ptr = (unsigned char *)(t1+1); ptr[0] = 2; ptr[1] = 4; ptr[2] = (sk->mtu) >> 8; ptr[3] = (sk->mtu) & 0xff; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(struct tcphdr) + 4, sk); /* This must go first otherwise a really quick response will get reset. */ sk->state = TCP_SYN_SENT; sk->rtt = TCP_CONNECT_TIME; reset_timer(sk, TIME_WRITE, TCP_CONNECT_TIME); /* Timer for repeating the SYN until an answer */ sk->retransmits = TCP_RETR2 - TCP_SYN_RETRIES; sk->prot->queue_xmit(sk, dev, buff, 0); release_sock(sk); return(0); } /* This functions checks to see if the tcp header is actually acceptible. */ static int tcp_sequence(struct sock *sk, struct tcphdr *th, short len, struct options *opt, unsigned long saddr, struct device *dev) { /* * This isn't quite right. sk->acked_seq could be more recent * than sk->window. This is however close enough. We will accept * slightly more packets than we should, but it should not cause * problems unless someone is trying to forge packets. */ DPRINTF((DBG_TCP, "tcp_sequence(sk=%X, th=%X, len = %d, opt=%d, saddr=%X)\n", sk, th, len, opt, saddr)); if (between(th->seq, sk->acked_seq, sk->acked_seq + sk->window)|| between(th->seq + len-(th->doff*4), sk->acked_seq + 1, sk->acked_seq + sk->window) || (before(th->seq, sk->acked_seq) && after(th->seq + len -(th->doff*4), sk->acked_seq + sk->window))) { return(1); } DPRINTF((DBG_TCP, "tcp_sequence: rejecting packet.\n")); /* * Send a reset if we get something not ours and we are * unsynchronized. Note: We don't do anything to our end. We * are just killing the bogus remote connection then we will * connect again and it will work (with luck). */ if(sk->state==TCP_SYN_SENT||sk->state==TCP_SYN_RECV) { tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); return(1); } /* * If it's too far ahead, send an ack to let the * other end know what we expect. */ if (after(th->seq, sk->acked_seq + sk->window)) { if(!th->rst) tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); return(0); } #ifdef undef /* * if we do this, we won't respond to keepalive packets, since those * are slightly out of window, and we have to generate an ack * a late ack out still not to have a sequence number less than * one we've seen before. Berkeley doesn't seem to do this, but it's * always hard to be sure. */ /* In case it's just a late ack, let it through. */ if (th->ack && len == (th->doff * 4) && after(th->seq, sk->acked_seq - 32767) && !th->fin && !th->syn) return(1); #endif if (!th->rst) { /* Try to resync things. */ tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); } return(0); } int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, unsigned long daddr, unsigned short len, unsigned long saddr, int redo, struct inet_protocol * protocol) { struct tcphdr *th; struct sock *sk; if (!skb) { DPRINTF((DBG_TCP, "tcp.c: tcp_rcv skb = NULL\n")); return(0); } #if 0 /* FIXME: it's ok for protocol to be NULL */ if (!protocol) { DPRINTF((DBG_TCP, "tcp.c: tcp_rcv protocol = NULL\n")); return(0); } if (!opt) { /* FIXME: it's ok for opt to be NULL */ DPRINTF((DBG_TCP, "tcp.c: tcp_rcv opt = NULL\n")); } #endif if (!dev) { DPRINTF((DBG_TCP, "tcp.c: tcp_rcv dev = NULL\n")); return(0); } th = skb->h.th; /* Find the socket. */ sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); DPRINTF((DBG_TCP, "<<\n")); DPRINTF((DBG_TCP, "len = %d, redo = %d, skb=%X\n", len, redo, skb)); /* If this socket has got a reset its to all intents and purposes really dead */ if (sk!=NULL && sk->zapped) sk=NULL; if (sk) { DPRINTF((DBG_TCP, "sk = %X:\n", sk)); } if (!redo) { if (tcp_check(th, len, saddr, daddr )) { skb->sk = NULL; DPRINTF((DBG_TCP, "packet dropped with bad checksum.\n")); if (inet_debug == DBG_SLIP) printk("\rtcp_rcv: bad checksum\n"); kfree_skb(skb,FREE_READ); /* * We don't release the socket because it was * never marked in use. */ return(0); } /* See if we know about the socket. */ if (sk == NULL) { if (!th->rst) { th->seq = ntohl(th->seq); /* So reset is always called with th->seq in host order */ tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); } skb->sk = NULL; kfree_skb(skb, FREE_READ); return(0); } skb->len = len; skb->sk = sk; skb->acked = 0; skb->used = 0; skb->free = 0; skb->urg_used = 0; skb->saddr = daddr; skb->daddr = saddr; th->seq = ntohl(th->seq); /* We may need to add it to the backlog here. */ cli(); if (sk->inuse) { if (sk->back_log == NULL) { sk->back_log = skb; skb->next = skb; skb->prev = skb; } else { skb->next = sk->back_log; skb->prev = sk->back_log->prev; skb->prev->next = skb; skb->next->prev = skb; } sti(); return(0); } sk->inuse = 1; sti(); } else { if (!sk) { DPRINTF((DBG_TCP, "tcp.c: tcp_rcv bug sk=NULL redo = 1\n")); return(0); } } if (!sk->prot) { DPRINTF((DBG_TCP, "tcp.c: tcp_rcv sk->prot = NULL \n")); return(0); } /* Charge the memory to the socket. */ if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) { skb->sk = NULL; DPRINTF((DBG_TCP, "dropping packet due to lack of buffer space.\n")); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } sk->rmem_alloc += skb->mem_len; DPRINTF((DBG_TCP, "About to do switch.\n")); /* Now deal with it. */ switch(sk->state) { /* * This should close the system down if it's waiting * for an ack that is never going to be sent. */ case TCP_LAST_ACK: if (th->rst) { sk->zapped=1; sk->err = ECONNRESET; sk->state = TCP_CLOSE; sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) { sk->state_change(sk); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } case TCP_ESTABLISHED: case TCP_CLOSE_WAIT: case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: case TCP_TIME_WAIT: if (!tcp_sequence(sk, th, len, opt, saddr,dev)) { if (inet_debug == DBG_SLIP) printk("\rtcp_rcv: not in seq\n"); #ifdef undef /* nice idea, but tcp_sequence already does this. Maybe it shouldn't?? */ if(!th->rst) tcp_send_ack(sk->send_seq, sk->acked_seq, sk, th, saddr); #endif kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (th->rst) { sk->zapped=1; /* This means the thing should really be closed. */ sk->err = ECONNRESET; if (sk->state == TCP_CLOSE_WAIT) { sk->err = EPIPE; } /* * A reset with a fin just means that * the data was not all read. */ sk->state = TCP_CLOSE; sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) { sk->state_change(sk); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if ( #if 0 if ((opt && (opt->security != 0 || opt->compartment != 0)) || #endif th->syn) { sk->err = ECONNRESET; sk->state = TCP_CLOSE; sk->shutdown = SHUTDOWN_MASK; tcp_reset(daddr, saddr, th, sk->prot, opt,dev, sk->ip_tos,sk->ip_ttl); if (!sk->dead) { sk->state_change(sk); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (th->ack) { if (!tcp_ack(sk, th, saddr, len)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } } if (th->urg) { if (tcp_urg(sk, th, saddr)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } } if (tcp_data(skb, sk, saddr, len)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } /* Moved: you must do data then fin bit */ if (th->fin && tcp_fin(sk, th, saddr, dev)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } release_sock(sk); return(0); case TCP_CLOSE: if (sk->dead || sk->daddr) { DPRINTF((DBG_TCP, "packet received for closed,dead socket\n")); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (!th->rst) { if (!th->ack) th->ack_seq = 0; tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); case TCP_LISTEN: if (th->rst) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (th->ack) { tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (th->syn) { #if 0 if (opt->security != 0 || opt->compartment != 0) { tcp_reset(daddr, saddr, th, prot, opt,dev); release_sock(sk); return(0); } #endif /* * Now we just put the whole thing including * the header and saddr, and protocol pointer * into the buffer. We can't respond until the * user tells us to accept the connection. */ tcp_conn_request(sk, skb, daddr, saddr, opt, dev); release_sock(sk); return(0); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); default: if (!tcp_sequence(sk, th, len, opt, saddr,dev)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } case TCP_SYN_SENT: if (th->rst) { sk->err = ECONNREFUSED; sk->state = TCP_CLOSE; sk->shutdown = SHUTDOWN_MASK; sk->zapped = 1; if (!sk->dead) { sk->state_change(sk); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } #if 0 if (opt->security != 0 || opt->compartment != 0) { sk->err = ECONNRESET; sk->state = TCP_CLOSE; sk->shutdown = SHUTDOWN_MASK; tcp_reset(daddr, saddr, th, sk->prot, opt, dev); if (!sk->dead) { wake_up_interruptible(sk->sleep); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } #endif if (!th->ack) { if (th->syn) { sk->state = TCP_SYN_RECV; } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } switch(sk->state) { case TCP_SYN_SENT: if (!tcp_ack(sk, th, saddr, len)) { tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } /* * If the syn bit is also set, switch to * tcp_syn_recv, and then to established. */ if (!th->syn) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } /* Ack the syn and fall through. */ sk->acked_seq = th->seq+1; sk->fin_seq = th->seq; tcp_send_ack(sk->send_seq, th->seq+1, sk, th, sk->daddr); case TCP_SYN_RECV: if (!tcp_ack(sk, th, saddr, len)) { tcp_reset(daddr, saddr, th, sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } sk->state = TCP_ESTABLISHED; /* * Now we need to finish filling out * some of the tcp header. */ /* We need to check for mtu info. */ tcp_options(sk, th); sk->dummy_th.dest = th->source; sk->copied_seq = sk->acked_seq-1; if (!sk->dead) { sk->state_change(sk); } /* * We've already processed his first * ack. In just about all cases that * will have set max_window. This is * to protect us against the possibility * that the initial window he sent was 0. * This must occur after tcp_options, which * sets sk->mtu. */ if (sk->max_window == 0) { sk->max_window = 32; sk->mss = min(sk->max_window, sk->mtu); } /* * Now process the rest like we were * already in the established state. */ if (th->urg) { if (tcp_urg(sk, th, saddr)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } } if (tcp_data(skb, sk, saddr, len)) kfree_skb(skb, FREE_READ); if (th->fin) tcp_fin(sk, th, saddr, dev); release_sock(sk); return(0); } if (th->urg) { if (tcp_urg(sk, th, saddr)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } } if (tcp_data(skb, sk, saddr, len)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (!th->fin) { release_sock(sk); return(0); } tcp_fin(sk, th, saddr, dev); release_sock(sk); return(0); } } /* * This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. */ static void tcp_write_wakeup(struct sock *sk) { struct sk_buff *buff; struct tcphdr *t1; struct device *dev=NULL; int tmp; if (sk->zapped) return; /* Afer a valid reset we can send no more */ if (sk -> state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) return; buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); if (buff == NULL) return; buff->mem_addr = buff; buff->mem_len = MAX_ACK_SIZE; buff->len = sizeof(struct tcphdr); buff->free = 1; buff->sk = sk; DPRINTF((DBG_TCP, "in tcp_write_wakeup\n")); t1 = (struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); return; } buff->len += tmp; t1 = (struct tcphdr *)((char *)t1 +tmp); memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); /* * Use a previous sequence. * This should cause the other end to send an ack. */ t1->seq = ntohl(sk->send_seq-1); t1->ack = 1; t1->res1= 0; t1->res2= 0; t1->rst = 0; t1->urg = 0; t1->psh = 0; t1->fin = 0; t1->syn = 0; t1->ack_seq = ntohl(sk->acked_seq); t1->window = ntohs(tcp_select_window(sk)/*sk->prot->rspace(sk)*/); t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); /* Send it and free it. * This will prevent the timer from automatically being restarted. */ sk->prot->queue_xmit(sk, dev, buff, 1); } /* * This routine probes a zero window. It makes a copy of the first * packet in the write queue, but with just one byte of data. */ void tcp_send_probe0(struct sock *sk) { unsigned char *raw; struct iphdr *iph; struct sk_buff *skb2, *skb; int len, hlen, data; struct tcphdr *t1; struct device *dev; if (sk->zapped) return; /* Afer a valid reset we can send no more */ if (sk -> state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk -> state != TCP_FIN_WAIT1 && sk->state != TCP_FIN_WAIT2) return; skb = sk->wfront; if (skb == NULL) return; dev = skb->dev; /* I know this can't happen but as it does.. */ if(dev==NULL) { printk("tcp_send_probe0: NULL device bug!\n"); return; } IS_SKB(skb); raw = skb->data; iph = (struct iphdr *) (raw + dev->hard_header_len); hlen = (iph->ihl * sizeof(unsigned long)) + dev->hard_header_len; data = skb->len - hlen - sizeof(struct tcphdr); len = hlen + sizeof(struct tcphdr) + (data ? 1 : 0); /* Allocate buffer. */ if ((skb2 = alloc_skb(sizeof(struct sk_buff) + len, GFP_ATOMIC)) == NULL) { /* printk("alloc failed raw %x th %x hlen %d data %d len %d\n", raw, skb->h.th, hlen, data, len); */ reset_timer (sk, TIME_PROBE0, 10); /* try again real soon */ return; } skb2->arp = skb->arp; skb2->len = len; skb2->h.raw = (char *)(skb2->data); sk->wmem_alloc += skb2->mem_len; /* Copy the packet header into the new buffer. */ memcpy(skb2->h.raw, raw, len); skb2->h.raw += hlen; /* it's now h.th -- pointer to the tcp header */ t1 = skb2->h.th; /* source, dest, seq, from existing packet */ t1->ack_seq = ntohl(sk->acked_seq); t1->res1 = 0; /* doff, fin, from existing packet. Fin is safe because Linux always * sends fin in a separate packet * syn, rst, had better be zero in original */ t1->ack = 1; t1->urg = 0; /* urgent pointer might be beyond this fragment */ t1->res2 = 0; t1->window = ntohs(tcp_select_window(sk)/*sk->prot->rspace(sk)*/); t1->urg_ptr = 0; tcp_send_check(t1, sk->saddr, sk->daddr, len - hlen, sk); /* Send it and free it. * This will prevent the timer from automatically being restarted. */ sk->prot->queue_xmit(sk, dev, skb2, 1); sk->backoff++; /* * in the case of retransmissions, there's good reason to limit * rto to 120 sec, as that's the maximum legal RTT on the Internet. * For probes it could reasonably be longer. However making it * much longer could cause unacceptable delays in some situation, * so we might as well use the same value */ sk->rto = min(sk->rto << 1, 120*HZ); reset_timer (sk, TIME_PROBE0, sk->rto); sk->retransmits++; sk->prot->retransmits ++; } /* * Socket option code for TCP. */ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int val,err; if(level!=SOL_TCP) return ip_setsockopt(sk,level,optname,optval,optlen); if (optval == NULL) return(-EINVAL); err=verify_area(VERIFY_READ, optval, sizeof(int)); if(err) return err; val = get_fs_long((unsigned long *)optval); switch(optname) { case TCP_MAXSEG: /* if(val<200||val>2048 || val>sk->mtu) */ /* * values greater than interface MTU won't take effect. however at * the point when this call is done we typically don't yet know * which interface is going to be used */ if(val<1||val>MAX_WINDOW) return -EINVAL; sk->user_mss=val; return 0; case TCP_NODELAY: sk->nonagle=(val==0)?0:1; return 0; default: return(-ENOPROTOOPT); } } int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) { int val,err; if(level!=SOL_TCP) return ip_getsockopt(sk,level,optname,optval,optlen); switch(optname) { case TCP_MAXSEG: val=sk->user_mss; break; case TCP_NODELAY: val=sk->nonagle; /* Until Johannes stuff is in */ break; default: return(-ENOPROTOOPT); } err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); if(err) return err; put_fs_long(sizeof(int),(unsigned long *) optlen); err=verify_area(VERIFY_WRITE, optval, sizeof(int)); if(err) return err; put_fs_long(val,(unsigned long *)optval); return(0); } struct proto tcp_prot = { sock_wmalloc, sock_rmalloc, sock_wfree, sock_rfree, sock_rspace, sock_wspace, tcp_close, tcp_read, tcp_write, tcp_sendto, tcp_recvfrom, ip_build_header, tcp_connect, tcp_accept, ip_queue_xmit, tcp_retransmit, tcp_write_wakeup, tcp_read_wakeup, tcp_rcv, tcp_select, tcp_ioctl, NULL, tcp_shutdown, tcp_setsockopt, tcp_getsockopt, 128, 0, {NULL,}, "TCP" };