sortix-mirror/kernel/net/tcp.cpp

2571 lines
71 KiB
C++

/*
* Copyright (c) 2016, 2017, 2018, 2022 Jonas 'Sortie' Termansen.
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* net/tcp.cpp
* Transmission Control Protocol.
*/
// TODO: Implement sending back RST and such.
// TODO: PUSH.
// TODO: URG.
// TODO: Nagle's algorithm, MSG_MORE, TCP_CORK, TCP_NODELAY, etc.
// TODO: TCP options.
// TODO: Maximum Segment Size (respect TCP_MSS).
// TODO: Efficient receieve queue when out of order.
// TODO: Efficient backlog / half-open. Avoid denial of service attacks.
// TODO: Measure average round trip time for efficient retransmission?
// TODO: High speed extensions.
// TODO: Anti-congestion extensions.
// TODO: Selective acknowledgements.
// TODO: Implement all RFC 1122 TCP requirements.
// TODO: Probing Zero Windows per RFC 1122 4.2.2.17.
// TODO: os-test all the things.
#include <sys/socket.h>
#include <sys/stat.h>
#include <assert.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <poll.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <timespec.h>
#ifndef IOV_MAX
#include <sortix/limits.h>
#endif
#include <sortix/kernel/clock.h>
#include <sortix/kernel/copy.h>
#include <sortix/kernel/if.h>
#include <sortix/kernel/inode.h>
#include <sortix/kernel/ioctx.h>
#include <sortix/kernel/kernel.h>
#include <sortix/kernel/kthread.h>
#include <sortix/kernel/packet.h>
#include <sortix/kernel/poll.h>
#include <sortix/kernel/process.h>
#include <sortix/kernel/sockopt.h>
#include <sortix/kernel/thread.h>
#include <sortix/kernel/time.h>
#include <sortix/kernel/timer.h>
#include <sortix/kernel/worker.h>
#include "ip.h"
#include "tcp.h"
#define BUFFER_SIZE 65536 // Documented in tcp(4).
#define NUM_RETRANSMISSIONS 6 // Documented in tcp(4)
namespace Sortix {
namespace TCP {
class TCPSocket;
union tcp_sockaddr
{
sa_family_t family;
struct sockaddr_in in;
struct sockaddr_in6 in6;
};
// The TCP states per STD 7 (RFC 793).
enum tcp_state
{
TCP_STATE_CLOSED,
TCP_STATE_LISTEN,
TCP_STATE_SYN_SENT,
TCP_STATE_SYN_RECV,
TCP_STATE_ESTAB,
TCP_STATE_FIN_WAIT_1,
TCP_STATE_FIN_WAIT_2,
TCP_STATE_CLOSE_WAIT,
TCP_STATE_CLOSING,
TCP_STATE_LAST_ACK,
TCP_STATE_TIME_WAIT,
};
enum tcp_special
{
TCP_SPECIAL_NOT,
TCP_SPECIAL_PENDING,
TCP_SPECIAL_WINDOW,
TCP_SPECIAL_ACKED,
};
// Global lock protecting all TCP sockets as they need to access each other.
static kthread_mutex_t tcp_lock = KTHREAD_MUTEX_INITIALIZER;
static TCPSocket** bindings_v4;
static TCPSocket** bindings_v6;
void Init()
{
if ( !(bindings_v4 = new TCPSocket*[65536]) ||
!(bindings_v6 = new TCPSocket*[65536]) )
Panic("Failed to allocate TCP Socket bindings");
for ( size_t i = 0; i < 65536; i++ )
{
bindings_v4[i] = NULL;
bindings_v6[i] = NULL;
}
}
static inline bool mod32_le(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) <= 0;
}
static inline bool mod32_lt(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) < 0;
}
static inline bool mod32_ge(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) >= 0;
}
static inline bool mod32_gt(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) > 0;
}
static bool IsSupportedAddressFamily(int af)
{
return af == AF_INET /* TODO: || af == AF_INET6 */;
}
static size_t AddressFamilySize(int af)
{
switch ( af )
{
case AF_INET: return sizeof(struct sockaddr_in);
case AF_INET6: return sizeof(struct sockaddr_in6);
}
return 0;
}
// The TCP socket implementation. It is separate from the class TCPSocketNode
// as that class is reference counted, but this class manages its own lifetime
// so the socket is properly shut down after all references are closed.
//
// Bound sockets are in a double linked list starting from the appropriate
// bindings array indexed by the port, and then the sockets on that port are
// doubly linked using prev_socket and next_socket.
//
// Half-open sockets are in a doubly linked list starting from connecting_half
// in the listening socket, and then doubly linked with connecting_prev and
// connecting_next (with connecting_parent going back to the listening socket).
//
// Ready sockets that have not yet been accepted are in a doubly linked list
// starting from connecting_ready in the listening socket, and then doubly
// linked with connecting_prev and connecting_next (with connecting_parent going
// back to the listening socket).
//
// A socket wants to be deleted when it's in the CLOSED state and is not
// referenced by its TCPSocketNode anymore. Deletion is possible when the timer
// is not pending.
class TCPSocket
{
friend void HandleIP(Ref<Packet> pkt,
const struct in_addr* src,
const struct in_addr* dst,
bool dst_broadcast);
public:
TCPSocket(int af);
~TCPSocket();
Ref<Inode> accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
int flags);
int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
int listen(ioctx_t* ctx, int backlog);
ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags);
ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags);
ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count);
ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags);
ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count);
int poll(ioctx_t* ctx, PollNode* node);
int getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value,
size_t* option_size_ptr);
int setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size);
int shutdown(ioctx_t* ctx, int how);
int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
public:
void Unreference();
void ProcessPacket(Ref<Packet> pkt, union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst);
void ReceivePacket(Ref<Packet> pkt, union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst);
void OnTimer();
inline bool want_destruction()
{
return state == TCP_STATE_CLOSED && !is_referenced;
}
inline bool can_destroy()
{
return want_destruction() && !timer_armed;
}
private:
short PollEventStatus();
bool ImportAddress(ioctx_t* ctx, union tcp_sockaddr* dest, const void* addr,
size_t addrsize);
bool CanBind(union tcp_sockaddr new_local);
bool BindDefault(const union tcp_sockaddr* new_local_ptr);
void UpdateWindow(uint16_t new_window);
void TransmitLoop();
bool Transmit();
void ScheduleTransmit();
void SetDeadline();
void SetTimer();
void Close();
void Destroy();
void Disconnect();
void Fail(int error);
ssize_t recv_unlocked(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
ssize_t send_unlocked(ioctx_t* ctx, const uint8_t* buf, size_t count,
int flags);
int shutdown_unlocked(int how);
public:
// The previous socket bound on the same port in the address family.
TCPSocket* prev_socket;
// The next socket bound on the same port in the address family.
TCPSocket* next_socket;
// The first half-connected socket in our listening queue.
TCPSocket* connecting_half;
// The first ready socket in our listening queue.
TCPSocket* connecting_ready;
// The previous half-connected or ready socket in our listening queue.
TCPSocket* connecting_prev;
// The next half-connected or ready socket in our listening queue.
TCPSocket* connecting_next;
// The listening socket this socket is in the listening queue for.
TCPSocket* connecting_parent;
// Condition variable that is signaled when new data can be received.
kthread_cond_t receive_cond;
// Condition variable that is signaled when new data can be transmitted.
kthread_cond_t transmit_cond;
// The local socket name, or the any address port 0 if not set.
union tcp_sockaddr local;
// The remote socket name, or the any address port 0 if not set.
union tcp_sockaddr remote;
// The network interface the socket is bound to, or 0 if none.
unsigned int ifindex;
// Whether the socket has been bound to a port.
bool bound;
// Whether the socket is receiving datagrams.
bool remoted;
// Whether SO_REUSEADDR is set.
bool reuseaddr;
// Whether the socket is referenced from anywhere and must not deallocate.
bool is_referenced;
private:
// The timer used for retransmissions and timing out the connection.
Timer timer;
// The poll channel to publish poll bit changes on.
PollChannel poll_channel;
// The queue of incoming packets whose sequence numbers are too high to
// process right now, sorted by increasing sequence number.
Ref<Packet> receive_queue; // TODO: Not a good way to keep track of this.
// The deadline for the remote to acknowledge before retransmitting.
struct timespec deadline;
// The offset at which data begins in the incoming ring buffer.
size_t incoming_offset;
// The amount of bytes in the incoming ring buffer.
size_t incoming_used;
// The offset at which data begins in the outgoing ring buffer.
size_t outgoing_offset;
// The amount of bytes in the outgoing ring buffer.
size_t outgoing_used;
// Send unacknowledged (STD 7, RFC 793).
tcp_seq send_una;
// Send next (STD 7, RFC 793).
tcp_seq send_nxt;
// Send window (STD 7, RFC 793).
tcp_seq send_wnd;
// Send urgent pointer (STD 7, RFC 793).
tcp_seq send_up;
// Segment sequence number used for last window update (STD 7, RFC 793).
tcp_seq send_wl1;
// Segment acknowledgment number used for last window update (STD 7, RFC
// 793).
tcp_seq send_wl2;
// Next sequence to send (STD 7, RFC 793).
tcp_seq send_pos;
// Initial send sequence number (STD 7, RFC 793).
tcp_seq iss;
// Receive next (STD 7, RFC 793).
tcp_seq recv_nxt;
// Receive window (STD 7, RFC 793).
tcp_seq recv_wnd;
// Receive urgent pointer (STD 7, RFC 793).
tcp_seq recv_up;
// Last sequence acked (STD 7, RFC 793).
tcp_seq recv_acked;
// Last window size advertised (STD 7, RFC 793).
tcp_seq recv_wndlast;
// Initial receive sequence number (STD 7, RFC 793).
tcp_seq irs;
// The address family to which this socket belongs.
int af;
// Set to an errno value if a socket error has occured, or 0 otherwise.
int sockerr;
// The number of sockets in the listening queue.
int backlog_used;
// The maximum number of sockets in the listening queue.
int backlog_max;
// The number of retransmissions that have occured since the last
// acknowledgement from the remote socket.
unsigned int retransmissions;
// The current TCP state.
enum tcp_state state;
// The state of the outgoing SYN.
enum tcp_special outgoing_syn;
// The state of the outgoing FIN.
enum tcp_special outgoing_fin;
// Whether SYN has been received from the remote socket.
bool has_syn;
// Whether FIN has been received from the remote socket.
bool has_fin;
// Whether a transmission has been scheduled.
bool transmit_scheduled;
// Whether the timer is pending.
bool timer_armed;
// Whether the socket has been shut down for receive.
bool shutdown_receive;
// The incoming ring buffer.
unsigned char incoming[BUFFER_SIZE];
// The outgoing ring buffer.
unsigned char outgoing[BUFFER_SIZE];
};
// The TCP socket Inode with a reference counted lifetime. The backend class
// TCPSocket is separate as it may stay alive for a little while after all
// references to it has been lost.
class TCPSocketNode : public AbstractInode
{
public:
TCPSocketNode(TCPSocket* socket);
virtual ~TCPSocketNode();
virtual Ref<Inode> accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
int flags);
virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
virtual int listen(ioctx_t* ctx, int backlog);
virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags);
virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count,
int flags);
virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr,
int flags);
virtual ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count);
virtual ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count);
virtual int poll(ioctx_t* ctx, PollNode* node);
virtual int getsockopt(ioctx_t* ctx, int level, int option_name,
void* option_value, size_t* option_size_ptr);
virtual int setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size);
virtual int shutdown(ioctx_t* ctx, int how);
virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
private:
TCPSocket* socket;
};
void TCPSocket__OnTimer(Clock* /*clock*/, Timer* /*timer*/, void* user)
{
((TCPSocket*) user)->OnTimer();
}
TCPSocket::TCPSocket(int af)
{
prev_socket = NULL;
next_socket = NULL;
connecting_half = NULL;
connecting_ready = NULL;
connecting_prev = NULL;
connecting_next = NULL;
connecting_parent = NULL;
receive_cond = KTHREAD_COND_INITIALIZER;
transmit_cond = KTHREAD_COND_INITIALIZER;
memset(&local, 0, sizeof(local));
memset(&remote, 0, sizeof(remote));
ifindex = 0;
bound = false;
remoted = false;
reuseaddr = false;
// timer is initialized by its constructor.
timer.Attach(Time::GetClock(CLOCK_MONOTONIC));
// poll_channel is initialized by its constructor.
// receive_queue is initialized by its constructor.
deadline = timespec_make(-1, 0);
incoming_offset = 0;
incoming_used = 0;
outgoing_offset = 0;
outgoing_used = 0;
send_una = 0;
send_nxt = 0;
send_wnd = 0;
send_up = 0;
send_wl1 = 0;
send_wl2 = 0;
send_pos = 0;
iss = 0;
recv_nxt = 0;
recv_wnd = 0;
recv_up = 0;
recv_acked = 0;
recv_wndlast = 0;
irs = 0;
this->af = af;
sockerr = 0;
backlog_used = 0;
backlog_max = 0;
retransmissions = 0;
state = TCP_STATE_CLOSED;
outgoing_syn = TCP_SPECIAL_NOT;
outgoing_fin = TCP_SPECIAL_NOT;
has_syn = false;
has_fin = false;
transmit_scheduled = false;
is_referenced = false;
timer_armed = false;
shutdown_receive = false;
memset(incoming, 0, sizeof(incoming));
memset(outgoing, 0, sizeof(outgoing));
}
TCPSocket::~TCPSocket()
{
assert(state == TCP_STATE_CLOSED);
assert(!bound);
assert(!prev_socket);
assert(!next_socket);
assert(!connecting_half);
assert(!connecting_half);
assert(!connecting_ready);
assert(!connecting_prev);
assert(!connecting_next);
assert(!connecting_parent);
assert(!is_referenced);
// Avoid stack overflow in receive_queue recursive destructor.
while ( receive_queue )
{
Ref<Packet> packet = receive_queue;
receive_queue = packet->next;
packet->next.Reset();
}
}
void TCPSocket::Unreference()
{
kthread_mutex_lock(&tcp_lock);
is_referenced = false;
Disconnect();
bool do_delete = can_destroy();
kthread_mutex_unlock(&tcp_lock);
if ( do_delete )
delete this;
}
void TCPSocket::Close() // tcp_lock taken
{
if ( timer_armed && timer.TryCancel() )
timer_armed = false;
Destroy();
state = TCP_STATE_CLOSED;
kthread_cond_broadcast(&transmit_cond);
kthread_cond_broadcast(&receive_cond);
deadline = timespec_make(-1, 0);
SetTimer();
}
void TCPSocket::Disconnect() // tcp_lock taken
{
if ( state == TCP_STATE_LISTEN )
Close();
else if ( state != TCP_STATE_CLOSED )
{
// TODO: Send back RST if the peer sends when we're not receiving.
if ( state == TCP_STATE_SYN_SENT ||
state == TCP_STATE_SYN_RECV ||
state == TCP_STATE_ESTAB ||
state == TCP_STATE_CLOSE_WAIT )
{
shutdown_unlocked(SHUT_WR);
// CLOSED, LAST_ACK, FIN_WAIT_1.
}
// FIN_WAIT_1 will enter FIN_WAIT_2 or time out.
// FIN_WAIT_2 will time out when unreferenced.
// CLOSING will resend FIN or time out.
// LAST_ACK will resend FIN or time out.
// TIME_WAIT will time out and close.
if ( state == TCP_STATE_FIN_WAIT_2 && !is_referenced )
{
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
}
}
void TCPSocket::Fail(int error)
{
sockerr = error;
Close();
}
void TCPSocket::Destroy() // tcp_lock taken
{
if ( bound )
{
if ( af == AF_INET )
{
uint16_t port = be16toh(local.in.sin_port);
if ( prev_socket )
prev_socket->next_socket = next_socket;
else
bindings_v4[port] = next_socket;
if ( next_socket )
next_socket->prev_socket = prev_socket;
}
else if ( af == AF_INET6 )
{
uint16_t port = be16toh(local.in6.sin6_port);
if ( prev_socket )
prev_socket->next_socket = next_socket;
else
bindings_v6[port] = next_socket;
if ( next_socket )
next_socket->prev_socket = prev_socket;
}
prev_socket = NULL;
next_socket = NULL;
bound = false;
}
while ( connecting_half || connecting_ready )
{
TCPSocket* socket;
if ( connecting_half )
{
socket = connecting_half;
connecting_half = socket->connecting_next;
if ( connecting_half )
connecting_half->connecting_prev = NULL;
}
else
{
socket = connecting_ready;
connecting_ready = socket->connecting_next;
if ( connecting_ready )
connecting_ready->connecting_prev = NULL;
}
socket->connecting_prev = NULL;
socket->connecting_next = NULL;
socket->connecting_parent = NULL;
backlog_used--;
socket->Disconnect();
}
if ( connecting_parent )
{
if ( connecting_prev )
connecting_prev->connecting_next = connecting_next;
else if ( state == TCP_STATE_SYN_RECV )
connecting_parent->connecting_half = connecting_next;
else
connecting_parent->connecting_ready = connecting_next;
if ( connecting_next )
connecting_next->connecting_prev = connecting_prev;
connecting_prev = NULL;
connecting_next = NULL;
connecting_parent->backlog_used--;
connecting_parent = NULL;
}
}
Ref<Inode> TCPSocket::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr,
int flags)
{
if ( flags & ~(0) )
return errno = EINVAL, Ref<Inode>(NULL);
if ( addr && !addrsize_ptr )
return errno = EINVAL, Ref<Inode>(NULL);
ScopedLock lock(&tcp_lock);
if ( state != TCP_STATE_LISTEN )
return errno = EINVAL, Ref<Inode>(NULL);
while ( !connecting_ready )
{
if ( ctx->dflags & O_NONBLOCK )
return errno = EWOULDBLOCK, Ref<Inode>(NULL);
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
return errno = EINTR, Ref<Inode>(NULL);
}
TCPSocket* socket = connecting_ready;
if ( addr )
{
size_t addrsize;
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
return Ref<Inode>(NULL);
size_t af_addrsize = AddressFamilySize(af);
if ( af_addrsize < addrsize )
addrsize = af_addrsize;
if ( !ctx->copy_to_dest(addr, &socket->remote, addrsize) )
return Ref<Inode>(NULL);
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
return Ref<Inode>(NULL);
}
Ref<TCPSocketNode> result(new TCPSocketNode(socket));
if ( !result )
return Ref<Inode>(NULL);
connecting_ready = socket->connecting_next;
if ( connecting_ready )
connecting_ready->connecting_prev = NULL;
socket->connecting_prev = NULL;
socket->connecting_next = NULL;
socket->connecting_parent = NULL;
backlog_used--;
return result;
}
bool TCPSocket::ImportAddress(ioctx_t* ctx,
union tcp_sockaddr* dest,
const void* addr,
size_t addrsize)
{
// TODO: os-test whether AF_UNSPEC can disconnect.
if ( addrsize != AddressFamilySize(af) )
return errno = EINVAL, -1;
union tcp_sockaddr copy;
memset(&copy, 0, sizeof(copy));
if ( !ctx->copy_from_src(&copy, addr, addrsize) )
return false;
if ( copy.family != af )
return errno = EAFNOSUPPORT, false;
memcpy(dest, &copy, sizeof(copy));
return true;
}
bool TCPSocket::CanBind(union tcp_sockaddr new_local) // tcp_lock taken
{
if ( af == AF_INET )
{
// TODO: os-test binding to broadcast addresses.
// Bind to either the any address or the address of a network interface.
if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) )
{
// TODO: What happens to sockets if the network interface changes
// its address?
ScopedLock ifs_lock(&netifs_lock);
bool found = false;
for ( unsigned int i = 1; i < netifs_count; i++ )
{
NetworkInterface* netif = netifs[i];
if ( !netif )
continue;
ScopedLock cfg_lock(&netif->cfg_lock);
if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr,
sizeof(struct in_addr)) == 0 )
{
found = true;
break;
}
}
// No interface had the correct address.
if ( !found )
return errno = EADDRNOTAVAIL, false;
}
uint16_t port = be16toh(new_local.in.sin_port);
for ( TCPSocket* socket = bindings_v4[port];
socket;
socket = socket->next_socket )
{
// TODO: os-test how SO_REUSEADDR works for TCP.
if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) &&
!(reuseaddr && socket->reuseaddr) )
return errno = EADDRINUSE, false;
if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) &&
!(reuseaddr && socket->reuseaddr) )
return errno = EADDRINUSE, false;
if ( new_local.in.sin_addr.s_addr ==
socket->local.in.sin_addr.s_addr )
return errno = EADDRINUSE, false;
}
}
else if ( af == AF_INET6 )
{
// TODO: IPv6 support for seeing if any interface has the address.
if ( true )
return errno = EAFNOSUPPORT, false;
uint16_t port = be16toh(new_local.in6.sin6_port);
if ( bindings_v6[port] )
return errno = EADDRINUSE, false;
for ( TCPSocket* socket = bindings_v6[port];
socket;
socket = socket->next_socket )
{
if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any,
sizeof(in6addr_any)) &&
!(reuseaddr && socket->reuseaddr) )
if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any,
sizeof(in6addr_any)) &&
!(reuseaddr && socket->reuseaddr) )
if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr,
sizeof(new_local.in6.sin6_addr)) )
return errno = EADDRINUSE, false;
}
}
else
return errno = EAFNOSUPPORT, false;
return true;
}
int TCPSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
ScopedLock lock(&tcp_lock);
if ( bound )
return errno = EINVAL, -1;
union tcp_sockaddr new_local;
if ( !ImportAddress(ctx, &new_local, addr, addrsize) )
return -1;
uint16_t port;
if ( af == AF_INET )
port = be16toh(new_local.in.sin_port);
else if ( af == AF_INET6 )
port = be16toh(new_local.in6.sin6_port);
else
return errno = EAFNOSUPPORT, -1;
// TODO: Binding to the any address needs to pick the appropriate source
// interface and bind to its address. (Or really? udp doesn't?
// os-test?)
// TODO: os-test a server listening on any, and then getsockname a
// connection received on that port.
if ( port == 0 )
return BindDefault(&new_local) ? 0 : -1;
if ( !CanBind(new_local) )
return -1;
if ( af == AF_INET )
{
uint16_t port = be16toh(new_local.in.sin_port);
if ( bindings_v4[port] )
bindings_v4[port]->prev_socket = this;
next_socket = bindings_v4[port];
prev_socket = NULL;
bindings_v4[port] = this;
}
else if ( af == AF_INET6 )
{
uint16_t port = be16toh(new_local.in6.sin6_port);
if ( bindings_v6[port] )
return errno = EADDRINUSE, -1;
next_socket = bindings_v6[port];
prev_socket = NULL;
bindings_v6[port] = this;
}
else
return errno = EAFNOSUPPORT, -1;
memcpy(&local, &new_local, sizeof(new_local));
bound = true;
return 0;
}
// tcp_lock locked
bool TCPSocket::BindDefault(const union tcp_sockaddr* new_local_ptr)
{
// TODO: This allocator becomes increasingly biased as more ports are
// allocated.
// TODO: Try not to allocate recently used ports.
union tcp_sockaddr new_local;
if ( new_local_ptr )
memcpy(&new_local, new_local_ptr, sizeof(union tcp_sockaddr));
else
{
memset(&new_local, 0, sizeof(new_local));
if ( af == AF_INET )
{
new_local.in.sin_family = AF_INET;
new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY);
}
else if ( af == AF_INET6 )
{
new_local.in6.sin6_family = AF_INET6;
new_local.in6.sin6_addr = in6addr_any;
}
else
return errno = EAFNOSUPPORT, false;
}
uint16_t start = 32768; // Documented in tcp(4).
uint16_t end = 61000; // Documented in tcp(4).
uint16_t count = end - start;
uint16_t offset = arc4random_uniform(count);
for ( uint16_t i = 0; i < count; i++ )
{
uint16_t j = offset + i;
if ( count <= j )
j -= count;
uint16_t port = start + j;
if ( af == AF_INET )
new_local.in.sin_port = htobe16(port);
else if ( af == AF_INET6 )
new_local.in6.sin6_port = htobe16(port);
else
return errno = EAFNOSUPPORT, false;
if ( !CanBind(new_local) )
{
if ( errno == EADDRINUSE )
continue;
return false;
}
if ( af == AF_INET )
{
if ( bindings_v4[port] )
bindings_v4[port]->prev_socket = this;
next_socket = bindings_v4[port];
prev_socket = NULL;
bindings_v4[port] = this;
}
else if ( af == AF_INET6 )
{
if ( bindings_v6[port] )
bindings_v6[port]->prev_socket = this;
next_socket = bindings_v6[port];
prev_socket = NULL;
bindings_v6[port] = this;
}
else
return errno = EAFNOSUPPORT, false;
memcpy(&local, &new_local, sizeof(new_local));
bound = true;
return true;
}
return errno = EAGAIN, false;
}
void TCPSocket::TransmitLoop() // tcp_lock taken
{
if ( state == TCP_STATE_CLOSED )
return;
if ( NUM_RETRANSMISSIONS <= retransmissions )
{
Fail(ETIMEDOUT);
return;
}
if ( !Transmit() && NUM_RETRANSMISSIONS - 1 <= retransmissions )
{
Fail(errno);
return;
}
}
bool TCPSocket::Transmit() // tcp_lock taken
{
if ( state == TCP_STATE_CLOSED )
return (errno = sockerr ? sockerr : ENOTCONN), false;
// Move new outgoing data into the transmission window if there is room.
tcp_seq window_available = (tcp_seq) (send_una + send_wnd - send_nxt);
if ( window_available && outgoing_syn == TCP_SPECIAL_PENDING )
{
send_nxt++;
outgoing_syn = TCP_SPECIAL_WINDOW;
window_available--;
}
if ( window_available )
{
tcp_seq window_data = (tcp_seq)(send_nxt - send_una);
if ( outgoing_syn == TCP_SPECIAL_WINDOW )
window_data--;
if ( outgoing_fin == TCP_SPECIAL_WINDOW )
window_data--;
assert(window_data <= outgoing_used);
size_t outgoing_new = outgoing_used - window_data;
tcp_seq amount = window_available;
if ( outgoing_new < amount )
amount = outgoing_new;
send_nxt += amount;
window_available -= amount;
}
if ( window_available && outgoing_fin == TCP_SPECIAL_PENDING )
{
send_nxt++;
outgoing_fin = TCP_SPECIAL_WINDOW;
window_available--;
}
// Transmit packets.
bool any = false;
while ( mod32_lt(send_pos, send_nxt) ||
(has_syn && mod32_lt(recv_acked, recv_nxt)) ||
recv_wnd != recv_wndlast )
{
any = true;
size_t mtu;
union tcp_sockaddr sendfrom;
if ( af == AF_INET )
{
if ( !IP::GetSourceIP(&local.in.sin_addr, &remote.in.sin_addr,
&sendfrom.in.sin_addr, ifindex, &mtu) )
return false;
}
// TODO: IPv6 support.
else
return errno = EAFNOSUPPORT, false;
if ( mtu < sizeof(struct tcphdr) )
return errno = EINVAL, false;
mtu -= sizeof(struct tcphdr);
Ref<Packet> pkt = GetPacket();
if ( !pkt )
return false;
pkt->length = sizeof(struct tcphdr);
unsigned char* out = pkt->from;
struct tcphdr hdr;
if ( af == AF_INET )
{
hdr.th_sport = local.in.sin_port;
hdr.th_dport = remote.in.sin_port;
}
else if ( af == AF_INET6 )
{
hdr.th_sport = local.in6.sin6_port;
hdr.th_dport = remote.in6.sin6_port;
}
else
return errno = EAFNOSUPPORT, false;
hdr.th_seq = htobe32(send_pos);
hdr.th_offset = TCP_OFFSET_ENCODE(sizeof(struct tcphdr) / 4);
hdr.th_flags = 0;
tcp_seq send_nxtpos = send_pos;
assert(mod32_le(send_nxtpos, send_nxt));
if ( outgoing_syn == TCP_SPECIAL_WINDOW && send_nxtpos == send_una )
{
hdr.th_flags |= TH_SYN;
send_nxtpos++;
}
assert(mod32_le(send_nxtpos, send_nxt));
if ( has_syn )
{
// TODO: RFC 1122 4.2.2.6:
// "TCP SHOULD send an MSS (Maximum Segment Size) option in
// every SYN segment when its receive MSS differs from the
// default 536, and MAY send it always."
// "If an MSS option is not received at connection setup, TCP
// MUST assume a default send MSS of 536 (576-40)."
hdr.th_flags |= TH_ACK;
hdr.th_ack = htobe32(recv_nxt);
}
else
hdr.th_ack = htobe32(0);
hdr.th_win = htobe16(recv_wnd);
hdr.th_urp = htobe16(0);
hdr.th_sum = htobe16(0);
tcp_seq window_data = (tcp_seq)(send_nxt - send_pos);
if ( send_pos == send_una && outgoing_syn == TCP_SPECIAL_WINDOW )
window_data--;
if ( mod32_lt(send_pos, send_nxt) &&
outgoing_fin == TCP_SPECIAL_WINDOW )
window_data--;
if ( window_data )
{
size_t amount = mtu < window_data ? mtu : window_data;
assert(outgoing_offset <= sizeof(outgoing));
tcp_seq window_length = (tcp_seq) (send_nxtpos - send_una);
if ( outgoing_syn == TCP_SPECIAL_WINDOW )
window_length--;
assert(window_length <= sizeof(outgoing));
size_t outgoing_end = outgoing_offset + window_length;
if ( sizeof(outgoing) <= outgoing_end )
outgoing_end -= sizeof(outgoing);
assert(outgoing_end < sizeof(outgoing));
size_t until_end = sizeof(outgoing) - outgoing_end;
size_t first = until_end < amount ? until_end : amount;
assert(first <= sizeof(outgoing));
assert(first <= sizeof(outgoing) - outgoing_end);
size_t second = amount - first;
assert(second <= sizeof(outgoing));
memcpy(out + sizeof(hdr), outgoing + outgoing_end, first);
if ( second )
memcpy(out + sizeof(hdr) + first, outgoing, second);
pkt->length += amount;
send_nxtpos += amount;
}
assert(mod32_le(send_nxtpos, send_nxt));
if ( outgoing_fin == TCP_SPECIAL_WINDOW &&
send_nxtpos + 1 == send_nxt )
{
hdr.th_flags |= TH_FIN;
send_nxtpos++;
}
assert(mod32_le(send_nxtpos, send_nxt));
memcpy(out, &hdr, sizeof(hdr));
uint16_t checksum = 0;
if ( af == AF_INET )
{
checksum = IP::ipsum_buf(checksum, &sendfrom.in.sin_addr,
sizeof(struct in_addr));
checksum = IP::ipsum_buf(checksum, &remote.in.sin_addr,
sizeof(struct in_addr));
}
else if ( af == AF_INET6 )
{
checksum = IP::ipsum_buf(checksum, &sendfrom.in6.sin6_addr,
sizeof(struct in6_addr));
checksum = IP::ipsum_buf(checksum, &remote.in6.sin6_addr,
sizeof(struct in6_addr));
}
else
return errno = EAFNOSUPPORT, false;
checksum = IP::ipsum_word(checksum, IPPROTO_TCP);
checksum = IP::ipsum_word(checksum, pkt->length);
checksum = IP::ipsum_buf(checksum, out, pkt->length);
hdr.th_sum = htobe16(IP::ipsum_finish(checksum));
memcpy(out, &hdr, sizeof(hdr));
if ( af == AF_INET )
{
if ( !IP::Send(pkt, &sendfrom.in.sin_addr, &remote.in.sin_addr,
IPPROTO_TCP, ifindex, false) )
return false;
}
// TODO: IPv6 support.
else
return errno = EAFNOSUPPORT, false;
if ( has_syn )
recv_acked = recv_nxt;
recv_wndlast = recv_wnd;
assert(mod32_le(send_nxtpos, send_nxt));
send_pos = send_nxtpos;
}
if ( any )
{
SetDeadline();
SetTimer();
}
return true;
}
void TCPSocket::OnTimer()
{
ScopedLock lock(&tcp_lock);
timer_armed = false;
if ( 0 <= deadline.tv_sec &&
timespec_le(deadline, Time::Get(CLOCK_MONOTONIC)) )
{
deadline = timespec_make(-1, 0);
if ( state == TCP_STATE_TIME_WAIT ||
(state == TCP_STATE_FIN_WAIT_2 && !is_referenced) )
Close();
else if ( mod32_lt(send_una, send_pos) )
{
retransmissions++;
send_pos = send_una;
}
}
transmit_scheduled = false;
TransmitLoop();
if ( 0 <= deadline.tv_sec )
SetTimer();
if ( can_destroy() )
delete this;
}
void TCPSocket::ScheduleTransmit() // tcp_lock locked
{
if ( state == TCP_STATE_CLOSED || state == TCP_STATE_LISTEN )
return;
if ( transmit_scheduled && timer_armed )
return;
transmit_scheduled = true;
SetTimer();
}
void TCPSocket::SetDeadline() // tcp_lock locked
{
if ( 0 <= deadline.tv_sec )
return;
if ( state == TCP_STATE_TIME_WAIT ||
(state == TCP_STATE_FIN_WAIT_2 && !is_referenced) )
{
struct timespec now = Time::Get(CLOCK_MONOTONIC);
struct timespec msl2 = timespec_make(60, 0); // Documented in tcp(4).
deadline = timespec_add(now, msl2);
}
else if ( mod32_le(send_una, send_pos) )
{
struct timespec now = Time::Get(CLOCK_MONOTONIC);
struct timespec delay = timespec_make(1 + 1 * retransmissions, 0);
deadline = timespec_add(now, delay);
}
}
void TCPSocket::SetTimer() // tcp_lock locked
{
if ( timer_armed )
{
if ( !timer.TryCancel() )
return;
timer_armed = false;
}
if ( state == TCP_STATE_CLOSED )
return;
bool destruction_is_wanted = want_destruction();
if ( transmit_scheduled || destruction_is_wanted || 0 <= deadline.tv_sec )
{
int flags = TIMER_FUNC_MAY_DEALLOCATE_TIMER;
struct itimerspec timeout;
memset(&timeout, 0, sizeof(timeout));
// Slightly delay transmission to batch together a better reply.
if ( transmit_scheduled )
timeout.it_value = timespec_make(0, 1);
else if ( 0 <= deadline.tv_sec )
{
timeout.it_value = deadline;
flags |= TIMER_ABSOLUTE;
}
timer.Set(&timeout, NULL, flags, TCPSocket__OnTimer, this);
timer_armed = true;
}
}
void TCPSocket::ProcessPacket(Ref<Packet> pkt,
union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst) // tcp_lock locked
{
const unsigned char* in = pkt->from + pkt->offset;
size_t inlen = pkt->length - pkt->offset;
struct tcphdr hdr;
memcpy(&hdr, in, sizeof(hdr));
hdr.th_sport = be16toh(hdr.th_sport);
hdr.th_dport = be16toh(hdr.th_dport);
hdr.th_seq = be32toh(hdr.th_seq);
hdr.th_ack = be32toh(hdr.th_ack);
hdr.th_win = be16toh(hdr.th_win);
hdr.th_urp = be16toh(hdr.th_urp);
size_t offset = TCP_OFFSET_DECODE(hdr.th_offset) * 4;
in += offset;
inlen -= offset;
if ( state == TCP_STATE_CLOSED ) // STD 7, RFC 793, page 65.
{
if ( hdr.th_flags & TH_RST )
return;
// TODO: ACK the RST.
return;
}
else if ( state == TCP_STATE_LISTEN ) // STD 7, RFC 793, page 65.
{
if ( hdr.th_flags & TH_RST )
return;
if ( hdr.th_flags & TH_ACK )
{
// TODO: Send <SEQ=SEG.ACK><CTL=RST>.
return;
}
if ( !(hdr.th_flags & TH_SYN) )
return;
if ( !hdr.th_win )
return;
if ( backlog_max <= backlog_used )
return;
// TODO: Use SYN cache to mitigate SYN flood attack.
TCPSocket* socket = new TCPSocket(af);
if ( !socket )
return;
assert(pkt_src);
assert(pkt_dst);
socket->remote = *pkt_src;
socket->local = *pkt_dst;
socket->remoted = true;
socket->bound = true;
if ( af == AF_INET )
{
uint16_t port = be16toh(socket->local.in.sin_port);
socket->prev_socket = NULL;
socket->next_socket = bindings_v4[port];
if ( socket->next_socket )
socket->next_socket->prev_socket = socket;
bindings_v4[port] = socket;
}
else if ( af == AF_INET6 )
{
uint16_t port = be16toh(socket->local.in6.sin6_port);
socket->prev_socket = NULL;
socket->next_socket = bindings_v6[port];
if ( socket->next_socket )
socket->next_socket->prev_socket = socket;
bindings_v6[port] = socket;
}
socket->iss = arc4random();
socket->send_una = socket->iss;
socket->send_nxt = socket->iss;
socket->send_wnd = 1;
socket->send_pos = socket->iss;
socket->outgoing_syn = TCP_SPECIAL_PENDING;
socket->recv_wnd = TCP_MAXWIN;
socket->recv_acked = hdr.th_seq;
socket->recv_nxt = hdr.th_seq + 1;
socket->irs = hdr.th_seq;
socket->has_syn = true;
socket->state = TCP_STATE_SYN_RECV;
socket->UpdateWindow(hdr.th_win);
socket->connecting_parent = this;
socket->connecting_prev = NULL;
socket->connecting_next = connecting_half;
if ( socket->connecting_next )
socket->connecting_next->connecting_prev = socket;
connecting_half = socket;
backlog_used++;
socket->TransmitLoop();
return;
}
else if ( state == TCP_STATE_SYN_SENT ) // STD 7, RFC 793, page 66.
{
if ( hdr.th_flags & TH_ACK )
{
if ( mod32_le(hdr.th_ack, iss) || mod32_gt(hdr.th_ack, send_nxt) )
{
if ( hdr.th_flags & TH_RST )
return;
// TODO: Send RST.
return;
}
if ( !(mod32_le(send_una, hdr.th_ack) &&
mod32_le(hdr.th_ack, send_nxt)) )
return;
}
if ( hdr.th_flags & TH_RST )
{
Fail(ECONNREFUSED);
return;
}
if ( !(hdr.th_flags & TH_SYN) )
return;
recv_acked = hdr.th_seq;
recv_nxt = hdr.th_seq + 1;
irs = hdr.th_seq;
has_syn = true;
// RFC 1122 4.2.2.20 (c), page 94.
UpdateWindow(hdr.th_win);
send_wl1 = hdr.th_seq;
send_wl2 = hdr.th_ack;
// TODO: Drop packet if the packet contains data/FIN beyond the SYN?
if ( hdr.th_flags & TH_ACK )
{
send_una = hdr.th_ack;
retransmissions = 0;
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
if ( mod32_lt(iss, send_una) )
{
outgoing_syn = TCP_SPECIAL_ACKED;
state = TCP_STATE_ESTAB;
kthread_cond_broadcast(&receive_cond); // Wake up connect.
return;
}
}
state = TCP_STATE_SYN_RECV;
return;
}
// STD 7, RFC 793, page 69.
bool acceptable;
if ( inlen == 0 && recv_wnd == 0 )
acceptable = hdr.th_seq == recv_nxt;
else if ( inlen == 0 && 0 < recv_wnd )
acceptable = mod32_le(recv_nxt, hdr.th_seq) &&
mod32_lt(hdr.th_seq, recv_nxt + recv_wnd);
else if ( 0 < inlen && 0 < recv_wnd )
{
tcp_seq seg_end = (tcp_seq) (hdr.th_seq + inlen - 1);
acceptable = (mod32_le(recv_nxt, hdr.th_seq) &&
mod32_lt(hdr.th_seq, recv_nxt + recv_wnd)) ||
(mod32_le(recv_nxt, seg_end) &&
mod32_lt(seg_end, recv_nxt + recv_wnd));
}
else
{
acceptable = false;
// TODO: STD 7, RFC 793, page 69 "If the RCV.WND is zero, no segments
// will be acceptable, but special allowance should be made to
// accept valid ACKs, URGs and RSTs".
}
if ( !acceptable )
{
if ( hdr.th_flags & TH_RST )
return;
// Send <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>.
recv_acked = recv_nxt - 1;
return;
}
// STD 7, RFC 793, page 70. Process segments in the right order and trim the
// segment to the receive window.
uint16_t real_seq = hdr.th_seq;
if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_SYN) )
{
hdr.th_flags &= ~TH_SYN;
hdr.th_seq++;
}
if ( mod32_lt(hdr.th_seq, recv_nxt) )
{
tcp_seq skip = recv_nxt - hdr.th_seq;
if ( inlen < skip )
skip = inlen;
hdr.th_seq += skip;
in += skip;
inlen -= skip;
}
if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_FIN) )
{
hdr.th_flags &= ~TH_FIN;
hdr.th_seq++;
}
if ( mod32_lt(hdr.th_seq, recv_nxt) ) // Already processes.
return;
if ( mod32_gt(hdr.th_seq, recv_nxt) ) // Can't process yet.
{
// Insert the segment in the receive queue.
Ref<Packet> prev;
Ref<Packet> iter = receive_queue;
// TODO: For n packets in the worst order, this scales O(n^2).
// TODO: This wastes a packet per byte in the worst case.
while ( iter )
{
const unsigned char* iter_in = iter->from + iter->offset;
const unsigned char* iter_in_seq =
iter_in + offsetof(struct tcphdr, th_seq);
tcp_seq iter_seq;
memcpy(&iter_seq, iter_in_seq, sizeof(iter_seq));
iter_seq = be32toh(iter_seq);
if ( mod32_le(real_seq, iter_seq) )
break;
// TODO: Handle duplicate and overlapping segments.
prev = iter;
iter = iter->next;
}
if ( prev )
{
pkt->next = prev->next;
prev->next = pkt;
}
else
{
pkt->next = receive_queue;
receive_queue = pkt;
}
return;
}
if ( recv_wnd < inlen )
inlen = recv_wnd;
// STD 7, RFC 793, page 70.
if ( hdr.th_flags & TH_RST )
{
if ( state == TCP_STATE_CLOSING ||
state == TCP_STATE_LAST_ACK ||
state == TCP_STATE_TIME_WAIT )
Close();
else
Fail(ECONNRESET);
return;
}
// STD 7, RFC 793, page 71.
if ( hdr.th_flags & TH_SYN )
{
// TODO: Send RST.
Fail(ECONNRESET);
return;
}
// STD 7, RFC 793, page 72.
if ( !(hdr.th_flags & TH_ACK) )
return;
// STD 7, RFC 793, page 72.
if ( state == TCP_STATE_SYN_RECV )
{
// RFC 1122 4.2.2.20 (f), page 94.
UpdateWindow(hdr.th_win);
send_wl1 = hdr.th_seq;
send_wl2 = hdr.th_ack;
if ( mod32_le(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt) )
{
state = TCP_STATE_ESTAB;
kthread_cond_broadcast(&receive_cond); // Wake up connect.
if ( connecting_parent )
{
if ( connecting_prev )
connecting_prev->connecting_next = connecting_next;
else
connecting_parent->connecting_half = connecting_next;
if ( connecting_next )
connecting_next->connecting_prev = connecting_prev;
// TODO: This inserts the connection to the front of the
// accept queue, rather than the end, which is unfair to
// connections that have been waiting longer.
connecting_prev = NULL;
connecting_next = connecting_parent->connecting_ready;
if ( connecting_next )
connecting_next->connecting_prev = this;
connecting_parent->connecting_ready = this;
kthread_cond_broadcast(&connecting_parent->receive_cond);
uint16_t status = connecting_parent->PollEventStatus();
connecting_parent->poll_channel.Signal(status);
}
}
else
{
// TODO: Send <SEQ=SEG.ACK><CTL=RST>.
TransmitLoop();
return;
}
}
// STD 7, RFC 793, page 72.
// TODO: RFC 1122 4.2.2.20 (g), page 94 says SEG.ACK =< SND.UNA however this
// causes incoming connections to fail.
if ( mod32_lt(hdr.th_ack, send_una) )
return; // Drop duplicate ack already seen.
else if ( mod32_lt(send_nxt, hdr.th_ack) )
{
// TODO: Send ACK.
return;
}
// STD 7, RFC 793, page 72. Remove acknowledged data from the window.
tcp_seq old_send_una = send_una;
tcp_seq acked = (tcp_seq) (hdr.th_ack - send_una);
if ( outgoing_syn == TCP_SPECIAL_WINDOW && 0 < acked )
{
outgoing_syn = TCP_SPECIAL_ACKED;
acked--;
send_una++;
}
tcp_seq window_data = (tcp_seq) (send_nxt - send_una);
if ( outgoing_fin == TCP_SPECIAL_WINDOW )
window_data--;
if ( window_data && acked )
{
size_t amount = window_data < acked ? window_data : acked;
assert(outgoing_offset < sizeof(outgoing));
outgoing_offset += amount;
if ( sizeof(outgoing) <= outgoing_offset )
outgoing_offset -= sizeof(outgoing);
assert(outgoing_offset < sizeof(outgoing));
assert(amount <= outgoing_used);
outgoing_used -= amount;
kthread_cond_broadcast(&transmit_cond);
poll_channel.Signal(PollEventStatus());
acked -= amount;
send_una += amount;
}
bool fin_was_acked = false;
if ( outgoing_fin == TCP_SPECIAL_WINDOW && 0 < acked )
{
outgoing_fin = TCP_SPECIAL_ACKED;
acked--;
send_una++;
fin_was_acked = true;
}
if ( send_una != old_send_una )
{
// TODO: Possibly recalculate the average time to contact remote.
retransmissions = 0;
SetTimer();
}
// STD 7, RFC 793, page 72.
if ( mod32_lt(send_wl1, hdr.th_seq) ||
(send_wl1 == hdr.th_seq && mod32_le(send_wl2, hdr.th_ack)) )
{
UpdateWindow(hdr.th_win);
send_wl1 = hdr.th_seq;
send_wl2 = hdr.th_ack;
}
// STD 7, RFC 793, page 73.
if ( state == TCP_STATE_FIN_WAIT_1 )
{
if ( fin_was_acked )
{
state = TCP_STATE_FIN_WAIT_2;
// Time out the connection if the socket is no longer referenced.
if ( !is_referenced )
{
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
}
}
else if ( state == TCP_STATE_CLOSING )
{
if ( fin_was_acked )
{
state = TCP_STATE_TIME_WAIT;
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
return;
}
else if ( state == TCP_STATE_LAST_ACK )
{
if ( fin_was_acked )
Close();
return;
}
// TODO: Urgent data per STD 7, RFC 793, page 73.
// STD 7, RFC 793, page 74.
if ( state == TCP_STATE_ESTAB ||
state == TCP_STATE_FIN_WAIT_1 ||
state == TCP_STATE_FIN_WAIT_2 )
{
assert(incoming_offset < sizeof(incoming));
assert(incoming_used <= sizeof(incoming));
size_t available = sizeof(incoming) - incoming_used;
size_t amount = available < inlen ? available : inlen;
assert(amount <= sizeof(incoming));
assert(amount <= available);
size_t newat = incoming_offset + incoming_used;
if ( sizeof(incoming) <= newat )
newat -= sizeof(incoming);
assert(newat < sizeof(incoming));
size_t until_end = sizeof(incoming) - newat;
assert(until_end <= sizeof(incoming));
size_t first = until_end < amount ? until_end : amount;
assert(first <= amount);
assert(first <= sizeof(incoming));
size_t second = amount - first;
assert(second <= amount);
assert(second <= sizeof(incoming));
assert(first + second == amount);
assert(first + second <= sizeof(incoming));
assert(first + second <= available);
if ( !shutdown_receive )
{
memcpy(incoming + newat, in, first);
if ( second )
memcpy(incoming, in + first, second);
incoming_used += amount;
}
available = sizeof(incoming) - incoming_used;
if ( available < recv_wnd )
recv_wnd = available;
recv_nxt = hdr.th_seq + amount;
if ( amount == inlen && (hdr.th_flags & TH_FIN) )
{
recv_nxt++;
has_fin = true;
}
if ( incoming_used || has_fin )
{
kthread_cond_broadcast(&receive_cond);
poll_channel.Signal(PollEventStatus());
}
}
// STD 7, RFC 793, page 75.
if ( hdr.th_flags & TH_FIN )
{
if ( state == TCP_STATE_ESTAB )
{
state = TCP_STATE_CLOSE_WAIT;
kthread_cond_broadcast(&receive_cond);
poll_channel.Signal(PollEventStatus());
}
else if ( state == TCP_STATE_FIN_WAIT_1 )
{
// Our sent FIN hasn't been ACK'd or we'd be in FIN_WAIT_2.
state = TCP_STATE_CLOSING;
}
else if ( state == TCP_STATE_FIN_WAIT_2 )
{
state = TCP_STATE_TIME_WAIT;
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
else if ( state == TCP_STATE_TIME_WAIT )
{
// The timer is not reset like as by the standard to avoid a hostile
// remote from staying forever in TIME-WAIT.
}
}
}
void TCPSocket::ReceivePacket(Ref<Packet> pktnew,
union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst) // tcp_lock locked
{
if ( pktnew )
ProcessPacket(pktnew, pkt_src, pkt_dst);
while ( receive_queue )
{
Ref<Packet> pkt = receive_queue;
const unsigned char* in = pkt->from + pkt->offset;
const unsigned char* in_seq = in + offsetof(struct tcphdr, th_seq);
tcp_seq seq;
memcpy(&seq, in_seq, sizeof(seq));
seq = be32toh(seq);
if ( mod32_gt(seq, recv_nxt) )
break;
receive_queue = pkt->next;
pkt->next.Reset();
if ( seq == recv_nxt )
ProcessPacket(pkt, pkt_src, pkt_dst);
}
// Delay transmit to answer more efficiently based on upcoming packets.
ScheduleTransmit();
}
void TCPSocket::UpdateWindow(uint16_t new_window)
{
tcp_seq pending = (tcp_seq) (send_nxt - send_una);
if ( new_window < pending )
send_nxt = (tcp_seq) (send_una + pending);
send_wnd = new_window;
}
int TCPSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
ScopedLock lock(&tcp_lock);
// TODO: os-test listen + connect, what errno?
if ( state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV )
return errno = EALREADY, -1;
if ( state != TCP_STATE_CLOSED )
return errno = EISCONN, -1; // TODO: Another errno if listening?
union tcp_sockaddr new_remote;
if ( !ImportAddress(ctx, &new_remote, addr, addrsize) )
return -1;
if ( af == AF_INET )
{
// Verify the port is non-zero.
if ( be16toh(new_remote.in.sin_port) == 0 )
return errno = EADDRNOTAVAIL, -1;
}
else
return errno = EAFNOSUPPORT, -1;
// TODO: os-test AF_UNSPEC
// If the socket is not bound, find a route to the remote address and bind
// to the appropriate source address.
if ( !bound )
{
union tcp_sockaddr new_local;
memset(&new_local, 0, sizeof(new_local));
if ( af == AF_INET )
{
struct in_addr any;
any.s_addr = htobe32(INADDR_ANY);
new_local.in.sin_family = AF_INET;
if ( !IP::GetSourceIP(&any, &new_remote.in.sin_addr,
&new_local.in.sin_addr, ifindex, NULL) )
return -1;
new_local.in.sin_port = htobe16(0);
}
else
return errno = EAFNOSUPPORT, -1;
if ( !BindDefault(&new_local) )
return -1;
}
// Test if there is a route from the local address to the remote address.
// TODO: Does TCP also do this? Note that connecting to the any address
// should be forbidden, right?
if ( af == AF_INET )
{
if ( !IP::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr,
NULL, ifindex, NULL) )
return -1;
}
else
return errno = EAFNOSUPPORT, -1;
memcpy(&remote, &new_remote, sizeof(new_remote));
remoted = true;
iss = arc4random();
recv_wnd = TCP_MAXWIN;
send_una = iss;
send_nxt = iss;
send_wnd = 1;
send_pos = iss;
outgoing_syn = TCP_SPECIAL_PENDING;
state = TCP_STATE_SYN_SENT;
TransmitLoop();
while ( !sockerr &&
(state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV) )
{
// TODO: os-test non-blocking connect.
if ( ctx->dflags & O_NONBLOCK )
return errno = EINPROGRESS, -1;
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
return errno = EINTR, -1;
}
if ( sockerr )
{
// TODO: This is not recoverable. Is that correct?
// TODO: os-test whether reconnect is possible after failed connect?
return errno = sockerr, -1;
}
return 0;
}
int TCPSocket::listen(ioctx_t* /*ctx*/, int backlog)
{
if ( backlog < 0 )
return errno = EINVAL, -1;
// TODO: os-test if zero backlog allows connections.
if ( backlog == 0 )
backlog = 1;
else if ( backlog < 0 || SOMAXCONN < backlog )
backlog = SOMAXCONN;
ScopedLock lock(&tcp_lock);
if ( !bound )
return errno = EDESTADDRREQ, -1;
// TODO: os-test a regular connection, close, and then try to listen.
if ( state != TCP_STATE_CLOSED )
return errno = EINVAL, -1;
backlog_max = backlog;
memset(&remote, 0, sizeof(remote));
if ( af == AF_INET )
{
remote.in.sin_family = AF_INET;
remote.in.sin_addr.s_addr = htobe32(INADDR_ANY);
}
else if ( af == AF_INET6 )
{
remote.in6.sin6_family = AF_INET6;
remote.in6.sin6_addr = in6addr_any;
}
else
return errno = EAFNOSUPPORT, -1;
remoted = true;
state = TCP_STATE_LISTEN;
return 0;
}
ssize_t TCPSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags)
{
ScopedLock lock(&tcp_lock);
ssize_t result = recv_unlocked(ctx, buf, count, flags);
// Respond immediately if receive window has become empty.
!incoming_used ? TransmitLoop() : ScheduleTransmit();
return result;
}
ssize_t TCPSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags)
{
struct msghdr msg;
if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) )
return -1;
if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen )
return errno = EINVAL, -1;
size_t iov_size = msg.msg_iovlen * sizeof(struct iovec);
struct iovec* iov = new struct iovec[msg.msg_iovlen];
if ( !iov )
return -1;
struct iovec* user_iov = msg.msg_iov;
if ( !ctx->copy_from_src(iov, user_iov, iov_size) )
return delete[] iov, -1;
msg.msg_iov = iov;
kthread_mutex_lock(&tcp_lock);
ssize_t result = 0;
for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ )
{
size_t maximum = SSIZE_MAX - (size_t) result;
uint8_t* buf = (uint8_t*) iov[i].iov_base;
size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum;
if ( !count )
continue;
ssize_t amount = recv_unlocked(ctx, buf, count, flags);
if ( amount < 0 )
{
if ( result == 0 )
result = -1;
break;
}
result += amount;
if ( (size_t) amount != count )
break;
}
// Respond immediately if receive window has become empty.
!incoming_used ? TransmitLoop() : ScheduleTransmit();
kthread_mutex_unlock(&tcp_lock);
msg.msg_iov = user_iov;
// TODO: os-test POSIX's requirement to ignore the namemsg_name
// msg_namelen, plus msg_controllen's behavior is unspecified.
msg.msg_namelen = 0;
msg.msg_controllen = 0;
delete[] iov;
if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) )
return -1;
return result;
}
ssize_t TCPSocket::recv_unlocked(ioctx_t* ctx,
uint8_t* buf,
size_t count,
int flags) // tcp_lock taken
{
if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) // TODO: MSG_OOB.
return errno = EINVAL, -1;
if ( sockerr )
return errno = sockerr, -1;
// TODO: os-test non-blocking connect + immediate recv.
// TODO: CLOSED after it has been closed?
if ( state == TCP_STATE_CLOSED ||
state == TCP_STATE_LISTEN ||
state == TCP_STATE_SYN_SENT ||
state == TCP_STATE_SYN_RECV )
return errno = ENOTCONN, -1;
size_t sofar = 0;
while ( sofar < count )
{
while ( !(incoming_used || has_fin || shutdown_receive) )
{
if ( sockerr )
return sofar ? sofar : (errno = sockerr, -1);
if ( state == TCP_STATE_CLOSED )
return sofar;
if ( sofar && !(flags & MSG_WAITALL) )
return sofar;
if ( ctx->dflags & O_NONBLOCK )
return sofar ? sofar : (errno = EWOULDBLOCK, -1);
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
return sofar ? sofar : (errno = EINTR, -1);
if ( sockerr )
return sofar ? sofar : (errno = sockerr, -1);
}
if ( incoming_used == 0 && (has_fin || shutdown_receive) )
return sofar;
uint8_t* data = buf + sofar;
size_t left = count - sofar;
assert(incoming_used <= sizeof(incoming));
size_t amount = incoming_used < left ? incoming_used : left;
assert(incoming_offset < sizeof(incoming));
size_t until_end = sizeof(incoming) - incoming_offset;
size_t first = until_end < amount ? until_end : amount;
size_t second = amount - first;
if ( !ctx->copy_to_dest(data, incoming + incoming_offset, first) )
return sofar ? sofar : -1;
if ( second && !ctx->copy_to_dest(data + first, incoming, second) )
return sofar ? sofar : -1;
sofar += amount;
if ( flags & MSG_PEEK )
return sofar;
incoming_offset += amount;
if ( sizeof(incoming) <= incoming_offset )
incoming_offset -= sizeof(incoming);
assert(incoming_offset < sizeof(incoming));
incoming_used -= amount;
recv_wnd = sizeof(incoming) - incoming_used;
if ( UINT16_MAX < recv_wnd )
recv_wnd = UINT16_MAX;
if ( TCP_MAXWIN < recv_wnd )
recv_wnd = TCP_MAXWIN;
}
return sofar;
}
ssize_t TCPSocket::send(ioctx_t* ctx,
const uint8_t* buf,
size_t count,
int flags)
{
ScopedLock lock(&tcp_lock);
ssize_t result = send_unlocked(ctx, buf, count, flags);
TransmitLoop();
return result;
}
ssize_t TCPSocket::sendmsg(ioctx_t* ctx,
const struct msghdr* msg_ptr,
int flags)
{
struct msghdr msg;
if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) )
return -1;
if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen )
return errno = EINVAL, -1;
// TODO: os-test if msg_name/msg_namelen/msg_control/msg_controllen are set.
size_t iov_size = msg.msg_iovlen * sizeof(struct iovec);
struct iovec* iov = new struct iovec[msg.msg_iovlen];
if ( !iov )
return -1;
if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) )
return delete[] iov, -1;
msg.msg_iov = iov;
kthread_mutex_lock(&tcp_lock);
ssize_t result = 0;
for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ )
{
size_t maximum = SSIZE_MAX - (size_t) result;
const uint8_t* buf = (const uint8_t*) iov[i].iov_base;
size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum;
ssize_t amount = send_unlocked(ctx, buf, count, flags);
if ( amount < 0 )
{
if ( result == 0 )
result = -1;
break;
}
result += amount;
if ( (size_t) amount != count )
break;
}
TransmitLoop();
kthread_mutex_unlock(&tcp_lock);
delete[] iov;
return result;
}
ssize_t TCPSocket::send_unlocked(ioctx_t* ctx,
const uint8_t* buf,
size_t count,
int flags) // tcp_lock taken
{
// TODO: MSG_MORE (and implement TCP_CORK), MSG_OOB, MSG_DONTROUTE.
if ( flags & ~(MSG_NOSIGNAL) )
return errno = EINVAL, -1;
if ( sockerr )
return errno = sockerr, -1;
if ( state == TCP_STATE_CLOSED ||
state == TCP_STATE_LISTEN ||
state == TCP_STATE_SYN_SENT ||
state == TCP_STATE_SYN_RECV )
return errno = ENOTCONN, -1;
size_t sofar = 0;
while ( sofar < count )
{
while ( outgoing_used == sizeof(outgoing) ||
(state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT) )
{
if ( sofar )
return sofar;
if ( sockerr )
return errno = sockerr, -1;
if ( ctx->dflags & O_NONBLOCK )
return errno = EWOULDBLOCK;
if ( !kthread_cond_wait_signal(&transmit_cond, &tcp_lock) )
return errno = EINTR, -1;
}
if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT )
{
if ( !(flags & MSG_NOSIGNAL) )
CurrentThread()->DeliverSignal(SIGPIPE);
return errno = EPIPE, -1;
}
const uint8_t* data = buf + sofar;
size_t left = count - sofar;
assert(outgoing_offset < sizeof(outgoing));
assert(outgoing_used <= sizeof(outgoing));
size_t available = sizeof(outgoing) - outgoing_used;
size_t amount = available < left ? available : left;
size_t newat = outgoing_offset + outgoing_used;
if ( sizeof(outgoing) <= newat )
newat -= sizeof(outgoing);
assert(newat < sizeof(outgoing));
size_t until_end = sizeof(outgoing) - newat;
size_t first = until_end < amount ? until_end : amount;
size_t second = amount - first;
if ( !ctx->copy_from_src(outgoing + newat, data, first) )
return sofar ? sofar : -1;
if ( second && !ctx->copy_from_src(outgoing, data + first, second) )
return sofar ? sofar : -1;
outgoing_used += amount;
assert(outgoing_used <= sizeof(outgoing));
sofar += amount;
// TODO: If there's a sent packet that hasn't been acknowledged, and
// there isn't a full packet yet, then just buffer and don't
// transmit yet.
// TODO: TCP_NODELAY, TCP_NOPUSH, MSG_MORE.
// TODO: Set PUSH appropriately.
}
return sofar;
}
ssize_t TCPSocket::read(ioctx_t* ctx, uint8_t* buf, size_t count)
{
return recv(ctx, buf, count, 0);
}
ssize_t TCPSocket::write(ioctx_t* ctx, const uint8_t* buf, size_t count)
{
return send(ctx, buf, count, 0);
}
short TCPSocket::PollEventStatus()
{
// TODO: os-test the poll bits.
// TODO: OOB poll bits.
short status = 0;
if ( connecting_ready )
status |= POLLIN | POLLRDNORM;
if ( incoming_used || has_fin || shutdown_receive )
status |= POLLIN | POLLRDNORM;
if ( (state == TCP_STATE_ESTAB || state == TCP_STATE_CLOSE_WAIT) &&
outgoing_used < sizeof(outgoing) )
status |= POLLOUT | POLLWRNORM;
if ( state == TCP_STATE_CLOSE_WAIT ||
state == TCP_STATE_LAST_ACK ||
state == TCP_STATE_TIME_WAIT ||
state == TCP_STATE_CLOSED )
status |= POLLHUP;
if ( sockerr )
status |= POLLERR;
return status;
}
int TCPSocket::poll(ioctx_t* /*ctx*/, PollNode* node)
{
ScopedLock lock(&tcp_lock);
short ret_status = PollEventStatus() & node->events;
if ( ret_status )
{
node->master->revents |= ret_status;
return 0;
}
poll_channel.Register(node);
return errno = EAGAIN, -1;
}
int TCPSocket::getsockopt(ioctx_t* ctx, int level, int option_name,
void* option_value, size_t* option_size_ptr)
{
ScopedLock lock(&tcp_lock);
if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE )
{
ScopedLock lock(&netifs_lock);
const char* ifname = "";
if ( ifindex < netifs_count && netifs[ifindex] )
ifname = netifs[ifindex]->ifinfo.name;
size_t option_size;
if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) )
return -1;
size_t len = strlen(ifname);
size_t size = len + 1;
if ( option_size < size )
return errno = ERANGE, -1;
if ( !CopyToUser(option_value, ifname, size) ||
!CopyToUser(option_size_ptr, &size, sizeof(size)) )
return -1;
return 0;
}
uintmax_t result = 0;
if ( level == IPPROTO_TCP )
{
switch ( option_name )
{
// TODO: TCP_NODELAY
// TODO: TCP_MAXSEG
// TODO: TCP_NOPUSH
// TODO: TCP_CORK
default: return errno = ENOPROTOOPT, -1;
}
}
else if ( level == SOL_SOCKET )
{
switch ( option_name )
{
case SO_BINDTOINDEX: result = ifindex; break;
case SO_DEBUG: result = 0; break;
case SO_DOMAIN: result = af; break;
case SO_ERROR: result = sockerr; break;
case SO_PROTOCOL: result = IPPROTO_TCP; break;
case SO_RCVBUF: result = sizeof(incoming); break;
case SO_REUSEADDR: result = reuseaddr; break;
case SO_SNDBUF: result = sizeof(outgoing); break;
case SO_TYPE: result = SOCK_STREAM; break;
// TODO: SO_ACCEPTCONN
// TODO: SO_LINGER
// TODO: SO_OOBINLINE
// TODO: SO_RCVLOWAT
// TODO: SO_RCVTIMEO
// TODO: SO_SNDLOWAT
// TODO: SO_SNDTIMEO
// TODO: SO_DONTROUTE
// TODO: SO_BROADCAST
default: return errno = ENOPROTOOPT, -1;
}
}
else
return errno = EINVAL, -1;
if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) )
return -1;
return 0;
}
// TODO: os-test socket options on shut down sockets. POSIX says EINVAL.
// TODO: os-test the errno for an invalid protocol.
// TODO: os-test the errno for an invalid option at a protocol level.
int TCPSocket::setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size)
{
ScopedLock lock(&tcp_lock);
if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE )
{
char ifname[IF_NAMESIZE];
if ( sizeof(ifname) < option_size )
option_size = sizeof(ifname);
if ( !CopyFromUser(ifname, option_value, option_size) )
return -1;
if ( strnlen(ifname, option_size) == sizeof(ifname) )
return errno = ENODEV, -1;
ifname[option_size] = '\0';
ScopedLock lock(&netifs_lock);
for ( size_t i = 1; i < netifs_count; i++ )
{
if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) )
{
ifindex = i;
return 0;
}
}
return errno = ENODEV, -1;
}
uintmax_t value;
if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) )
return -1;
if ( level == IPPROTO_TCP )
{
switch ( option_name )
{
case TCP_NODELAY: break; // TODO: Transmit if turned on?
case TCP_MAXSEG: break; // TODO: Implement this.
case TCP_NOPUSH: break; // TODO: Implement this.
// TODO: TCP_CORK
default:
return errno = ENOPROTOOPT, -1;
}
}
else if ( level == SOL_SOCKET )
{
switch ( option_name )
{
case SO_BINDTOINDEX:
if ( UINT_MAX < value )
return errno = EINVAL, -1;
ifindex = value;
break;
case SO_DEBUG:
if ( value != 0 )
return errno = EPERM, -1;
break;
case SO_KEEPALIVE: break; // TODO: Implement this.
case SO_REUSEADDR: reuseaddr = value; break;
case SO_LINGER: break; // TODO: Implement this.
case SO_RCVBUF: break; // TODO: Implement this.
case SO_SNDBUF: break; // TODO: Implement this.
// TODO: SO_BROADCAST
// TODO: SO_DONTROUTE
// TODO: SO_LINGER
// TODO: SO_RCVLOWAT
// TODO: SO_RCVTIMEO
// TODO: SO_SNDLOWAT
// TODO: SO_SNDTIMEO
default: return errno = ENOPROTOOPT, -1;
}
}
else
return errno = EINVAL, -1;
return 0;
}
int TCPSocket::shutdown(ioctx_t* /*ctx*/, int how)
{
ScopedLock lock(&tcp_lock);
return shutdown_unlocked(how);
}
int TCPSocket::shutdown_unlocked(int how) // tcp_lock taken
{
// STD 7, RFC 793, page 60.
if ( state != TCP_STATE_SYN_SENT &&
state != TCP_STATE_SYN_RECV &&
state != TCP_STATE_ESTAB &&
state != TCP_STATE_CLOSE_WAIT )
return errno = ENOTCONN, -1;
if ( how & SHUT_WR )
{
// STD 7, RFC 793, page 60.
if ( state == TCP_STATE_SYN_SENT )
Close();
else // TCP_STATE_SYN_SENT || TCP_STATE_ESTAB || TCP_STATE_CLOSE_WAIT
{
outgoing_fin = TCP_SPECIAL_PENDING;
// TODO: Should this state transition be delayed until the FIN
// enters the window or is sent?
if ( state == TCP_STATE_CLOSE_WAIT )
state = TCP_STATE_LAST_ACK /* RFC 1122, 4.2.2.20 (a), page 93 */;
else
state = TCP_STATE_FIN_WAIT_1;
kthread_cond_broadcast(&transmit_cond);
TransmitLoop();
}
}
if ( how & SHUT_RD )
{
shutdown_receive = true;
kthread_cond_broadcast(&receive_cond);
}
return 0;
}
int TCPSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr)
{
ScopedLock lock(&tcp_lock);
if ( !remoted || state == TCP_STATE_LISTEN )
return errno = ENOTCONN, -1;
size_t addrsize;
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
return -1;
if ( af == AF_INET )
{
if ( sizeof(remote.in) < addrsize )
addrsize = sizeof(remote.in);
}
else if ( af == AF_INET6 )
{
if ( sizeof(remote.in6) < addrsize )
addrsize = sizeof(remote.in6);
}
else
return errno = EAFNOSUPPORT, -1;
if ( !ctx->copy_to_dest(addr, &remote, addrsize) )
return -1;
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
return -1;
return 0;
}
int TCPSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr)
{
ScopedLock lock(&tcp_lock);
size_t addrsize;
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
return -1;
if ( af == AF_INET )
{
if ( sizeof(local.in) < addrsize )
addrsize = sizeof(local.in);
}
else if ( af == AF_INET6 )
{
if ( sizeof(local.in6) < addrsize )
addrsize = sizeof(local.in6);
}
else
return errno = EAFNOSUPPORT, -1;
if ( !ctx->copy_to_dest(addr, &local, addrsize) )
return -1;
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
return -1;
return 0;
}
// TODO: os-test fstat on a socket.
TCPSocketNode::TCPSocketNode(TCPSocket* socket)
{
this->socket = socket;
socket->is_referenced = true;
Process* process = CurrentProcess();
inode_type = INODE_TYPE_STREAM;
dev = (dev_t) this;
ino = (ino_t) this;
type = S_IFSOCK;
kthread_mutex_lock(&process->idlock);
stat_uid = process->uid;
stat_gid = process->gid;
kthread_mutex_unlock(&process->idlock);
stat_mode = 0600 | this->type;
}
TCPSocketNode::~TCPSocketNode()
{
socket->Unreference();
}
Ref<Inode> TCPSocketNode::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
int flags)
{
return socket->accept4(ctx, addr, addrsize, flags);
}
int TCPSocketNode::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
return socket->bind(ctx, addr, addrsize);
}
int TCPSocketNode::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
return socket->connect(ctx, addr, addrsize);
}
int TCPSocketNode::listen(ioctx_t* ctx, int backlog)
{
return socket->listen(ctx, backlog);
}
ssize_t TCPSocketNode::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags)
{
return socket->recv(ctx, buf, count, flags);
}
ssize_t TCPSocketNode::recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags)
{
return socket->recvmsg(ctx, msg, flags);
}
ssize_t TCPSocketNode::send(ioctx_t* ctx, const uint8_t* buf, size_t count,
int flags)
{
return socket->send(ctx, buf, count, flags);
}
ssize_t TCPSocketNode::sendmsg(ioctx_t* ctx, const struct msghdr* msg,
int flags)
{
return socket->sendmsg(ctx, msg, flags);
}
ssize_t TCPSocketNode::read(ioctx_t* ctx, uint8_t* buf, size_t count)
{
return socket->read(ctx, buf, count);
}
ssize_t TCPSocketNode::write(ioctx_t* ctx, const uint8_t* buf, size_t count)
{
return socket->write(ctx, buf, count);
}
int TCPSocketNode::poll(ioctx_t* ctx, PollNode* node)
{
return socket->poll(ctx, node);
}
int TCPSocketNode::getsockopt(ioctx_t* ctx, int level, int option_name,
void* option_value, size_t* option_size_ptr)
{
return socket->getsockopt(ctx, level, option_name, option_value,
option_size_ptr);
}
int TCPSocketNode::setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size)
{
return socket->setsockopt(ctx, level, option_name, option_value,
option_size);
}
int TCPSocketNode::shutdown(ioctx_t* ctx, int how)
{
return socket->shutdown(ctx, how);
}
int TCPSocketNode::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize)
{
return socket->getpeername(ctx, addr, addrsize);
}
int TCPSocketNode::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize)
{
return socket->getsockname(ctx, addr, addrsize);
}
void HandleIP(Ref<Packet> pkt,
const struct in_addr* src,
const struct in_addr* dst,
bool dst_broadcast)
{
if ( src->s_addr == htobe32(INADDR_ANY) )
return;
if ( dst_broadcast )
return;
const unsigned char* in = pkt->from + pkt->offset;
size_t inlen = pkt->length - pkt->offset;
struct tcphdr hdr;
if ( inlen < sizeof(hdr) )
return;
if ( UINT16_MAX < inlen )
return;
memcpy(&hdr, in, sizeof(hdr));
hdr.th_sport = be16toh(hdr.th_sport);
hdr.th_dport = be16toh(hdr.th_dport);
hdr.th_sum = be16toh(hdr.th_sum);
uint16_t sum = 0;
sum = IP::ipsum_buf(sum, src, sizeof(struct in_addr));
sum = IP::ipsum_buf(sum, dst, sizeof(struct in_addr));
sum = IP::ipsum_word(sum, IPPROTO_TCP);
sum = IP::ipsum_word(sum, inlen);
sum = IP::ipsum_buf(sum, in, inlen);
if ( sum != 0 && sum != 0xFFFF )
return;
if ( TCP_OFFSET_DECODE(hdr.th_offset) < sizeof(hdr) / 4 ||
inlen < (size_t) TCP_OFFSET_DECODE(hdr.th_offset) * 4 )
return;
// Port 0 is not valid.
if ( hdr.th_sport == 0 || hdr.th_dport == 0 )
return;
// TODO: TCP options. Respect TCPOPT_MAXSEG.
TCPSocket* socket = NULL;
TCPSocket* socket_listener = NULL;
TCPSocket* any_socket_listener = NULL;
ScopedLock lock(&tcp_lock);
for ( TCPSocket* iter = bindings_v4[hdr.th_dport];
!socket && iter;
iter = iter->next_socket )
{
// TODO: If a TCP socket is bound, and then connected to, what happens?
// What if the TCP socket then connects to the other side?
if ( !iter->remoted )
continue;
// The datagram was sent to the socket's local address.
if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) )
{
// The first priority is to receive on a socket with the correct
// local address and the correct remote address.
if ( !memcmp(&iter->remote.in.sin_addr, src, sizeof(*src)) &&
be16toh(iter->remote.in.sin_port) == hdr.th_sport )
socket = iter;
// The second priority is to receive on a socket with the correct
// local address and listening for connections from any address.
else if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
socket_listener = iter;
}
// The socket is bound to the any address.
if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
{
// The third priority is to receive on a socket bound to the any
// address and listening for connections from any address.
if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
any_socket_listener = iter;
}
}
if ( !socket )
socket = socket_listener;
if ( !socket )
socket = any_socket_listener;
// No socket wanted to receive the packet.
if ( !socket )
{
// TODO: Send RST.
return;
}
// If the socket is bound to a network interface, require the packet to
// have been received on that network interface.
if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid )
{
// TODO: Send RST.
return;
}
union tcp_sockaddr pkt_src;
pkt_src.in.sin_family = AF_INET;
pkt_src.in.sin_addr = *src;
pkt_src.in.sin_port = htobe16(hdr.th_sport);
union tcp_sockaddr pkt_dst;
pkt_dst.in.sin_family = AF_INET;
pkt_dst.in.sin_addr = *dst;
pkt_dst.in.sin_port = htobe16(hdr.th_dport);
// Receive the packet on the socket.
socket->ReceivePacket(pkt, &pkt_src, &pkt_dst);
// Delete the socket if needed or schedule a transmit if needed.
if ( socket->can_destroy() )
delete socket;
}
Ref<Inode> Socket(int af)
{
if ( !IsSupportedAddressFamily(af) )
return errno = EAFNOSUPPORT, Ref<Inode>(NULL);
TCPSocket* socket = new TCPSocket(af);
if ( !socket )
return Ref<Inode>();
Ref<TCPSocketNode> result(new TCPSocketNode(socket));
if ( !result )
return delete socket, Ref<Inode>();
return result;
}
} // namespace TCP
} // namespace Sortix