sortix-mirror/kernel/net/tcp.cpp

2571 lines
71 KiB
C++
Raw Normal View History

Add networking stack. This change adds all the kernel parts of a network stack. The network stack is partial but implements many of the important parts. Add if(4) network interface abstraction. Network interfaces are registered in a global list that can be iterated and each assigned an unique integer identifier. Add reference counted packets with a cache that recycles recent packets. Add support for lo(4) loopback and ether(4) ethernet network interfaces. The /dev/lo0 loopback device is created automatically on boot. Add arp(4) address resolution protocol driver for translation of inet(4) network layer addresses into ether(4) link layer addresses. arp(4) entries are cached and evicted from the cache when needed or when the entry has not been used for a while. The cache is limited to 256 entries for now. Add ip(4) internet protocol version 4 support. IP fragmentation and options are not implemented yet. Add tcp(4) transmission control protocol sockets for a reliable transport layer protocol that provides a reliable byte stream connection between two hosts. The implementation is incomplete and does not yet implement out of band data, options, and high performance extensions. Add udp(4) user datagram protocol sockets for a connectionless transport layer that provides best-effort delivery of datagrams. Add ping(4) sockets for a best-effort delivery of echo datagrams. Change type of sa_family_t from unsigned short to uint16_t. Add --disable-network-drivers to the kernel(7) options and expose it with a bootloader menu. tix-iso-bootconfig can set this option by default. Import CRC32 code from libz for the Ethernet checksum. This is a compatible ABI change that adds features to socket(2) (AF_INET, IPPROTO_TCP, IPPROTO_UDP, IPPROTO_PING), the ioctls for if(4), socket options, and the lo0 loopback interface. This commit is based on work by Meisaka Yukara contributed as the commit bbf7f1e8a5238a2bd1fe8eb1d2cc5c9c2421e2c4. Almost no lines of this work remains in this final commit as it has been rewritten or refactored away over the years, see the individual file headers for which files contain remnants of this work. Co-authored-by: Meisaka Yukara <Meisaka.Yukara@gmail.com>
2022-12-04 23:35:21 +00:00
/*
* Copyright (c) 2016, 2017, 2018, 2022 Jonas 'Sortie' Termansen.
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* net/tcp.cpp
* Transmission Control Protocol.
*/
// TODO: Implement sending back RST and such.
// TODO: PUSH.
// TODO: URG.
// TODO: Nagle's algorithm, MSG_MORE, TCP_CORK, TCP_NODELAY, etc.
// TODO: TCP options.
// TODO: Maximum Segment Size (respect TCP_MSS).
// TODO: Efficient receieve queue when out of order.
// TODO: Efficient backlog / half-open. Avoid denial of service attacks.
// TODO: Measure average round trip time for efficient retransmission?
// TODO: High speed extensions.
// TODO: Anti-congestion extensions.
// TODO: Selective acknowledgements.
// TODO: Implement all RFC 1122 TCP requirements.
// TODO: Probing Zero Windows per RFC 1122 4.2.2.17.
// TODO: os-test all the things.
#include <sys/socket.h>
#include <sys/stat.h>
#include <assert.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <poll.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <timespec.h>
#ifndef IOV_MAX
#include <sortix/limits.h>
#endif
#include <sortix/kernel/clock.h>
#include <sortix/kernel/copy.h>
#include <sortix/kernel/if.h>
#include <sortix/kernel/inode.h>
#include <sortix/kernel/ioctx.h>
#include <sortix/kernel/kernel.h>
#include <sortix/kernel/kthread.h>
#include <sortix/kernel/packet.h>
#include <sortix/kernel/poll.h>
#include <sortix/kernel/process.h>
#include <sortix/kernel/sockopt.h>
#include <sortix/kernel/thread.h>
#include <sortix/kernel/time.h>
#include <sortix/kernel/timer.h>
#include <sortix/kernel/worker.h>
#include "ip.h"
#include "tcp.h"
#define BUFFER_SIZE 65536 // Documented in tcp(4).
#define NUM_RETRANSMISSIONS 6 // Documented in tcp(4)
namespace Sortix {
namespace TCP {
class TCPSocket;
union tcp_sockaddr
{
sa_family_t family;
struct sockaddr_in in;
struct sockaddr_in6 in6;
};
// The TCP states per STD 7 (RFC 793).
enum tcp_state
{
TCP_STATE_CLOSED,
TCP_STATE_LISTEN,
TCP_STATE_SYN_SENT,
TCP_STATE_SYN_RECV,
TCP_STATE_ESTAB,
TCP_STATE_FIN_WAIT_1,
TCP_STATE_FIN_WAIT_2,
TCP_STATE_CLOSE_WAIT,
TCP_STATE_CLOSING,
TCP_STATE_LAST_ACK,
TCP_STATE_TIME_WAIT,
};
enum tcp_special
{
TCP_SPECIAL_NOT,
TCP_SPECIAL_PENDING,
TCP_SPECIAL_WINDOW,
TCP_SPECIAL_ACKED,
};
// Global lock protecting all TCP sockets as they need to access each other.
static kthread_mutex_t tcp_lock = KTHREAD_MUTEX_INITIALIZER;
static TCPSocket** bindings_v4;
static TCPSocket** bindings_v6;
void Init()
{
if ( !(bindings_v4 = new TCPSocket*[65536]) ||
!(bindings_v6 = new TCPSocket*[65536]) )
Panic("Failed to allocate TCP Socket bindings");
for ( size_t i = 0; i < 65536; i++ )
{
bindings_v4[i] = NULL;
bindings_v6[i] = NULL;
}
}
static inline bool mod32_le(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) <= 0;
}
static inline bool mod32_lt(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) < 0;
}
static inline bool mod32_ge(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) >= 0;
}
static inline bool mod32_gt(tcp_seq a, tcp_seq b)
{
return (int32_t) (a - b) > 0;
}
static bool IsSupportedAddressFamily(int af)
{
return af == AF_INET /* TODO: || af == AF_INET6 */;
}
static size_t AddressFamilySize(int af)
{
switch ( af )
{
case AF_INET: return sizeof(struct sockaddr_in);
case AF_INET6: return sizeof(struct sockaddr_in6);
}
return 0;
}
// The TCP socket implementation. It is separate from the class TCPSocketNode
// as that class is reference counted, but this class manages its own lifetime
// so the socket is properly shut down after all references are closed.
//
// Bound sockets are in a double linked list starting from the appropriate
// bindings array indexed by the port, and then the sockets on that port are
// doubly linked using prev_socket and next_socket.
//
// Half-open sockets are in a doubly linked list starting from connecting_half
// in the listening socket, and then doubly linked with connecting_prev and
// connecting_next (with connecting_parent going back to the listening socket).
//
// Ready sockets that have not yet been accepted are in a doubly linked list
// starting from connecting_ready in the listening socket, and then doubly
// linked with connecting_prev and connecting_next (with connecting_parent going
// back to the listening socket).
//
// A socket wants to be deleted when it's in the CLOSED state and is not
// referenced by its TCPSocketNode anymore. Deletion is possible when the timer
// is not pending.
class TCPSocket
{
friend void HandleIP(Ref<Packet> pkt,
const struct in_addr* src,
const struct in_addr* dst,
bool dst_broadcast);
public:
TCPSocket(int af);
~TCPSocket();
Ref<Inode> accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
int flags);
int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
int listen(ioctx_t* ctx, int backlog);
ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags);
ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags);
ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count);
ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags);
ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count);
int poll(ioctx_t* ctx, PollNode* node);
int getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value,
size_t* option_size_ptr);
int setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size);
int shutdown(ioctx_t* ctx, int how);
int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
public:
void Unreference();
void ProcessPacket(Ref<Packet> pkt, union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst);
void ReceivePacket(Ref<Packet> pkt, union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst);
void OnTimer();
inline bool want_destruction()
{
return state == TCP_STATE_CLOSED && !is_referenced;
}
inline bool can_destroy()
{
return want_destruction() && !timer_armed;
}
private:
short PollEventStatus();
bool ImportAddress(ioctx_t* ctx, union tcp_sockaddr* dest, const void* addr,
size_t addrsize);
bool CanBind(union tcp_sockaddr new_local);
bool BindDefault(const union tcp_sockaddr* new_local_ptr);
void UpdateWindow(uint16_t new_window);
void TransmitLoop();
bool Transmit();
void ScheduleTransmit();
void SetDeadline();
void SetTimer();
void Close();
void Destroy();
void Disconnect();
void Fail(int error);
ssize_t recv_unlocked(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
ssize_t send_unlocked(ioctx_t* ctx, const uint8_t* buf, size_t count,
int flags);
int shutdown_unlocked(int how);
public:
// The previous socket bound on the same port in the address family.
TCPSocket* prev_socket;
// The next socket bound on the same port in the address family.
TCPSocket* next_socket;
// The first half-connected socket in our listening queue.
TCPSocket* connecting_half;
// The first ready socket in our listening queue.
TCPSocket* connecting_ready;
// The previous half-connected or ready socket in our listening queue.
TCPSocket* connecting_prev;
// The next half-connected or ready socket in our listening queue.
TCPSocket* connecting_next;
// The listening socket this socket is in the listening queue for.
TCPSocket* connecting_parent;
// Condition variable that is signaled when new data can be received.
kthread_cond_t receive_cond;
// Condition variable that is signaled when new data can be transmitted.
kthread_cond_t transmit_cond;
// The local socket name, or the any address port 0 if not set.
union tcp_sockaddr local;
// The remote socket name, or the any address port 0 if not set.
union tcp_sockaddr remote;
// The network interface the socket is bound to, or 0 if none.
unsigned int ifindex;
// Whether the socket has been bound to a port.
bool bound;
// Whether the socket is receiving datagrams.
bool remoted;
// Whether SO_REUSEADDR is set.
bool reuseaddr;
// Whether the socket is referenced from anywhere and must not deallocate.
bool is_referenced;
private:
// The timer used for retransmissions and timing out the connection.
Timer timer;
// The poll channel to publish poll bit changes on.
PollChannel poll_channel;
// The queue of incoming packets whose sequence numbers are too high to
// process right now, sorted by increasing sequence number.
Ref<Packet> receive_queue; // TODO: Not a good way to keep track of this.
// The deadline for the remote to acknowledge before retransmitting.
struct timespec deadline;
// The offset at which data begins in the incoming ring buffer.
size_t incoming_offset;
// The amount of bytes in the incoming ring buffer.
size_t incoming_used;
// The offset at which data begins in the outgoing ring buffer.
size_t outgoing_offset;
// The amount of bytes in the outgoing ring buffer.
size_t outgoing_used;
// Send unacknowledged (STD 7, RFC 793).
tcp_seq send_una;
// Send next (STD 7, RFC 793).
tcp_seq send_nxt;
// Send window (STD 7, RFC 793).
tcp_seq send_wnd;
// Send urgent pointer (STD 7, RFC 793).
tcp_seq send_up;
// Segment sequence number used for last window update (STD 7, RFC 793).
tcp_seq send_wl1;
// Segment acknowledgment number used for last window update (STD 7, RFC
// 793).
tcp_seq send_wl2;
// Next sequence to send (STD 7, RFC 793).
tcp_seq send_pos;
// Initial send sequence number (STD 7, RFC 793).
tcp_seq iss;
// Receive next (STD 7, RFC 793).
tcp_seq recv_nxt;
// Receive window (STD 7, RFC 793).
tcp_seq recv_wnd;
// Receive urgent pointer (STD 7, RFC 793).
tcp_seq recv_up;
// Last sequence acked (STD 7, RFC 793).
tcp_seq recv_acked;
// Last window size advertised (STD 7, RFC 793).
tcp_seq recv_wndlast;
// Initial receive sequence number (STD 7, RFC 793).
tcp_seq irs;
// The address family to which this socket belongs.
int af;
// Set to an errno value if a socket error has occured, or 0 otherwise.
int sockerr;
// The number of sockets in the listening queue.
int backlog_used;
// The maximum number of sockets in the listening queue.
int backlog_max;
// The number of retransmissions that have occured since the last
// acknowledgement from the remote socket.
unsigned int retransmissions;
// The current TCP state.
enum tcp_state state;
// The state of the outgoing SYN.
enum tcp_special outgoing_syn;
// The state of the outgoing FIN.
enum tcp_special outgoing_fin;
// Whether SYN has been received from the remote socket.
bool has_syn;
// Whether FIN has been received from the remote socket.
bool has_fin;
// Whether a transmission has been scheduled.
bool transmit_scheduled;
// Whether the timer is pending.
bool timer_armed;
// Whether the socket has been shut down for receive.
bool shutdown_receive;
// The incoming ring buffer.
unsigned char incoming[BUFFER_SIZE];
// The outgoing ring buffer.
unsigned char outgoing[BUFFER_SIZE];
};
// The TCP socket Inode with a reference counted lifetime. The backend class
// TCPSocket is separate as it may stay alive for a little while after all
// references to it has been lost.
class TCPSocketNode : public AbstractInode
{
public:
TCPSocketNode(TCPSocket* socket);
virtual ~TCPSocketNode();
virtual Ref<Inode> accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
int flags);
virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
virtual int listen(ioctx_t* ctx, int backlog);
virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags);
virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count,
int flags);
virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr,
int flags);
virtual ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count);
virtual ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count);
virtual int poll(ioctx_t* ctx, PollNode* node);
virtual int getsockopt(ioctx_t* ctx, int level, int option_name,
void* option_value, size_t* option_size_ptr);
virtual int setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size);
virtual int shutdown(ioctx_t* ctx, int how);
virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
private:
TCPSocket* socket;
};
void TCPSocket__OnTimer(Clock* /*clock*/, Timer* /*timer*/, void* user)
{
((TCPSocket*) user)->OnTimer();
}
TCPSocket::TCPSocket(int af)
{
prev_socket = NULL;
next_socket = NULL;
connecting_half = NULL;
connecting_ready = NULL;
connecting_prev = NULL;
connecting_next = NULL;
connecting_parent = NULL;
receive_cond = KTHREAD_COND_INITIALIZER;
transmit_cond = KTHREAD_COND_INITIALIZER;
memset(&local, 0, sizeof(local));
memset(&remote, 0, sizeof(remote));
ifindex = 0;
bound = false;
remoted = false;
reuseaddr = false;
// timer is initialized by its constructor.
timer.Attach(Time::GetClock(CLOCK_MONOTONIC));
// poll_channel is initialized by its constructor.
// receive_queue is initialized by its constructor.
deadline = timespec_make(-1, 0);
incoming_offset = 0;
incoming_used = 0;
outgoing_offset = 0;
outgoing_used = 0;
send_una = 0;
send_nxt = 0;
send_wnd = 0;
send_up = 0;
send_wl1 = 0;
send_wl2 = 0;
send_pos = 0;
iss = 0;
recv_nxt = 0;
recv_wnd = 0;
recv_up = 0;
recv_acked = 0;
recv_wndlast = 0;
irs = 0;
this->af = af;
sockerr = 0;
backlog_used = 0;
backlog_max = 0;
retransmissions = 0;
state = TCP_STATE_CLOSED;
outgoing_syn = TCP_SPECIAL_NOT;
outgoing_fin = TCP_SPECIAL_NOT;
has_syn = false;
has_fin = false;
transmit_scheduled = false;
is_referenced = false;
timer_armed = false;
shutdown_receive = false;
memset(incoming, 0, sizeof(incoming));
memset(outgoing, 0, sizeof(outgoing));
}
TCPSocket::~TCPSocket()
{
assert(state == TCP_STATE_CLOSED);
assert(!bound);
assert(!prev_socket);
assert(!next_socket);
assert(!connecting_half);
assert(!connecting_half);
assert(!connecting_ready);
assert(!connecting_prev);
assert(!connecting_next);
assert(!connecting_parent);
assert(!is_referenced);
// Avoid stack overflow in receive_queue recursive destructor.
while ( receive_queue )
{
Ref<Packet> packet = receive_queue;
receive_queue = packet->next;
packet->next.Reset();
}
}
void TCPSocket::Unreference()
{
kthread_mutex_lock(&tcp_lock);
is_referenced = false;
Disconnect();
bool do_delete = can_destroy();
kthread_mutex_unlock(&tcp_lock);
if ( do_delete )
delete this;
}
void TCPSocket::Close() // tcp_lock taken
{
if ( timer_armed && timer.TryCancel() )
timer_armed = false;
Destroy();
state = TCP_STATE_CLOSED;
kthread_cond_broadcast(&transmit_cond);
kthread_cond_broadcast(&receive_cond);
deadline = timespec_make(-1, 0);
SetTimer();
}
void TCPSocket::Disconnect() // tcp_lock taken
{
if ( state == TCP_STATE_LISTEN )
Close();
else if ( state != TCP_STATE_CLOSED )
{
// TODO: Send back RST if the peer sends when we're not receiving.
if ( state == TCP_STATE_SYN_SENT ||
state == TCP_STATE_SYN_RECV ||
state == TCP_STATE_ESTAB ||
state == TCP_STATE_CLOSE_WAIT )
{
shutdown_unlocked(SHUT_WR);
// CLOSED, LAST_ACK, FIN_WAIT_1.
}
// FIN_WAIT_1 will enter FIN_WAIT_2 or time out.
// FIN_WAIT_2 will time out when unreferenced.
// CLOSING will resend FIN or time out.
// LAST_ACK will resend FIN or time out.
// TIME_WAIT will time out and close.
if ( state == TCP_STATE_FIN_WAIT_2 && !is_referenced )
{
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
}
}
void TCPSocket::Fail(int error)
{
sockerr = error;
Close();
}
void TCPSocket::Destroy() // tcp_lock taken
{
if ( bound )
{
if ( af == AF_INET )
{
uint16_t port = be16toh(local.in.sin_port);
if ( prev_socket )
prev_socket->next_socket = next_socket;
else
bindings_v4[port] = next_socket;
if ( next_socket )
next_socket->prev_socket = prev_socket;
}
else if ( af == AF_INET6 )
{
uint16_t port = be16toh(local.in6.sin6_port);
if ( prev_socket )
prev_socket->next_socket = next_socket;
else
bindings_v6[port] = next_socket;
if ( next_socket )
next_socket->prev_socket = prev_socket;
}
prev_socket = NULL;
next_socket = NULL;
bound = false;
}
while ( connecting_half || connecting_ready )
{
TCPSocket* socket;
if ( connecting_half )
{
socket = connecting_half;
connecting_half = socket->connecting_next;
if ( connecting_half )
connecting_half->connecting_prev = NULL;
}
else
{
socket = connecting_ready;
connecting_ready = socket->connecting_next;
if ( connecting_ready )
connecting_ready->connecting_prev = NULL;
}
socket->connecting_prev = NULL;
socket->connecting_next = NULL;
socket->connecting_parent = NULL;
backlog_used--;
socket->Disconnect();
}
if ( connecting_parent )
{
if ( connecting_prev )
connecting_prev->connecting_next = connecting_next;
else if ( state == TCP_STATE_SYN_RECV )
connecting_parent->connecting_half = connecting_next;
else
connecting_parent->connecting_ready = connecting_next;
if ( connecting_next )
connecting_next->connecting_prev = connecting_prev;
connecting_prev = NULL;
connecting_next = NULL;
connecting_parent->backlog_used--;
connecting_parent = NULL;
}
}
Ref<Inode> TCPSocket::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr,
int flags)
{
if ( flags & ~(0) )
return errno = EINVAL, Ref<Inode>(NULL);
if ( addr && !addrsize_ptr )
return errno = EINVAL, Ref<Inode>(NULL);
ScopedLock lock(&tcp_lock);
if ( state != TCP_STATE_LISTEN )
return errno = EINVAL, Ref<Inode>(NULL);
while ( !connecting_ready )
{
if ( ctx->dflags & O_NONBLOCK )
return errno = EWOULDBLOCK, Ref<Inode>(NULL);
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
return errno = EINTR, Ref<Inode>(NULL);
}
TCPSocket* socket = connecting_ready;
if ( addr )
{
size_t addrsize;
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
return Ref<Inode>(NULL);
size_t af_addrsize = AddressFamilySize(af);
if ( af_addrsize < addrsize )
addrsize = af_addrsize;
if ( !ctx->copy_to_dest(addr, &socket->remote, addrsize) )
return Ref<Inode>(NULL);
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
return Ref<Inode>(NULL);
}
Ref<TCPSocketNode> result(new TCPSocketNode(socket));
if ( !result )
return Ref<Inode>(NULL);
connecting_ready = socket->connecting_next;
if ( connecting_ready )
connecting_ready->connecting_prev = NULL;
socket->connecting_prev = NULL;
socket->connecting_next = NULL;
socket->connecting_parent = NULL;
backlog_used--;
return result;
}
bool TCPSocket::ImportAddress(ioctx_t* ctx,
union tcp_sockaddr* dest,
const void* addr,
size_t addrsize)
{
// TODO: os-test whether AF_UNSPEC can disconnect.
if ( addrsize != AddressFamilySize(af) )
return errno = EINVAL, -1;
union tcp_sockaddr copy;
memset(&copy, 0, sizeof(copy));
if ( !ctx->copy_from_src(&copy, addr, addrsize) )
return false;
if ( copy.family != af )
return errno = EAFNOSUPPORT, false;
memcpy(dest, &copy, sizeof(copy));
return true;
}
bool TCPSocket::CanBind(union tcp_sockaddr new_local) // tcp_lock taken
{
if ( af == AF_INET )
{
// TODO: os-test binding to broadcast addresses.
// Bind to either the any address or the address of a network interface.
if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) )
{
// TODO: What happens to sockets if the network interface changes
// its address?
ScopedLock ifs_lock(&netifs_lock);
bool found = false;
for ( unsigned int i = 1; i < netifs_count; i++ )
{
NetworkInterface* netif = netifs[i];
if ( !netif )
continue;
ScopedLock cfg_lock(&netif->cfg_lock);
if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr,
sizeof(struct in_addr)) == 0 )
{
found = true;
break;
}
}
// No interface had the correct address.
if ( !found )
return errno = EADDRNOTAVAIL, false;
}
uint16_t port = be16toh(new_local.in.sin_port);
for ( TCPSocket* socket = bindings_v4[port];
socket;
socket = socket->next_socket )
{
// TODO: os-test how SO_REUSEADDR works for TCP.
if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) &&
!(reuseaddr && socket->reuseaddr) )
return errno = EADDRINUSE, false;
if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) &&
!(reuseaddr && socket->reuseaddr) )
return errno = EADDRINUSE, false;
if ( new_local.in.sin_addr.s_addr ==
socket->local.in.sin_addr.s_addr )
return errno = EADDRINUSE, false;
}
}
else if ( af == AF_INET6 )
{
// TODO: IPv6 support for seeing if any interface has the address.
if ( true )
return errno = EAFNOSUPPORT, false;
uint16_t port = be16toh(new_local.in6.sin6_port);
if ( bindings_v6[port] )
return errno = EADDRINUSE, false;
for ( TCPSocket* socket = bindings_v6[port];
socket;
socket = socket->next_socket )
{
if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any,
sizeof(in6addr_any)) &&
!(reuseaddr && socket->reuseaddr) )
if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any,
sizeof(in6addr_any)) &&
!(reuseaddr && socket->reuseaddr) )
if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr,
sizeof(new_local.in6.sin6_addr)) )
return errno = EADDRINUSE, false;
}
}
else
return errno = EAFNOSUPPORT, false;
return true;
}
int TCPSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
ScopedLock lock(&tcp_lock);
if ( bound )
return errno = EINVAL, -1;
union tcp_sockaddr new_local;
if ( !ImportAddress(ctx, &new_local, addr, addrsize) )
return -1;
uint16_t port;
if ( af == AF_INET )
port = be16toh(new_local.in.sin_port);
else if ( af == AF_INET6 )
port = be16toh(new_local.in6.sin6_port);
else
return errno = EAFNOSUPPORT, -1;
// TODO: Binding to the any address needs to pick the appropriate source
// interface and bind to its address. (Or really? udp doesn't?
// os-test?)
// TODO: os-test a server listening on any, and then getsockname a
// connection received on that port.
if ( port == 0 )
return BindDefault(&new_local) ? 0 : -1;
if ( !CanBind(new_local) )
return -1;
if ( af == AF_INET )
{
uint16_t port = be16toh(new_local.in.sin_port);
if ( bindings_v4[port] )
bindings_v4[port]->prev_socket = this;
next_socket = bindings_v4[port];
prev_socket = NULL;
bindings_v4[port] = this;
}
else if ( af == AF_INET6 )
{
uint16_t port = be16toh(new_local.in6.sin6_port);
if ( bindings_v6[port] )
return errno = EADDRINUSE, -1;
next_socket = bindings_v6[port];
prev_socket = NULL;
bindings_v6[port] = this;
}
else
return errno = EAFNOSUPPORT, -1;
memcpy(&local, &new_local, sizeof(new_local));
bound = true;
return 0;
}
// tcp_lock locked
bool TCPSocket::BindDefault(const union tcp_sockaddr* new_local_ptr)
{
// TODO: This allocator becomes increasingly biased as more ports are
// allocated.
// TODO: Try not to allocate recently used ports.
union tcp_sockaddr new_local;
if ( new_local_ptr )
memcpy(&new_local, new_local_ptr, sizeof(union tcp_sockaddr));
else
{
memset(&new_local, 0, sizeof(new_local));
if ( af == AF_INET )
{
new_local.in.sin_family = AF_INET;
new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY);
}
else if ( af == AF_INET6 )
{
new_local.in6.sin6_family = AF_INET6;
new_local.in6.sin6_addr = in6addr_any;
}
else
return errno = EAFNOSUPPORT, false;
}
uint16_t start = 32768; // Documented in tcp(4).
uint16_t end = 61000; // Documented in tcp(4).
uint16_t count = end - start;
uint16_t offset = arc4random_uniform(count);
for ( uint16_t i = 0; i < count; i++ )
{
uint16_t j = offset + i;
if ( count <= j )
j -= count;
uint16_t port = start + j;
if ( af == AF_INET )
new_local.in.sin_port = htobe16(port);
else if ( af == AF_INET6 )
new_local.in6.sin6_port = htobe16(port);
else
return errno = EAFNOSUPPORT, false;
if ( !CanBind(new_local) )
{
if ( errno == EADDRINUSE )
continue;
return false;
}
if ( af == AF_INET )
{
if ( bindings_v4[port] )
bindings_v4[port]->prev_socket = this;
next_socket = bindings_v4[port];
prev_socket = NULL;
bindings_v4[port] = this;
}
else if ( af == AF_INET6 )
{
if ( bindings_v6[port] )
bindings_v6[port]->prev_socket = this;
next_socket = bindings_v6[port];
prev_socket = NULL;
bindings_v6[port] = this;
}
else
return errno = EAFNOSUPPORT, false;
memcpy(&local, &new_local, sizeof(new_local));
bound = true;
return true;
}
return errno = EAGAIN, false;
}
void TCPSocket::TransmitLoop() // tcp_lock taken
{
if ( state == TCP_STATE_CLOSED )
return;
if ( NUM_RETRANSMISSIONS <= retransmissions )
{
Fail(ETIMEDOUT);
return;
}
if ( !Transmit() && NUM_RETRANSMISSIONS - 1 <= retransmissions )
{
Fail(errno);
return;
}
}
bool TCPSocket::Transmit() // tcp_lock taken
{
if ( state == TCP_STATE_CLOSED )
return (errno = sockerr ? sockerr : ENOTCONN), false;
// Move new outgoing data into the transmission window if there is room.
tcp_seq window_available = (tcp_seq) (send_una + send_wnd - send_nxt);
if ( window_available && outgoing_syn == TCP_SPECIAL_PENDING )
{
send_nxt++;
outgoing_syn = TCP_SPECIAL_WINDOW;
window_available--;
}
if ( window_available )
{
tcp_seq window_data = (tcp_seq)(send_nxt - send_una);
if ( outgoing_syn == TCP_SPECIAL_WINDOW )
window_data--;
if ( outgoing_fin == TCP_SPECIAL_WINDOW )
window_data--;
assert(window_data <= outgoing_used);
size_t outgoing_new = outgoing_used - window_data;
tcp_seq amount = window_available;
if ( outgoing_new < amount )
amount = outgoing_new;
send_nxt += amount;
window_available -= amount;
}
if ( window_available && outgoing_fin == TCP_SPECIAL_PENDING )
{
send_nxt++;
outgoing_fin = TCP_SPECIAL_WINDOW;
window_available--;
}
// Transmit packets.
bool any = false;
while ( mod32_lt(send_pos, send_nxt) ||
(has_syn && mod32_lt(recv_acked, recv_nxt)) ||
recv_wnd != recv_wndlast )
{
any = true;
size_t mtu;
union tcp_sockaddr sendfrom;
if ( af == AF_INET )
{
if ( !IP::GetSourceIP(&local.in.sin_addr, &remote.in.sin_addr,
&sendfrom.in.sin_addr, ifindex, &mtu) )
return false;
}
// TODO: IPv6 support.
else
return errno = EAFNOSUPPORT, false;
if ( mtu < sizeof(struct tcphdr) )
return errno = EINVAL, false;
mtu -= sizeof(struct tcphdr);
Ref<Packet> pkt = GetPacket();
if ( !pkt )
return false;
pkt->length = sizeof(struct tcphdr);
unsigned char* out = pkt->from;
struct tcphdr hdr;
if ( af == AF_INET )
{
hdr.th_sport = local.in.sin_port;
hdr.th_dport = remote.in.sin_port;
}
else if ( af == AF_INET6 )
{
hdr.th_sport = local.in6.sin6_port;
hdr.th_dport = remote.in6.sin6_port;
}
else
return errno = EAFNOSUPPORT, false;
hdr.th_seq = htobe32(send_pos);
hdr.th_offset = TCP_OFFSET_ENCODE(sizeof(struct tcphdr) / 4);
hdr.th_flags = 0;
tcp_seq send_nxtpos = send_pos;
assert(mod32_le(send_nxtpos, send_nxt));
if ( outgoing_syn == TCP_SPECIAL_WINDOW && send_nxtpos == send_una )
{
hdr.th_flags |= TH_SYN;
send_nxtpos++;
}
assert(mod32_le(send_nxtpos, send_nxt));
if ( has_syn )
{
// TODO: RFC 1122 4.2.2.6:
// "TCP SHOULD send an MSS (Maximum Segment Size) option in
// every SYN segment when its receive MSS differs from the
// default 536, and MAY send it always."
// "If an MSS option is not received at connection setup, TCP
// MUST assume a default send MSS of 536 (576-40)."
hdr.th_flags |= TH_ACK;
hdr.th_ack = htobe32(recv_nxt);
}
else
hdr.th_ack = htobe32(0);
hdr.th_win = htobe16(recv_wnd);
hdr.th_urp = htobe16(0);
hdr.th_sum = htobe16(0);
tcp_seq window_data = (tcp_seq)(send_nxt - send_pos);
if ( send_pos == send_una && outgoing_syn == TCP_SPECIAL_WINDOW )
window_data--;
if ( mod32_lt(send_pos, send_nxt) &&
outgoing_fin == TCP_SPECIAL_WINDOW )
window_data--;
if ( window_data )
{
size_t amount = mtu < window_data ? mtu : window_data;
assert(outgoing_offset <= sizeof(outgoing));
tcp_seq window_length = (tcp_seq) (send_nxtpos - send_una);
if ( outgoing_syn == TCP_SPECIAL_WINDOW )
window_length--;
assert(window_length <= sizeof(outgoing));
size_t outgoing_end = outgoing_offset + window_length;
if ( sizeof(outgoing) <= outgoing_end )
outgoing_end -= sizeof(outgoing);
assert(outgoing_end < sizeof(outgoing));
size_t until_end = sizeof(outgoing) - outgoing_end;
size_t first = until_end < amount ? until_end : amount;
assert(first <= sizeof(outgoing));
assert(first <= sizeof(outgoing) - outgoing_end);
size_t second = amount - first;
assert(second <= sizeof(outgoing));
memcpy(out + sizeof(hdr), outgoing + outgoing_end, first);
if ( second )
memcpy(out + sizeof(hdr) + first, outgoing, second);
pkt->length += amount;
send_nxtpos += amount;
}
assert(mod32_le(send_nxtpos, send_nxt));
if ( outgoing_fin == TCP_SPECIAL_WINDOW &&
send_nxtpos + 1 == send_nxt )
{
hdr.th_flags |= TH_FIN;
send_nxtpos++;
}
assert(mod32_le(send_nxtpos, send_nxt));
memcpy(out, &hdr, sizeof(hdr));
uint16_t checksum = 0;
if ( af == AF_INET )
{
checksum = IP::ipsum_buf(checksum, &sendfrom.in.sin_addr,
sizeof(struct in_addr));
checksum = IP::ipsum_buf(checksum, &remote.in.sin_addr,
sizeof(struct in_addr));
}
else if ( af == AF_INET6 )
{
checksum = IP::ipsum_buf(checksum, &sendfrom.in6.sin6_addr,
sizeof(struct in6_addr));
checksum = IP::ipsum_buf(checksum, &remote.in6.sin6_addr,
sizeof(struct in6_addr));
}
else
return errno = EAFNOSUPPORT, false;
checksum = IP::ipsum_word(checksum, IPPROTO_TCP);
checksum = IP::ipsum_word(checksum, pkt->length);
checksum = IP::ipsum_buf(checksum, out, pkt->length);
hdr.th_sum = htobe16(IP::ipsum_finish(checksum));
memcpy(out, &hdr, sizeof(hdr));
if ( af == AF_INET )
{
if ( !IP::Send(pkt, &sendfrom.in.sin_addr, &remote.in.sin_addr,
IPPROTO_TCP, ifindex, false) )
return false;
}
// TODO: IPv6 support.
else
return errno = EAFNOSUPPORT, false;
if ( has_syn )
recv_acked = recv_nxt;
recv_wndlast = recv_wnd;
assert(mod32_le(send_nxtpos, send_nxt));
send_pos = send_nxtpos;
}
if ( any )
{
SetDeadline();
SetTimer();
}
return true;
}
void TCPSocket::OnTimer()
{
ScopedLock lock(&tcp_lock);
timer_armed = false;
if ( 0 <= deadline.tv_sec &&
timespec_le(deadline, Time::Get(CLOCK_MONOTONIC)) )
{
deadline = timespec_make(-1, 0);
if ( state == TCP_STATE_TIME_WAIT ||
(state == TCP_STATE_FIN_WAIT_2 && !is_referenced) )
Close();
else if ( mod32_lt(send_una, send_pos) )
{
retransmissions++;
send_pos = send_una;
}
}
transmit_scheduled = false;
TransmitLoop();
if ( 0 <= deadline.tv_sec )
SetTimer();
if ( can_destroy() )
delete this;
}
void TCPSocket::ScheduleTransmit() // tcp_lock locked
{
if ( state == TCP_STATE_CLOSED || state == TCP_STATE_LISTEN )
return;
if ( transmit_scheduled && timer_armed )
return;
transmit_scheduled = true;
SetTimer();
}
void TCPSocket::SetDeadline() // tcp_lock locked
{
if ( 0 <= deadline.tv_sec )
return;
if ( state == TCP_STATE_TIME_WAIT ||
(state == TCP_STATE_FIN_WAIT_2 && !is_referenced) )
{
struct timespec now = Time::Get(CLOCK_MONOTONIC);
struct timespec msl2 = timespec_make(60, 0); // Documented in tcp(4).
deadline = timespec_add(now, msl2);
}
else if ( mod32_le(send_una, send_pos) )
{
struct timespec now = Time::Get(CLOCK_MONOTONIC);
struct timespec delay = timespec_make(1 + 1 * retransmissions, 0);
deadline = timespec_add(now, delay);
}
}
void TCPSocket::SetTimer() // tcp_lock locked
{
if ( timer_armed )
{
if ( !timer.TryCancel() )
return;
timer_armed = false;
}
if ( state == TCP_STATE_CLOSED )
return;
bool destruction_is_wanted = want_destruction();
if ( transmit_scheduled || destruction_is_wanted || 0 <= deadline.tv_sec )
{
int flags = TIMER_FUNC_MAY_DEALLOCATE_TIMER;
struct itimerspec timeout;
memset(&timeout, 0, sizeof(timeout));
// Slightly delay transmission to batch together a better reply.
if ( transmit_scheduled )
timeout.it_value = timespec_make(0, 1);
else if ( 0 <= deadline.tv_sec )
{
timeout.it_value = deadline;
flags |= TIMER_ABSOLUTE;
}
timer.Set(&timeout, NULL, flags, TCPSocket__OnTimer, this);
timer_armed = true;
}
}
void TCPSocket::ProcessPacket(Ref<Packet> pkt,
union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst) // tcp_lock locked
{
const unsigned char* in = pkt->from + pkt->offset;
size_t inlen = pkt->length - pkt->offset;
struct tcphdr hdr;
memcpy(&hdr, in, sizeof(hdr));
hdr.th_sport = be16toh(hdr.th_sport);
hdr.th_dport = be16toh(hdr.th_dport);
hdr.th_seq = be32toh(hdr.th_seq);
hdr.th_ack = be32toh(hdr.th_ack);
hdr.th_win = be16toh(hdr.th_win);
hdr.th_urp = be16toh(hdr.th_urp);
size_t offset = TCP_OFFSET_DECODE(hdr.th_offset) * 4;
in += offset;
inlen -= offset;
if ( state == TCP_STATE_CLOSED ) // STD 7, RFC 793, page 65.
{
if ( hdr.th_flags & TH_RST )
return;
// TODO: ACK the RST.
return;
}
else if ( state == TCP_STATE_LISTEN ) // STD 7, RFC 793, page 65.
{
if ( hdr.th_flags & TH_RST )
return;
if ( hdr.th_flags & TH_ACK )
{
// TODO: Send <SEQ=SEG.ACK><CTL=RST>.
return;
}
if ( !(hdr.th_flags & TH_SYN) )
return;
if ( !hdr.th_win )
return;
if ( backlog_max <= backlog_used )
return;
// TODO: Use SYN cache to mitigate SYN flood attack.
TCPSocket* socket = new TCPSocket(af);
if ( !socket )
return;
assert(pkt_src);
assert(pkt_dst);
socket->remote = *pkt_src;
socket->local = *pkt_dst;
socket->remoted = true;
socket->bound = true;
if ( af == AF_INET )
{
uint16_t port = be16toh(socket->local.in.sin_port);
socket->prev_socket = NULL;
socket->next_socket = bindings_v4[port];
if ( socket->next_socket )
socket->next_socket->prev_socket = socket;
bindings_v4[port] = socket;
}
else if ( af == AF_INET6 )
{
uint16_t port = be16toh(socket->local.in6.sin6_port);
socket->prev_socket = NULL;
socket->next_socket = bindings_v6[port];
if ( socket->next_socket )
socket->next_socket->prev_socket = socket;
bindings_v6[port] = socket;
}
socket->iss = arc4random();
socket->send_una = socket->iss;
socket->send_nxt = socket->iss;
socket->send_wnd = 1;
socket->send_pos = socket->iss;
socket->outgoing_syn = TCP_SPECIAL_PENDING;
socket->recv_wnd = TCP_MAXWIN;
socket->recv_acked = hdr.th_seq;
socket->recv_nxt = hdr.th_seq + 1;
socket->irs = hdr.th_seq;
socket->has_syn = true;
socket->state = TCP_STATE_SYN_RECV;
socket->UpdateWindow(hdr.th_win);
socket->connecting_parent = this;
socket->connecting_prev = NULL;
socket->connecting_next = connecting_half;
if ( socket->connecting_next )
socket->connecting_next->connecting_prev = socket;
connecting_half = socket;
backlog_used++;
socket->TransmitLoop();
return;
}
else if ( state == TCP_STATE_SYN_SENT ) // STD 7, RFC 793, page 66.
{
if ( hdr.th_flags & TH_ACK )
{
if ( mod32_le(hdr.th_ack, iss) || mod32_gt(hdr.th_ack, send_nxt) )
{
if ( hdr.th_flags & TH_RST )
return;
// TODO: Send RST.
return;
}
if ( !(mod32_le(send_una, hdr.th_ack) &&
mod32_le(hdr.th_ack, send_nxt)) )
return;
}
if ( hdr.th_flags & TH_RST )
{
Fail(ECONNREFUSED);
return;
}
if ( !(hdr.th_flags & TH_SYN) )
return;
recv_acked = hdr.th_seq;
recv_nxt = hdr.th_seq + 1;
irs = hdr.th_seq;
has_syn = true;
// RFC 1122 4.2.2.20 (c), page 94.
UpdateWindow(hdr.th_win);
send_wl1 = hdr.th_seq;
send_wl2 = hdr.th_ack;
// TODO: Drop packet if the packet contains data/FIN beyond the SYN?
if ( hdr.th_flags & TH_ACK )
{
send_una = hdr.th_ack;
retransmissions = 0;
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
if ( mod32_lt(iss, send_una) )
{
outgoing_syn = TCP_SPECIAL_ACKED;
state = TCP_STATE_ESTAB;
kthread_cond_broadcast(&receive_cond); // Wake up connect.
return;
}
}
state = TCP_STATE_SYN_RECV;
return;
}
// STD 7, RFC 793, page 69.
bool acceptable;
if ( inlen == 0 && recv_wnd == 0 )
acceptable = hdr.th_seq == recv_nxt;
else if ( inlen == 0 && 0 < recv_wnd )
acceptable = mod32_le(recv_nxt, hdr.th_seq) &&
mod32_lt(hdr.th_seq, recv_nxt + recv_wnd);
else if ( 0 < inlen && 0 < recv_wnd )
{
tcp_seq seg_end = (tcp_seq) (hdr.th_seq + inlen - 1);
acceptable = (mod32_le(recv_nxt, hdr.th_seq) &&
mod32_lt(hdr.th_seq, recv_nxt + recv_wnd)) ||
(mod32_le(recv_nxt, seg_end) &&
mod32_lt(seg_end, recv_nxt + recv_wnd));
}
else
{
acceptable = false;
// TODO: STD 7, RFC 793, page 69 "If the RCV.WND is zero, no segments
// will be acceptable, but special allowance should be made to
// accept valid ACKs, URGs and RSTs".
}
if ( !acceptable )
{
if ( hdr.th_flags & TH_RST )
return;
// Send <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>.
recv_acked = recv_nxt - 1;
return;
}
// STD 7, RFC 793, page 70. Process segments in the right order and trim the
// segment to the receive window.
uint16_t real_seq = hdr.th_seq;
if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_SYN) )
{
hdr.th_flags &= ~TH_SYN;
hdr.th_seq++;
}
if ( mod32_lt(hdr.th_seq, recv_nxt) )
{
tcp_seq skip = recv_nxt - hdr.th_seq;
if ( inlen < skip )
skip = inlen;
hdr.th_seq += skip;
in += skip;
inlen -= skip;
}
if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_FIN) )
{
hdr.th_flags &= ~TH_FIN;
hdr.th_seq++;
}
if ( mod32_lt(hdr.th_seq, recv_nxt) ) // Already processes.
return;
if ( mod32_gt(hdr.th_seq, recv_nxt) ) // Can't process yet.
{
// Insert the segment in the receive queue.
Ref<Packet> prev;
Ref<Packet> iter = receive_queue;
// TODO: For n packets in the worst order, this scales O(n^2).
// TODO: This wastes a packet per byte in the worst case.
while ( iter )
{
const unsigned char* iter_in = iter->from + iter->offset;
const unsigned char* iter_in_seq =
iter_in + offsetof(struct tcphdr, th_seq);
tcp_seq iter_seq;
memcpy(&iter_seq, iter_in_seq, sizeof(iter_seq));
iter_seq = be32toh(iter_seq);
if ( mod32_le(real_seq, iter_seq) )
break;
// TODO: Handle duplicate and overlapping segments.
prev = iter;
iter = iter->next;
}
if ( prev )
{
pkt->next = prev->next;
prev->next = pkt;
}
else
{
pkt->next = receive_queue;
receive_queue = pkt;
}
return;
}
if ( recv_wnd < inlen )
inlen = recv_wnd;
// STD 7, RFC 793, page 70.
if ( hdr.th_flags & TH_RST )
{
if ( state == TCP_STATE_CLOSING ||
state == TCP_STATE_LAST_ACK ||
state == TCP_STATE_TIME_WAIT )
Close();
else
Fail(ECONNRESET);
return;
}
// STD 7, RFC 793, page 71.
if ( hdr.th_flags & TH_SYN )
{
// TODO: Send RST.
Fail(ECONNRESET);
return;
}
// STD 7, RFC 793, page 72.
if ( !(hdr.th_flags & TH_ACK) )
return;
// STD 7, RFC 793, page 72.
if ( state == TCP_STATE_SYN_RECV )
{
// RFC 1122 4.2.2.20 (f), page 94.
UpdateWindow(hdr.th_win);
send_wl1 = hdr.th_seq;
send_wl2 = hdr.th_ack;
if ( mod32_le(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt) )
{
state = TCP_STATE_ESTAB;
kthread_cond_broadcast(&receive_cond); // Wake up connect.
if ( connecting_parent )
{
if ( connecting_prev )
connecting_prev->connecting_next = connecting_next;
else
connecting_parent->connecting_half = connecting_next;
if ( connecting_next )
connecting_next->connecting_prev = connecting_prev;
// TODO: This inserts the connection to the front of the
// accept queue, rather than the end, which is unfair to
// connections that have been waiting longer.
connecting_prev = NULL;
connecting_next = connecting_parent->connecting_ready;
if ( connecting_next )
connecting_next->connecting_prev = this;
connecting_parent->connecting_ready = this;
kthread_cond_broadcast(&connecting_parent->receive_cond);
uint16_t status = connecting_parent->PollEventStatus();
connecting_parent->poll_channel.Signal(status);
}
}
else
{
// TODO: Send <SEQ=SEG.ACK><CTL=RST>.
TransmitLoop();
return;
}
}
// STD 7, RFC 793, page 72.
// TODO: RFC 1122 4.2.2.20 (g), page 94 says SEG.ACK =< SND.UNA however this
// causes incoming connections to fail.
if ( mod32_lt(hdr.th_ack, send_una) )
return; // Drop duplicate ack already seen.
else if ( mod32_lt(send_nxt, hdr.th_ack) )
{
// TODO: Send ACK.
return;
}
// STD 7, RFC 793, page 72. Remove acknowledged data from the window.
tcp_seq old_send_una = send_una;
tcp_seq acked = (tcp_seq) (hdr.th_ack - send_una);
if ( outgoing_syn == TCP_SPECIAL_WINDOW && 0 < acked )
{
outgoing_syn = TCP_SPECIAL_ACKED;
acked--;
send_una++;
}
tcp_seq window_data = (tcp_seq) (send_nxt - send_una);
if ( outgoing_fin == TCP_SPECIAL_WINDOW )
window_data--;
if ( window_data && acked )
{
size_t amount = window_data < acked ? window_data : acked;
assert(outgoing_offset < sizeof(outgoing));
outgoing_offset += amount;
if ( sizeof(outgoing) <= outgoing_offset )
outgoing_offset -= sizeof(outgoing);
assert(outgoing_offset < sizeof(outgoing));
assert(amount <= outgoing_used);
outgoing_used -= amount;
kthread_cond_broadcast(&transmit_cond);
poll_channel.Signal(PollEventStatus());
acked -= amount;
send_una += amount;
}
bool fin_was_acked = false;
if ( outgoing_fin == TCP_SPECIAL_WINDOW && 0 < acked )
{
outgoing_fin = TCP_SPECIAL_ACKED;
acked--;
send_una++;
fin_was_acked = true;
}
if ( send_una != old_send_una )
{
// TODO: Possibly recalculate the average time to contact remote.
retransmissions = 0;
SetTimer();
}
// STD 7, RFC 793, page 72.
if ( mod32_lt(send_wl1, hdr.th_seq) ||
(send_wl1 == hdr.th_seq && mod32_le(send_wl2, hdr.th_ack)) )
{
UpdateWindow(hdr.th_win);
send_wl1 = hdr.th_seq;
send_wl2 = hdr.th_ack;
}
// STD 7, RFC 793, page 73.
if ( state == TCP_STATE_FIN_WAIT_1 )
{
if ( fin_was_acked )
{
state = TCP_STATE_FIN_WAIT_2;
// Time out the connection if the socket is no longer referenced.
if ( !is_referenced )
{
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
}
}
else if ( state == TCP_STATE_CLOSING )
{
if ( fin_was_acked )
{
state = TCP_STATE_TIME_WAIT;
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
return;
}
else if ( state == TCP_STATE_LAST_ACK )
{
if ( fin_was_acked )
Close();
return;
}
// TODO: Urgent data per STD 7, RFC 793, page 73.
// STD 7, RFC 793, page 74.
if ( state == TCP_STATE_ESTAB ||
state == TCP_STATE_FIN_WAIT_1 ||
state == TCP_STATE_FIN_WAIT_2 )
{
assert(incoming_offset < sizeof(incoming));
assert(incoming_used <= sizeof(incoming));
size_t available = sizeof(incoming) - incoming_used;
size_t amount = available < inlen ? available : inlen;
assert(amount <= sizeof(incoming));
assert(amount <= available);
size_t newat = incoming_offset + incoming_used;
if ( sizeof(incoming) <= newat )
newat -= sizeof(incoming);
assert(newat < sizeof(incoming));
size_t until_end = sizeof(incoming) - newat;
assert(until_end <= sizeof(incoming));
size_t first = until_end < amount ? until_end : amount;
assert(first <= amount);
assert(first <= sizeof(incoming));
size_t second = amount - first;
assert(second <= amount);
assert(second <= sizeof(incoming));
assert(first + second == amount);
assert(first + second <= sizeof(incoming));
assert(first + second <= available);
if ( !shutdown_receive )
{
memcpy(incoming + newat, in, first);
if ( second )
memcpy(incoming, in + first, second);
incoming_used += amount;
}
available = sizeof(incoming) - incoming_used;
if ( available < recv_wnd )
recv_wnd = available;
recv_nxt = hdr.th_seq + amount;
if ( amount == inlen && (hdr.th_flags & TH_FIN) )
{
recv_nxt++;
has_fin = true;
}
if ( incoming_used || has_fin )
{
kthread_cond_broadcast(&receive_cond);
poll_channel.Signal(PollEventStatus());
}
}
// STD 7, RFC 793, page 75.
if ( hdr.th_flags & TH_FIN )
{
if ( state == TCP_STATE_ESTAB )
{
state = TCP_STATE_CLOSE_WAIT;
kthread_cond_broadcast(&receive_cond);
poll_channel.Signal(PollEventStatus());
}
else if ( state == TCP_STATE_FIN_WAIT_1 )
{
// Our sent FIN hasn't been ACK'd or we'd be in FIN_WAIT_2.
state = TCP_STATE_CLOSING;
}
else if ( state == TCP_STATE_FIN_WAIT_2 )
{
state = TCP_STATE_TIME_WAIT;
deadline = timespec_make(-1, 0);
SetDeadline();
SetTimer();
}
else if ( state == TCP_STATE_TIME_WAIT )
{
// The timer is not reset like as by the standard to avoid a hostile
// remote from staying forever in TIME-WAIT.
}
}
}
void TCPSocket::ReceivePacket(Ref<Packet> pktnew,
union tcp_sockaddr* pkt_src,
union tcp_sockaddr* pkt_dst) // tcp_lock locked
{
if ( pktnew )
ProcessPacket(pktnew, pkt_src, pkt_dst);
while ( receive_queue )
{
Ref<Packet> pkt = receive_queue;
const unsigned char* in = pkt->from + pkt->offset;
const unsigned char* in_seq = in + offsetof(struct tcphdr, th_seq);
tcp_seq seq;
memcpy(&seq, in_seq, sizeof(seq));
seq = be32toh(seq);
if ( mod32_gt(seq, recv_nxt) )
break;
receive_queue = pkt->next;
pkt->next.Reset();
if ( seq == recv_nxt )
ProcessPacket(pkt, pkt_src, pkt_dst);
}
// Delay transmit to answer more efficiently based on upcoming packets.
ScheduleTransmit();
}
void TCPSocket::UpdateWindow(uint16_t new_window)
{
tcp_seq pending = (tcp_seq) (send_nxt - send_una);
if ( new_window < pending )
send_nxt = (tcp_seq) (send_una + pending);
send_wnd = new_window;
}
int TCPSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
ScopedLock lock(&tcp_lock);
// TODO: os-test listen + connect, what errno?
if ( state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV )
return errno = EALREADY, -1;
if ( state != TCP_STATE_CLOSED )
return errno = EISCONN, -1; // TODO: Another errno if listening?
union tcp_sockaddr new_remote;
if ( !ImportAddress(ctx, &new_remote, addr, addrsize) )
return -1;
if ( af == AF_INET )
{
// Verify the port is non-zero.
if ( be16toh(new_remote.in.sin_port) == 0 )
return errno = EADDRNOTAVAIL, -1;
}
else
return errno = EAFNOSUPPORT, -1;
// TODO: os-test AF_UNSPEC
// If the socket is not bound, find a route to the remote address and bind
// to the appropriate source address.
if ( !bound )
{
union tcp_sockaddr new_local;
memset(&new_local, 0, sizeof(new_local));
if ( af == AF_INET )
{
struct in_addr any;
any.s_addr = htobe32(INADDR_ANY);
new_local.in.sin_family = AF_INET;
if ( !IP::GetSourceIP(&any, &new_remote.in.sin_addr,
&new_local.in.sin_addr, ifindex, NULL) )
return -1;
new_local.in.sin_port = htobe16(0);
}
else
return errno = EAFNOSUPPORT, -1;
if ( !BindDefault(&new_local) )
return -1;
}
// Test if there is a route from the local address to the remote address.
// TODO: Does TCP also do this? Note that connecting to the any address
// should be forbidden, right?
if ( af == AF_INET )
{
if ( !IP::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr,
NULL, ifindex, NULL) )
return -1;
}
else
return errno = EAFNOSUPPORT, -1;
memcpy(&remote, &new_remote, sizeof(new_remote));
remoted = true;
iss = arc4random();
recv_wnd = TCP_MAXWIN;
send_una = iss;
send_nxt = iss;
send_wnd = 1;
send_pos = iss;
outgoing_syn = TCP_SPECIAL_PENDING;
state = TCP_STATE_SYN_SENT;
TransmitLoop();
while ( !sockerr &&
(state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV) )
{
// TODO: os-test non-blocking connect.
if ( ctx->dflags & O_NONBLOCK )
return errno = EINPROGRESS, -1;
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
return errno = EINTR, -1;
}
if ( sockerr )
{
// TODO: This is not recoverable. Is that correct?
// TODO: os-test whether reconnect is possible after failed connect?
return errno = sockerr, -1;
}
return 0;
}
int TCPSocket::listen(ioctx_t* /*ctx*/, int backlog)
{
if ( backlog < 0 )
return errno = EINVAL, -1;
// TODO: os-test if zero backlog allows connections.
if ( backlog == 0 )
backlog = 1;
else if ( backlog < 0 || SOMAXCONN < backlog )
backlog = SOMAXCONN;
ScopedLock lock(&tcp_lock);
if ( !bound )
return errno = EDESTADDRREQ, -1;
// TODO: os-test a regular connection, close, and then try to listen.
if ( state != TCP_STATE_CLOSED )
return errno = EINVAL, -1;
backlog_max = backlog;
memset(&remote, 0, sizeof(remote));
if ( af == AF_INET )
{
remote.in.sin_family = AF_INET;
remote.in.sin_addr.s_addr = htobe32(INADDR_ANY);
}
else if ( af == AF_INET6 )
{
remote.in6.sin6_family = AF_INET6;
remote.in6.sin6_addr = in6addr_any;
}
else
return errno = EAFNOSUPPORT, -1;
remoted = true;
state = TCP_STATE_LISTEN;
return 0;
}
ssize_t TCPSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags)
{
ScopedLock lock(&tcp_lock);
ssize_t result = recv_unlocked(ctx, buf, count, flags);
// Respond immediately if receive window has become empty.
!incoming_used ? TransmitLoop() : ScheduleTransmit();
return result;
}
ssize_t TCPSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags)
{
struct msghdr msg;
if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) )
return -1;
if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen )
return errno = EINVAL, -1;
size_t iov_size = msg.msg_iovlen * sizeof(struct iovec);
struct iovec* iov = new struct iovec[msg.msg_iovlen];
if ( !iov )
return -1;
struct iovec* user_iov = msg.msg_iov;
if ( !ctx->copy_from_src(iov, user_iov, iov_size) )
return delete[] iov, -1;
msg.msg_iov = iov;
kthread_mutex_lock(&tcp_lock);
ssize_t result = 0;
for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ )
{
size_t maximum = SSIZE_MAX - (size_t) result;
uint8_t* buf = (uint8_t*) iov[i].iov_base;
size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum;
if ( !count )
continue;
ssize_t amount = recv_unlocked(ctx, buf, count, flags);
if ( amount < 0 )
{
if ( result == 0 )
result = -1;
break;
}
result += amount;
if ( (size_t) amount != count )
break;
}
// Respond immediately if receive window has become empty.
!incoming_used ? TransmitLoop() : ScheduleTransmit();
kthread_mutex_unlock(&tcp_lock);
msg.msg_iov = user_iov;
// TODO: os-test POSIX's requirement to ignore the namemsg_name
// msg_namelen, plus msg_controllen's behavior is unspecified.
msg.msg_namelen = 0;
msg.msg_controllen = 0;
delete[] iov;
if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) )
return -1;
return result;
}
ssize_t TCPSocket::recv_unlocked(ioctx_t* ctx,
uint8_t* buf,
size_t count,
int flags) // tcp_lock taken
{
if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) // TODO: MSG_OOB.
return errno = EINVAL, -1;
if ( sockerr )
return errno = sockerr, -1;
// TODO: os-test non-blocking connect + immediate recv.
// TODO: CLOSED after it has been closed?
if ( state == TCP_STATE_CLOSED ||
state == TCP_STATE_LISTEN ||
state == TCP_STATE_SYN_SENT ||
state == TCP_STATE_SYN_RECV )
return errno = ENOTCONN, -1;
size_t sofar = 0;
while ( sofar < count )
{
while ( !(incoming_used || has_fin || shutdown_receive) )
{
if ( sockerr )
return sofar ? sofar : (errno = sockerr, -1);
if ( state == TCP_STATE_CLOSED )
return sofar;
if ( sofar && !(flags & MSG_WAITALL) )
return sofar;
if ( ctx->dflags & O_NONBLOCK )
return sofar ? sofar : (errno = EWOULDBLOCK, -1);
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
return sofar ? sofar : (errno = EINTR, -1);
if ( sockerr )
return sofar ? sofar : (errno = sockerr, -1);
}
if ( incoming_used == 0 && (has_fin || shutdown_receive) )
return sofar;
uint8_t* data = buf + sofar;
size_t left = count - sofar;
assert(incoming_used <= sizeof(incoming));
size_t amount = incoming_used < left ? incoming_used : left;
assert(incoming_offset < sizeof(incoming));
size_t until_end = sizeof(incoming) - incoming_offset;
size_t first = until_end < amount ? until_end : amount;
size_t second = amount - first;
if ( !ctx->copy_to_dest(data, incoming + incoming_offset, first) )
return sofar ? sofar : -1;
if ( second && !ctx->copy_to_dest(data + first, incoming, second) )
return sofar ? sofar : -1;
sofar += amount;
if ( flags & MSG_PEEK )
return sofar;
incoming_offset += amount;
if ( sizeof(incoming) <= incoming_offset )
incoming_offset -= sizeof(incoming);
assert(incoming_offset < sizeof(incoming));
incoming_used -= amount;
recv_wnd = sizeof(incoming) - incoming_used;
if ( UINT16_MAX < recv_wnd )
recv_wnd = UINT16_MAX;
if ( TCP_MAXWIN < recv_wnd )
recv_wnd = TCP_MAXWIN;
}
return sofar;
}
ssize_t TCPSocket::send(ioctx_t* ctx,
const uint8_t* buf,
size_t count,
int flags)
{
ScopedLock lock(&tcp_lock);
ssize_t result = send_unlocked(ctx, buf, count, flags);
TransmitLoop();
return result;
}
ssize_t TCPSocket::sendmsg(ioctx_t* ctx,
const struct msghdr* msg_ptr,
int flags)
{
struct msghdr msg;
if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) )
return -1;
if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen )
return errno = EINVAL, -1;
// TODO: os-test if msg_name/msg_namelen/msg_control/msg_controllen are set.
size_t iov_size = msg.msg_iovlen * sizeof(struct iovec);
struct iovec* iov = new struct iovec[msg.msg_iovlen];
if ( !iov )
return -1;
if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) )
return delete[] iov, -1;
msg.msg_iov = iov;
kthread_mutex_lock(&tcp_lock);
ssize_t result = 0;
for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ )
{
size_t maximum = SSIZE_MAX - (size_t) result;
const uint8_t* buf = (const uint8_t*) iov[i].iov_base;
size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum;
ssize_t amount = send_unlocked(ctx, buf, count, flags);
if ( amount < 0 )
{
if ( result == 0 )
result = -1;
break;
}
result += amount;
if ( (size_t) amount != count )
break;
}
TransmitLoop();
kthread_mutex_unlock(&tcp_lock);
delete[] iov;
return result;
}
ssize_t TCPSocket::send_unlocked(ioctx_t* ctx,
const uint8_t* buf,
size_t count,
int flags) // tcp_lock taken
{
// TODO: MSG_MORE (and implement TCP_CORK), MSG_OOB, MSG_DONTROUTE.
if ( flags & ~(MSG_NOSIGNAL) )
return errno = EINVAL, -1;
if ( sockerr )
return errno = sockerr, -1;
if ( state == TCP_STATE_CLOSED ||
state == TCP_STATE_LISTEN ||
state == TCP_STATE_SYN_SENT ||
state == TCP_STATE_SYN_RECV )
return errno = ENOTCONN, -1;
size_t sofar = 0;
while ( sofar < count )
{
while ( outgoing_used == sizeof(outgoing) ||
(state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT) )
{
if ( sofar )
return sofar;
if ( sockerr )
return errno = sockerr, -1;
if ( ctx->dflags & O_NONBLOCK )
return errno = EWOULDBLOCK;
if ( !kthread_cond_wait_signal(&transmit_cond, &tcp_lock) )
return errno = EINTR, -1;
}
if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT )
{
if ( !(flags & MSG_NOSIGNAL) )
CurrentThread()->DeliverSignal(SIGPIPE);
return errno = EPIPE, -1;
}
const uint8_t* data = buf + sofar;
size_t left = count - sofar;
assert(outgoing_offset < sizeof(outgoing));
assert(outgoing_used <= sizeof(outgoing));
size_t available = sizeof(outgoing) - outgoing_used;
size_t amount = available < left ? available : left;
size_t newat = outgoing_offset + outgoing_used;
if ( sizeof(outgoing) <= newat )
newat -= sizeof(outgoing);
assert(newat < sizeof(outgoing));
size_t until_end = sizeof(outgoing) - newat;
size_t first = until_end < amount ? until_end : amount;
size_t second = amount - first;
if ( !ctx->copy_from_src(outgoing + newat, data, first) )
return sofar ? sofar : -1;
if ( second && !ctx->copy_from_src(outgoing, data + first, second) )
return sofar ? sofar : -1;
outgoing_used += amount;
assert(outgoing_used <= sizeof(outgoing));
sofar += amount;
// TODO: If there's a sent packet that hasn't been acknowledged, and
// there isn't a full packet yet, then just buffer and don't
// transmit yet.
// TODO: TCP_NODELAY, TCP_NOPUSH, MSG_MORE.
// TODO: Set PUSH appropriately.
}
return sofar;
}
ssize_t TCPSocket::read(ioctx_t* ctx, uint8_t* buf, size_t count)
{
return recv(ctx, buf, count, 0);
}
ssize_t TCPSocket::write(ioctx_t* ctx, const uint8_t* buf, size_t count)
{
return send(ctx, buf, count, 0);
}
short TCPSocket::PollEventStatus()
{
// TODO: os-test the poll bits.
// TODO: OOB poll bits.
short status = 0;
if ( connecting_ready )
status |= POLLIN | POLLRDNORM;
if ( incoming_used || has_fin || shutdown_receive )
status |= POLLIN | POLLRDNORM;
if ( (state == TCP_STATE_ESTAB || state == TCP_STATE_CLOSE_WAIT) &&
outgoing_used < sizeof(outgoing) )
status |= POLLOUT | POLLWRNORM;
if ( state == TCP_STATE_CLOSE_WAIT ||
state == TCP_STATE_LAST_ACK ||
state == TCP_STATE_TIME_WAIT ||
state == TCP_STATE_CLOSED )
status |= POLLHUP;
if ( sockerr )
status |= POLLERR;
return status;
}
int TCPSocket::poll(ioctx_t* /*ctx*/, PollNode* node)
{
ScopedLock lock(&tcp_lock);
short ret_status = PollEventStatus() & node->events;
if ( ret_status )
{
node->master->revents |= ret_status;
return 0;
}
poll_channel.Register(node);
return errno = EAGAIN, -1;
}
int TCPSocket::getsockopt(ioctx_t* ctx, int level, int option_name,
void* option_value, size_t* option_size_ptr)
{
ScopedLock lock(&tcp_lock);
if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE )
{
ScopedLock lock(&netifs_lock);
const char* ifname = "";
if ( ifindex < netifs_count && netifs[ifindex] )
ifname = netifs[ifindex]->ifinfo.name;
size_t option_size;
if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) )
return -1;
size_t len = strlen(ifname);
size_t size = len + 1;
if ( option_size < size )
return errno = ERANGE, -1;
if ( !CopyToUser(option_value, ifname, size) ||
!CopyToUser(option_size_ptr, &size, sizeof(size)) )
return -1;
return 0;
}
uintmax_t result = 0;
if ( level == IPPROTO_TCP )
{
switch ( option_name )
{
// TODO: TCP_NODELAY
// TODO: TCP_MAXSEG
// TODO: TCP_NOPUSH
// TODO: TCP_CORK
default: return errno = ENOPROTOOPT, -1;
}
}
else if ( level == SOL_SOCKET )
{
switch ( option_name )
{
case SO_BINDTOINDEX: result = ifindex; break;
case SO_DEBUG: result = 0; break;
case SO_DOMAIN: result = af; break;
case SO_ERROR: result = sockerr; break;
case SO_PROTOCOL: result = IPPROTO_TCP; break;
case SO_RCVBUF: result = sizeof(incoming); break;
case SO_REUSEADDR: result = reuseaddr; break;
case SO_SNDBUF: result = sizeof(outgoing); break;
case SO_TYPE: result = SOCK_STREAM; break;
// TODO: SO_ACCEPTCONN
// TODO: SO_LINGER
// TODO: SO_OOBINLINE
// TODO: SO_RCVLOWAT
// TODO: SO_RCVTIMEO
// TODO: SO_SNDLOWAT
// TODO: SO_SNDTIMEO
// TODO: SO_DONTROUTE
// TODO: SO_BROADCAST
default: return errno = ENOPROTOOPT, -1;
}
}
else
return errno = EINVAL, -1;
if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) )
return -1;
return 0;
}
// TODO: os-test socket options on shut down sockets. POSIX says EINVAL.
// TODO: os-test the errno for an invalid protocol.
// TODO: os-test the errno for an invalid option at a protocol level.
int TCPSocket::setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size)
{
ScopedLock lock(&tcp_lock);
if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE )
{
char ifname[IF_NAMESIZE];
if ( sizeof(ifname) < option_size )
option_size = sizeof(ifname);
if ( !CopyFromUser(ifname, option_value, option_size) )
return -1;
if ( strnlen(ifname, option_size) == sizeof(ifname) )
return errno = ENODEV, -1;
ifname[option_size] = '\0';
ScopedLock lock(&netifs_lock);
for ( size_t i = 1; i < netifs_count; i++ )
{
if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) )
{
ifindex = i;
return 0;
}
}
return errno = ENODEV, -1;
}
uintmax_t value;
if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) )
return -1;
if ( level == IPPROTO_TCP )
{
switch ( option_name )
{
case TCP_NODELAY: break; // TODO: Transmit if turned on?
case TCP_MAXSEG: break; // TODO: Implement this.
case TCP_NOPUSH: break; // TODO: Implement this.
// TODO: TCP_CORK
default:
return errno = ENOPROTOOPT, -1;
}
}
else if ( level == SOL_SOCKET )
{
switch ( option_name )
{
case SO_BINDTOINDEX:
if ( UINT_MAX < value )
return errno = EINVAL, -1;
ifindex = value;
break;
case SO_DEBUG:
if ( value != 0 )
return errno = EPERM, -1;
break;
case SO_KEEPALIVE: break; // TODO: Implement this.
case SO_REUSEADDR: reuseaddr = value; break;
case SO_LINGER: break; // TODO: Implement this.
case SO_RCVBUF: break; // TODO: Implement this.
case SO_SNDBUF: break; // TODO: Implement this.
// TODO: SO_BROADCAST
// TODO: SO_DONTROUTE
// TODO: SO_LINGER
// TODO: SO_RCVLOWAT
// TODO: SO_RCVTIMEO
// TODO: SO_SNDLOWAT
// TODO: SO_SNDTIMEO
default: return errno = ENOPROTOOPT, -1;
}
}
else
return errno = EINVAL, -1;
return 0;
}
int TCPSocket::shutdown(ioctx_t* /*ctx*/, int how)
{
ScopedLock lock(&tcp_lock);
return shutdown_unlocked(how);
}
int TCPSocket::shutdown_unlocked(int how) // tcp_lock taken
{
// STD 7, RFC 793, page 60.
if ( state != TCP_STATE_SYN_SENT &&
state != TCP_STATE_SYN_RECV &&
state != TCP_STATE_ESTAB &&
state != TCP_STATE_CLOSE_WAIT )
return errno = ENOTCONN, -1;
if ( how & SHUT_WR )
{
// STD 7, RFC 793, page 60.
if ( state == TCP_STATE_SYN_SENT )
Close();
else // TCP_STATE_SYN_SENT || TCP_STATE_ESTAB || TCP_STATE_CLOSE_WAIT
{
outgoing_fin = TCP_SPECIAL_PENDING;
// TODO: Should this state transition be delayed until the FIN
// enters the window or is sent?
if ( state == TCP_STATE_CLOSE_WAIT )
state = TCP_STATE_LAST_ACK /* RFC 1122, 4.2.2.20 (a), page 93 */;
else
state = TCP_STATE_FIN_WAIT_1;
kthread_cond_broadcast(&transmit_cond);
TransmitLoop();
}
}
if ( how & SHUT_RD )
{
shutdown_receive = true;
kthread_cond_broadcast(&receive_cond);
}
return 0;
}
int TCPSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr)
{
ScopedLock lock(&tcp_lock);
if ( !remoted || state == TCP_STATE_LISTEN )
return errno = ENOTCONN, -1;
size_t addrsize;
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
return -1;
if ( af == AF_INET )
{
if ( sizeof(remote.in) < addrsize )
addrsize = sizeof(remote.in);
}
else if ( af == AF_INET6 )
{
if ( sizeof(remote.in6) < addrsize )
addrsize = sizeof(remote.in6);
}
else
return errno = EAFNOSUPPORT, -1;
if ( !ctx->copy_to_dest(addr, &remote, addrsize) )
return -1;
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
return -1;
return 0;
}
int TCPSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr)
{
ScopedLock lock(&tcp_lock);
size_t addrsize;
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
return -1;
if ( af == AF_INET )
{
if ( sizeof(local.in) < addrsize )
addrsize = sizeof(local.in);
}
else if ( af == AF_INET6 )
{
if ( sizeof(local.in6) < addrsize )
addrsize = sizeof(local.in6);
}
else
return errno = EAFNOSUPPORT, -1;
if ( !ctx->copy_to_dest(addr, &local, addrsize) )
return -1;
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
return -1;
return 0;
}
// TODO: os-test fstat on a socket.
TCPSocketNode::TCPSocketNode(TCPSocket* socket)
{
this->socket = socket;
socket->is_referenced = true;
Process* process = CurrentProcess();
inode_type = INODE_TYPE_STREAM;
dev = (dev_t) this;
ino = (ino_t) this;
type = S_IFSOCK;
kthread_mutex_lock(&process->idlock);
stat_uid = process->uid;
stat_gid = process->gid;
kthread_mutex_unlock(&process->idlock);
stat_mode = 0600 | this->type;
}
TCPSocketNode::~TCPSocketNode()
{
socket->Unreference();
}
Ref<Inode> TCPSocketNode::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
int flags)
{
return socket->accept4(ctx, addr, addrsize, flags);
}
int TCPSocketNode::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
return socket->bind(ctx, addr, addrsize);
}
int TCPSocketNode::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
{
return socket->connect(ctx, addr, addrsize);
}
int TCPSocketNode::listen(ioctx_t* ctx, int backlog)
{
return socket->listen(ctx, backlog);
}
ssize_t TCPSocketNode::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags)
{
return socket->recv(ctx, buf, count, flags);
}
ssize_t TCPSocketNode::recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags)
{
return socket->recvmsg(ctx, msg, flags);
}
ssize_t TCPSocketNode::send(ioctx_t* ctx, const uint8_t* buf, size_t count,
int flags)
{
return socket->send(ctx, buf, count, flags);
}
ssize_t TCPSocketNode::sendmsg(ioctx_t* ctx, const struct msghdr* msg,
int flags)
{
return socket->sendmsg(ctx, msg, flags);
}
ssize_t TCPSocketNode::read(ioctx_t* ctx, uint8_t* buf, size_t count)
{
return socket->read(ctx, buf, count);
}
ssize_t TCPSocketNode::write(ioctx_t* ctx, const uint8_t* buf, size_t count)
{
return socket->write(ctx, buf, count);
}
int TCPSocketNode::poll(ioctx_t* ctx, PollNode* node)
{
return socket->poll(ctx, node);
}
int TCPSocketNode::getsockopt(ioctx_t* ctx, int level, int option_name,
void* option_value, size_t* option_size_ptr)
{
return socket->getsockopt(ctx, level, option_name, option_value,
option_size_ptr);
}
int TCPSocketNode::setsockopt(ioctx_t* ctx, int level, int option_name,
const void* option_value, size_t option_size)
{
return socket->setsockopt(ctx, level, option_name, option_value,
option_size);
}
int TCPSocketNode::shutdown(ioctx_t* ctx, int how)
{
return socket->shutdown(ctx, how);
}
int TCPSocketNode::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize)
{
return socket->getpeername(ctx, addr, addrsize);
}
int TCPSocketNode::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize)
{
return socket->getsockname(ctx, addr, addrsize);
}
void HandleIP(Ref<Packet> pkt,
const struct in_addr* src,
const struct in_addr* dst,
bool dst_broadcast)
{
if ( src->s_addr == htobe32(INADDR_ANY) )
return;
if ( dst_broadcast )
return;
const unsigned char* in = pkt->from + pkt->offset;
size_t inlen = pkt->length - pkt->offset;
struct tcphdr hdr;
if ( inlen < sizeof(hdr) )
return;
if ( UINT16_MAX < inlen )
return;
memcpy(&hdr, in, sizeof(hdr));
hdr.th_sport = be16toh(hdr.th_sport);
hdr.th_dport = be16toh(hdr.th_dport);
hdr.th_sum = be16toh(hdr.th_sum);
uint16_t sum = 0;
sum = IP::ipsum_buf(sum, src, sizeof(struct in_addr));
sum = IP::ipsum_buf(sum, dst, sizeof(struct in_addr));
sum = IP::ipsum_word(sum, IPPROTO_TCP);
sum = IP::ipsum_word(sum, inlen);
sum = IP::ipsum_buf(sum, in, inlen);
if ( sum != 0 && sum != 0xFFFF )
return;
if ( TCP_OFFSET_DECODE(hdr.th_offset) < sizeof(hdr) / 4 ||
inlen < (size_t) TCP_OFFSET_DECODE(hdr.th_offset) * 4 )
return;
// Port 0 is not valid.
if ( hdr.th_sport == 0 || hdr.th_dport == 0 )
return;
// TODO: TCP options. Respect TCPOPT_MAXSEG.
TCPSocket* socket = NULL;
TCPSocket* socket_listener = NULL;
TCPSocket* any_socket_listener = NULL;
ScopedLock lock(&tcp_lock);
for ( TCPSocket* iter = bindings_v4[hdr.th_dport];
!socket && iter;
iter = iter->next_socket )
{
// TODO: If a TCP socket is bound, and then connected to, what happens?
// What if the TCP socket then connects to the other side?
if ( !iter->remoted )
continue;
// The datagram was sent to the socket's local address.
if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) )
{
// The first priority is to receive on a socket with the correct
// local address and the correct remote address.
if ( !memcmp(&iter->remote.in.sin_addr, src, sizeof(*src)) &&
be16toh(iter->remote.in.sin_port) == hdr.th_sport )
socket = iter;
// The second priority is to receive on a socket with the correct
// local address and listening for connections from any address.
else if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
socket_listener = iter;
}
// The socket is bound to the any address.
if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
{
// The third priority is to receive on a socket bound to the any
// address and listening for connections from any address.
if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
any_socket_listener = iter;
}
}
if ( !socket )
socket = socket_listener;
if ( !socket )
socket = any_socket_listener;
// No socket wanted to receive the packet.
if ( !socket )
{
// TODO: Send RST.
return;
}
// If the socket is bound to a network interface, require the packet to
// have been received on that network interface.
if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid )
{
// TODO: Send RST.
return;
}
union tcp_sockaddr pkt_src;
pkt_src.in.sin_family = AF_INET;
pkt_src.in.sin_addr = *src;
pkt_src.in.sin_port = htobe16(hdr.th_sport);
union tcp_sockaddr pkt_dst;
pkt_dst.in.sin_family = AF_INET;
pkt_dst.in.sin_addr = *dst;
pkt_dst.in.sin_port = htobe16(hdr.th_dport);
// Receive the packet on the socket.
socket->ReceivePacket(pkt, &pkt_src, &pkt_dst);
// Delete the socket if needed or schedule a transmit if needed.
if ( socket->can_destroy() )
delete socket;
}
Ref<Inode> Socket(int af)
{
if ( !IsSupportedAddressFamily(af) )
return errno = EAFNOSUPPORT, Ref<Inode>(NULL);
TCPSocket* socket = new TCPSocket(af);
if ( !socket )
return Ref<Inode>();
Ref<TCPSocketNode> result(new TCPSocketNode(socket));
if ( !result )
return delete socket, Ref<Inode>();
return result;
}
} // namespace TCP
} // namespace Sortix