sortix-mirror/kernel/net/ip.cpp

454 lines
15 KiB
C++
Raw Normal View History

Add networking stack. This change adds all the kernel parts of a network stack. The network stack is partial but implements many of the important parts. Add if(4) network interface abstraction. Network interfaces are registered in a global list that can be iterated and each assigned an unique integer identifier. Add reference counted packets with a cache that recycles recent packets. Add support for lo(4) loopback and ether(4) ethernet network interfaces. The /dev/lo0 loopback device is created automatically on boot. Add arp(4) address resolution protocol driver for translation of inet(4) network layer addresses into ether(4) link layer addresses. arp(4) entries are cached and evicted from the cache when needed or when the entry has not been used for a while. The cache is limited to 256 entries for now. Add ip(4) internet protocol version 4 support. IP fragmentation and options are not implemented yet. Add tcp(4) transmission control protocol sockets for a reliable transport layer protocol that provides a reliable byte stream connection between two hosts. The implementation is incomplete and does not yet implement out of band data, options, and high performance extensions. Add udp(4) user datagram protocol sockets for a connectionless transport layer that provides best-effort delivery of datagrams. Add ping(4) sockets for a best-effort delivery of echo datagrams. Change type of sa_family_t from unsigned short to uint16_t. Add --disable-network-drivers to the kernel(7) options and expose it with a bootloader menu. tix-iso-bootconfig can set this option by default. Import CRC32 code from libz for the Ethernet checksum. This is a compatible ABI change that adds features to socket(2) (AF_INET, IPPROTO_TCP, IPPROTO_UDP, IPPROTO_PING), the ioctls for if(4), socket options, and the lo0 loopback interface. This commit is based on work by Meisaka Yukara contributed as the commit bbf7f1e8a5238a2bd1fe8eb1d2cc5c9c2421e2c4. Almost no lines of this work remains in this final commit as it has been rewritten or refactored away over the years, see the individual file headers for which files contain remnants of this work. Co-authored-by: Meisaka Yukara <Meisaka.Yukara@gmail.com>
2022-12-04 23:35:21 +00:00
/*
* Copyright (c) 2016, 2017, 2018 Jonas 'Sortie' Termansen.
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* net/ip.cpp
* Internet Protocol Version 4.
*/
#include <sys/socket.h>
#include <assert.h>
#include <endian.h>
#include <errno.h>
#include <netinet/if_ether.h>
#include <netinet/in.h>
#include <stdint.h>
#include <timespec.h>
#include <sortix/kernel/kernel.h>
#include <sortix/kernel/if.h>
#include <sortix/kernel/packet.h>
#include <sortix/kernel/refcount.h>
#include "arp.h"
#include "ether.h"
#include "ping.h"
#include "tcp.h"
#include "udp.h"
namespace Sortix {
namespace IP {
struct ipv4
{
uint8_t version_ihl;
uint8_t dscp_ecn;
uint16_t length;
uint16_t identification;
uint16_t fragment;
uint8_t ttl;
uint8_t protocol;
uint16_t checksum;
uint8_t source[4];
uint8_t destination[4];
};
#define IPV4_IHL(x) ((x) >> 0 & 0xF)
#define IPV4_VERSION(x) ((x) >> 4 & 0xF)
#define IPV4_IHL_MAKE(x) (((x) & 0xF) << 0)
#define IPV4_VERSION_MAKE(x) (((x) & 0xF) << 4)
#define IPV4_FRAGMENT(x) (((x) >> 0) & 0x1FFF)
#define IPV4_FRAGMENT_MAKE(x) (((x) & 0x1FFF) << 0)
#define IPV4_FRAGMENT_MORE (1 << (13 + 0))
#define IPV4_FRAGMENT_DONT (1 << (13 + 1))
#define IPV4_FRAGMENT_EVIL (1 << (13 + 2))
uint16_t ipsum_word(uint16_t sum, uint16_t word)
{
uint32_t result = sum + word;
if ( result & 0x10000 )
return (result + 1) & 0xFFFF;
return result;
}
uint16_t ipsum_buf(uint16_t sum, const void* bufptr, size_t size)
{
const uint8_t* buf = (const uint8_t*) bufptr;
for ( size_t i = 0; i < (size & ~1UL); i += 2 )
sum = ipsum_word(sum, buf[i] << 8 | buf[i + 1]);
// Odd sizes only work correctly if this is the final byte being summed.
if ( size & 1 )
sum = ipsum_word(sum, buf[size - 1] << 8);
return sum;
}
uint16_t ipsum_finish(uint16_t sum)
{
return ~sum;
}
uint16_t ipsum(const void* bufptr, size_t size)
{
uint16_t sum = ipsum_buf(0, bufptr, size);
return ipsum_finish(sum);
}
static NetworkInterface* LocateInterface(const struct in_addr* src,
const struct in_addr* dst,
unsigned int ifindex)
{
ScopedLock ifs_lock(&netifs_lock);
in_addr_t any_ip = htobe32(INADDR_ANY);
in_addr_t broadcast_ip = htobe32(INADDR_BROADCAST);
// Refuse to route to the any address.
if ( !memcmp(&any_ip, dst, sizeof(in_addr_t)) )
return errno = ENETUNREACH, (NetworkInterface*) NULL;
// If src is set, but ifindex is not set, search for a fitting interface.
if ( !ifindex && memcmp(&any_ip, src, sizeof(in_addr_t)) != 0 )
{
for ( unsigned int i = 1; i < netifs_count; i++ )
{
NetworkInterface* netif = netifs[i];
if ( !netif )
continue;
ScopedLock cfg_lock(&netif->cfg_lock);
if ( memcmp(&netif->cfg.inet.address, src, sizeof(in_addr_t)) != 0 )
continue;
ifindex = i;
break;
}
// No interface had the correct address.
if ( !ifindex )
return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL;
}
// If ifindex is set, route to that interface.
if ( ifindex )
{
// Can't route to non-existent interface.
if ( netifs_count <= ifindex )
return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL;
NetworkInterface* netif = netifs[ifindex];
if ( !netif )
return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL;
ScopedLock cfg_lock(&netif->cfg_lock);
// Can't route to down interfaces.
if ( !(netif->ifstatus.flags & IF_STATUS_FLAGS_UP) )
return errno = ENETDOWN, (NetworkInterface*) NULL;
// If src is set, it must be the interface's address.
if ( memcmp(src, &any_ip, sizeof(in_addr_t)) != 0 &&
memcmp(src, &netif->cfg.inet.address, sizeof(in_addr_t)) != 0 )
return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL;
in_addr_t dstaddr = be32toh(dst->s_addr);
in_addr_t ifaddr = be32toh(netif->cfg.inet.address.s_addr);
in_addr_t subnet = be32toh(netif->cfg.inet.subnet.s_addr);
in_addr_t loopaddr = INADDR_LOOPBACK;
in_addr_t loopmask = INADDR_LOOPMASK;
if ( netif->ifinfo.type == IF_TYPE_LOOPBACK )
{
// The destination must be on the interface's subnet.
if ( (dstaddr & subnet) != (ifaddr & subnet) )
return errno = ENETUNREACH, (NetworkInterface*) NULL;
return netif;
}
else
{
// The destination must not be on the loopback network for
// a non-loopback interface.
if ( (dstaddr & loopmask) == (loopaddr & loopmask) )
return errno = ENETUNREACH, (NetworkInterface*) NULL;
// If the interface does not have a default route, the destination
// must be broadcast or be on the interface's subnet.
if ( !memcmp(&netif->cfg.inet.router, &any_ip, sizeof(in_addr_t)) &&
memcmp(&dstaddr, &broadcast_ip, sizeof(in_addr_t)) != 0 &&
(dstaddr & subnet) != (ifaddr & subnet) )
return errno = ENETUNREACH, (NetworkInterface*) NULL;
return netif;
}
}
// If the destination is broadcast, send to the first fitting interface.
else if ( !memcmp(&broadcast_ip, dst, sizeof(in_addr_t)) )
{
for ( unsigned int i = 1; i < netifs_count; i++ )
{
NetworkInterface* netif = netifs[i];
if ( !netif )
continue;
ScopedLock cfg_lock(&netif->cfg_lock);
// Can't route broadcast to loopback interfaces or down interfaces.
if ( netif->ifinfo.type == IF_TYPE_LOOPBACK ||
!(netif->ifstatus.flags & IF_STATUS_FLAGS_UP) )
continue;
return netif;
}
// No interface was suitable for broadcast.
return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL;
}
// Otherwise, pick the best interface for the destination address.
else
{
NetworkInterface* default_netif = NULL;
for ( unsigned int i = 1; i < netifs_count; i++ )
{
NetworkInterface* netif = netifs[i];
if ( !netif )
continue;
ScopedLock cfg_lock(&netif->cfg_lock);
in_addr_t dstaddr = be32toh(dst->s_addr);
in_addr_t ifaddr = be32toh(netif->cfg.inet.address.s_addr);
in_addr_t subnet = be32toh(netif->cfg.inet.subnet.s_addr);
// Route to the interface if the destination is on its subnet.
if ( (dstaddr & subnet) == (ifaddr & subnet) )
{
// Can't route to down interfaces.
if ( !(netif->ifstatus.flags & IF_STATUS_FLAGS_UP) )
return errno = ENETDOWN, (NetworkInterface*) NULL;
return netif;
}
// If the interface is up, no default route has been found yet, and
// the interface has a default route, default to that route if no
// better interface is found.
else if ( (netif->ifstatus.flags & IF_STATUS_FLAGS_UP) &&
!default_netif &&
memcmp(&any_ip, &netif->cfg.inet.router,
sizeof(in_addr_t)) != 0 )
default_netif = netif;
}
// If a fitting default route was found, use it.
if ( default_netif )
return default_netif;
// No interface was up that could accept the destination address, hence
// the network is down.
return errno = ENETDOWN, (NetworkInterface*) NULL;
}
}
static bool ShouldHandlePacket(Ref<Packet> pkt,
const struct in_addr* src,
const struct in_addr* dst,
bool dst_broadcast,
bool* out_broadcast)
{
NetworkInterface* netif = pkt->netif;
ScopedLock cfg_lock(&netif->cfg_lock);
// The source address must not be broadcast (RFC 1122 3.2.1.3).
in_addr_t broadcast_ip = htobe32(INADDR_BROADCAST);
if ( !memcmp(src, &broadcast_ip, sizeof(in_addr_t)) )
return false;
// The source address must not be the subnet's broadcast (RFC 1122 3.2.1.3).
in_addr_t if_broadcast_ip =
netif->cfg.inet.address.s_addr | ~netif->cfg.inet.subnet.s_addr;
if ( !memcmp(&if_broadcast_ip, src, sizeof(in_addr_t)) )
return false;
if ( netif->ifinfo.type != IF_TYPE_LOOPBACK )
{
// 127.0.0.0/8 is only for loopback.
if ( (be32toh(src->s_addr) & INADDR_LOOPMASK) ==
(INADDR_LOOPBACK & INADDR_LOOPMASK) ||
(be32toh(dst->s_addr) & INADDR_LOOPMASK) ==
(INADDR_LOOPBACK & INADDR_LOOPMASK) )
return false;
}
// Receive packets sent to the broadcast address.
*out_broadcast = false;
if ( !memcmp(dst, &broadcast_ip, sizeof(broadcast_ip)) )
return *out_broadcast = true, true;
in_addr_t any_ip = htobe32(INADDR_ANY);
// Only receive non-broadcast packets if the interface is configured.
if ( memcmp(&netif->cfg.inet.address, &any_ip, sizeof(in_addr_t)) != 0 )
{
// Receive packets sent to our address.
if ( !dst_broadcast &&
!memcmp(&netif->cfg.inet.address, dst, sizeof(in_addr_t)) )
return true;
// Receive packets sent to the subnet's broadcast address.
if ( !memcmp(&if_broadcast_ip, dst, sizeof(in_addr_t)) )
return *out_broadcast = true, true;
}
return false;
}
void Handle(Ref<Packet> pkt,
const struct ether_addr* /*src*/,
const struct ether_addr* /*dst*/,
bool dst_broadcast)
{
struct ipv4 hdr;
size_t pkt_remain = pkt->length - pkt->offset;
// The packet has to be large enough to contain a header.
if ( pkt_remain < sizeof(hdr) )
return;
memcpy(&hdr, pkt->from + pkt->offset, sizeof(hdr));
// Verify the header's checksum is correct.
if ( ipsum(&hdr, sizeof(hdr)) != 0 )
return;
hdr.length = be16toh(hdr.length);
hdr.identification = be16toh(hdr.identification);
hdr.fragment = be16toh(hdr.fragment);
hdr.checksum = be16toh(hdr.checksum);
// Verify the packet is Internet Protocol Version 4.
if ( IPV4_VERSION(hdr.version_ihl) != 4 )
return;
// Verify the relation:
// sizeof(hdr) <= ihl <= hdr.length <= pkt_remain
size_t ihl = 4 * IPV4_IHL(hdr.version_ihl);
// Verify the header length isn't smaller than the minimum header.
if ( ihl < sizeof(hdr) )
return;
// Verify total length isn't smaller than the header length.
if ( hdr.length < ihl )
return;
// Verify the packet length isn't smaller than the datagram.
if ( pkt_remain < hdr.length )
return;
// Drop the packet if we shouldn't handle it.
bool in_dst_broadcast;
const struct in_addr* in_src = (const struct in_addr*) &hdr.source;
const struct in_addr* in_dst = (const struct in_addr*) &hdr.destination;
if ( !ShouldHandlePacket(pkt, in_src, in_dst, dst_broadcast,
&in_dst_broadcast) )
return;
// TODO: IP options.
// TODO: Reassemble fragmented packets.
if ( IPV4_FRAGMENT(hdr.fragment) )
return;
if ( hdr.fragment & IPV4_FRAGMENT_MORE )
return;
// Trim the packet to the length according to the header, in case the packet
// was smaller than the link layer protocol's minimum transmission unit and
// the packet was padded by zeroes.
size_t truncated_length = pkt->offset + hdr.length;
if ( pkt->length < truncated_length )
return;
pkt->length = truncated_length;
pkt->offset += ihl;
if ( hdr.protocol == IPPROTO_ICMP )
Ping::HandleIP(pkt, in_src, in_dst, in_dst_broadcast);
else if ( hdr.protocol == IPPROTO_TCP )
TCP::HandleIP(pkt, in_src, in_dst, in_dst_broadcast);
else if ( hdr.protocol == IPPROTO_UDP )
UDP::HandleIP(pkt, in_src, in_dst, in_dst_broadcast);
}
bool Send(Ref<Packet> pktin,
const struct in_addr* src,
const struct in_addr* dst,
uint8_t protocol,
unsigned int ifindex,
bool broadcast)
{
Ref<Packet> pkt = GetPacket();
if ( !pkt )
return false;
size_t mtu = pkt->pmap.size;
if ( mtu < sizeof(struct ipv4) ||
mtu - sizeof(struct ipv4) < pktin->length )
return errno = EMSGSIZE, -1;
pkt->length = sizeof(struct ipv4) + pktin->length;
unsigned char* in = pktin->from;
unsigned char* out = pkt->from;
struct ipv4 hdr;
hdr.version_ihl = IPV4_VERSION_MAKE(4) | IPV4_IHL_MAKE(5);
hdr.dscp_ecn = 0;
hdr.length = htobe16(pkt->length);
hdr.identification = htobe16(0); // TODO: Assign identification to packets.
hdr.fragment = htobe16(0);
hdr.ttl = 0x40; // TODO: This should be configurable.
hdr.protocol = protocol;
hdr.checksum = 0;
memcpy(hdr.source, src, sizeof(struct in_addr));
memcpy(hdr.destination, dst, sizeof(struct in_addr));
hdr.checksum = htobe16(ipsum(&hdr, sizeof(hdr)));
memcpy(out, &hdr, sizeof(hdr));
memcpy(out + sizeof(struct ipv4), in, pktin->length);
NetworkInterface* netif = LocateInterface(src, dst, ifindex);
if ( !netif )
return false;
if ( netif->ifinfo.type == IF_TYPE_LOOPBACK )
{
struct ether_addr localaddr;
memset(&localaddr, 0, sizeof(localaddr));
return Ether::Send(pkt, &localaddr, &localaddr, ETHERTYPE_IP, netif);
}
if ( netif->ifinfo.type != IF_TYPE_ETHERNET )
return errno = EAFNOSUPPORT, false;
kthread_mutex_lock(&netif->cfg_lock);
in_addr_t dst_ip = dst->s_addr;
in_addr_t address_ip = netif->cfg.inet.address.s_addr;
in_addr_t router_ip = netif->cfg.inet.router.s_addr;
in_addr_t subnet_ip = netif->cfg.inet.subnet.s_addr;
in_addr_t broadcast_ip =
netif->cfg.inet.address.s_addr | ~netif->cfg.inet.subnet.s_addr;
struct ether_addr ether_src = netif->cfg.ether.address;
kthread_mutex_unlock(&netif->cfg_lock);
struct in_addr route;
// Route directly to the destination if the destination is broadcast.
if ( dst_ip == htobe32(INADDR_BROADCAST) || dst_ip == broadcast_ip )
memcpy(&route, &dst_ip, sizeof(route));
// Route directly to the destination if the destination is on the subnet.
else if ( (dst_ip & subnet_ip) == (address_ip & subnet_ip) &&
dst_ip != address_ip )
memcpy(&route, dst, sizeof(route));
// Route to the the default route if any.
else if ( router_ip != htobe32(INADDR_ANY) )
memcpy(&route, &router_ip, sizeof(route));
// Otherwise the network is unreachable.
else
return errno = ENETUNREACH, false;
// If the destination is broadcast, send an ethernet broadcast.
if ( dst_ip == htobe32(INADDR_BROADCAST) || dst_ip == broadcast_ip )
{
if ( !broadcast )
return errno = EACCES, false;
return Ether::Send(pkt, &ether_src, &etheraddr_broadcast, ETHERTYPE_IP,
netif);
}
return ARP::RouteIPEthernet(netif, pkt, &route);
}
bool GetSourceIP(const struct in_addr* src,
const struct in_addr* dst,
struct in_addr* sendfrom,
unsigned int ifindex,
size_t* mtu)
{
NetworkInterface* netif = LocateInterface(src, dst, ifindex);
if ( !netif )
return false;
ScopedLock cfg_lock(&netif->cfg_lock);
if ( sendfrom )
memcpy(sendfrom, &netif->cfg.inet.address, sizeof(struct in_addr));
if ( mtu )
*mtu = Ether::GetMTU(netif) - sizeof(struct ipv4);
return true;
}
Ref<Inode> Socket(int type, int protocol)
{
switch ( type )
{
case SOCK_DGRAM:
if ( protocol == 0 || protocol == IPPROTO_UDP )
return UDP::Socket(AF_INET);
if ( protocol == IPPROTO_PING )
return Ping::Socket(AF_INET);
return errno = EPROTONOSUPPORT, Ref<Inode>(NULL);
case SOCK_STREAM:
if ( protocol == 0 || protocol == IPPROTO_TCP )
return TCP::Socket(AF_INET);
return errno = EPROTONOSUPPORT, Ref<Inode>(NULL);
default: return errno = EPROTOTYPE, Ref<Inode>(NULL);
}
}
} // namespace IP
} // namespace Sortix