diff --git a/Makefile b/Makefile index d91ee97c..2ce51208 100644 --- a/Makefile +++ b/Makefile @@ -222,7 +222,7 @@ sysroot-system: sysroot-fsh sysroot-base-headers echo 'ID=sortix' && \ echo 'VERSION_ID="$(VERSION)"' && \ echo 'PRETTY_NAME="Sortix $(VERSION)"' && \ - echo 'SORTIX_ABI=1.2' && \ + echo 'SORTIX_ABI=1.3' && \ true) > "$(SYSROOT)/etc/sortix-release" echo /etc/sortix-release >> "$(SYSROOT)/tix/manifest/system" ln -sf sortix-release "$(SYSROOT)/etc/os-release" diff --git a/README b/README index 115f8062..68fdb338 100644 --- a/README +++ b/README @@ -11,8 +11,8 @@ tinkering. It has been in development since 2011 by a single developer and contributors. Though the system is stable and capable right now, it is still early in development, and a number of crucial features haven't been made yet. Releases -are made yearly and future releases will add features such as networking, SMP, -and USB that were skipped in favor of becoming self-hosting now. +are made yearly and future releases will add features such as SMP, and USB that +were skipped in favor of becoming self-hosting now. Documentation ------------- diff --git a/build-aux/iso-grub-cfg.sh b/build-aux/iso-grub-cfg.sh index 851b4ea3..ec08d41e 100755 --- a/build-aux/iso-grub-cfg.sh +++ b/build-aux/iso-grub-cfg.sh @@ -185,6 +185,7 @@ else no_random_seed=--no-random-seed fi set enable_src=true +set enable_network_drivers= export version export machine @@ -194,6 +195,7 @@ export timeout export default export no_random_seed export enable_src +export enable_network_drivers EOF if [ -n "$ports" ]; then @@ -280,7 +282,7 @@ esac cat << EOF hook_kernel_pre echo -n "Loading /$kernel ($(human_size $kernel)) ... " - multiboot /$kernel \$no_random_seed "\$@" + multiboot /$kernel \$no_random_seed \$enable_network_drivers "\$@" echo done hook_kernel_post if [ \$no_random_seed != --no-random-seed ]; then @@ -418,6 +420,18 @@ else } fi +if [ "\$enable_network_drivers" = --disable-network-drivers ]; then + menuentry "Enable networking drivers" { + enable_network_drivers= + configfile /boot/grub/advanced.cfg + } +else + menuentry "Disable networking drivers" { + enable_network_drivers=--disable-network-drivers + configfile /boot/grub/advanced.cfg + } +fi + menuentry "Select binary packages..." { configfile /boot/grub/tix.cfg } diff --git a/dnsconfig/dnsconfig.8 b/dnsconfig/dnsconfig.8 index aab27305..935f6219 100644 --- a/dnsconfig/dnsconfig.8 +++ b/dnsconfig/dnsconfig.8 @@ -75,7 +75,8 @@ Delete a resolver: .Ed .Sh SEE ALSO .Xr getdnsconfig 2 , -.Xr setdnsconfig 2 +.Xr setdnsconfig 2 , +.Xr inet 4 .Sh HISTORY .Nm originally appeared in Sortix 1.1. diff --git a/kernel/Makefile b/kernel/Makefile index eb8d1a64..04e57594 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,6 +82,7 @@ alarm.o \ clock.o \ com.o \ copy.o \ +crc32.o \ descriptor.o \ disk/ahci/ahci.o \ disk/ahci/hba.o \ @@ -124,8 +125,17 @@ logterminal.o \ memorymanagement.o \ mouse/ps2.o \ mtable.o \ +net/arp.o \ +net/ether.o \ net/fs.o \ +net/if.o \ +net/ip.o \ +net/lo/lo.o \ +net/packet.o \ +net/ping.o \ net/socket.o \ +net/tcp.o \ +net/udp.o \ op-new.o \ panic.o \ partition.o \ diff --git a/kernel/crc32.cpp b/kernel/crc32.cpp new file mode 100644 index 00000000..99a0af8b --- /dev/null +++ b/kernel/crc32.cpp @@ -0,0 +1,499 @@ +/* + * Copyright (c) 1995-2006, 2010, 2011, 2012 Mark Adler. + * Copyright (c) 2015 Josiah Worcester. + * Copyright (c) 2017 Jonas 'Sortie' Termansen. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly Mark Adler + * jloup@gzip.org madler@alumni.caltech.edu + * + * This file is based on zlib work by Mark Adler, forked into Sortix libz by + * Jonas 'Sortie' Termansen, improved by Josiah Worcester, then adapted for the + * Sortix kernel by Jonas 'Sortie' Termansen. + * + * crc32.cpp + * CRC32 checksum. + */ + +#include +#include + +namespace Sortix { + +static const uint32_t crc_table[8][256] = +{ + { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, + 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, + 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, + 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, + 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, + 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, + 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, + 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, + 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, + 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, + 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, + 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, + 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, + 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, + 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, + 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, + 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, + 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, + 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, + 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, + 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, + 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, + 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, + 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, + 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, + 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, + 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, + 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, + 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, + 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, + 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, + 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, + 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, + 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, + 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, + 0x2d02ef8d + }, + { + 0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, 0x646cc504, + 0x7d77f445, 0x565aa786, 0x4f4196c7, 0xc8d98a08, 0xd1c2bb49, + 0xfaefe88a, 0xe3f4d9cb, 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, + 0x87981ccf, 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192, + 0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, 0x821b9859, + 0x9b00a918, 0xb02dfadb, 0xa936cb9a, 0xe6775d5d, 0xff6c6c1c, + 0xd4413fdf, 0xcd5a0e9e, 0x958424a2, 0x8c9f15e3, 0xa7b24620, + 0xbea97761, 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265, + 0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, 0x39316bae, + 0x202a5aef, 0x0b07092c, 0x121c386d, 0xdf4636f3, 0xc65d07b2, + 0xed705471, 0xf46b6530, 0xbb2af3f7, 0xa231c2b6, 0x891c9175, + 0x9007a034, 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38, + 0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, 0xf0794f05, + 0xe9627e44, 0xc24f2d87, 0xdb541cc6, 0x94158a01, 0x8d0ebb40, + 0xa623e883, 0xbf38d9c2, 0x38a0c50d, 0x21bbf44c, 0x0a96a78f, + 0x138d96ce, 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca, + 0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, 0xded79850, + 0xc7cca911, 0xece1fad2, 0xf5facb93, 0x7262d75c, 0x6b79e61d, + 0x4054b5de, 0x594f849f, 0x160e1258, 0x0f152319, 0x243870da, + 0x3d23419b, 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864, + 0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, 0xad24e1af, + 0xb43fd0ee, 0x9f12832d, 0x8609b26c, 0xc94824ab, 0xd05315ea, + 0xfb7e4629, 0xe2657768, 0x2f3f79f6, 0x362448b7, 0x1d091b74, + 0x04122a35, 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31, + 0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, 0x838a36fa, + 0x9a9107bb, 0xb1bc5478, 0xa8a76539, 0x3b83984b, 0x2298a90a, + 0x09b5fac9, 0x10aecb88, 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, + 0x74c20e8c, 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180, + 0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, 0x71418a1a, + 0x685abb5b, 0x4377e898, 0x5a6cd9d9, 0x152d4f1e, 0x0c367e5f, + 0x271b2d9c, 0x3e001cdd, 0xb9980012, 0xa0833153, 0x8bae6290, + 0x92b553d1, 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5, + 0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, 0xca6b79ed, + 0xd37048ac, 0xf85d1b6f, 0xe1462a2e, 0x66de36e1, 0x7fc507a0, + 0x54e85463, 0x4df36522, 0x02b2f3e5, 0x1ba9c2a4, 0x30849167, + 0x299fa026, 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b, + 0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, 0x2c1c24b0, + 0x350715f1, 0x1e2a4632, 0x07317773, 0x4870e1b4, 0x516bd0f5, + 0x7a468336, 0x635db277, 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, + 0xe0d7848d, 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189, + 0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, 0x674f9842, + 0x7e54a903, 0x5579fac0, 0x4c62cb81, 0x8138c51f, 0x9823f45e, + 0xb30ea79d, 0xaa1596dc, 0xe554001b, 0xfc4f315a, 0xd7626299, + 0xce7953d8, 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4, + 0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, 0x5e7ef3ec, + 0x4765c2ad, 0x6c48916e, 0x7553a02f, 0x3a1236e8, 0x230907a9, + 0x0824546a, 0x113f652b, 0x96a779e4, 0x8fbc48a5, 0xa4911b66, + 0xbd8a2a27, 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23, + 0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, 0x70d024b9, + 0x69cb15f8, 0x42e6463b, 0x5bfd777a, 0xdc656bb5, 0xc57e5af4, + 0xee530937, 0xf7483876, 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, + 0x9324fd72 + }, + { + 0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, 0x0709a8dc, + 0x06cbc2eb, 0x048d7cb2, 0x054f1685, 0x0e1351b8, 0x0fd13b8f, + 0x0d9785d6, 0x0c55efe1, 0x091af964, 0x08d89353, 0x0a9e2d0a, + 0x0b5c473d, 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29, + 0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, 0x1235f2c8, + 0x13f798ff, 0x11b126a6, 0x10734c91, 0x153c5a14, 0x14fe3023, + 0x16b88e7a, 0x177ae44d, 0x384d46e0, 0x398f2cd7, 0x3bc9928e, + 0x3a0bf8b9, 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065, + 0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, 0x3157bf84, + 0x3095d5b3, 0x32d36bea, 0x331101dd, 0x246be590, 0x25a98fa7, + 0x27ef31fe, 0x262d5bc9, 0x23624d4c, 0x22a0277b, 0x20e69922, + 0x2124f315, 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71, + 0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, 0x709a8dc0, + 0x7158e7f7, 0x731e59ae, 0x72dc3399, 0x7793251c, 0x76514f2b, + 0x7417f172, 0x75d59b45, 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, + 0x7ccf6221, 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd, + 0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, 0x6bb5866c, + 0x6a77ec5b, 0x68315202, 0x69f33835, 0x62af7f08, 0x636d153f, + 0x612bab66, 0x60e9c151, 0x65a6d7d4, 0x6464bde3, 0x662203ba, + 0x67e0698d, 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579, + 0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, 0x46c49a98, + 0x4706f0af, 0x45404ef6, 0x448224c1, 0x41cd3244, 0x400f5873, + 0x4249e62a, 0x438b8c1d, 0x54f16850, 0x55330267, 0x5775bc3e, + 0x56b7d609, 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5, + 0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, 0x5deb9134, + 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, 0xe1351b80, 0xe0f771b7, + 0xe2b1cfee, 0xe373a5d9, 0xe63cb35c, 0xe7fed96b, 0xe5b86732, + 0xe47a0d05, 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461, + 0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, 0xfd13b8f0, + 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, 0xfa1a102c, 0xfbd87a1b, + 0xf99ec442, 0xf85cae75, 0xf300e948, 0xf2c2837f, 0xf0843d26, + 0xf1465711, 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd, + 0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, 0xde71f5bc, + 0xdfb39f8b, 0xddf521d2, 0xdc374be5, 0xd76b0cd8, 0xd6a966ef, + 0xd4efd8b6, 0xd52db281, 0xd062a404, 0xd1a0ce33, 0xd3e6706a, + 0xd2241a5d, 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049, + 0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, 0xcb4dafa8, + 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, 0xcc440774, 0xcd866d43, + 0xcfc0d31a, 0xce02b92d, 0x91af9640, 0x906dfc77, 0x922b422e, + 0x93e92819, 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5, + 0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, 0x98b56f24, + 0x99770513, 0x9b31bb4a, 0x9af3d17d, 0x8d893530, 0x8c4b5f07, + 0x8e0de15e, 0x8fcf8b69, 0x8a809dec, 0x8b42f7db, 0x89044982, + 0x88c623b5, 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1, + 0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, 0xa9e2d0a0, + 0xa820ba97, 0xaa6604ce, 0xaba46ef9, 0xaeeb787c, 0xaf29124b, + 0xad6fac12, 0xacadc625, 0xa7f18118, 0xa633eb2f, 0xa4755576, + 0xa5b73f41, 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d, + 0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, 0xb2cddb0c, + 0xb30fb13b, 0xb1490f62, 0xb08b6555, 0xbbd72268, 0xba15485f, + 0xb853f606, 0xb9919c31, 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, + 0xbe9834ed + }, + { + 0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, 0x8f629757, + 0x37def032, 0x256b5fdc, 0x9dd738b9, 0xc5b428ef, 0x7d084f8a, + 0x6fbde064, 0xd7018701, 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, + 0x58631056, 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871, + 0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, 0x95ad7f70, + 0x2d111815, 0x3fa4b7fb, 0x8718d09e, 0x1acfe827, 0xa2738f42, + 0xb0c620ac, 0x087a47c9, 0xa032af3e, 0x188ec85b, 0x0a3b67b5, + 0xb28700d0, 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787, + 0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, 0xeae41086, + 0x525877e3, 0x40edd80d, 0xf851bf68, 0xf02bf8a1, 0x48979fc4, + 0x5a22302a, 0xe29e574f, 0x7f496ff6, 0xc7f50893, 0xd540a77d, + 0x6dfcc018, 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0, + 0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, 0x9b14583d, + 0x23a83f58, 0x311d90b6, 0x89a1f7d3, 0x1476cf6a, 0xaccaa80f, + 0xbe7f07e1, 0x06c36084, 0x5ea070d2, 0xe61c17b7, 0xf4a9b859, + 0x4c15df3c, 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b, + 0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, 0x446f98f5, + 0xfcd3ff90, 0xee66507e, 0x56da371b, 0x0eb9274d, 0xb6054028, + 0xa4b0efc6, 0x1c0c88a3, 0x81dbb01a, 0x3967d77f, 0x2bd27891, + 0x936e1ff4, 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed, + 0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, 0xfe92dfec, + 0x462eb889, 0x549b1767, 0xec277002, 0x71f048bb, 0xc94c2fde, + 0xdbf98030, 0x6345e755, 0x6b3fa09c, 0xd383c7f9, 0xc1366817, + 0x798a0f72, 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825, + 0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, 0x21e91f24, + 0x99557841, 0x8be0d7af, 0x335cb0ca, 0xed59b63b, 0x55e5d15e, + 0x47507eb0, 0xffec19d5, 0x623b216c, 0xda874609, 0xc832e9e7, + 0x708e8e82, 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a, + 0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, 0xbd40e1a4, + 0x05fc86c1, 0x1749292f, 0xaff54e4a, 0x322276f3, 0x8a9e1196, + 0x982bbe78, 0x2097d91d, 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, + 0x6a4166a5, 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2, + 0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, 0xc2098e52, + 0x7ab5e937, 0x680046d9, 0xd0bc21bc, 0x88df31ea, 0x3063568f, + 0x22d6f961, 0x9a6a9e04, 0x07bda6bd, 0xbf01c1d8, 0xadb46e36, + 0x15080953, 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174, + 0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, 0xd8c66675, + 0x607a0110, 0x72cfaefe, 0xca73c99b, 0x57a4f122, 0xef189647, + 0xfdad39a9, 0x45115ecc, 0x764dee06, 0xcef18963, 0xdc44268d, + 0x64f841e8, 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf, + 0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, 0x3c9b51be, + 0x842736db, 0x96929935, 0x2e2efe50, 0x2654b999, 0x9ee8defc, + 0x8c5d7112, 0x34e11677, 0xa9362ece, 0x118a49ab, 0x033fe645, + 0xbb838120, 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98, + 0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, 0xd67f4138, + 0x6ec3265d, 0x7c7689b3, 0xc4caeed6, 0x591dd66f, 0xe1a1b10a, + 0xf3141ee4, 0x4ba87981, 0x13cb69d7, 0xab770eb2, 0xb9c2a15c, + 0x017ec639, 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e, + 0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, 0x090481f0, + 0xb1b8e695, 0xa30d497b, 0x1bb12e1e, 0x43d23e48, 0xfb6e592d, + 0xe9dbf6c3, 0x516791a6, 0xccb0a91f, 0x740cce7a, 0x66b96194, + 0xde0506f1 + }, + { + 0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, 0xf580a6c0, + 0xc8e08f70, 0x8f40f5a0, 0xb220dc10, 0x30704bc1, 0x0d106271, + 0x4ab018a1, 0x77d03111, 0xc5f0ed01, 0xf890c4b1, 0xbf30be61, + 0x825097d1, 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52, + 0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, 0x5090dc43, + 0x6df0f5f3, 0x2a508f23, 0x1730a693, 0xa5107a83, 0x98705333, + 0xdfd029e3, 0xe2b00053, 0xc1c12f04, 0xfca106b4, 0xbb017c64, + 0x866155d4, 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314, + 0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, 0x0431c205, + 0x3951ebb5, 0x7ef19165, 0x4391b8d5, 0xa121b886, 0x9c419136, + 0xdbe1ebe6, 0xe681c256, 0x54a11e46, 0x69c137f6, 0x2e614d26, + 0x13016496, 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997, + 0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, 0x58f35849, + 0x659371f9, 0x22330b29, 0x1f532299, 0xad73fe89, 0x9013d739, + 0xd7b3ade9, 0xead38459, 0x68831388, 0x55e33a38, 0x124340e8, + 0x2f236958, 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98, + 0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, 0xcd93690b, + 0xf0f340bb, 0xb7533a6b, 0x8a3313db, 0x0863840a, 0x3503adba, + 0x72a3d76a, 0x4fc3feda, 0xfde322ca, 0xc0830b7a, 0x872371aa, + 0xba43581a, 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d, + 0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, 0xa9423c8c, + 0x9422153c, 0xd3826fec, 0xeee2465c, 0x5cc29a4c, 0x61a2b3fc, + 0x2602c92c, 0x1b62e09c, 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, + 0xbe729a1f, 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf, + 0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, 0x3c220dce, + 0x0142247e, 0x46e25eae, 0x7b82771e, 0xb1e6b092, 0x8c869922, + 0xcb26e3f2, 0xf646ca42, 0x44661652, 0x79063fe2, 0x3ea64532, + 0x03c66c82, 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183, + 0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, 0xd1062710, + 0xec660ea0, 0xabc67470, 0x96a65dc0, 0x248681d0, 0x19e6a860, + 0x5e46d2b0, 0x6326fb00, 0xe1766cd1, 0xdc164561, 0x9bb63fb1, + 0xa6d61601, 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1, + 0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, 0x85a73956, + 0xb8c710e6, 0xff676a36, 0xc2074386, 0x4057d457, 0x7d37fde7, + 0x3a978737, 0x07f7ae87, 0xb5d77297, 0x88b75b27, 0xcf1721f7, + 0xf2770847, 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4, + 0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, 0x20b743d5, + 0x1dd76a65, 0x5a7710b5, 0x67173905, 0xd537e515, 0xe857cca5, + 0xaff7b675, 0x92979fc5, 0xe915e8db, 0xd475c16b, 0x93d5bbbb, + 0xaeb5920b, 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb, + 0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, 0x2ce505da, + 0x11852c6a, 0x562556ba, 0x6b457f0a, 0x89f57f59, 0xb49556e9, + 0xf3352c39, 0xce550589, 0x7c75d999, 0x4115f029, 0x06b58af9, + 0x3bd5a349, 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48, + 0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, 0x28d4c7df, + 0x15b4ee6f, 0x521494bf, 0x6f74bd0f, 0xdd54611f, 0xe03448af, + 0xa794327f, 0x9af41bcf, 0x18a48c1e, 0x25c4a5ae, 0x6264df7e, + 0x5f04f6ce, 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e, + 0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, 0xbdb4f69d, + 0x80d4df2d, 0xc774a5fd, 0xfa148c4d, 0x78441b9c, 0x4524322c, + 0x028448fc, 0x3fe4614c, 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, + 0xca64c78c + }, + { + 0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, 0x9b914216, + 0x50cd91b3, 0xd659e31d, 0x1d0530b8, 0xec53826d, 0x270f51c8, + 0xa19b2366, 0x6ac7f0c3, 0x77c2c07b, 0xbc9e13de, 0x3a0a6170, + 0xf156b2d5, 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035, + 0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, 0xef8580f6, + 0x24d95353, 0xa24d21fd, 0x6911f258, 0x7414c2e0, 0xbf481145, + 0x39dc63eb, 0xf280b04e, 0x07ac0536, 0xccf0d693, 0x4a64a43d, + 0x81387798, 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e, + 0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, 0x706ec54d, + 0xbb3216e8, 0x3da66446, 0xf6fab7e3, 0x047a07ad, 0xcf26d408, + 0x49b2a6a6, 0x82ee7503, 0x9feb45bb, 0x54b7961e, 0xd223e4b0, + 0x197f3715, 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e, + 0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, 0x0f580a6c, + 0xc404d9c9, 0x4290ab67, 0x89cc78c2, 0x94c9487a, 0x5f959bdf, + 0xd901e971, 0x125d3ad4, 0xe30b8801, 0x28575ba4, 0xaec3290a, + 0x659ffaaf, 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9, + 0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, 0x971f4ae1, + 0x5c439944, 0xdad7ebea, 0x118b384f, 0xe0dd8a9a, 0x2b81593f, + 0xad152b91, 0x6649f834, 0x7b4cc88c, 0xb0101b29, 0x36846987, + 0xfdd8ba22, 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4, + 0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, 0xe4a78d37, + 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, 0x7f36cf21, 0xb46a1c84, + 0x32fe6e2a, 0xf9a2bd8f, 0x0b220dc1, 0xc07ede64, 0x46eaacca, + 0x8db67f6f, 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79, + 0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, 0x7ce0cdba, + 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, 0x1eb014d8, 0xd5ecc77d, + 0x5378b5d3, 0x98246676, 0x852156ce, 0x4e7d856b, 0xc8e9f7c5, + 0x03b52460, 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b, + 0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, 0x1d661643, + 0xd63ac5e6, 0x50aeb748, 0x9bf264ed, 0x86f75455, 0x4dab87f0, + 0xcb3ff55e, 0x006326fb, 0xf135942e, 0x3a69478b, 0xbcfd3525, + 0x77a1e680, 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496, + 0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, 0x828d53f8, + 0x49d1805d, 0xcf45f2f3, 0x04192156, 0xf54f9383, 0x3e134026, + 0xb8873288, 0x73dbe12d, 0x6eded195, 0xa5820230, 0x2316709e, + 0xe84aa33b, 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db, + 0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, 0xf6999118, + 0x3dc542bd, 0xbb513013, 0x700de3b6, 0x6d08d30e, 0xa65400ab, + 0x20c07205, 0xeb9ca1a0, 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, + 0x977c6c1a, 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c, + 0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, 0x662adecf, + 0xad760d6a, 0x2be27fc4, 0xe0beac61, 0x123e1c2f, 0xd962cf8a, + 0x5ff6bd24, 0x94aa6e81, 0x89af5e39, 0x42f38d9c, 0xc467ff32, + 0x0f3b2c97, 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec, + 0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, 0x16441b82, + 0xdd18c827, 0x5b8cba89, 0x90d0692c, 0x8dd55994, 0x46898a31, + 0xc01df89f, 0x0b412b3a, 0xfa1799ef, 0x314b4a4a, 0xb7df38e4, + 0x7c83eb41, 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957, + 0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, 0x8e035b0f, + 0x455f88aa, 0xc3cbfa04, 0x089729a1, 0xf9c19b74, 0x329d48d1, + 0xb4093a7f, 0x7f55e9da, 0x6250d962, 0xa90c0ac7, 0x2f987869, + 0xe4c4abcc + }, + { + 0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, 0xf44f2413, + 0x52382fa7, 0x63d0353a, 0xc5a73e8e, 0x33ef4e67, 0x959845d3, + 0xa4705f4e, 0x020754fa, 0xc7a06a74, 0x61d761c0, 0x503f7b5d, + 0xf64870e9, 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653, + 0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, 0x5431d2a9, + 0xf246d91d, 0xc3aec380, 0x65d9c834, 0xa07ef6ba, 0x0609fd0e, + 0x37e1e793, 0x9196ec27, 0xcfbd399c, 0x69ca3228, 0x582228b5, + 0xfe552301, 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712, + 0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, 0x081d53e8, + 0xae6a585c, 0x9f8242c1, 0x39f54975, 0xa863a552, 0x0e14aee6, + 0x3ffcb47b, 0x998bbfcf, 0x5c2c8141, 0xfa5b8af5, 0xcbb39068, + 0x6dc49bdc, 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8, + 0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, 0x440b7579, + 0xe27c7ecd, 0xd3946450, 0x75e36fe4, 0xb044516a, 0x16335ade, + 0x27db4043, 0x81ac4bf7, 0x77e43b1e, 0xd19330aa, 0xe07b2a37, + 0x460c2183, 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590, + 0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, 0xd79acda4, + 0x71edc610, 0x4005dc8d, 0xe672d739, 0x103aa7d0, 0xb64dac64, + 0x87a5b6f9, 0x21d2bd4d, 0xe47583c3, 0x42028877, 0x73ea92ea, + 0xd59d995e, 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678, + 0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, 0xb8590282, + 0x1e2e0936, 0x2fc613ab, 0x89b1181f, 0x4c162691, 0xea612d25, + 0xdb8937b8, 0x7dfe3c0c, 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, + 0xdd80cab6, 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5, + 0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, 0x2bc8ba5f, + 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, 0x8816eaf2, 0x2e61e146, + 0x1f89fbdb, 0xb9fef06f, 0x7c59cee1, 0xda2ec555, 0xebc6dfc8, + 0x4db1d47c, 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08, + 0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, 0xefc8763c, + 0x49bf7d88, 0x78576715, 0xde206ca1, 0x1b87522f, 0xbdf0599b, + 0x8c184306, 0x2a6f48b2, 0xdc27385b, 0x7a5033ef, 0x4bb82972, + 0xedcf22c6, 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5, + 0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, 0xb3e4f77d, + 0x1593fcc9, 0x247be654, 0x820cede0, 0x74449d09, 0xd23396bd, + 0xe3db8c20, 0x45ac8794, 0x800bb91a, 0x267cb2ae, 0x1794a833, + 0xb1e3a387, 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d, + 0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, 0x139a01c7, + 0xb5ed0a73, 0x840510ee, 0x22721b5a, 0xe7d525d4, 0x41a22e60, + 0x704a34fd, 0xd63d3f49, 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, + 0xfdf58516, 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105, + 0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, 0x0bbdf5ff, + 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, 0xabc30345, 0x0db408f1, + 0x3c5c126c, 0x9a2b19d8, 0x5f8c2756, 0xf9fb2ce2, 0xc813367f, + 0x6e643dcb, 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf, + 0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, 0x03a0a617, + 0xa5d7ada3, 0x943fb73e, 0x3248bc8a, 0xf7ef8204, 0x519889b0, + 0x6070932d, 0xc6079899, 0x304fe870, 0x9638e3c4, 0xa7d0f959, + 0x01a7f2ed, 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe, + 0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, 0x90311eca, + 0x3646157e, 0x07ae0fe3, 0xa1d90457, 0x579174be, 0xf1e67f0a, + 0xc00e6597, 0x66796e23, 0xa3de50ad, 0x05a95b19, 0x34414184, + 0x92364a30 + }, + { + 0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa, + 0x48e00e64, 0xc66f0987, 0x0ac50919, 0xd3e51bb5, 0x1f4f1b2b, + 0x91c01cc8, 0x5d6a1c56, 0x57af154f, 0x9b0515d1, 0x158a1232, + 0xd92012ac, 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8, + 0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, 0xaf5e2a9e, + 0x63f42a00, 0xed7b2de3, 0x21d12d7d, 0x2b142464, 0xe7be24fa, + 0x69312319, 0xa59b2387, 0xf9766256, 0x35dc62c8, 0xbb53652b, + 0x77f965b5, 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f, + 0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, 0xaed97719, + 0x62737787, 0xecfc7064, 0x205670fa, 0x85cd537d, 0x496753e3, + 0xc7e85400, 0x0b42549e, 0x01875d87, 0xcd2d5d19, 0x43a25afa, + 0x8f085a64, 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b, + 0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, 0x299dc2ed, + 0xe537c273, 0x6bb8c590, 0xa712c50e, 0xadd7cc17, 0x617dcc89, + 0xeff2cb6a, 0x2358cbf4, 0xfa78d958, 0x36d2d9c6, 0xb85dde25, + 0x74f7debb, 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041, + 0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, 0xd16cfd3c, + 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, 0x86c3e873, 0x4a69e8ed, + 0xc4e6ef0e, 0x084cef90, 0x0289e689, 0xce23e617, 0x40ace1f4, + 0x8c06e16a, 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758, + 0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, 0x030ebb0e, + 0xcfa4bb90, 0x412bbc73, 0x8d81bced, 0x8744b5f4, 0x4beeb56a, + 0xc561b289, 0x09cbb217, 0xac509190, 0x60fa910e, 0xee7596ed, + 0x22df9673, 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889, + 0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, 0xfbff84df, + 0x37558441, 0xb9da83a2, 0x7570833c, 0x533b85da, 0x9f918544, + 0x111e82a7, 0xddb48239, 0xd7718b20, 0x1bdb8bbe, 0x95548c5d, + 0x59fe8cc3, 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c, + 0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, 0x2f80b4f1, + 0xe32ab46f, 0x6da5b38c, 0xa10fb312, 0xabcaba0b, 0x6760ba95, + 0xe9efbd76, 0x2545bde8, 0xfc65af44, 0x30cfafda, 0xbe40a839, + 0x72eaa8a7, 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d, + 0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, 0x2e07e976, + 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, 0x79a8fc39, 0xb502fca7, + 0x3b8dfb44, 0xf727fbda, 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, + 0x736df520, 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144, + 0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, 0x0513cd12, + 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, 0x8159c3e8, 0x4df3c376, + 0xc37cc495, 0x0fd6c40b, 0x7aa64737, 0xb60c47a9, 0x3883404a, + 0xf42940d4, 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e, + 0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, 0x2d095278, + 0xe1a352e6, 0x6f2c5505, 0xa386559b, 0x061d761c, 0xcab77682, + 0x44387161, 0x889271ff, 0x825778e6, 0x4efd7878, 0xc0727f9b, + 0x0cd87f05, 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a, + 0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, 0x83d02561, + 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, 0x079a2b9b, 0xcb302b05, + 0x45bf2ce6, 0x89152c78, 0x50353ed4, 0x9c9f3e4a, 0x121039a9, + 0xdeba3937, 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd, + 0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, 0x7b211ab0, + 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, 0x2c8e0fff, 0xe0240f61, + 0x6eab0882, 0xa201081c, 0xa8c40105, 0x646e019b, 0xeae10678, + 0x264b06e6 + } +}; + +// Implement crc32 using Intel's "slicing by 8" algorithm. Significantly faster +// than most other common approachs on common CPUs at the time of this writing. +uint32_t crc32(uint32_t crc, const unsigned char* buf, size_t len) +{ + if ( !buf ) + return 0; + crc = crc ^ 0xffffffff; + for ( ; 9 < len && (uintptr_t) buf & 7; len--, buf++ ) + crc = crc_table[0][(crc & 0xff) ^ *buf] ^ (crc >> 8); + for ( ; 8 <= len; len -= 8, buf += 8 ) + { + crc = crc_table[7][(buf[0] ^ (crc )) & 0xff] + ^ crc_table[6][(buf[1] ^ (crc >> 8 )) & 0xff] + ^ crc_table[5][(buf[2] ^ (crc >> 16)) & 0xff] + ^ crc_table[4][(buf[3] ^ (crc >> 24)) & 0xff] + ^ crc_table[3][buf[4]] + ^ crc_table[2][buf[5]] + ^ crc_table[1][buf[6]] + ^ crc_table[0][buf[7]]; + } + for ( ; 0 < len; len--, buf++ ) + crc = crc_table[0][(crc & 0xff) ^ *buf] ^ (crc >> 8); + return crc ^ 0xffffffff; +} + +} // namespace Sortix diff --git a/kernel/descriptor.cpp b/kernel/descriptor.cpp index 681afbf6..888a83ec 100644 --- a/kernel/descriptor.cpp +++ b/kernel/descriptor.cpp @@ -997,7 +997,11 @@ int Descriptor::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrlen) int Descriptor::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrlen) { - return vnode->connect(ctx, addr, addrlen); + int old_ctx_dflags = ctx->dflags; + ctx->dflags = ContextFlags(old_ctx_dflags, dflags); + int result = vnode->connect(ctx, addr, addrlen); + ctx->dflags = old_ctx_dflags; + return result; } int Descriptor::listen(ioctx_t* ctx, int backlog) diff --git a/kernel/include/sortix/ioctl.h b/kernel/include/sortix/ioctl.h index 26992a52..01454017 100644 --- a/kernel/include/sortix/ioctl.h +++ b/kernel/include/sortix/ioctl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Jonas 'Sortie' Termansen. + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -40,4 +40,22 @@ #define TIOCGPTN __IOCTL(7, __IOCTL_TYPE_PTR) #define TIOCGDISPLAYS __IOCTL(8, __IOCTL_TYPE_PTR) +#define IOC_TYPE(x) ((x) >> 0 & 0xFF) +#define IOC_TYPE_BLOCK_DEVICE 1 +#define IOC_TYPE_NETWORK_INTERFACE 2 +#define IOC_SUBTYPE(x) ((x) >> 8 & 0xFF) +#define IOC_SUBTYPE_BLOCK_DEVICE_HARDDISK 1 +#define IOC_SUBTYPE_BLOCK_DEVICE_PARTITION 2 +#define IOC_MAKE_TYPE(type, subtype) ((type) << 0 | (subtype) << 8) +#define IOCGETTYPE __IOCTL(9, __IOCTL_TYPE_VOID) + +#define NIOC_GETINFO __IOCTL(10, __IOCTL_TYPE_PTR) +#define NIOC_GETSTATUS __IOCTL(11, __IOCTL_TYPE_PTR) +#define NIOC_GETCONFIG __IOCTL(12, __IOCTL_TYPE_PTR) +#define NIOC_SETCONFIG __IOCTL(13, __IOCTL_TYPE_PTR) +#define NIOC_GETCONFIG_ETHER __IOCTL(14, __IOCTL_TYPE_PTR) +#define NIOC_SETCONFIG_ETHER __IOCTL(15, __IOCTL_TYPE_PTR) +#define NIOC_GETCONFIG_INET __IOCTL(16, __IOCTL_TYPE_PTR) +#define NIOC_SETCONFIG_INET __IOCTL(17, __IOCTL_TYPE_PTR) + #endif diff --git a/kernel/include/sortix/kernel/crc32.h b/kernel/include/sortix/kernel/crc32.h new file mode 100644 index 00000000..d6dd84ef --- /dev/null +++ b/kernel/include/sortix/kernel/crc32.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * sortix/kernel/crc32.h + * CRC32 checksum. + */ + +#ifndef _INCLUDE_SORTIX_KERNEL_CRC32_H +#define _INCLUDE_SORTIX_KERNEL_CRC32_H + +#include +#include + +namespace Sortix { + +uint32_t crc32(uint32_t crc, const unsigned char* buf, size_t len); + +} // namespace Sortix + +#endif diff --git a/kernel/include/sortix/kernel/if.h b/kernel/include/sortix/kernel/if.h new file mode 100644 index 00000000..121102a9 --- /dev/null +++ b/kernel/include/sortix/kernel/if.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2015 Meisaka Yukara. + * Copyright (c) 2016, 2017, 2022 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * sortix/kernel/if.h + * Network Interface. + */ + +#ifndef _INCLUDE_SORTIX_KERNEL_IF_H +#define _INCLUDE_SORTIX_KERNEL_IF_H + +#include + +#include +#include +#include +#include + +namespace Sortix { + +namespace ARP { +struct arp_table; +} // namespace ARP + +class NetworkInterface +{ +public: + NetworkInterface(); + virtual ~NetworkInterface(); + +public: + virtual bool Send(Ref pkt) = 0; + +public: + int poll(ioctx_t* ctx, PollNode* node); + short PollEventStatus(); + +public: + kthread_mutex_t cfg_lock; + kthread_cond_t cfg_cond; + struct if_info ifinfo; + struct if_status ifstatus; + struct if_config cfg; + struct ARP::arp_table* arp_table; + PollChannel poll_channel; + +}; + +bool RegisterNetworkInterface(NetworkInterface* netif, + Ref dev); + +extern kthread_mutex_t netifs_lock; +extern NetworkInterface** netifs; +extern size_t netifs_count; + +} // namespace Sortix + +#endif diff --git a/kernel/include/sortix/kernel/memorymanagement.h b/kernel/include/sortix/kernel/memorymanagement.h index 5e3d0806..ef2791de 100644 --- a/kernel/include/sortix/kernel/memorymanagement.h +++ b/kernel/include/sortix/kernel/memorymanagement.h @@ -41,6 +41,7 @@ enum page_usage PAGE_USAGE_USER_SPACE, PAGE_USAGE_EXECUTE, PAGE_USAGE_DRIVER, + PAGE_USAGE_NETWORK_PACKET, PAGE_USAGE_NUM_KINDS, PAGE_USAGE_WASNT_ALLOCATED, }; diff --git a/kernel/include/sortix/kernel/packet.h b/kernel/include/sortix/kernel/packet.h new file mode 100644 index 00000000..03b538e0 --- /dev/null +++ b/kernel/include/sortix/kernel/packet.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016 Meisaka Yukara. + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * sortix/kernel/packet.h + * Reference counted network packets. + */ + +#ifndef _INCLUDE_SORTIX_KERNEL_PACKET_H +#define _INCLUDE_SORTIX_KERNEL_PACKET_H + +#include +#include + +#include +#include +#include + +namespace Sortix { + +class NetworkInterface; + +class Packet : public Refcountable +{ +public: + Packet(paddrmapped_t pmap); + virtual ~Packet(); + +public: + paddrmapped_t pmap; + unsigned char* from; + size_t length; + size_t offset; + NetworkInterface* netif; + Ref next; + +}; + +Ref GetPacket(); + +} // namespace Sortix + +#endif diff --git a/kernel/include/sortix/socket.h b/kernel/include/sortix/socket.h index d8dd1f2c..f8e424e0 100644 --- a/kernel/include/sortix/socket.h +++ b/kernel/include/sortix/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Jonas 'Sortie' Termansen. + * Copyright (c) 2013, 2016, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -22,23 +22,14 @@ #include -#ifdef __cplusplus -extern "C" { -#endif - -/* TODO: Nicely wrap this in an enum, as in glibc's header? */ #define SOCK_TYPE_MASK ((1<<20)-1) -#define SOCK_RAW 0 /* Will Sortix support this? */ #define SOCK_DGRAM 1 #define SOCK_SEQPACKET 2 #define SOCK_STREAM 3 +#define SOCK_RAW 4 #define SOCK_NONBLOCK (1<<20) #define SOCK_CLOEXEC (1<<21) #define SOCK_CLOFORK (1<<22) -#ifdef __cplusplus -} /* extern "C" */ -#endif - #endif diff --git a/kernel/kernel.cpp b/kernel/kernel.cpp index 2f770f50..41802767 100644 --- a/kernel/kernel.cpp +++ b/kernel/kernel.cpp @@ -84,6 +84,10 @@ #include "mouse/ps2.h" #include "multiboot.h" #include "net/fs.h" +#include "net/lo/lo.h" +#include "net/ping.h" +#include "net/tcp.h" +#include "net/udp.h" #include "poll.h" #include "pty.h" #include "uart.h" @@ -111,6 +115,7 @@ static void SystemIdleThread(void* user); static int argc; static char** argv; static multiboot_info_t* bootinfo; +static bool enable_network_drivers = true; static char* cmdline_tokenize(char** saved) { @@ -291,6 +296,10 @@ extern "C" void KernelInit(unsigned long magic, multiboot_info_t* bootinfo_p) HaltKernel(); } } + else if ( !strcmp(arg, "--disable-network-drivers") ) + enable_network_drivers = false; + else if ( !strcmp(arg, "--enable-network-drivers") ) + enable_network_drivers = true; else if ( !strcmp(arg, "--no-random-seed") ) no_random_seed = true; else @@ -610,6 +619,23 @@ static void BootThread(void* /*user*/) // Initialize the filesystem network. NetFS::Init(); + // Initialize the ping protocol. + Ping::Init(); + + // Initialize the TCP. + TCP::Init(); + + // Initialize the UDP. + UDP::Init(); + + // Initialize the loopback driver. + Loopback::Init("/dev", slashdev); + + // Initialize the network drivers. + if ( enable_network_drivers ) + { + } + // // Stage 6. Executing Hosted Environment ("User-Space") // diff --git a/kernel/net/arp.cpp b/kernel/net/arp.cpp new file mode 100644 index 00000000..8bb53a51 --- /dev/null +++ b/kernel/net/arp.cpp @@ -0,0 +1,740 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/arp.cpp + * Address resolution protocol. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arp.h" +#include "ether.h" + +// Every network interface has its own ARP table of cached entries. The table +// is a hash map of IP address to an ARP entry. The hash function is the +// bytewise xor of each byte in the IP address. The table can contain up to 256 +// entries, which all start out in a linked list of unused entries. +// +// The used entries of an table are in a linked list sorted in order of last +// use. The unused entries are in a linked list in no particular order. The +// entries currently being resolved are in the deadline linked list sorted in +// order of the request deadline. The entries currently resolved and valid are +// in a linked list sorted in order of their expiration. +// +// To evict an entry from the cache, remove the entry from the appropriate +// linked lists, discard the entry's transmission queue, clear it, and add it to +// the table's list of unused entries. +// +// To allocate an entry for an IP address, the hash table is searched for an +// existing entry to return. If an existing entry is found, it is moved to the +// front of the hash table in case of a collision. Otherwise, the first unused +// entry is used. If the table was full, the least recently used entry is +// evicted and then used. The new entry is assigned the IP address and added to +// the hash table. +// +// When a packet is sent to an IP address, an ARP table is made for the network +// interface if it doesn't already have one. If the IP address is outside the +// network interface's IP subnet, or if the network interface has no IP address +// configured, the request fails. An ARP entry for the destination IP address is +// searched for, or if none exists, then a new one is allocated. The entry is +// marked as USED and is moved to the front of the table's list of entries in +// order of last use. If the entry is marked as RESOLVED, the packet is just +// sent to the entry's Ethernet address. Otherwise if the entry has not been +// marked as RESOLVING, an initial request for the IP address is broadcast on +// the local network, the entry is added to the end of the deadline linked list, +// and the deadline timer is set to fire when the request times out. The packet +// is added to the entry's transmission queue unless it is already full. +// +// If the deadline timer fires, the entry is removed from the deadline linked +// list. If too many attempts failed, the entry is evicted. Otherwise, the IP +// address resolution is attempted again and the entry's request attempt counter +// is incremented. +// +// When an ARP message is received, the message is discarded if the source or +// destination IP is outside the network interface's subnet, or if the network +// interface did not have an IP address configured. The entry for the source +// IP address is located in the network interface's table, or if none exists and +// the table is not currently full, an entry is allocated. The entry is removed +// from the deadline linked list if it is RESOLVING. The entry is removed from +// the expiration linked list if it is EXPIRING. The entry is marked as RESOLVED +// and the source Ethernet address is assigned to the entry. The entry is marked +// as EXPIRING and is added to the end of the expiring linked list and the +// expiration is set to fire when the entry expires. Every packet in the entry's +// transmission queue is sent to the source Ethernet address. +// +// If the message is a request, and the destination IP address is that of the +// network interface. Otherwise, an ARP reply message is sent back with the +// Ethernet address of the network interface. +// +// When the expiration timer fires, the entry is removed from the expiration +// linked list. If the entry was not marked as USED, it is evicted. Otherwise +// the entry is marked as RESOLVING, the attempt request attempt counter is +// reset, and the address resolution is attempted again. Until the renewal +// succeeds or times out, the entry remains marked RESOLVED and is used to route +// traffic from its IP address to its Ethernet address. + +#define ETHERTYPE_ETHER 1 + +#define ARP_REQUEST 1 +#define ARP_REPLY 2 + +// The entry contains a valid Ethernet address that has been resolved. +#define ARP_STATUS_RESOLVED (1 << 0) + +// The entry is currently being resolved, the deadline timeout has been set and +// the deadline timer will fire when the resolution times out. This status is +// mutually exclusive with the EXPIRING status. +#define ARP_STATUS_RESOLVING (1 << 1) + +// The entry has been resolved and is currently waiting until it expires, the +// expiration timeout has been set and the expiration timer will fire when the +// entry expires. This status is mutually exclusive with the RESOLVING status. +#define ARP_STATUS_EXPIRING (1 << 2) + +// The entry has been used to route a packet and should be renewed on expiry. +#define ARP_STATUS_USED (1 << 3) + +// The number of entries in an ARP table, this value is documented in arp(4). +#define ARP_TABLE_LENGTH 256 + +// The number of entries in the ARP table hashmap, this value is documented +// in arp(4). +#define ARP_HASHTABLE_LENGTH 256 + +// Attempt to resolve an address this many times before giving up, this value is +// documented in arp(4). +#define ARP_MAX_ATTEMPTS 3 + +// The maximum number of packets in an ARP entry's transmission queue, this +// value is documented in arp(4). +#define ARP_MAX_PENDING 16 + +namespace Sortix { +namespace ARP { + +// The duration to wait before giving up on an attempt to resolve an address, +// this value is documented in arp(4). +static const struct timespec REQUEST_TIMEOUT = { .tv_sec = 1, .tv_nsec = 0 }; + +// The duration before the entry expires and renewal begins, this value is +// documented in arp(4). +static const struct timespec ENTRY_TIMEOUT = { .tv_sec = 60, .tv_nsec = 0 }; + +struct arp +{ + uint16_t hrd; /* Hardware address space */ + uint16_t pro; /* Protocol address space */ + uint8_t hln; /* Byte length of each hardware address */ + uint8_t pln; /* Byte length of each protocol address */ + uint16_t op; /* opcode */ + uint8_t sha[6]; /* Hardware address of sender */ + uint8_t spa[4]; /* Protocol address of sender */ + uint8_t tha[6]; /* Hardware address of target */ + uint8_t tpa[4]; /* Protocol address of target */ +}; + +struct arp_entry +{ + struct arp_table* table; + struct arp_entry* prev_by_table; + struct arp_entry* next_by_table; + struct arp_entry* prev_by_hash; + struct arp_entry* next_by_hash; + struct arp_entry* prev_by_timer; + struct arp_entry* next_by_timer; + struct timespec timeout; + struct in_addr addr; + struct ether_addr ether; + uint16_t status; + uint16_t attempts; + uint16_t pending; + Ref pending_first; + Ref pending_last; +}; + +struct arp_table +{ + NetworkInterface* netif; + struct arp_entry* first_unused; + struct arp_entry* first_used; + struct arp_entry* last_unused; + struct arp_entry* last_used; + struct arp_entry* hashtable[ARP_HASHTABLE_LENGTH]; + struct arp_entry entries[ARP_TABLE_LENGTH]; +}; + +static void OnDeadline(Clock* clock, Timer* timer, void* context); +static void OnExpiration(Clock* clock, Timer* timer, void* context); + +kthread_mutex_t arp_lock = KTHREAD_MUTEX_INITIALIZER; +static struct arp_entry* first_by_deadline; +static struct arp_entry* last_by_deadline; +static struct arp_entry* first_by_expiration; +static struct arp_entry* last_by_expiration; +static Timer* deadline_timer; +static Timer* expiration_timer; +static bool deadline_timer_armed; +static bool expiration_timer_armed; + +// This hash function is perfect if the subnet is at least /24, with no more +// than remaining bits for the address of the machine on the subnet. +static inline uint8_t HashAddress(const struct in_addr* addr) +{ + uint32_t value = be32toh(addr->s_addr); + return (value << 0 & 0xFF) ^ (value << 8 & 0xFF) ^ + (value << 16 & 0xFF) ^ (value << 24 & 0xFF); +} + +// arp_lock locked +static struct arp_table* GetTable(NetworkInterface* netif) +{ + if ( netif->arp_table ) + return netif->arp_table; + struct arp_table* table = new struct arp_table; + if ( !table ) + return NULL; + memset(table, 0, sizeof(*table)); + netif->arp_table = table; + table->netif = netif; + // Enter every entry into the table's unused linked list. + table->first_unused = &table->entries[0]; + for ( size_t i = 0; i < ARP_TABLE_LENGTH; i++ ) + { + table->entries[i].table = table; + if ( i ) + table->entries[i].prev_by_table = &table->entries[i-1]; + if ( i + 1 < ARP_TABLE_LENGTH ) + table->entries[i].next_by_table = &table->entries[i+1]; + } + table->last_unused = &table->entries[ARP_TABLE_LENGTH-1]; + return table; +} + +// arp_lock locked +static void EvictEntry(struct arp_table* table, struct arp_entry* entry) +{ + unsigned char hash = HashAddress(&entry->addr); + + // Remove from the table's used linked list. + (entry->next_by_table ? + entry->next_by_table->prev_by_table : + table->last_used) = entry->prev_by_table; + (entry->prev_by_table ? + entry->prev_by_table->next_by_table : + table->first_used) = entry->next_by_table; + entry->prev_by_table = NULL; + entry->next_by_table = NULL; + + // Remove from the hash table. + if ( entry->next_by_hash ) + entry->next_by_hash->prev_by_hash = entry->prev_by_hash; + (entry->prev_by_hash ? + entry->prev_by_hash->next_by_hash : + table->hashtable[hash]) = entry->next_by_hash; + entry->prev_by_hash = NULL; + entry->next_by_hash = NULL; + + // Remove from deadline linked list. + if ( entry->status & ARP_STATUS_RESOLVING ) + { + (entry->next_by_timer ? + entry->next_by_timer->prev_by_timer : + last_by_deadline) = entry->prev_by_timer; + (entry->prev_by_timer ? + entry->prev_by_timer->next_by_timer : + first_by_deadline) = entry->next_by_timer; + entry->prev_by_timer = NULL; + entry->next_by_timer = NULL; + } + + // Remove from expiration linked list. + else if ( entry->status & ARP_STATUS_EXPIRING ) + { + (entry->next_by_timer ? + entry->next_by_timer->prev_by_timer : + last_by_expiration) = entry->prev_by_timer; + (entry->prev_by_timer ? + entry->prev_by_timer->next_by_timer : + first_by_expiration) = entry->next_by_timer; + entry->prev_by_timer = NULL; + entry->next_by_timer = NULL; + } + + // Drain the transmission queue while avoiding a stack overflow in packet + // recursive destructor. + while ( entry->pending_first ) + { + Ref next = entry->pending_first->next; + entry->pending_first->next.Reset(); + entry->pending_first = next; + entry->pending--; + if ( !entry->pending_first ) + entry->pending_last.Reset(); + } + + // Clear the entry. + memset(entry, 0, sizeof(*entry)); + entry->table = table; + + // Insert the entry into the table's unused linked list. + (table->first_unused ? + table->first_unused->prev_by_table : + table->last_unused) = entry; + entry->prev_by_table = NULL; + entry->next_by_table = table->first_unused; + table->first_unused = entry; +} + +// arp_lock locked +static struct arp_entry* AllocateEntry(struct arp_table* table, + const struct in_addr* addr, + bool evict) +{ + // Search for an existing entry. + unsigned char hash = HashAddress(addr); + for ( struct arp_entry* entry = table->hashtable[hash]; + entry; + entry = entry->next_by_hash ) + { + if ( be32toh(addr->s_addr) == be32toh(entry->addr.s_addr) ) + { + // Move to the front of the hash table if not already. + if ( entry->prev_by_hash ) + { + if ( entry->next_by_hash ) + entry->next_by_hash->prev_by_hash = entry->prev_by_hash; + entry->prev_by_hash->next_by_hash = entry->next_by_hash; + entry->prev_by_hash = NULL; + entry->next_by_hash = table->hashtable[hash]; + table->hashtable[hash]->prev_by_hash = entry; + table->hashtable[hash] = entry; + } + return entry; + } + } + + // Allocate a new entry, potentially evicting the least recently used entry. + struct arp_entry* entry = table->first_unused; + if ( !table->first_unused ) + { + if ( !evict ) + return NULL; + EvictEntry(table, table->last_used); + assert(table->first_unused); + entry = table->first_unused; + } + + // Remove from the table's unused list. + table->first_unused = entry->next_by_table; + (table->first_unused ? + table->first_unused->prev_by_table : + table->last_unused) = NULL; + + // Initialize the entry. + entry->addr.s_addr = addr->s_addr; + + // Insert into the table's used list. + (table->last_used ? + table->last_used->next_by_table : + table->first_used) = entry; + entry->prev_by_table = table->last_used; + entry->next_by_table = NULL; + table->last_used = entry; + + // Add to the front of the hash table. + if ( table->hashtable[hash] ) + table->hashtable[hash]->prev_by_hash = entry; + entry->prev_by_hash = NULL; + entry->next_by_hash = table->hashtable[hash]; + table->hashtable[hash] = entry; + + return entry; +} + +// arp_lock locked +static bool Resolve(NetworkInterface* netif, struct arp_entry* entry) +{ + if ( !deadline_timer ) + { + if ( !(deadline_timer = new Timer()) ) + return false; + deadline_timer->Attach(Time::GetClock(CLOCK_MONOTONIC)); + } + struct ether_addr src_ether_addr; + struct in_addr src_in_addr; + kthread_mutex_lock(&netif->cfg_lock); + struct if_config* cfg = &netif->cfg; + memcpy(&src_ether_addr, &cfg->ether.address, sizeof(struct ether_addr)); + memcpy(&src_in_addr, &cfg->inet.address, sizeof(struct in_addr)); + kthread_mutex_unlock(&netif->cfg_lock); + struct arp arp; + arp.hrd = htobe16(ETHERTYPE_ETHER); + arp.pro = htobe16(ETHERTYPE_IP); + arp.hln = sizeof(struct ether_addr); + arp.pln = sizeof(struct in_addr); + arp.op = htobe16(ARP_REQUEST); + memcpy(arp.sha, &src_ether_addr, sizeof(struct ether_addr)); + memcpy(arp.spa, &src_in_addr, sizeof(struct in_addr)); + memcpy(arp.tha, ðeraddr_broadcast, sizeof(struct ether_addr)); + memcpy(arp.tpa, &entry->addr, sizeof(struct in_addr)); + Ref pkt = GetPacket(); + if ( !pkt ) + return false; + if ( pkt->pmap.size < sizeof(arp) ) + return errno = EMSGSIZE, false; + pkt->length = sizeof(arp); + memcpy(pkt->from, &arp, sizeof(arp)); + if ( !Ether::Send(pkt, &src_ether_addr, ðeraddr_broadcast, ETHERTYPE_ARP, + netif) ) + return false; + entry->status |= ARP_STATUS_RESOLVING; + entry->attempts++; + struct timespec now = Time::Get(CLOCK_MONOTONIC); + entry->timeout = timespec_add(now, REQUEST_TIMEOUT); + // Add entry to end of deadline linked list. + (last_by_deadline ? + last_by_deadline->next_by_timer : + first_by_deadline) = entry; + entry->prev_by_timer = last_by_deadline; + entry->next_by_timer = NULL; + last_by_deadline = entry; + if ( !deadline_timer_armed ) + { + struct itimerspec its; + its.it_value = REQUEST_TIMEOUT; + its.it_interval = timespec_nul(); + deadline_timer->Set(&its, NULL, 0, OnDeadline, NULL); + deadline_timer_armed = true; + } + return true; +} + +static void OnDeadline(Clock* clock, Timer* timer, void* /*context*/) +{ + ScopedLock lock(&arp_lock); + struct timespec now; + clock->Get(&now, NULL); + struct arp_entry* entry; + while ( (entry = first_by_deadline) ) + { + if ( timespec_lt(now, entry->timeout) ) + { + struct itimerspec its; + its.it_value = timespec_sub(entry->timeout, now); + its.it_interval = timespec_nul(); + timer->Set(&its, NULL, 0, OnDeadline, NULL); + return; + } + struct arp_table* table = entry->table; + // Remove from the deadline linked list. + entry->status &= ~ARP_STATUS_RESOLVING; + first_by_deadline = entry->next_by_timer; + (first_by_deadline ? + first_by_deadline->prev_by_timer : + last_by_deadline) = NULL; + entry->prev_by_timer = NULL; + entry->next_by_timer = NULL; + if ( entry->attempts < ARP_MAX_ATTEMPTS ) + Resolve(table->netif, entry); + else + EvictEntry(table, entry); + } + deadline_timer_armed = false; +} + +static void OnExpiration(Clock* clock, Timer* timer, void* /*context*/) +{ + ScopedLock lock(&arp_lock); + struct timespec now; + clock->Get(&now, NULL); + struct arp_entry* entry; + while ( (entry = first_by_expiration) ) + { + if ( timespec_lt(now, entry->timeout) ) + { + struct itimerspec its; + its.it_value = timespec_sub(entry->timeout, now); + its.it_interval = timespec_nul(); + timer->Set(&its, NULL, 0, OnExpiration, NULL); + return; + } + struct arp_table* table = entry->table; + // Remove the entry from the expiration linked list. + entry->status &= ~ARP_STATUS_EXPIRING; + first_by_expiration = entry->next_by_timer; + (first_by_expiration ? + first_by_expiration->prev_by_timer : + last_by_expiration) = NULL; + entry->prev_by_timer = NULL; + entry->next_by_timer = NULL; + if ( entry->status & ARP_STATUS_USED ) + { + entry->status &= ~ARP_STATUS_USED; + entry->attempts = 0; + Resolve(table->netif, entry); + } + else + EvictEntry(table, entry); + } + expiration_timer_armed = false; +} + +bool RouteIPEthernet(NetworkInterface* netif, + Ref pkt, + const struct in_addr* dst) +{ + struct ether_addr local_ether; + struct in_addr local_in; + struct in_addr local_subnet; + kthread_mutex_lock(&netif->cfg_lock); + memcpy(&local_ether, &netif->cfg.ether.address, sizeof(struct ether_addr)); + memcpy(&local_in, &netif->cfg.inet.address, sizeof(struct in_addr)); + memcpy(&local_subnet, &netif->cfg.inet.subnet, sizeof(struct in_addr)); + kthread_mutex_unlock(&netif->cfg_lock); + if ( be32toh(local_in.s_addr) == INADDR_ANY ) + return errno = ENETUNREACH, false; + if ( (local_in.s_addr & local_subnet.s_addr) != + (dst->s_addr &local_subnet.s_addr) ) + return errno = ENETUNREACH, false; + ScopedLock lock(&arp_lock); + struct arp_table* table = GetTable(netif); + if ( !table ) + return false; + struct arp_entry* entry = AllocateEntry(table, dst, true); + assert(entry); + // Mark as USED and move the entry to the front of table's used linked list. + entry->status |= ARP_STATUS_USED; + if ( entry->prev_by_table ) + { + (entry->next_by_table ? + entry->next_by_table->prev_by_table : + table->last_used) = entry->prev_by_table; + entry->prev_by_table->next_by_table = entry->next_by_table; + entry->prev_by_table = NULL; + entry->next_by_table = table->first_used; + table->first_used->prev_by_table = entry; + table->first_used = entry; + } + if ( entry->status & ARP_STATUS_RESOLVED ) + { + struct ether_addr dst_ether = entry->ether; + lock.Reset(); + return Ether::Send(pkt, &local_ether, &dst_ether, ETHERTYPE_IP, netif); + } + assert(!pkt->next); + if ( !(entry->status & ARP_STATUS_RESOLVING) && !Resolve(netif, entry) ) + return false; + // Drop the packet if the transmission queue is full. + if ( ARP_MAX_PENDING <= entry->pending ) + return true; + (entry->pending_last ? + entry->pending_last->next : + entry->pending_first) = pkt; + entry->pending_last = pkt; + entry->pending++; + return true; +} + +void Handle(Ref pkt, + const struct ether_addr* src_ether_of_packet, + const struct ether_addr* /*dst_ether*/, + bool /*dst_ether*/) +{ + const unsigned char* in = pkt->from + pkt->offset; + size_t inlen = pkt->length - pkt->offset; + NetworkInterface* netif = pkt->netif; + struct arp hdr; + if ( inlen < sizeof(hdr) ) + return; + memcpy(&hdr, in, sizeof(hdr)); + hdr.hrd = be16toh(hdr.hrd); + hdr.pro = be16toh(hdr.pro); + hdr.op = be16toh(hdr.op); + + // Drop unsupported or invalid packets. + if ( !(hdr.hrd == ETHERTYPE_ETHER && hdr.hln == 6) ) + return; + if ( !(hdr.pro == ETHERTYPE_IP && hdr.pln == 4) ) + return; + if ( !(hdr.op == ARP_REQUEST || hdr.op == ARP_REPLY) ) + return; + + struct in_addr src; + struct in_addr dst; + memcpy(&src, hdr.spa, sizeof(src)); + memcpy(&dst, hdr.tpa, sizeof(dst)); + struct ether_addr src_ether; + memcpy(&src_ether, hdr.sha, sizeof(src_ether)); + struct ether_addr local_eth; + struct in_addr local_in; + struct in_addr local_subnet; + kthread_mutex_lock(&netif->cfg_lock); + memcpy(&local_eth, &netif->cfg.ether.address, sizeof(struct ether_addr)); + memcpy(&local_in, &netif->cfg.inet.address, sizeof(struct in_addr)); + memcpy(&local_subnet, &netif->cfg.inet.subnet, sizeof(struct in_addr)); + kthread_mutex_unlock(&netif->cfg_lock); + + // Drop packets if the network interface does not have an IP address + // configured, or if the source or destination IP address are outside of the + // network interface's IP subnet. + if ( be32toh(local_in.s_addr) == INADDR_ANY ) + return; + if ( (local_in.s_addr & local_subnet.s_addr) != + (src.s_addr & local_subnet.s_addr)) + return; + if ( (local_in.s_addr & local_subnet.s_addr) != + (dst.s_addr & local_subnet.s_addr) ) + return; + + ScopedLock lock(&arp_lock); + + if ( !expiration_timer ) + { + if ( !(expiration_timer = new Timer()) ) + return; + expiration_timer->Attach(Time::GetClock(CLOCK_MONOTONIC)); + } + + struct arp_table* table = GetTable(netif); + if ( !table ) + return; + struct arp_entry* entry = AllocateEntry(table, &src, false); + + if ( entry ) + { + // Remove from pending request linked list. + if ( entry->status & ARP_STATUS_RESOLVING ) + { + entry->status &= ~ARP_STATUS_RESOLVING; + (entry->next_by_timer ? + entry->next_by_timer->prev_by_timer : + last_by_deadline) = entry->prev_by_timer; + (entry->prev_by_timer ? + entry->prev_by_timer->next_by_timer : + first_by_deadline) = entry->next_by_timer; + entry->prev_by_timer = NULL; + entry->next_by_timer = NULL; + } + + // Remove from expiration linked list. + else if ( entry->status & ARP_STATUS_EXPIRING ) + { + entry->status &= ~ARP_STATUS_EXPIRING; + (entry->next_by_timer ? + entry->next_by_timer->prev_by_timer : + last_by_expiration) = entry->prev_by_timer; + (entry->prev_by_timer ? + entry->prev_by_timer->next_by_timer : + first_by_expiration) = entry->next_by_timer; + entry->prev_by_timer = NULL; + entry->next_by_timer = NULL; + } + + // Mark entry as RESOLVED. + entry->status |= ARP_STATUS_RESOLVED; + memcpy(&entry->ether, &src_ether, sizeof(src_ether)); + + // Mark entry as EXPIRING and add to end of the expiration linked list. + entry->status |= ARP_STATUS_EXPIRING; + (last_by_expiration ? + last_by_expiration->next_by_timer : + first_by_expiration) = entry; + entry->prev_by_timer = last_by_expiration; + entry->next_by_timer = NULL; + last_by_expiration = entry; + struct timespec now = Time::Get(CLOCK_MONOTONIC); + entry->timeout = timespec_add(now, ENTRY_TIMEOUT); + if ( !expiration_timer_armed ) + { + struct itimerspec its; + its.it_value = ENTRY_TIMEOUT; + its.it_interval = timespec_nul(); + expiration_timer->Set(&its, NULL, 0, OnExpiration, NULL); + expiration_timer_armed = true; + } + + // Transmit the transission queue. + while ( entry->pending_first ) + { + Ref pending = entry->pending_first; + entry->pending_first = pending->next; + pending->next.Reset(); + Ether::Send(pending, &local_eth, &src_ether, ETHERTYPE_IP, netif); + if ( !entry->pending_first ) + entry->pending_last.Reset(); + } + } + + // Send an ARP reply if our local address was requested. + if ( hdr.op == ARP_REQUEST && + !memcmp(&local_in, &dst, sizeof(struct in_addr)) ) + { + Ref packet = GetPacket(); + if ( !packet ) + return; + struct arp arp; + arp.hrd = htobe16(ETHERTYPE_ETHER); + arp.pro = htobe16(ETHERTYPE_IP); + arp.hln = sizeof(struct ether_addr); + arp.pln = sizeof(struct in_addr); + arp.op = htobe16(ARP_REPLY); + memcpy(arp.sha, &local_eth, sizeof(struct ether_addr)); + memcpy(arp.spa, &local_in, sizeof(struct in_addr)); + memcpy(arp.tha, &src_ether, sizeof(struct ether_addr)); + memcpy(arp.tpa, &src, sizeof(struct in_addr)); + if ( packet->pmap.size < sizeof(arp) ) + return; + packet->length = sizeof(arp); + unsigned char* out = packet->from; + memcpy(out, &arp, sizeof(arp)); + Ether::Send(packet, &local_eth, src_ether_of_packet, ETHERTYPE_ARP, + netif); + } +} + +// arp_lock locked, netif->cfg_lock locked. +void OnConfiguration(NetworkInterface* netif, + const struct if_config* old_cfg, + const struct if_config* new_cfg) +{ + // Purge the ARP cache if the ether or inet configuration changed. + if ( !memcmp(&old_cfg->ether, &new_cfg->ether, sizeof(new_cfg->ether)) && + !memcmp(&old_cfg->inet, &new_cfg->inet, sizeof(new_cfg->inet)) ) + return; + struct arp_table* table = GetTable(netif); + if ( !table ) + return; + while ( table->first_used ) + EvictEntry(table, table->first_used); +} + +} // namespace ARP +} // namespace Sortix diff --git a/kernel/net/arp.h b/kernel/net/arp.h new file mode 100644 index 00000000..88602f67 --- /dev/null +++ b/kernel/net/arp.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/arp.h + * Address resolution protocol. + */ + +#ifndef NET_ARP_H +#define NET_ARP_H + +#include +#include + +#include + +struct if_config; + +namespace Sortix { +class NetworkInterface; +} // namespace Sortix + +namespace Sortix { +namespace ARP { + +extern kthread_mutex_t arp_lock; + +bool RouteIPEthernet(NetworkInterface* netif, + Ref pkt, + const struct in_addr* dst); +void Handle(Ref pkt, + const struct ether_addr* src, + const struct ether_addr* dst, + bool dst_broadcast); +void OnConfiguration(NetworkInterface* netif, + const struct if_config* old_cfg, + const struct if_config* new_cfg); + +} // namespace ARP +} // namespace Sortix + +#endif diff --git a/kernel/net/ether.cpp b/kernel/net/ether.cpp new file mode 100644 index 00000000..34937f12 --- /dev/null +++ b/kernel/net/ether.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/ether.cpp + * Ethernet. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "arp.h" +#include "ether.h" +#include "ip.h" + +namespace Sortix { +namespace Ether { + +size_t GetMTU(NetworkInterface* netif) +{ + (void) netif; + return ETHERMTU; +} + +void Handle(Ref pkt, bool checksum_offloaded) +{ + assert(pkt->netif); + assert(pkt->offset <= pkt->length); + const unsigned char* in = pkt->from + pkt->offset; + size_t inlen = pkt->length - pkt->offset; + struct ether_header hdr; + if ( !checksum_offloaded ) + { + struct ether_footer ftr; + if ( inlen < sizeof(hdr) + sizeof(ftr) /* ETHER_LEN */ ) + return; + size_t ftr_offset = inlen - sizeof(ftr); + memcpy(&ftr, in + ftr_offset, sizeof(ftr)); + pkt->length -= sizeof(ftr); + inlen -= sizeof(ftr); + ftr.ether_crc = le32toh(ftr.ether_crc); + if ( ftr.ether_crc != crc32(0, in, inlen) ) + return; + } + else if ( inlen < sizeof(hdr) /* ETHER_HDR_LEN */ ) + return; + memcpy(&hdr, in, sizeof(hdr)); + hdr.ether_type = be16toh(hdr.ether_type); + pkt->offset += sizeof(hdr); + const struct ether_addr* src = (const struct ether_addr*) &hdr.ether_shost; + const struct ether_addr* dst = (const struct ether_addr*) &hdr.ether_dhost; + // Drop invalid frames with broadcast source. + if ( !memcmp(src, ðeraddr_broadcast, sizeof(struct ether_addr)) ) + return; + // Accept only frames with destination being broadcast or our address. + bool dst_broadcast = + !memcmp(dst, ðeraddr_broadcast, sizeof(struct ether_addr)); + if ( !dst_broadcast ) + { + ScopedLock(&pkt->netif->cfg_lock); + const struct ether_addr* local = &pkt->netif->cfg.ether.address; + if ( pkt->netif->ifinfo.type != IF_TYPE_LOOPBACK && + memcmp(dst, local, sizeof(struct ether_addr)) != 0 ) + return; + } + switch ( hdr.ether_type ) + { + case ETHERTYPE_IP: // IPv4 + IP::Handle(pkt, src, dst, dst_broadcast); + break; + case ETHERTYPE_ARP: // Address Resolution Protocol + ARP::Handle(pkt, src, dst, dst_broadcast); + break; + case ETHERTYPE_IPV6: // IPv6 + break; + default: + break; + } +} + +bool Send(Ref pktin, + const struct ether_addr* src, + const struct ether_addr* dst, + uint16_t ether_type, + NetworkInterface* netif) +{ + if ( ETHERMTU < pktin->length ) + return errno = EMSGSIZE, false; + Ref pkt = GetPacket(); + if ( !pkt ) + return false; + const unsigned char* in = pktin->from; + size_t inlen = pktin->length; + size_t padding = inlen < ETHERMIN ? ETHERMIN - inlen : 0; + unsigned char* out = pkt->from; + struct ether_header hdr; + struct ether_footer ftr; + size_t outlen = sizeof(hdr) /* ETHER_HDR_LEN */ + inlen + padding; + if ( !(netif->ifinfo.features & IF_FEATURE_ETHERNET_CRC_OFFLOAD) ) + outlen += sizeof(ftr) /* ETHER_CRC_LEN */; + if ( pkt->pmap.size < outlen ) + return errno = EMSGSIZE, false; + pkt->length = outlen; + memcpy(&hdr.ether_dhost, dst, sizeof(struct ether_addr)); + memcpy(&hdr.ether_shost, src, sizeof(struct ether_addr)); + hdr.ether_type = htobe16(ether_type); + memcpy(out, &hdr, sizeof(hdr)); + memcpy(out + sizeof(hdr), in, inlen); + memset(out + sizeof(hdr) + inlen, 0, padding); + if ( !(netif->ifinfo.features & IF_FEATURE_ETHERNET_CRC_OFFLOAD) ) + { + ftr.ether_crc = htole32(crc32(0, out, pkt->length)); + memcpy(out + sizeof(hdr) + inlen + padding, &ftr, sizeof(ftr)); + } + return netif->Send(pkt); +} + +} // namespace Ether +} // namespace Sortix diff --git a/kernel/net/ether.h b/kernel/net/ether.h new file mode 100644 index 00000000..cee45491 --- /dev/null +++ b/kernel/net/ether.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/ether.h + * Ethernet. + */ + +#ifndef SORTIX_NET_ETHER_H +#define SORTIX_NET_ETHER_H + +#include +#include + +#include + +namespace Sortix { +class NetworkInterface; +} // namespace Sortix + +namespace Sortix { +namespace Ether { + +size_t GetMTU(NetworkInterface* netif); +void Handle(Ref pkt, bool checksum_offloaded); +bool Send(Ref pkt, + const struct ether_addr* src, + const struct ether_addr* dst, + uint16_t ether_type, + NetworkInterface* netif); + +} // namespace Ether +} // namespace Sortix + +#endif diff --git a/kernel/net/if.cpp b/kernel/net/if.cpp new file mode 100644 index 00000000..c0150d42 --- /dev/null +++ b/kernel/net/if.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2015 Meisaka Yukara. + * Copyright (c) 2016, 2017, 2022 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/if.cpp + * Network Interface. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arp.h" + +namespace Sortix { + +kthread_mutex_t netifs_lock = KTHREAD_MUTEX_INITIALIZER; +NetworkInterface** netifs = NULL; +size_t netifs_count = 0; +static size_t netifs_allocated = 0; + +class NetworkInterfaceNode : public AbstractInode +{ +public: + NetworkInterfaceNode(dev_t dev, uid_t owner, gid_t group, mode_t mode, + NetworkInterface* netif); + virtual ~NetworkInterfaceNode(); + +public: + virtual int ioctl(ioctx_t* ctx, int cmd, uintptr_t ptr); + virtual int poll(ioctx_t* ctx, PollNode* node); + +private: + NetworkInterface* netif; + +}; + +bool RegisterNetworkInterface(NetworkInterface* netif, + Ref dev) +{ + ScopedLock lock(&netifs_lock); + if ( netifs_count == netifs_allocated ) + { + size_t new_length_half = netifs_allocated; + if ( new_length_half == 0 ) + new_length_half = 8; + NetworkInterface** new_netifs = + (NetworkInterface**) reallocarray(netifs, new_length_half, + 2 * sizeof(NetworkInterface*)); + if ( !new_netifs ) + return false; + netifs = new_netifs; + netifs_allocated = new_length_half * 2; + } + Ref node(new NetworkInterfaceNode(dev->dev, 0, 0, 0666, netif)); + if ( !node ) + return false; + ioctx_t ctx; SetupKernelIOCtx(&ctx); + if ( LinkInodeInDir(&ctx, dev, netif->ifinfo.name, node) != 0 ) + return false; + // Interfaces are counted from 1 inclusive up to UINT_MAX exclusive. + if ( netifs_count == 0 ) + netifs[netifs_count++] = NULL; + if ( UINT_MAX <= netifs_count + 1 ) + return errno = EOVERFLOW, false; + unsigned int linkid = netifs_count++; + netifs[linkid] = netif; + netif->ifinfo.linkid = linkid; + return true; +} + +NetworkInterface::NetworkInterface() +{ + cfg_lock = KTHREAD_MUTEX_INITIALIZER; + cfg_cond = KTHREAD_COND_INITIALIZER; + memset(&ifinfo, 0, sizeof(ifinfo)); + memset(&ifstatus, 0, sizeof(ifstatus)); + memset(&cfg, 0, sizeof(cfg)); + arp_table = NULL; + // poll_channel is initialized by its constructor. +} + +NetworkInterface::~NetworkInterface() +{ +} + +short NetworkInterface::PollEventStatus() +{ + short status = 0; + if ( ifstatus.flags & IF_STATUS_FLAGS_UP ) + status |= POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM; + return status; +} + +int NetworkInterface::poll(ioctx_t* /*ctx*/, PollNode* node) +{ + ScopedLock lock(&cfg_lock); + short ret_status = PollEventStatus() & node->events; + if ( ret_status ) + { + node->master->revents |= ret_status; + return 0; + } + poll_channel.Register(node); + return errno = EAGAIN, -1; +} + +NetworkInterfaceNode::NetworkInterfaceNode(dev_t dev, uid_t owner, gid_t group, + mode_t mode, NetworkInterface* netif) +{ + inode_type = INODE_TYPE_UNKNOWN; + if ( !dev ) + dev = (dev_t) this; + ino = (ino_t) this; + this->type = S_IFCHR; + this->dev = dev; + this->stat_uid = owner; + this->stat_gid = group; + this->stat_mode = (mode & S_SETABLE) | this->type; + this->netif = netif; +} + +NetworkInterfaceNode::~NetworkInterfaceNode() +{ +} + +int NetworkInterfaceNode::ioctl(ioctx_t* ctx, int cmd, uintptr_t arg) +{ + void* ptr = (void*) arg; + + if ( cmd == NIOC_SETCONFIG || + cmd == NIOC_SETCONFIG_ETHER || + cmd == NIOC_SETCONFIG_INET ) + { + ScopedLock outer(&ARP::arp_lock); + ScopedLock inner(&netif->cfg_lock); + struct if_config new_cfg; + memcpy(&new_cfg, &netif->cfg, sizeof(new_cfg)); + switch ( cmd ) + { + case NIOC_SETCONFIG: + if ( !ctx->copy_from_src(&new_cfg, ptr, sizeof(new_cfg)) ) + return -1; + break; + case NIOC_SETCONFIG_ETHER: + if ( !ctx->copy_from_src(&new_cfg.ether, ptr, + sizeof(new_cfg.ether)) ) + return -1; + break; + case NIOC_SETCONFIG_INET: + if ( !ctx->copy_from_src(&new_cfg.inet, ptr, sizeof(new_cfg.inet)) ) + return -1; + break; + } + // Let the ARP cache know the configuration changed, so it can purge any + // entries that are no longer valid. + ARP::OnConfiguration(netif, &netif->cfg, &new_cfg); + memcpy(&netif->cfg, &new_cfg, sizeof(new_cfg)); + kthread_cond_broadcast(&netif->cfg_cond); + return 0; + } + + ScopedLock lock(&netif->cfg_lock); + switch ( cmd ) + { + case IOCGETTYPE: + return IOC_MAKE_TYPE(IOC_TYPE_NETWORK_INTERFACE, 0); + case NIOC_GETINFO: + if ( !ctx->copy_to_dest(ptr, &netif->ifinfo, sizeof(netif->ifinfo)) ) + return -1; + return 0; + case NIOC_GETSTATUS: + if ( !ctx->copy_to_dest(ptr, &netif->ifstatus, + sizeof(netif->ifstatus)) ) + return -1; + return 0; + case NIOC_GETCONFIG: + if ( !ctx->copy_to_dest(ptr, &netif->cfg, sizeof(netif->cfg)) ) + return -1; + return 0; + case NIOC_GETCONFIG_ETHER: + if ( !ctx->copy_to_dest(ptr, &netif->cfg.ether, + sizeof(netif->cfg.ether)) ) + return -1; + return 0; + case NIOC_GETCONFIG_INET: + if ( !ctx->copy_to_dest(ptr, &netif->cfg.inet, + sizeof(netif->cfg.inet)) ) + return -1; + return 0; + default: + return errno = ENOTTY, -1; + } +} + +int NetworkInterfaceNode::poll(ioctx_t* ctx, PollNode* node) +{ + return netif->poll(ctx, node); +} + +} // namespace Sortix diff --git a/kernel/net/ip.cpp b/kernel/net/ip.cpp new file mode 100644 index 00000000..9296fd99 --- /dev/null +++ b/kernel/net/ip.cpp @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2016, 2017, 2018 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/ip.cpp + * Internet Protocol Version 4. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "arp.h" +#include "ether.h" +#include "ping.h" +#include "tcp.h" +#include "udp.h" + +namespace Sortix { +namespace IP { + +struct ipv4 +{ + uint8_t version_ihl; + uint8_t dscp_ecn; + uint16_t length; + uint16_t identification; + uint16_t fragment; + uint8_t ttl; + uint8_t protocol; + uint16_t checksum; + uint8_t source[4]; + uint8_t destination[4]; +}; + +#define IPV4_IHL(x) ((x) >> 0 & 0xF) +#define IPV4_VERSION(x) ((x) >> 4 & 0xF) +#define IPV4_IHL_MAKE(x) (((x) & 0xF) << 0) +#define IPV4_VERSION_MAKE(x) (((x) & 0xF) << 4) +#define IPV4_FRAGMENT(x) (((x) >> 0) & 0x1FFF) +#define IPV4_FRAGMENT_MAKE(x) (((x) & 0x1FFF) << 0) +#define IPV4_FRAGMENT_MORE (1 << (13 + 0)) +#define IPV4_FRAGMENT_DONT (1 << (13 + 1)) +#define IPV4_FRAGMENT_EVIL (1 << (13 + 2)) + +uint16_t ipsum_word(uint16_t sum, uint16_t word) +{ + uint32_t result = sum + word; + if ( result & 0x10000 ) + return (result + 1) & 0xFFFF; + return result; +} + +uint16_t ipsum_buf(uint16_t sum, const void* bufptr, size_t size) +{ + const uint8_t* buf = (const uint8_t*) bufptr; + for ( size_t i = 0; i < (size & ~1UL); i += 2 ) + sum = ipsum_word(sum, buf[i] << 8 | buf[i + 1]); + // Odd sizes only work correctly if this is the final byte being summed. + if ( size & 1 ) + sum = ipsum_word(sum, buf[size - 1] << 8); + return sum; +} + +uint16_t ipsum_finish(uint16_t sum) +{ + return ~sum; +} + +uint16_t ipsum(const void* bufptr, size_t size) +{ + uint16_t sum = ipsum_buf(0, bufptr, size); + return ipsum_finish(sum); +} + +static NetworkInterface* LocateInterface(const struct in_addr* src, + const struct in_addr* dst, + unsigned int ifindex) +{ + ScopedLock ifs_lock(&netifs_lock); + in_addr_t any_ip = htobe32(INADDR_ANY); + in_addr_t broadcast_ip = htobe32(INADDR_BROADCAST); + + // Refuse to route to the any address. + if ( !memcmp(&any_ip, dst, sizeof(in_addr_t)) ) + return errno = ENETUNREACH, (NetworkInterface*) NULL; + // If src is set, but ifindex is not set, search for a fitting interface. + if ( !ifindex && memcmp(&any_ip, src, sizeof(in_addr_t)) != 0 ) + { + for ( unsigned int i = 1; i < netifs_count; i++ ) + { + NetworkInterface* netif = netifs[i]; + if ( !netif ) + continue; + ScopedLock cfg_lock(&netif->cfg_lock); + if ( memcmp(&netif->cfg.inet.address, src, sizeof(in_addr_t)) != 0 ) + continue; + ifindex = i; + break; + } + // No interface had the correct address. + if ( !ifindex ) + return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL; + } + // If ifindex is set, route to that interface. + if ( ifindex ) + { + // Can't route to non-existent interface. + if ( netifs_count <= ifindex ) + return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL; + NetworkInterface* netif = netifs[ifindex]; + if ( !netif ) + return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL; + ScopedLock cfg_lock(&netif->cfg_lock); + // Can't route to down interfaces. + if ( !(netif->ifstatus.flags & IF_STATUS_FLAGS_UP) ) + return errno = ENETDOWN, (NetworkInterface*) NULL; + // If src is set, it must be the interface's address. + if ( memcmp(src, &any_ip, sizeof(in_addr_t)) != 0 && + memcmp(src, &netif->cfg.inet.address, sizeof(in_addr_t)) != 0 ) + return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL; + in_addr_t dstaddr = be32toh(dst->s_addr); + in_addr_t ifaddr = be32toh(netif->cfg.inet.address.s_addr); + in_addr_t subnet = be32toh(netif->cfg.inet.subnet.s_addr); + in_addr_t loopaddr = INADDR_LOOPBACK; + in_addr_t loopmask = INADDR_LOOPMASK; + if ( netif->ifinfo.type == IF_TYPE_LOOPBACK ) + { + // The destination must be on the interface's subnet. + if ( (dstaddr & subnet) != (ifaddr & subnet) ) + return errno = ENETUNREACH, (NetworkInterface*) NULL; + return netif; + } + else + { + // The destination must not be on the loopback network for + // a non-loopback interface. + if ( (dstaddr & loopmask) == (loopaddr & loopmask) ) + return errno = ENETUNREACH, (NetworkInterface*) NULL; + // If the interface does not have a default route, the destination + // must be broadcast or be on the interface's subnet. + if ( !memcmp(&netif->cfg.inet.router, &any_ip, sizeof(in_addr_t)) && + memcmp(&dstaddr, &broadcast_ip, sizeof(in_addr_t)) != 0 && + (dstaddr & subnet) != (ifaddr & subnet) ) + return errno = ENETUNREACH, (NetworkInterface*) NULL; + return netif; + } + } + // If the destination is broadcast, send to the first fitting interface. + else if ( !memcmp(&broadcast_ip, dst, sizeof(in_addr_t)) ) + { + for ( unsigned int i = 1; i < netifs_count; i++ ) + { + NetworkInterface* netif = netifs[i]; + if ( !netif ) + continue; + ScopedLock cfg_lock(&netif->cfg_lock); + // Can't route broadcast to loopback interfaces or down interfaces. + if ( netif->ifinfo.type == IF_TYPE_LOOPBACK || + !(netif->ifstatus.flags & IF_STATUS_FLAGS_UP) ) + continue; + return netif; + } + // No interface was suitable for broadcast. + return errno = EADDRNOTAVAIL, (NetworkInterface*) NULL; + } + // Otherwise, pick the best interface for the destination address. + else + { + NetworkInterface* default_netif = NULL; + for ( unsigned int i = 1; i < netifs_count; i++ ) + { + NetworkInterface* netif = netifs[i]; + if ( !netif ) + continue; + ScopedLock cfg_lock(&netif->cfg_lock); + in_addr_t dstaddr = be32toh(dst->s_addr); + in_addr_t ifaddr = be32toh(netif->cfg.inet.address.s_addr); + in_addr_t subnet = be32toh(netif->cfg.inet.subnet.s_addr); + // Route to the interface if the destination is on its subnet. + if ( (dstaddr & subnet) == (ifaddr & subnet) ) + { + // Can't route to down interfaces. + if ( !(netif->ifstatus.flags & IF_STATUS_FLAGS_UP) ) + return errno = ENETDOWN, (NetworkInterface*) NULL; + return netif; + } + // If the interface is up, no default route has been found yet, and + // the interface has a default route, default to that route if no + // better interface is found. + else if ( (netif->ifstatus.flags & IF_STATUS_FLAGS_UP) && + !default_netif && + memcmp(&any_ip, &netif->cfg.inet.router, + sizeof(in_addr_t)) != 0 ) + default_netif = netif; + } + // If a fitting default route was found, use it. + if ( default_netif ) + return default_netif; + // No interface was up that could accept the destination address, hence + // the network is down. + return errno = ENETDOWN, (NetworkInterface*) NULL; + } +} + +static bool ShouldHandlePacket(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast, + bool* out_broadcast) +{ + NetworkInterface* netif = pkt->netif; + ScopedLock cfg_lock(&netif->cfg_lock); + // The source address must not be broadcast (RFC 1122 3.2.1.3). + in_addr_t broadcast_ip = htobe32(INADDR_BROADCAST); + if ( !memcmp(src, &broadcast_ip, sizeof(in_addr_t)) ) + return false; + // The source address must not be the subnet's broadcast (RFC 1122 3.2.1.3). + in_addr_t if_broadcast_ip = + netif->cfg.inet.address.s_addr | ~netif->cfg.inet.subnet.s_addr; + if ( !memcmp(&if_broadcast_ip, src, sizeof(in_addr_t)) ) + return false; + if ( netif->ifinfo.type != IF_TYPE_LOOPBACK ) + { + // 127.0.0.0/8 is only for loopback. + if ( (be32toh(src->s_addr) & INADDR_LOOPMASK) == + (INADDR_LOOPBACK & INADDR_LOOPMASK) || + (be32toh(dst->s_addr) & INADDR_LOOPMASK) == + (INADDR_LOOPBACK & INADDR_LOOPMASK) ) + return false; + } + // Receive packets sent to the broadcast address. + *out_broadcast = false; + if ( !memcmp(dst, &broadcast_ip, sizeof(broadcast_ip)) ) + return *out_broadcast = true, true; + in_addr_t any_ip = htobe32(INADDR_ANY); + // Only receive non-broadcast packets if the interface is configured. + if ( memcmp(&netif->cfg.inet.address, &any_ip, sizeof(in_addr_t)) != 0 ) + { + // Receive packets sent to our address. + if ( !dst_broadcast && + !memcmp(&netif->cfg.inet.address, dst, sizeof(in_addr_t)) ) + return true; + // Receive packets sent to the subnet's broadcast address. + if ( !memcmp(&if_broadcast_ip, dst, sizeof(in_addr_t)) ) + return *out_broadcast = true, true; + } + return false; +} + +void Handle(Ref pkt, + const struct ether_addr* /*src*/, + const struct ether_addr* /*dst*/, + bool dst_broadcast) +{ + struct ipv4 hdr; + size_t pkt_remain = pkt->length - pkt->offset; + // The packet has to be large enough to contain a header. + if ( pkt_remain < sizeof(hdr) ) + return; + memcpy(&hdr, pkt->from + pkt->offset, sizeof(hdr)); + // Verify the header's checksum is correct. + if ( ipsum(&hdr, sizeof(hdr)) != 0 ) + return; + hdr.length = be16toh(hdr.length); + hdr.identification = be16toh(hdr.identification); + hdr.fragment = be16toh(hdr.fragment); + hdr.checksum = be16toh(hdr.checksum); + // Verify the packet is Internet Protocol Version 4. + if ( IPV4_VERSION(hdr.version_ihl) != 4 ) + return; + // Verify the relation: + // sizeof(hdr) <= ihl <= hdr.length <= pkt_remain + size_t ihl = 4 * IPV4_IHL(hdr.version_ihl); + // Verify the header length isn't smaller than the minimum header. + if ( ihl < sizeof(hdr) ) + return; + // Verify total length isn't smaller than the header length. + if ( hdr.length < ihl ) + return; + // Verify the packet length isn't smaller than the datagram. + if ( pkt_remain < hdr.length ) + return; + // Drop the packet if we shouldn't handle it. + bool in_dst_broadcast; + const struct in_addr* in_src = (const struct in_addr*) &hdr.source; + const struct in_addr* in_dst = (const struct in_addr*) &hdr.destination; + if ( !ShouldHandlePacket(pkt, in_src, in_dst, dst_broadcast, + &in_dst_broadcast) ) + return; + // TODO: IP options. + // TODO: Reassemble fragmented packets. + if ( IPV4_FRAGMENT(hdr.fragment) ) + return; + if ( hdr.fragment & IPV4_FRAGMENT_MORE ) + return; + // Trim the packet to the length according to the header, in case the packet + // was smaller than the link layer protocol's minimum transmission unit and + // the packet was padded by zeroes. + size_t truncated_length = pkt->offset + hdr.length; + if ( pkt->length < truncated_length ) + return; + pkt->length = truncated_length; + pkt->offset += ihl; + if ( hdr.protocol == IPPROTO_ICMP ) + Ping::HandleIP(pkt, in_src, in_dst, in_dst_broadcast); + else if ( hdr.protocol == IPPROTO_TCP ) + TCP::HandleIP(pkt, in_src, in_dst, in_dst_broadcast); + else if ( hdr.protocol == IPPROTO_UDP ) + UDP::HandleIP(pkt, in_src, in_dst, in_dst_broadcast); +} + +bool Send(Ref pktin, + const struct in_addr* src, + const struct in_addr* dst, + uint8_t protocol, + unsigned int ifindex, + bool broadcast) +{ + Ref pkt = GetPacket(); + if ( !pkt ) + return false; + size_t mtu = pkt->pmap.size; + if ( mtu < sizeof(struct ipv4) || + mtu - sizeof(struct ipv4) < pktin->length ) + return errno = EMSGSIZE, -1; + pkt->length = sizeof(struct ipv4) + pktin->length; + unsigned char* in = pktin->from; + unsigned char* out = pkt->from; + struct ipv4 hdr; + hdr.version_ihl = IPV4_VERSION_MAKE(4) | IPV4_IHL_MAKE(5); + hdr.dscp_ecn = 0; + hdr.length = htobe16(pkt->length); + hdr.identification = htobe16(0); // TODO: Assign identification to packets. + hdr.fragment = htobe16(0); + hdr.ttl = 0x40; // TODO: This should be configurable. + hdr.protocol = protocol; + hdr.checksum = 0; + memcpy(hdr.source, src, sizeof(struct in_addr)); + memcpy(hdr.destination, dst, sizeof(struct in_addr)); + hdr.checksum = htobe16(ipsum(&hdr, sizeof(hdr))); + memcpy(out, &hdr, sizeof(hdr)); + memcpy(out + sizeof(struct ipv4), in, pktin->length); + + NetworkInterface* netif = LocateInterface(src, dst, ifindex); + if ( !netif ) + return false; + + if ( netif->ifinfo.type == IF_TYPE_LOOPBACK ) + { + struct ether_addr localaddr; + memset(&localaddr, 0, sizeof(localaddr)); + return Ether::Send(pkt, &localaddr, &localaddr, ETHERTYPE_IP, netif); + } + + if ( netif->ifinfo.type != IF_TYPE_ETHERNET ) + return errno = EAFNOSUPPORT, false; + + kthread_mutex_lock(&netif->cfg_lock); + in_addr_t dst_ip = dst->s_addr; + in_addr_t address_ip = netif->cfg.inet.address.s_addr; + in_addr_t router_ip = netif->cfg.inet.router.s_addr; + in_addr_t subnet_ip = netif->cfg.inet.subnet.s_addr; + in_addr_t broadcast_ip = + netif->cfg.inet.address.s_addr | ~netif->cfg.inet.subnet.s_addr; + struct ether_addr ether_src = netif->cfg.ether.address; + kthread_mutex_unlock(&netif->cfg_lock); + + struct in_addr route; + // Route directly to the destination if the destination is broadcast. + if ( dst_ip == htobe32(INADDR_BROADCAST) || dst_ip == broadcast_ip ) + memcpy(&route, &dst_ip, sizeof(route)); + // Route directly to the destination if the destination is on the subnet. + else if ( (dst_ip & subnet_ip) == (address_ip & subnet_ip) && + dst_ip != address_ip ) + memcpy(&route, dst, sizeof(route)); + // Route to the the default route if any. + else if ( router_ip != htobe32(INADDR_ANY) ) + memcpy(&route, &router_ip, sizeof(route)); + // Otherwise the network is unreachable. + else + return errno = ENETUNREACH, false; + + // If the destination is broadcast, send an ethernet broadcast. + if ( dst_ip == htobe32(INADDR_BROADCAST) || dst_ip == broadcast_ip ) + { + if ( !broadcast ) + return errno = EACCES, false; + return Ether::Send(pkt, ðer_src, ðeraddr_broadcast, ETHERTYPE_IP, + netif); + } + return ARP::RouteIPEthernet(netif, pkt, &route); +} + +bool GetSourceIP(const struct in_addr* src, + const struct in_addr* dst, + struct in_addr* sendfrom, + unsigned int ifindex, + size_t* mtu) +{ + NetworkInterface* netif = LocateInterface(src, dst, ifindex); + if ( !netif ) + return false; + ScopedLock cfg_lock(&netif->cfg_lock); + if ( sendfrom ) + memcpy(sendfrom, &netif->cfg.inet.address, sizeof(struct in_addr)); + if ( mtu ) + *mtu = Ether::GetMTU(netif) - sizeof(struct ipv4); + return true; +} + +Ref Socket(int type, int protocol) +{ + switch ( type ) + { + case SOCK_DGRAM: + if ( protocol == 0 || protocol == IPPROTO_UDP ) + return UDP::Socket(AF_INET); + if ( protocol == IPPROTO_PING ) + return Ping::Socket(AF_INET); + return errno = EPROTONOSUPPORT, Ref(NULL); + case SOCK_STREAM: + if ( protocol == 0 || protocol == IPPROTO_TCP ) + return TCP::Socket(AF_INET); + return errno = EPROTONOSUPPORT, Ref(NULL); + default: return errno = EPROTOTYPE, Ref(NULL); + } +} + +} // namespace IP +} // namespace Sortix diff --git a/kernel/net/ip.h b/kernel/net/ip.h new file mode 100644 index 00000000..31e88f67 --- /dev/null +++ b/kernel/net/ip.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/ip.h + * Internet Protocol Version 4. + */ + +#ifndef SORTIX_NET_IP_H +#define SORTIX_NET_IP_H + +#include +#include + +#include + +namespace Sortix { +namespace IP { + +uint16_t ipsum(const void* bufptr, size_t size); +uint16_t ipsum_word(uint16_t sum, uint16_t word); +uint16_t ipsum_buf(uint16_t sum, const void* bufptr, size_t size); +uint16_t ipsum_finish(uint16_t sum); +void Handle(Ref pkt, + const struct ether_addr* src, + const struct ether_addr* dst, + bool dst_broadcast); +bool Send(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + uint8_t protocol, + unsigned int ifindex, + bool broadcast); +bool GetSourceIP(const struct in_addr* src, + const struct in_addr* dst, + struct in_addr* sendfrom, + unsigned int ifindex, + size_t* mtu = NULL); +Ref Socket(int type, int protocol); + +} // namespace IP +} // namespace Sortix + +#endif diff --git a/kernel/net/lo/lo.cpp b/kernel/net/lo/lo.cpp new file mode 100644 index 00000000..3e958d2e --- /dev/null +++ b/kernel/net/lo/lo.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/lo/lo.cpp + * Loopback device. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "../ether.h" + +#include "lo.h" + +// The loopback device currently communicates through the Ethernet layer and +// pretends to do offload Ethernet checksumming as an optimization. + +// The shared worker thread is used for processing. Whenever a packet needs to +// be sent, if the worker thread isn't scheduled, it is scheduled. The worker +// thread transmits all the packets that were in the queue when it begins, but +// not any more than that. If any work remains at the end, it schedules itself +// again to run later (to avoid starving other tasks using the shared worker +// thread). The packet queue is a singly linked list of packets. + +namespace Sortix { +namespace Loopback { + +class Loopback : public NetworkInterface +{ + friend void Loopback__Recv(void* ctx); + +public: + Loopback(); + ~Loopback(); + +public: + bool Send(Ref pkt); + +private: + void Recv(); + +private: + kthread_mutex_t socket_lock; + Ref first_packet; + Ref last_packet; + bool worker_scheduled; + +}; + +Loopback::Loopback() +{ + ifinfo.type = IF_TYPE_LOOPBACK; + ifinfo.features = IF_FEATURE_ETHERNET_CRC_OFFLOAD; + ifinfo.addrlen = 0; + ifstatus.flags = IF_STATUS_FLAGS_UP; + cfg.inet.address.s_addr = htobe32(INADDR_LOOPBACK); + cfg.inet.router.s_addr = htobe32(INADDR_ANY); + cfg.inet.subnet.s_addr = htobe32(INADDR_LOOPMASK); + socket_lock = KTHREAD_MUTEX_INITIALIZER; + // first_packet initialized by constructor + // last_packet initialized by constructor + worker_scheduled = false; +} + +Loopback::~Loopback() +{ + // Avoid stack overflow in first_packet recursive destructor. + while ( first_packet ) + { + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + } + last_packet.Reset(); +} + +void Loopback__Recv(void* ctx) +{ + ((Loopback*) ctx)->Recv(); +} + +void Loopback::Recv() +{ + kthread_mutex_lock(&socket_lock); + Ref next_packet = first_packet; + first_packet.Reset(); + last_packet.Reset(); + kthread_mutex_unlock(&socket_lock); + while ( next_packet ) + { + Ref packet = next_packet; + next_packet = next_packet->next; + packet->next.Reset(); + packet->netif = this; + Ether::Handle(packet, true); + } + kthread_mutex_lock(&socket_lock); + bool should_schedule = first_packet; + if ( !should_schedule ) + worker_scheduled = false; + kthread_mutex_unlock(&socket_lock); + if ( should_schedule ) + Worker::Schedule(Loopback__Recv, this); +} + +bool Loopback::Send(Ref pkt) +{ + kthread_mutex_lock(&socket_lock); + if ( last_packet ) + last_packet->next = pkt; + else + first_packet = pkt; + last_packet = pkt; + bool should_schedule = !worker_scheduled; + worker_scheduled = true; + kthread_mutex_unlock(&socket_lock); + if ( should_schedule ) + Worker::Schedule(Loopback__Recv, this); + return true; +} + +void Init(const char* /*devpath*/, Ref dev) +{ + Loopback* lo = new Loopback(); + if ( !lo ) + PanicF("Failed to allocate loopback device"); + size_t index = 0; + snprintf(lo->ifinfo.name, sizeof(lo->ifinfo.name), "lo%zu", index); + if ( !RegisterNetworkInterface(lo, dev) ) + PanicF("Failed to register %s as network interface", lo->ifinfo.name); +} + +} // namespace Loopback +} // namespace Sortix diff --git a/kernel/net/lo/lo.h b/kernel/net/lo/lo.h new file mode 100644 index 00000000..cc6b925b --- /dev/null +++ b/kernel/net/lo/lo.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/lo/lo.h + * Loopback device. + */ + +#ifndef SORTIX_NET_LO_LO_H +#define SORTIX_NET_LO_LO_H + +#include + +namespace Sortix { +namespace Loopback { + +void Init(const char* devpath, Ref dev); + +} // namespace Loopback +} // namespace Sortix + +#endif diff --git a/kernel/net/packet.cpp b/kernel/net/packet.cpp new file mode 100644 index 00000000..a1697def --- /dev/null +++ b/kernel/net/packet.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2015 Meisaka Yukara. + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/packet.cpp + * Reference counted network packets. + */ + +#include +#include + +#include +#include +#include +#include +#include + +namespace Sortix { + +// Limit the memory usage of network packets to this fraction of total memory. +// This constant is documented in if(4) under NOTES. +static const size_t MAX_PACKET_FRACTION = 16; + +// Keep this amount of virtually mapped buffers around at all times. +// This constant is documented in if(4) under NOTES. +static const size_t PACKET_CACHE_TARGET_SIZE = 384; + +// A cache of physical memory allocations for quick allocation to new packets. +static kthread_mutex_t packet_cache_lock = KTHREAD_MUTEX_INITIALIZER; +static paddrmapped_t* packet_cache = NULL; +static size_t packet_cache_used = 0; +static size_t packet_cache_allocated = 0; +static size_t packet_count = 0; + +Packet::Packet(paddrmapped_t _pmap) : pmap(_pmap) +{ + from = (unsigned char*) pmap.from; + length = 0; + offset = 0; + netif = NULL; + packet_count++; +} + +Packet::~Packet() +{ + // Refuse to do recursive destructor calls that could stack overflow. + assert(!next); + ScopedLock lock(&packet_cache_lock); + if ( packet_cache_used < packet_cache_allocated ) + packet_cache[packet_cache_used++] = pmap; + else + FreeAllocatedAndMappedPage(&pmap); + packet_count--; +} + +Ref GetPacket() +{ + ScopedLock lock(&packet_cache_lock); + if ( packet_cache == NULL ) + { + size_t new_allocated = PACKET_CACHE_TARGET_SIZE; + packet_cache = new paddrmapped_t[new_allocated]; + if ( !packet_cache ) + return errno = ENOBUFS, Ref(NULL); + packet_cache_allocated = new_allocated; + } + paddrmapped_t pmap; + // Fast reuse of an existing physical allocation if available. + if ( 0 < packet_cache_used ) + pmap = packet_cache[--packet_cache_used]; + // Otherwise make a new physical allocation for the packet. + else + { + size_t total_memory; + Memory::Statistics(NULL, &total_memory); + size_t total_pages = total_memory / Page::Size(); + size_t max_packets = total_pages / MAX_PACKET_FRACTION; + if ( max_packets <= packet_count ) + return errno = ENOBUFS, Ref(NULL); + if ( !AllocateAndMapPage(&pmap, PAGE_USAGE_NETWORK_PACKET) ) + return errno = ENOBUFS, Ref(NULL); + } + Ref pkt = Ref(new Packet(pmap)); + if ( !pkt ) + { + FreeAllocatedAndMappedPage(&pmap); + return errno = ENOBUFS, Ref(NULL); + } + return pkt; +} + +} // namespace Sortix diff --git a/kernel/net/ping.cpp b/kernel/net/ping.cpp new file mode 100644 index 00000000..7b2cddfd --- /dev/null +++ b/kernel/net/ping.cpp @@ -0,0 +1,1271 @@ +/* + * Copyright (c) 2016, 2017, 2018, 2022 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/ping.cpp + * Internet Control Message Protocol Echo. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef IOV_MAX +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ip.h" +#include "ping.h" + +namespace Sortix { +namespace Ping { + +class PingSocket; + +struct icmp +{ + uint8_t type; + uint8_t code; + uint16_t checksum; + uint8_t roh[4]; +}; + +struct icmp_echo +{ + uint8_t type; + uint8_t code; + uint16_t checksum; + uint16_t identifier; + uint16_t sequence; +}; + +struct ping +{ + uint8_t type; + uint8_t code; + uint16_t checksum; + uint16_t identifier; +}; + +union ping_sockaddr +{ + sa_family_t family; + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + +#define ICMP_TYPE_ECHO_REPLY 0 +#define ICMP_TYPE_ECHO_REQUEST 8 + +// These values are documented in ping(4). +static const size_t DEFAULT_PACKET_LIMIT = 64; +static const size_t MAXIMAL_PACKET_LIMIT = 4096; + +static kthread_mutex_t bind_lock = KTHREAD_MUTEX_INITIALIZER; +static PingSocket** bindings_v4; +static PingSocket** bindings_v6; + +void Init() +{ + if ( !(bindings_v4 = new PingSocket*[65536]) || + !(bindings_v6 = new PingSocket*[65536]) ) + Panic("Failed to allocate Ping Socket bindings"); + for ( size_t i = 0; i < 65536; i++ ) + { + bindings_v4[i] = NULL; + bindings_v6[i] = NULL; + } +} + +static bool IsSupportedAddressFamily(int af) +{ + return af == AF_INET /* TODO: || af == AF_INET6 */; +} + +static size_t AddressFamilySize(int af) +{ + switch ( af ) + { + case AF_INET: return sizeof(struct sockaddr_in); + case AF_INET6: return sizeof(struct sockaddr_in6); + } + return 0; +} + +class PingSocket : public AbstractInode +{ + friend void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast); + +public: + PingSocket(int af); + virtual ~PingSocket(); + virtual Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, + int flags); + virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + virtual int listen(ioctx_t* ctx, int backlog); + virtual ssize_t readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt); + virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); + virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); + virtual ssize_t recvmsg_internal(ioctx_t* ctx, struct msghdr* msg, + int flags); + virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, + int flags); + virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg, int flags); + virtual ssize_t sendmsg_internal(ioctx_t* ctx, const struct msghdr* msg, + int flags); + virtual ssize_t writev(ioctx_t* ctx, const struct iovec* iov, int iovcnt); + virtual int poll(ioctx_t* ctx, PollNode* node); + virtual int getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr); + virtual int setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size); + virtual int shutdown(ioctx_t* ctx, int how); + virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + +public: + void ReceivePacket(Ref pkt); + +private: + short PollEventStatus(); + bool ImportAddress(ioctx_t* ctx, union ping_sockaddr* dest, + const void* addr, size_t addrsize); + bool CanBind(union ping_sockaddr new_local); + bool BindDefault(const union ping_sockaddr* new_local); + +private: + kthread_mutex_t socket_lock; + kthread_cond_t receive_cond; + PollChannel poll_channel; + union ping_sockaddr local; + union ping_sockaddr remote; + Ref first_packet; + Ref last_packet; + PingSocket* prev_socket; + PingSocket* next_socket; + size_t receive_current; + size_t receive_limit; + size_t send_limit; + unsigned int ifindex; + int af; + int sockerr; + int how_shutdown; + bool bound; + bool broadcast; + bool connected; + bool reuseaddr; + +}; + +PingSocket::PingSocket(int af) +{ + Process* process = CurrentProcess(); + inode_type = INODE_TYPE_STREAM; + dev = (dev_t) this; + ino = (ino_t) this; + type = S_IFSOCK; + kthread_mutex_lock(&process->idlock); + stat_uid = process->uid; + stat_gid = process->gid; + kthread_mutex_unlock(&process->idlock); + stat_mode = 0600 | this->type; + supports_iovec = true; + socket_lock = KTHREAD_MUTEX_INITIALIZER; + receive_cond = KTHREAD_COND_INITIALIZER; + // poll_channel initialized by constructor + memset(&local, 0, sizeof(local)); + memset(&remote, 0, sizeof(remote)); + if ( af == AF_INET ) + { + local.in.sin_family = AF_INET; + local.in.sin_addr.s_addr = htobe32(INADDR_ANY); + local.in.sin_port = htobe16(0); + remote.in.sin_family = AF_INET; + remote.in.sin_addr.s_addr = htobe32(INADDR_ANY); + remote.in.sin_port = htobe16(0); + } + else if ( af == AF_INET6 ) + { + local.in6.sin6_family = AF_INET6; + local.in6.sin6_addr = in6addr_any; + local.in6.sin6_port = htobe16(0); + remote.in6.sin6_family = AF_INET6; + remote.in6.sin6_addr = in6addr_any; + remote.in6.sin6_port = htobe16(0); + } + // first_packet initialized by constructor + // last_packet initialized by constructor + prev_socket = NULL; + next_socket = NULL; + receive_current = 0; + receive_limit = DEFAULT_PACKET_LIMIT * Page::Size(); + send_limit = DEFAULT_PACKET_LIMIT * Page::Size(); + ifindex = 0; + this->af = af; + sockerr = 0; + how_shutdown = 0; + bound = false; + broadcast = false; + connected = false; + reuseaddr = false; +} + +PingSocket::~PingSocket() +{ + if ( bound ) + { + ScopedLock lock(&bind_lock); + if ( af == AF_INET ) + { + uint16_t port = be16toh(local.in.sin_port); + if ( prev_socket ) + prev_socket->next_socket = next_socket; + else + bindings_v4[port] = next_socket; + if ( next_socket ) + next_socket->prev_socket = prev_socket; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(local.in6.sin6_port); + if ( prev_socket ) + prev_socket->next_socket = next_socket; + else + bindings_v6[port] = next_socket; + if ( next_socket ) + next_socket->prev_socket = prev_socket; + } + bound = false; + } + // Avoid stack overflow in first_packet recursive destructor. + while ( first_packet ) + { + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + } + last_packet.Reset(); +} + +Ref PingSocket::accept4(ioctx_t* /*ctx*/, uint8_t* /*addr*/, + size_t* /*addrsize*/, int /*flags*/) +{ + return errno = EOPNOTSUPP, Ref(NULL); +} + +bool PingSocket::ImportAddress(ioctx_t* ctx, + union ping_sockaddr* dest, + const void* addr, + size_t addrsize) +{ + if ( addrsize != AddressFamilySize(af) ) + { + sa_family_t family; + if ( sizeof(family) <= addrsize && + ctx->copy_from_src(&family, addr, sizeof(family)) && + family == AF_UNSPEC ) + { + union ping_sockaddr unspec; + memset(&unspec, 0, sizeof(unspec)); + unspec.family = AF_UNSPEC; + memcpy(dest, &unspec, sizeof(unspec)); + return true; + } + return errno = EINVAL, false; + } + union ping_sockaddr copy; + memset(©, 0, sizeof(copy)); + if ( !ctx->copy_from_src(©, addr, addrsize) ) + return false; + if ( copy.family != af && copy.family != AF_UNSPEC ) + return errno = EAFNOSUPPORT, false; + memcpy(dest, ©, sizeof(copy)); + return true; +} + +// bind_lock locked, socket_lock locked (in that order) +bool PingSocket::CanBind(union ping_sockaddr new_local) +{ + if ( af == AF_INET ) + { + // Bind to either the any address, the broadcast address, the address of + // a network interface, or the broadcast address of a network interface. + if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) && + new_local.in.sin_addr.s_addr != htobe32(INADDR_BROADCAST) ) + { + // TODO: What happens to sockets if the network interface changes + // its address? + ScopedLock ifs_lock(&netifs_lock); + bool found = false; + for ( unsigned int i = 1; i < netifs_count; i++ ) + { + NetworkInterface* netif = netifs[i]; + if ( !netif ) + continue; + ScopedLock cfg_lock(&netif->cfg_lock); + struct in_addr if_broadcast_ip; + if_broadcast_ip.s_addr = netif->cfg.inet.address.s_addr | + ~netif->cfg.inet.subnet.s_addr; + if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr, + sizeof(struct in_addr)) == 0 || + memcmp(&if_broadcast_ip, &new_local.in.sin_addr, + sizeof(struct in_addr)) == 0 ) + { + found = true; + break; + } + } + // No interface had the correct address. + if ( !found ) + return errno = EADDRNOTAVAIL, false; + } + uint16_t port = be16toh(new_local.in.sin_port); + if ( port == 0 ) + return errno = EINVAL, false; + for ( PingSocket* socket = bindings_v4[port]; + socket; + socket = socket->next_socket ) + { + // Taking the lock of the other socket is safe against deadlocks, + // despite having the lock of this socket, because bind_lock was + // locked prior to this socket's lock, and bind_lock must always + // be taken before the same thread locks two sockets. + ScopedLock lock(&socket->socket_lock); + if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && + !(reuseaddr && socket->reuseaddr) ) + return errno = EADDRINUSE, false; + if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && + !(reuseaddr && socket->reuseaddr) ) + return errno = EADDRINUSE, false; + if ( new_local.in.sin_addr.s_addr == + socket->local.in.sin_addr.s_addr ) + return errno = EADDRINUSE, false; + } + } + else if ( af == AF_INET6 ) + { + // TODO: IPv6 support for seeing if any interface has the address. + if ( true ) + return errno = EAFNOSUPPORT, false; + uint16_t port = be16toh(new_local.in6.sin6_port); + if ( port == 0 ) + return errno = EINVAL, false; + for ( PingSocket* socket = bindings_v6[port]; + socket; + socket = socket->next_socket ) + { + if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any, + sizeof(in6addr_any)) && + !(reuseaddr && socket->reuseaddr) ) + if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any, + sizeof(in6addr_any)) && + !(reuseaddr && socket->reuseaddr) ) + if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr, + sizeof(new_local.in6.sin6_addr)) ) + return errno = EADDRINUSE, false; + } + } + else + return errno = EAFNOSUPPORT, false; + return true; +} + +int PingSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + ScopedLock lock2(&bind_lock); + ScopedLock lock(&socket_lock); + if ( bound ) + return errno = EINVAL, -1; + union ping_sockaddr new_local; + if ( !ImportAddress(ctx, &new_local, addr, addrsize) ) + return -1; + if ( new_local.family == AF_UNSPEC ) + return errno = EAFNOSUPPORT, -1; + uint16_t port; + if ( af == AF_INET ) + port = be16toh(new_local.in.sin_port); + else if ( af == AF_INET6 ) + port = be16toh(new_local.in6.sin6_port); + else + return errno = EAFNOSUPPORT, -1; + if ( port == 0 ) + return BindDefault(&new_local) ? 0 : -1; + if ( !CanBind(new_local) ) + return -1; + if ( af == AF_INET ) + { + uint16_t port = be16toh(new_local.in.sin_port); + if ( bindings_v4[port] ) + bindings_v4[port]->prev_socket = this; + next_socket = bindings_v4[port]; + prev_socket = NULL; + bindings_v4[port] = this; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(new_local.in6.sin6_port); + if ( bindings_v6[port] ) + bindings_v6[port]->prev_socket = this; + next_socket = bindings_v6[port]; + prev_socket = NULL; + bindings_v6[port] = this; + } + else + return errno = EAFNOSUPPORT, -1; + memcpy(&local, &new_local, sizeof(new_local)); + bound = true; + return 0; +} + +// bind_lock locked, socket_lock locked (in that order) +bool PingSocket::BindDefault(const union ping_sockaddr* new_local_ptr) +{ + // TODO: This allocator becomes increasingly biased as more ports are + // allocated. + // TODO: Try not to allocate recently used ports. + union ping_sockaddr new_local; + if ( new_local_ptr ) + memcpy(&new_local, new_local_ptr, sizeof(union ping_sockaddr)); + else + { + memset(&new_local, 0, sizeof(new_local)); + if ( af == AF_INET ) + { + new_local.in.sin_family = AF_INET; + new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY); + } + else if ( af == AF_INET6 ) + { + new_local.in6.sin6_family = AF_INET6; + new_local.in6.sin6_addr = in6addr_any; + } + else + return errno = EAFNOSUPPORT, false; + } + uint16_t start = 32768; // Documented in ping(4). + uint16_t end = 61000; // Documented in ping(4). + uint16_t count = end - start; + uint16_t offset = arc4random_uniform(count); + for ( uint16_t i = 0; i < count; i++ ) + { + uint16_t j = offset + i; + if ( count <= j ) + j -= count; + uint16_t port = start + j; + if ( af == AF_INET ) + new_local.in.sin_port = htobe16(port); + else if ( af == AF_INET6 ) + new_local.in6.sin6_port = htobe16(port); + else + return errno = EAFNOSUPPORT, false; + if ( !CanBind(new_local) ) + { + if ( errno == EADDRINUSE ) + continue; + return false; + } + if ( af == AF_INET ) + { + if ( bindings_v4[port] ) + bindings_v4[port]->prev_socket = this; + next_socket = bindings_v4[port]; + prev_socket = NULL; + bindings_v4[port] = this; + } + else if ( af == AF_INET6 ) + { + if ( bindings_v6[port] ) + bindings_v6[port]->prev_socket = this; + next_socket = bindings_v6[port]; + prev_socket = NULL; + bindings_v6[port] = this; + } + else + return errno = EAFNOSUPPORT, false; + memcpy(&local, &new_local, sizeof(new_local)); + bound = true; + return true; + } + return errno = EAGAIN, false; +} + +int PingSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + ScopedLock lock2(&bind_lock); + ScopedLock lock(&socket_lock); + union ping_sockaddr new_remote; + if ( !ImportAddress(ctx, &new_remote, addr, addrsize) ) + return -1; + if ( new_remote.family == AF_UNSPEC ) + { + // Disconnect the socket when connecting to the AF_UNSPEC family. + connected = false; + return 0; + } + else if ( af == AF_INET ) + { + } + else + return errno = EAFNOSUPPORT, -1; + // If the socket is not bound, find a route to the remote address and bind + // to the appropriate source address. + if ( !bound ) + { + union ping_sockaddr new_local; + memset(&new_local, 0, sizeof(new_local)); + if ( af == AF_INET ) + { + struct in_addr any; + any.s_addr = htobe32(INADDR_ANY); + new_local.in.sin_family = AF_INET; + if ( !IP::GetSourceIP(&any, &new_remote.in.sin_addr, + &new_local.in.sin_addr, ifindex, NULL) ) + return -1; + new_local.in.sin_port = htobe16(0); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !BindDefault(&new_local) ) + return -1; + } + // Test if there is a route from the local address to the remote address. + if ( af == AF_INET ) + { + if ( !IP::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr, NULL, + ifindex, NULL) ) + return -1; + } + else + return errno = EAFNOSUPPORT, -1; + // Set the remote address and become connected. + connected = true; + memcpy(&remote, &new_remote, sizeof(new_remote)); + // Discard datagrams not from the new remote, thus enforcing that all + // datagrams provided by recvmsg always comes from the address connected to. + size_t name_size = AddressFamilySize(af); + Ref* packet_ptr = &first_packet; + while ( *packet_ptr ) + { + void* name = first_packet->from + first_packet->offset; + if ( memcmp(name, &remote, name_size) != 0 ) + { + Ref next = (*packet_ptr)->next; + (*packet_ptr)->next.Reset(); + packet_ptr->Reset(); + *packet_ptr = next; + continue; + } + packet_ptr = &(*packet_ptr)->next; + } + if ( !first_packet ) + last_packet.Reset(); + return 0; +} + +int PingSocket::listen(ioctx_t* /*ctx*/, int /*backlog*/) +{ + return errno = EOPNOTSUPP, -1; +} + +ssize_t PingSocket::readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt) +{ + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = (struct iovec*) iov; + msg.msg_iovlen = iovcnt; + return recvmsg_internal(ctx, &msg, 0); +} + +ssize_t PingSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) +{ + struct iovec iov; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = (void*) buf; + iov.iov_len = count; + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return recvmsg_internal(ctx, &msg, flags); +} + +ssize_t PingSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags) +{ + struct msghdr msg; + if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) + return -1; + if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) + return errno = EINVAL, -1; + size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); + struct iovec* iov = new struct iovec[msg.msg_iovlen]; + if ( !iov ) + return -1; + struct iovec* user_iov = msg.msg_iov; + if ( !ctx->copy_from_src(iov, user_iov, iov_size) ) + return delete[] iov, -1; + msg.msg_iov = iov; + ssize_t result = recvmsg_internal(ctx, &msg, flags); + msg.msg_iov = user_iov; + delete[] iov; + if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) ) + return -1; + return result; +} + +ssize_t PingSocket::recvmsg_internal(ioctx_t* ctx, + struct msghdr* msg, + int flags) +{ + if ( flags & ~(MSG_PEEK) ) + return errno = EINVAL, -1; + ScopedLock lock(&socket_lock); + if ( sockerr ) + { + errno = sockerr; + sockerr = 0; + return -1; + } + if ( how_shutdown & SHUT_RD ) + return 0; + while ( !first_packet ) + { + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK, -1; + if ( !kthread_cond_wait_signal(&receive_cond, &socket_lock) ) + return errno = EINTR, -1; + } + void* name = first_packet->from + first_packet->offset; + size_t name_size = AddressFamilySize(af); + assert(name_size <= first_packet->length - first_packet->offset); + if ( msg->msg_name ) + { + if ( name_size < msg->msg_namelen ) + msg->msg_namelen = name_size; + if ( !ctx->copy_to_dest(msg->msg_name, name, msg->msg_namelen) ) + return -1; + } + else + msg->msg_namelen = 0; + first_packet->offset += name_size; + const unsigned char* in = first_packet->from + first_packet->offset; + size_t in_length = first_packet->length - first_packet->offset; + msg->msg_controllen = 0; + msg->msg_flags = 0; + if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) + return errno = EINVAL, -1; + size_t sofar = 0; + for ( int i = 0; i < msg->msg_iovlen && sofar < in_length; i++) + { + size_t in_left = in_length - sofar; + const struct iovec* iov = &msg->msg_iov[i]; + size_t count = in_left < iov->iov_len ? in_left : iov->iov_len; + if ( !ctx->copy_to_dest(iov->iov_base, in + sofar, count) ) + return -1; + sofar += count; + } + if ( sofar < in_length ) + msg->msg_flags |= MSG_TRUNC; + if ( !(flags & MSG_PEEK) ) + { + receive_current -= first_packet->pmap.size; + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + if ( !first_packet ) + last_packet.Reset(); + } + return sofar; +} + +ssize_t PingSocket::send(ioctx_t* ctx, + const uint8_t* buf, + size_t count, + int flags) +{ + struct iovec iov; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = (void*) buf; + iov.iov_len = count; + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return sendmsg_internal(ctx, &msg, flags); +} + +ssize_t PingSocket::sendmsg(ioctx_t* ctx, + const struct msghdr* msg_ptr, + int flags) +{ + struct msghdr msg; + if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) + return -1; + if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) + return errno = EINVAL, -1; + size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); + struct iovec* iov = new struct iovec[msg.msg_iovlen]; + if ( !iov ) + return -1; + if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) + return delete[] iov, -1; + msg.msg_iov = iov; + ssize_t result = sendmsg_internal(ctx, &msg, flags); + delete[] iov; + return result; +} + +ssize_t PingSocket::sendmsg_internal(ioctx_t* ctx, + const struct msghdr* msg, + int flags) +{ + if ( flags & ~(MSG_NOSIGNAL) ) // TODO: MSG_DONTROUTE + return errno = EINVAL, -1; + ScopedLock lock(&socket_lock); + if ( how_shutdown & SHUT_WR ) + { + if ( !(flags & MSG_NOSIGNAL) ) + CurrentThread()->DeliverSignal(SIGPIPE); + return errno = EPIPE, -1; + } + if ( sockerr ) + { + errno = sockerr; + sockerr = 0; + return -1; + } + union ping_sockaddr sendto; + if ( msg->msg_name ) + { + if ( connected ) + return errno = EISCONN, -1; + if ( af == AF_INET ) + { + if ( msg->msg_namelen != sizeof(sendto.in) ) + return errno = EINVAL, -1; + sendto.family = af; + if ( !ctx->copy_from_src(&sendto.in, msg->msg_name, + sizeof(sendto.in)) ) + return -1; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, -1; + } + else if ( connected ) + sendto = remote; + else + return errno = EDESTADDRREQ, -1; + if ( !bound ) + { + kthread_mutex_unlock(&socket_lock); // Don't deadlock. + kthread_mutex_lock(&bind_lock); + kthread_mutex_lock(&socket_lock); + bool was_bound = BindDefault(NULL); + kthread_mutex_unlock(&bind_lock); + if ( !was_bound ) + return -1; + } + // Find a route to the destination and verify the port is non-zero. + union ping_sockaddr sendfrom; + if ( af == AF_INET ) + { + if ( !IP::GetSourceIP(&local.in.sin_addr, &sendto.in.sin_addr, + &sendfrom.in.sin_addr, ifindex) ) + return -1; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, -1; + Ref pkt = GetPacket(); + if ( !pkt ) + return -1; + size_t mtu = pkt->pmap.size; + if ( mtu < sizeof(struct ping) ) + return errno = EMSGSIZE, -1; + pkt->length = sizeof(struct ping); + unsigned char* out = pkt->from; + struct ping hdr; + hdr.type = ICMP_TYPE_ECHO_REQUEST; + hdr.code = 0; + hdr.checksum = 0; + if ( af == AF_INET ) + hdr.identifier = local.in.sin_port; + else if ( af == AF_INET6 ) + hdr.identifier = local.in6.sin6_port; + else + return errno = EAFNOSUPPORT, -1; + if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) + return errno = EINVAL, -1; + size_t count = 0; + for ( int i = 0; i < msg->msg_iovlen; i++ ) + { + const struct iovec* iov = &msg->msg_iov[i]; + if ( mtu - pkt->length < iov->iov_len ) + return errno = EMSGSIZE, -1; + if ( !ctx->copy_from_src(out + pkt->length, iov->iov_base, + iov->iov_len) ) + return -1; + pkt->length += iov->iov_len; + count += iov->iov_len; + } + if ( count < 4 ) // Require sequence number. + return errno = EINVAL, -1; + memcpy(out, &hdr, sizeof(hdr)); + hdr.checksum = htobe16(IP::ipsum(out, pkt->length)); + memcpy(out, &hdr, sizeof(hdr)); + (void) flags; + if ( af == AF_INET ) + { + if ( !IP::Send(pkt, &sendfrom.in.sin_addr, &sendto.in.sin_addr, + IPPROTO_ICMP, ifindex, broadcast) ) + return -1; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, -1; + return count; +} + +ssize_t PingSocket::writev(ioctx_t* ctx, const struct iovec* iov, int iovcnt) +{ + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = (struct iovec*) iov; + msg.msg_iovlen = iovcnt; + return sendmsg_internal(ctx, &msg, 0); +} + +short PingSocket::PollEventStatus() +{ + short status = 0; + if ( first_packet || (how_shutdown & SHUT_RD) ) + status |= POLLIN | POLLRDNORM; + if ( !(how_shutdown & SHUT_WR) ) + status |= POLLOUT | POLLWRNORM; + else + status |= POLLHUP; + if ( sockerr ) + status |= POLLERR; + return status; +} + +int PingSocket::poll(ioctx_t* /*ctx*/, PollNode* node) +{ + ScopedLock lock(&socket_lock); + short ret_status = PollEventStatus() & node->events; + if ( ret_status ) + { + node->master->revents |= ret_status; + return 0; + } + poll_channel.Register(node); + return errno = EAGAIN, -1; +} + +int PingSocket::getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr) +{ + ScopedLock lock(&socket_lock); + + if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) + { + ScopedLock lock(&netifs_lock); + const char* ifname = ""; + if ( ifindex < netifs_count && netifs[ifindex] ) + ifname = netifs[ifindex]->ifinfo.name; + size_t option_size; + if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) ) + return -1; + size_t len = strlen(ifname); + size_t size = len + 1; + if ( option_size < size ) + return errno = ERANGE, -1; + if ( !CopyToUser(option_value, ifname, size) || + !CopyToUser(option_size_ptr, &size, sizeof(size)) ) + return -1; + return 0; + } + + uintmax_t result = 0; + + if ( level == IPPROTO_PING ) + { + switch ( option_name ) + { + default: return errno = ENOPROTOOPT, -1; + } + } + else if ( level == SOL_SOCKET ) + { + switch ( option_name ) + { + case SO_BINDTOINDEX: result = ifindex; break; + case SO_BROADCAST: result = broadcast; break; + case SO_DEBUG: result = 0; break; + case SO_DOMAIN: result = af; break; + case SO_DONTROUTE: result = 0; break; + case SO_ERROR: result = sockerr; sockerr = 0; break; + case SO_PROTOCOL: result = IPPROTO_PING; break; + case SO_RCVBUF: result = receive_limit; break; + case SO_REUSEADDR: result = reuseaddr; break; + case SO_SNDBUF: result = send_limit; break; + case SO_TYPE: result = SOCK_DGRAM; break; + default: return errno = ENOPROTOOPT, -1; + } + } + else + return errno = EINVAL, -1; + + if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) ) + return -1; + + return 0; +} + +int PingSocket::setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size) +{ + ScopedLock lock(&socket_lock); + + if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) + { + char ifname[IF_NAMESIZE]; + if ( sizeof(ifname) < option_size ) + option_size = sizeof(ifname); + if ( !CopyFromUser(ifname, option_value, option_size) ) + return -1; + if ( strnlen(ifname, option_size) == sizeof(ifname) ) + return errno = ENODEV, -1; + ifname[option_size] = '\0'; + ScopedLock lock(&netifs_lock); + for ( size_t i = 1; i < netifs_count; i++ ) + { + if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) ) + { + ifindex = i; + return 0; + } + } + return errno = ENODEV, -1; + } + + uintmax_t value; + if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) ) + return -1; + + if ( level == IPPROTO_PING ) + { + switch ( option_name ) + { + default: return errno = ENOPROTOOPT, -1; + } + } + else if ( level == SOL_SOCKET ) + { + switch ( option_name ) + { + case SO_BINDTOINDEX: + if ( UINT_MAX < value ) + return errno = EINVAL, -1; + ifindex = value; + break; + case SO_BROADCAST: broadcast = value; break; + case SO_DEBUG: + if ( value != 0 ) + return errno = EPERM, -1; + break; + case SO_DONTROUTE: + if ( value != 0 ) + return errno = EPERM, -1; + break; + case SO_RCVBUF: + { + size_t hard_limit = MAXIMAL_PACKET_LIMIT * Page::Size(); + if ( hard_limit < value ) + value = hard_limit; + receive_limit = value; + // Shrink the receive queue until it fits. + while ( first_packet && receive_limit < receive_current ) + { + Ref packet = first_packet; + first_packet->next.Reset(); + first_packet = first_packet->next; + receive_current -= packet->pmap.size; + } + if ( !first_packet ) + last_packet.Reset(); + break; + } + case SO_REUSEADDR: reuseaddr = value; break; + case SO_SNDBUF: + { + size_t hard_limit = MAXIMAL_PACKET_LIMIT * Page::Size(); + if ( hard_limit < value ) + value = hard_limit; + // TODO: This value is unused. + send_limit = value; + break; + } + default: return errno = ENOPROTOOPT, -1; + } + } + else + return errno = EINVAL, -1; + + return 0; +} + +int PingSocket::shutdown(ioctx_t* ctx, int how) +{ + (void) ctx; + ScopedLock lock(&socket_lock); + if ( how & ~(SHUT_RD | SHUT_WR) ) + return errno = EINVAL, -1; + how_shutdown |= how; + // Drop the receive queue if shut down for read. + if ( how & SHUT_RD ) + { + // Avoid stack overflow in first_packet recursive destructor. + while ( first_packet ) + { + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + } + last_packet.Reset(); + } + kthread_cond_broadcast(&receive_cond); + poll_channel.Signal(PollEventStatus()); + return 0; +} + +int PingSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) +{ + ScopedLock lock(&socket_lock); + if ( !connected ) + return errno = ENOTCONN, -1; + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return -1; + if ( af == AF_INET ) + { + if ( sizeof(remote.in) < addrsize ) + addrsize = sizeof(remote.in); + } + else if ( af == AF_INET6 ) + { + if ( sizeof(remote.in6) < addrsize ) + addrsize = sizeof(remote.in6); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !ctx->copy_to_dest(addr, &remote, addrsize) ) + return -1; + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return -1; + return 0; +} + +int PingSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) +{ + ScopedLock lock(&socket_lock); + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return -1; + if ( af == AF_INET ) + { + if ( sizeof(local.in) < addrsize ) + addrsize = sizeof(local.in); + } + else if ( af == AF_INET6 ) + { + if ( sizeof(local.in6) < addrsize ) + addrsize = sizeof(local.in6); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !ctx->copy_to_dest(addr, &local, addrsize) ) + return -1; + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return -1; + return 0; +} + +// socket_lock locked +void PingSocket::ReceivePacket(Ref pkt) +{ + if ( how_shutdown & SHUT_RD ) + return; + // Drop the packet if the receive queue is full. + if ( receive_limit < receive_current ) + return; + size_t available = receive_limit - receive_current; + if ( available < pkt->pmap.size ) + return; + // Add the packet to the receive queue. + receive_current += pkt->pmap.size; + if ( last_packet ) + { + last_packet->next = pkt; + last_packet = pkt; + } + else + { + first_packet = pkt; + last_packet = pkt; + } + kthread_cond_broadcast(&receive_cond); + poll_channel.Signal(PollEventStatus()); +} + +void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast) +{ + if ( src->s_addr == htobe32(INADDR_ANY) ) + return; + const unsigned char* in = pkt->from + pkt->offset; + size_t inlen = pkt->length - pkt->offset; + if ( IP::ipsum(in, inlen) != 0 ) + return; + struct icmp_echo hdr; + if ( inlen < sizeof(hdr) ) + return; + memcpy(&hdr, in, sizeof(hdr)); + if ( hdr.type == ICMP_TYPE_ECHO_REQUEST ) + { + // Ignore broadcast echo requests as permitted by RFC 1122 3.2.2.6. + if ( dst_broadcast ) + return; + if ( hdr.code != 0 ) + return; + size_t payload_length = inlen - sizeof(hdr); + struct in_addr sendfrom; + if ( !IP::GetSourceIP(dst, src, &sendfrom, 0) ) + return; + Ref out_pkg = GetPacket(); + if ( !out_pkg ) + return; + unsigned char* out = out_pkg->from; + struct icmp_echo reply; + reply.type = ICMP_TYPE_ECHO_REPLY; + reply.code = 0; + reply.checksum = 0; + reply.identifier = hdr.identifier; + reply.sequence = hdr.sequence; + out_pkg->length = sizeof(reply) + payload_length; + if ( out_pkg->pmap.size < out_pkg->length ) + return; + memcpy(out, &reply, sizeof(reply)); + memcpy(out + sizeof(reply), in + sizeof(hdr), payload_length); + reply.checksum = htobe16(IP::ipsum(out, out_pkg->length)); + memcpy(out, &reply, sizeof(reply)); + if ( !IP::Send(out_pkg, &sendfrom, src, IPPROTO_ICMP, 0, false) ) + return; + return; + } + if ( hdr.type != ICMP_TYPE_ECHO_REPLY ) + return; + pkt->offset += sizeof(hdr) - sizeof(hdr.sequence); + hdr.identifier = be16toh(hdr.identifier); + // Port 0 is not valid. + if ( hdr.identifier == 0 ) + return; + ScopedLock lock1(&bind_lock); + // Find the socket that would receive the datagram sent to that address + // and port, or if no such socket, perhaps a socket bound to the any address + // and that port. + PingSocket* socket = NULL; + PingSocket* any_socket = NULL; + for ( PingSocket* iter = bindings_v4[hdr.identifier]; + !socket && iter; + iter = iter->next_socket ) + { + // Receive the datagram only if sent to the socket's address. + if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) ) + socket = iter; + // Receive the datagram only if the socket's address was the any address + // (and no other socket is bound to the datagram's destination address + // and port). + if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) + any_socket = iter; + } + // If no socket was bound to the datagram's destination address and port, + // try to deliver it to a socket bound to the any address and that port. + if ( !socket ) + socket = any_socket; + // Drop the datagram is no socket would receive it. + if ( !socket ) + return; + // If connected, require the source address is the remote address and the + // source port is the remote port, otherwise drop the datagram. + if ( socket->connected && + (memcmp(&socket->remote.in.sin_addr, src, sizeof(*src)) != 0 ) ) + return; + ScopedLock lock2(&socket->socket_lock); + // If the socket is bound to a network interface, require the datagram to + // have been received on that network interface. + if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid ) + return; + // Prepend the source address to the packet. + struct sockaddr_in from_addr; + memset(&from_addr, 0, sizeof(from_addr)); + from_addr.sin_family = AF_INET; + from_addr.sin_port = htobe16(hdr.identifier); + from_addr.sin_addr = *src; + if ( pkt->offset < sizeof(from_addr) ) + return; + pkt->offset -= sizeof(from_addr); + memcpy(pkt->from + pkt->offset, &from_addr, sizeof(from_addr)); + // Receive the datagram on the socket. + socket->ReceivePacket(pkt); +} + +Ref Socket(int af) +{ + if ( !IsSupportedAddressFamily(af) ) + return errno = EAFNOSUPPORT, Ref(NULL); + return Ref(new PingSocket(af)); +} + +} // namespace Ping +} // namespace Sortix diff --git a/kernel/net/ping.h b/kernel/net/ping.h new file mode 100644 index 00000000..1d829dd2 --- /dev/null +++ b/kernel/net/ping.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/ping.h + * Internet Control Message Protocol Echo. + */ + +#ifndef SORTIX_NET_PING_H +#define SORTIX_NET_PING_H + +#include + +#include +#include + +namespace Sortix { +namespace Ping { + +void Init(); +void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast); +Ref Socket(int af); + +} // namespace Ping +} // namespace Sortix + +#endif diff --git a/kernel/net/socket.cpp b/kernel/net/socket.cpp index 6e971f74..924a87ae 100644 --- a/kernel/net/socket.cpp +++ b/kernel/net/socket.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Jonas 'Sortie' Termansen. + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -30,6 +30,7 @@ #include #include "fs.h" +#include "ip.h" namespace Sortix { @@ -37,6 +38,7 @@ static Ref CreateSocket(int domain, int type, int protocol) { switch ( domain ) { + case AF_INET: return IP::Socket(type, protocol); case AF_UNIX: return NetFS::Socket(type, protocol); default: return errno = EAFNOSUPPORT, Ref(NULL); } diff --git a/kernel/net/tcp.cpp b/kernel/net/tcp.cpp new file mode 100644 index 00000000..bbfdda42 --- /dev/null +++ b/kernel/net/tcp.cpp @@ -0,0 +1,2570 @@ +/* + * Copyright (c) 2016, 2017, 2018, 2022 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/tcp.cpp + * Transmission Control Protocol. + */ + +// TODO: Implement sending back RST and such. +// TODO: PUSH. +// TODO: URG. +// TODO: Nagle's algorithm, MSG_MORE, TCP_CORK, TCP_NODELAY, etc. +// TODO: TCP options. +// TODO: Maximum Segment Size (respect TCP_MSS). +// TODO: Efficient receieve queue when out of order. +// TODO: Efficient backlog / half-open. Avoid denial of service attacks. +// TODO: Measure average round trip time for efficient retransmission? +// TODO: High speed extensions. +// TODO: Anti-congestion extensions. +// TODO: Selective acknowledgements. +// TODO: Implement all RFC 1122 TCP requirements. +// TODO: Probing Zero Windows per RFC 1122 4.2.2.17. +// TODO: os-test all the things. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef IOV_MAX +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ip.h" +#include "tcp.h" + +#define BUFFER_SIZE 65536 // Documented in tcp(4). + +#define NUM_RETRANSMISSIONS 6 // Documented in tcp(4) + +namespace Sortix { +namespace TCP { + +class TCPSocket; + +union tcp_sockaddr +{ + sa_family_t family; + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + +// The TCP states per STD 7 (RFC 793). +enum tcp_state +{ + TCP_STATE_CLOSED, + TCP_STATE_LISTEN, + TCP_STATE_SYN_SENT, + TCP_STATE_SYN_RECV, + TCP_STATE_ESTAB, + TCP_STATE_FIN_WAIT_1, + TCP_STATE_FIN_WAIT_2, + TCP_STATE_CLOSE_WAIT, + TCP_STATE_CLOSING, + TCP_STATE_LAST_ACK, + TCP_STATE_TIME_WAIT, +}; + +enum tcp_special +{ + TCP_SPECIAL_NOT, + TCP_SPECIAL_PENDING, + TCP_SPECIAL_WINDOW, + TCP_SPECIAL_ACKED, +}; + +// Global lock protecting all TCP sockets as they need to access each other. +static kthread_mutex_t tcp_lock = KTHREAD_MUTEX_INITIALIZER; + +static TCPSocket** bindings_v4; +static TCPSocket** bindings_v6; + +void Init() +{ + if ( !(bindings_v4 = new TCPSocket*[65536]) || + !(bindings_v6 = new TCPSocket*[65536]) ) + Panic("Failed to allocate TCP Socket bindings"); + for ( size_t i = 0; i < 65536; i++ ) + { + bindings_v4[i] = NULL; + bindings_v6[i] = NULL; + } +} + +static inline bool mod32_le(tcp_seq a, tcp_seq b) +{ + return (int32_t) (a - b) <= 0; +} + +static inline bool mod32_lt(tcp_seq a, tcp_seq b) +{ + return (int32_t) (a - b) < 0; +} + +static inline bool mod32_ge(tcp_seq a, tcp_seq b) +{ + return (int32_t) (a - b) >= 0; +} + +static inline bool mod32_gt(tcp_seq a, tcp_seq b) +{ + return (int32_t) (a - b) > 0; +} + +static bool IsSupportedAddressFamily(int af) +{ + return af == AF_INET /* TODO: || af == AF_INET6 */; +} + +static size_t AddressFamilySize(int af) +{ + switch ( af ) + { + case AF_INET: return sizeof(struct sockaddr_in); + case AF_INET6: return sizeof(struct sockaddr_in6); + } + return 0; +} + +// The TCP socket implementation. It is separate from the class TCPSocketNode +// as that class is reference counted, but this class manages its own lifetime +// so the socket is properly shut down after all references are closed. +// +// Bound sockets are in a double linked list starting from the appropriate +// bindings array indexed by the port, and then the sockets on that port are +// doubly linked using prev_socket and next_socket. +// +// Half-open sockets are in a doubly linked list starting from connecting_half +// in the listening socket, and then doubly linked with connecting_prev and +// connecting_next (with connecting_parent going back to the listening socket). +// +// Ready sockets that have not yet been accepted are in a doubly linked list +// starting from connecting_ready in the listening socket, and then doubly +// linked with connecting_prev and connecting_next (with connecting_parent going +// back to the listening socket). +// +// A socket wants to be deleted when it's in the CLOSED state and is not +// referenced by its TCPSocketNode anymore. Deletion is possible when the timer +// is not pending. +class TCPSocket +{ + friend void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast); + +public: + TCPSocket(int af); + ~TCPSocket(); + Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, + int flags); + int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + int listen(ioctx_t* ctx, int backlog); + ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); + ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags); + ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags); + ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count); + ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); + ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count); + int poll(ioctx_t* ctx, PollNode* node); + int getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value, + size_t* option_size_ptr); + int setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size); + int shutdown(ioctx_t* ctx, int how); + int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + +public: + void Unreference(); + void ProcessPacket(Ref pkt, union tcp_sockaddr* pkt_src, + union tcp_sockaddr* pkt_dst); + void ReceivePacket(Ref pkt, union tcp_sockaddr* pkt_src, + union tcp_sockaddr* pkt_dst); + void OnTimer(); + + inline bool want_destruction() + { + return state == TCP_STATE_CLOSED && !is_referenced; + } + + inline bool can_destroy() + { + return want_destruction() && !timer_armed; + } + +private: + short PollEventStatus(); + bool ImportAddress(ioctx_t* ctx, union tcp_sockaddr* dest, const void* addr, + size_t addrsize); + bool CanBind(union tcp_sockaddr new_local); + bool BindDefault(const union tcp_sockaddr* new_local_ptr); + void UpdateWindow(uint16_t new_window); + void TransmitLoop(); + bool Transmit(); + void ScheduleTransmit(); + void SetDeadline(); + void SetTimer(); + void Close(); + void Destroy(); + void Disconnect(); + void Fail(int error); + ssize_t recv_unlocked(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); + ssize_t send_unlocked(ioctx_t* ctx, const uint8_t* buf, size_t count, + int flags); + int shutdown_unlocked(int how); + +public: + // The previous socket bound on the same port in the address family. + TCPSocket* prev_socket; + + // The next socket bound on the same port in the address family. + TCPSocket* next_socket; + + // The first half-connected socket in our listening queue. + TCPSocket* connecting_half; + + // The first ready socket in our listening queue. + TCPSocket* connecting_ready; + + // The previous half-connected or ready socket in our listening queue. + TCPSocket* connecting_prev; + + // The next half-connected or ready socket in our listening queue. + TCPSocket* connecting_next; + + // The listening socket this socket is in the listening queue for. + TCPSocket* connecting_parent; + + // Condition variable that is signaled when new data can be received. + kthread_cond_t receive_cond; + + // Condition variable that is signaled when new data can be transmitted. + kthread_cond_t transmit_cond; + + // The local socket name, or the any address port 0 if not set. + union tcp_sockaddr local; + + // The remote socket name, or the any address port 0 if not set. + union tcp_sockaddr remote; + + // The network interface the socket is bound to, or 0 if none. + unsigned int ifindex; + + // Whether the socket has been bound to a port. + bool bound; + + // Whether the socket is receiving datagrams. + bool remoted; + + // Whether SO_REUSEADDR is set. + bool reuseaddr; + + // Whether the socket is referenced from anywhere and must not deallocate. + bool is_referenced; + +private: + // The timer used for retransmissions and timing out the connection. + Timer timer; + + // The poll channel to publish poll bit changes on. + PollChannel poll_channel; + + // The queue of incoming packets whose sequence numbers are too high to + // process right now, sorted by increasing sequence number. + Ref receive_queue; // TODO: Not a good way to keep track of this. + + // The deadline for the remote to acknowledge before retransmitting. + struct timespec deadline; + + // The offset at which data begins in the incoming ring buffer. + size_t incoming_offset; + + // The amount of bytes in the incoming ring buffer. + size_t incoming_used; + + // The offset at which data begins in the outgoing ring buffer. + size_t outgoing_offset; + + // The amount of bytes in the outgoing ring buffer. + size_t outgoing_used; + + // Send unacknowledged (STD 7, RFC 793). + tcp_seq send_una; + + // Send next (STD 7, RFC 793). + tcp_seq send_nxt; + + // Send window (STD 7, RFC 793). + tcp_seq send_wnd; + + // Send urgent pointer (STD 7, RFC 793). + tcp_seq send_up; + + // Segment sequence number used for last window update (STD 7, RFC 793). + tcp_seq send_wl1; + + // Segment acknowledgment number used for last window update (STD 7, RFC + // 793). + tcp_seq send_wl2; + + // Next sequence to send (STD 7, RFC 793). + tcp_seq send_pos; + + // Initial send sequence number (STD 7, RFC 793). + tcp_seq iss; + + // Receive next (STD 7, RFC 793). + tcp_seq recv_nxt; + + // Receive window (STD 7, RFC 793). + tcp_seq recv_wnd; + + // Receive urgent pointer (STD 7, RFC 793). + tcp_seq recv_up; + + // Last sequence acked (STD 7, RFC 793). + tcp_seq recv_acked; + + // Last window size advertised (STD 7, RFC 793). + tcp_seq recv_wndlast; + + // Initial receive sequence number (STD 7, RFC 793). + tcp_seq irs; + + // The address family to which this socket belongs. + int af; + + // Set to an errno value if a socket error has occured, or 0 otherwise. + int sockerr; + + // The number of sockets in the listening queue. + int backlog_used; + + // The maximum number of sockets in the listening queue. + int backlog_max; + + // The number of retransmissions that have occured since the last + // acknowledgement from the remote socket. + unsigned int retransmissions; + + // The current TCP state. + enum tcp_state state; + + // The state of the outgoing SYN. + enum tcp_special outgoing_syn; + + // The state of the outgoing FIN. + enum tcp_special outgoing_fin; + + // Whether SYN has been received from the remote socket. + bool has_syn; + + // Whether FIN has been received from the remote socket. + bool has_fin; + + // Whether a transmission has been scheduled. + bool transmit_scheduled; + + // Whether the timer is pending. + bool timer_armed; + + // Whether the socket has been shut down for receive. + bool shutdown_receive; + + // The incoming ring buffer. + unsigned char incoming[BUFFER_SIZE]; + + // The outgoing ring buffer. + unsigned char outgoing[BUFFER_SIZE]; +}; + +// The TCP socket Inode with a reference counted lifetime. The backend class +// TCPSocket is separate as it may stay alive for a little while after all +// references to it has been lost. +class TCPSocketNode : public AbstractInode +{ +public: + TCPSocketNode(TCPSocket* socket); + virtual ~TCPSocketNode(); + virtual Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, + int flags); + virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + virtual int listen(ioctx_t* ctx, int backlog); + virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); + virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); + virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, + int flags); + virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, + int flags); + virtual ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count); + virtual ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count); + virtual int poll(ioctx_t* ctx, PollNode* node); + virtual int getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr); + virtual int setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size); + virtual int shutdown(ioctx_t* ctx, int how); + virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + +private: + TCPSocket* socket; + +}; + +void TCPSocket__OnTimer(Clock* /*clock*/, Timer* /*timer*/, void* user) +{ + ((TCPSocket*) user)->OnTimer(); +} + +TCPSocket::TCPSocket(int af) +{ + prev_socket = NULL; + next_socket = NULL; + connecting_half = NULL; + connecting_ready = NULL; + connecting_prev = NULL; + connecting_next = NULL; + connecting_parent = NULL; + receive_cond = KTHREAD_COND_INITIALIZER; + transmit_cond = KTHREAD_COND_INITIALIZER; + memset(&local, 0, sizeof(local)); + memset(&remote, 0, sizeof(remote)); + ifindex = 0; + bound = false; + remoted = false; + reuseaddr = false; + // timer is initialized by its constructor. + timer.Attach(Time::GetClock(CLOCK_MONOTONIC)); + // poll_channel is initialized by its constructor. + // receive_queue is initialized by its constructor. + deadline = timespec_make(-1, 0); + incoming_offset = 0; + incoming_used = 0; + outgoing_offset = 0; + outgoing_used = 0; + send_una = 0; + send_nxt = 0; + send_wnd = 0; + send_up = 0; + send_wl1 = 0; + send_wl2 = 0; + send_pos = 0; + iss = 0; + recv_nxt = 0; + recv_wnd = 0; + recv_up = 0; + recv_acked = 0; + recv_wndlast = 0; + irs = 0; + this->af = af; + sockerr = 0; + backlog_used = 0; + backlog_max = 0; + retransmissions = 0; + state = TCP_STATE_CLOSED; + outgoing_syn = TCP_SPECIAL_NOT; + outgoing_fin = TCP_SPECIAL_NOT; + has_syn = false; + has_fin = false; + transmit_scheduled = false; + is_referenced = false; + timer_armed = false; + shutdown_receive = false; + memset(incoming, 0, sizeof(incoming)); + memset(outgoing, 0, sizeof(outgoing)); +} + +TCPSocket::~TCPSocket() +{ + assert(state == TCP_STATE_CLOSED); + assert(!bound); + assert(!prev_socket); + assert(!next_socket); + assert(!connecting_half); + assert(!connecting_half); + assert(!connecting_ready); + assert(!connecting_prev); + assert(!connecting_next); + assert(!connecting_parent); + assert(!is_referenced); + // Avoid stack overflow in receive_queue recursive destructor. + while ( receive_queue ) + { + Ref packet = receive_queue; + receive_queue = packet->next; + packet->next.Reset(); + } +} + +void TCPSocket::Unreference() +{ + kthread_mutex_lock(&tcp_lock); + is_referenced = false; + Disconnect(); + bool do_delete = can_destroy(); + kthread_mutex_unlock(&tcp_lock); + if ( do_delete ) + delete this; +} + +void TCPSocket::Close() // tcp_lock taken +{ + if ( timer_armed && timer.TryCancel() ) + timer_armed = false; + Destroy(); + state = TCP_STATE_CLOSED; + kthread_cond_broadcast(&transmit_cond); + kthread_cond_broadcast(&receive_cond); + deadline = timespec_make(-1, 0); + SetTimer(); +} + +void TCPSocket::Disconnect() // tcp_lock taken +{ + if ( state == TCP_STATE_LISTEN ) + Close(); + else if ( state != TCP_STATE_CLOSED ) + { + // TODO: Send back RST if the peer sends when we're not receiving. + if ( state == TCP_STATE_SYN_SENT || + state == TCP_STATE_SYN_RECV || + state == TCP_STATE_ESTAB || + state == TCP_STATE_CLOSE_WAIT ) + { + shutdown_unlocked(SHUT_WR); + // CLOSED, LAST_ACK, FIN_WAIT_1. + } + // FIN_WAIT_1 will enter FIN_WAIT_2 or time out. + // FIN_WAIT_2 will time out when unreferenced. + // CLOSING will resend FIN or time out. + // LAST_ACK will resend FIN or time out. + // TIME_WAIT will time out and close. + if ( state == TCP_STATE_FIN_WAIT_2 && !is_referenced ) + { + deadline = timespec_make(-1, 0); + SetDeadline(); + SetTimer(); + } + } +} + +void TCPSocket::Fail(int error) +{ + sockerr = error; + Close(); +} + +void TCPSocket::Destroy() // tcp_lock taken +{ + if ( bound ) + { + if ( af == AF_INET ) + { + uint16_t port = be16toh(local.in.sin_port); + if ( prev_socket ) + prev_socket->next_socket = next_socket; + else + bindings_v4[port] = next_socket; + if ( next_socket ) + next_socket->prev_socket = prev_socket; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(local.in6.sin6_port); + if ( prev_socket ) + prev_socket->next_socket = next_socket; + else + bindings_v6[port] = next_socket; + if ( next_socket ) + next_socket->prev_socket = prev_socket; + } + prev_socket = NULL; + next_socket = NULL; + bound = false; + } + while ( connecting_half || connecting_ready ) + { + TCPSocket* socket; + if ( connecting_half ) + { + socket = connecting_half; + connecting_half = socket->connecting_next; + if ( connecting_half ) + connecting_half->connecting_prev = NULL; + } + else + { + socket = connecting_ready; + connecting_ready = socket->connecting_next; + if ( connecting_ready ) + connecting_ready->connecting_prev = NULL; + } + socket->connecting_prev = NULL; + socket->connecting_next = NULL; + socket->connecting_parent = NULL; + backlog_used--; + socket->Disconnect(); + } + if ( connecting_parent ) + { + if ( connecting_prev ) + connecting_prev->connecting_next = connecting_next; + else if ( state == TCP_STATE_SYN_RECV ) + connecting_parent->connecting_half = connecting_next; + else + connecting_parent->connecting_ready = connecting_next; + if ( connecting_next ) + connecting_next->connecting_prev = connecting_prev; + connecting_prev = NULL; + connecting_next = NULL; + connecting_parent->backlog_used--; + connecting_parent = NULL; + } +} + +Ref TCPSocket::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr, + int flags) +{ + if ( flags & ~(0) ) + return errno = EINVAL, Ref(NULL); + if ( addr && !addrsize_ptr ) + return errno = EINVAL, Ref(NULL); + ScopedLock lock(&tcp_lock); + if ( state != TCP_STATE_LISTEN ) + return errno = EINVAL, Ref(NULL); + while ( !connecting_ready ) + { + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK, Ref(NULL); + if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) ) + return errno = EINTR, Ref(NULL); + } + TCPSocket* socket = connecting_ready; + if ( addr ) + { + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return Ref(NULL); + size_t af_addrsize = AddressFamilySize(af); + if ( af_addrsize < addrsize ) + addrsize = af_addrsize; + if ( !ctx->copy_to_dest(addr, &socket->remote, addrsize) ) + return Ref(NULL); + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return Ref(NULL); + } + Ref result(new TCPSocketNode(socket)); + if ( !result ) + return Ref(NULL); + connecting_ready = socket->connecting_next; + if ( connecting_ready ) + connecting_ready->connecting_prev = NULL; + socket->connecting_prev = NULL; + socket->connecting_next = NULL; + socket->connecting_parent = NULL; + backlog_used--; + return result; +} + +bool TCPSocket::ImportAddress(ioctx_t* ctx, + union tcp_sockaddr* dest, + const void* addr, + size_t addrsize) +{ + // TODO: os-test whether AF_UNSPEC can disconnect. + if ( addrsize != AddressFamilySize(af) ) + return errno = EINVAL, -1; + union tcp_sockaddr copy; + memset(©, 0, sizeof(copy)); + if ( !ctx->copy_from_src(©, addr, addrsize) ) + return false; + if ( copy.family != af ) + return errno = EAFNOSUPPORT, false; + memcpy(dest, ©, sizeof(copy)); + return true; +} + +bool TCPSocket::CanBind(union tcp_sockaddr new_local) // tcp_lock taken +{ + if ( af == AF_INET ) + { + // TODO: os-test binding to broadcast addresses. + // Bind to either the any address or the address of a network interface. + if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) ) + { + // TODO: What happens to sockets if the network interface changes + // its address? + ScopedLock ifs_lock(&netifs_lock); + bool found = false; + for ( unsigned int i = 1; i < netifs_count; i++ ) + { + NetworkInterface* netif = netifs[i]; + if ( !netif ) + continue; + ScopedLock cfg_lock(&netif->cfg_lock); + if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr, + sizeof(struct in_addr)) == 0 ) + { + found = true; + break; + } + } + // No interface had the correct address. + if ( !found ) + return errno = EADDRNOTAVAIL, false; + } + uint16_t port = be16toh(new_local.in.sin_port); + for ( TCPSocket* socket = bindings_v4[port]; + socket; + socket = socket->next_socket ) + { + // TODO: os-test how SO_REUSEADDR works for TCP. + if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && + !(reuseaddr && socket->reuseaddr) ) + return errno = EADDRINUSE, false; + if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && + !(reuseaddr && socket->reuseaddr) ) + return errno = EADDRINUSE, false; + if ( new_local.in.sin_addr.s_addr == + socket->local.in.sin_addr.s_addr ) + return errno = EADDRINUSE, false; + } + } + else if ( af == AF_INET6 ) + { + // TODO: IPv6 support for seeing if any interface has the address. + if ( true ) + return errno = EAFNOSUPPORT, false; + uint16_t port = be16toh(new_local.in6.sin6_port); + if ( bindings_v6[port] ) + return errno = EADDRINUSE, false; + for ( TCPSocket* socket = bindings_v6[port]; + socket; + socket = socket->next_socket ) + { + if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any, + sizeof(in6addr_any)) && + !(reuseaddr && socket->reuseaddr) ) + if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any, + sizeof(in6addr_any)) && + !(reuseaddr && socket->reuseaddr) ) + if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr, + sizeof(new_local.in6.sin6_addr)) ) + return errno = EADDRINUSE, false; + } + } + else + return errno = EAFNOSUPPORT, false; + return true; +} + +int TCPSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + ScopedLock lock(&tcp_lock); + if ( bound ) + return errno = EINVAL, -1; + union tcp_sockaddr new_local; + if ( !ImportAddress(ctx, &new_local, addr, addrsize) ) + return -1; + uint16_t port; + if ( af == AF_INET ) + port = be16toh(new_local.in.sin_port); + else if ( af == AF_INET6 ) + port = be16toh(new_local.in6.sin6_port); + else + return errno = EAFNOSUPPORT, -1; + // TODO: Binding to the any address needs to pick the appropriate source + // interface and bind to its address. (Or really? udp doesn't? + // os-test?) + // TODO: os-test a server listening on any, and then getsockname a + // connection received on that port. + if ( port == 0 ) + return BindDefault(&new_local) ? 0 : -1; + if ( !CanBind(new_local) ) + return -1; + if ( af == AF_INET ) + { + uint16_t port = be16toh(new_local.in.sin_port); + if ( bindings_v4[port] ) + bindings_v4[port]->prev_socket = this; + next_socket = bindings_v4[port]; + prev_socket = NULL; + bindings_v4[port] = this; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(new_local.in6.sin6_port); + if ( bindings_v6[port] ) + return errno = EADDRINUSE, -1; + next_socket = bindings_v6[port]; + prev_socket = NULL; + bindings_v6[port] = this; + } + else + return errno = EAFNOSUPPORT, -1; + memcpy(&local, &new_local, sizeof(new_local)); + bound = true; + return 0; +} + +// tcp_lock locked +bool TCPSocket::BindDefault(const union tcp_sockaddr* new_local_ptr) +{ + // TODO: This allocator becomes increasingly biased as more ports are + // allocated. + // TODO: Try not to allocate recently used ports. + union tcp_sockaddr new_local; + if ( new_local_ptr ) + memcpy(&new_local, new_local_ptr, sizeof(union tcp_sockaddr)); + else + { + memset(&new_local, 0, sizeof(new_local)); + if ( af == AF_INET ) + { + new_local.in.sin_family = AF_INET; + new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY); + } + else if ( af == AF_INET6 ) + { + new_local.in6.sin6_family = AF_INET6; + new_local.in6.sin6_addr = in6addr_any; + } + else + return errno = EAFNOSUPPORT, false; + } + uint16_t start = 32768; // Documented in tcp(4). + uint16_t end = 61000; // Documented in tcp(4). + uint16_t count = end - start; + uint16_t offset = arc4random_uniform(count); + for ( uint16_t i = 0; i < count; i++ ) + { + uint16_t j = offset + i; + if ( count <= j ) + j -= count; + uint16_t port = start + j; + if ( af == AF_INET ) + new_local.in.sin_port = htobe16(port); + else if ( af == AF_INET6 ) + new_local.in6.sin6_port = htobe16(port); + else + return errno = EAFNOSUPPORT, false; + if ( !CanBind(new_local) ) + { + if ( errno == EADDRINUSE ) + continue; + return false; + } + if ( af == AF_INET ) + { + if ( bindings_v4[port] ) + bindings_v4[port]->prev_socket = this; + next_socket = bindings_v4[port]; + prev_socket = NULL; + bindings_v4[port] = this; + } + else if ( af == AF_INET6 ) + { + if ( bindings_v6[port] ) + bindings_v6[port]->prev_socket = this; + next_socket = bindings_v6[port]; + prev_socket = NULL; + bindings_v6[port] = this; + } + else + return errno = EAFNOSUPPORT, false; + memcpy(&local, &new_local, sizeof(new_local)); + bound = true; + return true; + } + return errno = EAGAIN, false; +} + +void TCPSocket::TransmitLoop() // tcp_lock taken +{ + if ( state == TCP_STATE_CLOSED ) + return; + if ( NUM_RETRANSMISSIONS <= retransmissions ) + { + Fail(ETIMEDOUT); + return; + } + if ( !Transmit() && NUM_RETRANSMISSIONS - 1 <= retransmissions ) + { + Fail(errno); + return; + } +} + +bool TCPSocket::Transmit() // tcp_lock taken +{ + if ( state == TCP_STATE_CLOSED ) + return (errno = sockerr ? sockerr : ENOTCONN), false; + + // Move new outgoing data into the transmission window if there is room. + tcp_seq window_available = (tcp_seq) (send_una + send_wnd - send_nxt); + if ( window_available && outgoing_syn == TCP_SPECIAL_PENDING ) + { + send_nxt++; + outgoing_syn = TCP_SPECIAL_WINDOW; + window_available--; + } + if ( window_available ) + { + tcp_seq window_data = (tcp_seq)(send_nxt - send_una); + if ( outgoing_syn == TCP_SPECIAL_WINDOW ) + window_data--; + if ( outgoing_fin == TCP_SPECIAL_WINDOW ) + window_data--; + assert(window_data <= outgoing_used); + size_t outgoing_new = outgoing_used - window_data; + tcp_seq amount = window_available; + if ( outgoing_new < amount ) + amount = outgoing_new; + send_nxt += amount; + window_available -= amount; + } + if ( window_available && outgoing_fin == TCP_SPECIAL_PENDING ) + { + send_nxt++; + outgoing_fin = TCP_SPECIAL_WINDOW; + window_available--; + } + + // Transmit packets. + bool any = false; + while ( mod32_lt(send_pos, send_nxt) || + (has_syn && mod32_lt(recv_acked, recv_nxt)) || + recv_wnd != recv_wndlast ) + { + any = true; + size_t mtu; + union tcp_sockaddr sendfrom; + if ( af == AF_INET ) + { + if ( !IP::GetSourceIP(&local.in.sin_addr, &remote.in.sin_addr, + &sendfrom.in.sin_addr, ifindex, &mtu) ) + return false; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, false; + if ( mtu < sizeof(struct tcphdr) ) + return errno = EINVAL, false; + mtu -= sizeof(struct tcphdr); + Ref pkt = GetPacket(); + if ( !pkt ) + return false; + pkt->length = sizeof(struct tcphdr); + unsigned char* out = pkt->from; + struct tcphdr hdr; + if ( af == AF_INET ) + { + hdr.th_sport = local.in.sin_port; + hdr.th_dport = remote.in.sin_port; + } + else if ( af == AF_INET6 ) + { + hdr.th_sport = local.in6.sin6_port; + hdr.th_dport = remote.in6.sin6_port; + } + else + return errno = EAFNOSUPPORT, false; + hdr.th_seq = htobe32(send_pos); + hdr.th_offset = TCP_OFFSET_ENCODE(sizeof(struct tcphdr) / 4); + hdr.th_flags = 0; + tcp_seq send_nxtpos = send_pos; + assert(mod32_le(send_nxtpos, send_nxt)); + if ( outgoing_syn == TCP_SPECIAL_WINDOW && send_nxtpos == send_una ) + { + hdr.th_flags |= TH_SYN; + send_nxtpos++; + } + assert(mod32_le(send_nxtpos, send_nxt)); + if ( has_syn ) + { + // TODO: RFC 1122 4.2.2.6: + // "TCP SHOULD send an MSS (Maximum Segment Size) option in + // every SYN segment when its receive MSS differs from the + // default 536, and MAY send it always." + // "If an MSS option is not received at connection setup, TCP + // MUST assume a default send MSS of 536 (576-40)." + hdr.th_flags |= TH_ACK; + hdr.th_ack = htobe32(recv_nxt); + } + else + hdr.th_ack = htobe32(0); + hdr.th_win = htobe16(recv_wnd); + hdr.th_urp = htobe16(0); + hdr.th_sum = htobe16(0); + tcp_seq window_data = (tcp_seq)(send_nxt - send_pos); + if ( send_pos == send_una && outgoing_syn == TCP_SPECIAL_WINDOW ) + window_data--; + if ( mod32_lt(send_pos, send_nxt) && + outgoing_fin == TCP_SPECIAL_WINDOW ) + window_data--; + if ( window_data ) + { + size_t amount = mtu < window_data ? mtu : window_data; + assert(outgoing_offset <= sizeof(outgoing)); + tcp_seq window_length = (tcp_seq) (send_nxtpos - send_una); + if ( outgoing_syn == TCP_SPECIAL_WINDOW ) + window_length--; + assert(window_length <= sizeof(outgoing)); + size_t outgoing_end = outgoing_offset + window_length; + if ( sizeof(outgoing) <= outgoing_end ) + outgoing_end -= sizeof(outgoing); + assert(outgoing_end < sizeof(outgoing)); + size_t until_end = sizeof(outgoing) - outgoing_end; + size_t first = until_end < amount ? until_end : amount; + assert(first <= sizeof(outgoing)); + assert(first <= sizeof(outgoing) - outgoing_end); + size_t second = amount - first; + assert(second <= sizeof(outgoing)); + memcpy(out + sizeof(hdr), outgoing + outgoing_end, first); + if ( second ) + memcpy(out + sizeof(hdr) + first, outgoing, second); + pkt->length += amount; + send_nxtpos += amount; + } + assert(mod32_le(send_nxtpos, send_nxt)); + if ( outgoing_fin == TCP_SPECIAL_WINDOW && + send_nxtpos + 1 == send_nxt ) + { + hdr.th_flags |= TH_FIN; + send_nxtpos++; + } + assert(mod32_le(send_nxtpos, send_nxt)); + memcpy(out, &hdr, sizeof(hdr)); + uint16_t checksum = 0; + if ( af == AF_INET ) + { + checksum = IP::ipsum_buf(checksum, &sendfrom.in.sin_addr, + sizeof(struct in_addr)); + checksum = IP::ipsum_buf(checksum, &remote.in.sin_addr, + sizeof(struct in_addr)); + } + else if ( af == AF_INET6 ) + { + checksum = IP::ipsum_buf(checksum, &sendfrom.in6.sin6_addr, + sizeof(struct in6_addr)); + checksum = IP::ipsum_buf(checksum, &remote.in6.sin6_addr, + sizeof(struct in6_addr)); + } + else + return errno = EAFNOSUPPORT, false; + checksum = IP::ipsum_word(checksum, IPPROTO_TCP); + checksum = IP::ipsum_word(checksum, pkt->length); + checksum = IP::ipsum_buf(checksum, out, pkt->length); + hdr.th_sum = htobe16(IP::ipsum_finish(checksum)); + memcpy(out, &hdr, sizeof(hdr)); + if ( af == AF_INET ) + { + if ( !IP::Send(pkt, &sendfrom.in.sin_addr, &remote.in.sin_addr, + IPPROTO_TCP, ifindex, false) ) + return false; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, false; + if ( has_syn ) + recv_acked = recv_nxt; + recv_wndlast = recv_wnd; + assert(mod32_le(send_nxtpos, send_nxt)); + send_pos = send_nxtpos; + } + if ( any ) + { + SetDeadline(); + SetTimer(); + } + return true; +} + +void TCPSocket::OnTimer() +{ + ScopedLock lock(&tcp_lock); + timer_armed = false; + if ( 0 <= deadline.tv_sec && + timespec_le(deadline, Time::Get(CLOCK_MONOTONIC)) ) + { + deadline = timespec_make(-1, 0); + if ( state == TCP_STATE_TIME_WAIT || + (state == TCP_STATE_FIN_WAIT_2 && !is_referenced) ) + Close(); + else if ( mod32_lt(send_una, send_pos) ) + { + retransmissions++; + send_pos = send_una; + } + } + transmit_scheduled = false; + TransmitLoop(); + if ( 0 <= deadline.tv_sec ) + SetTimer(); + if ( can_destroy() ) + delete this; +} + +void TCPSocket::ScheduleTransmit() // tcp_lock locked +{ + if ( state == TCP_STATE_CLOSED || state == TCP_STATE_LISTEN ) + return; + if ( transmit_scheduled && timer_armed ) + return; + transmit_scheduled = true; + SetTimer(); + +} + +void TCPSocket::SetDeadline() // tcp_lock locked +{ + if ( 0 <= deadline.tv_sec ) + return; + if ( state == TCP_STATE_TIME_WAIT || + (state == TCP_STATE_FIN_WAIT_2 && !is_referenced) ) + { + struct timespec now = Time::Get(CLOCK_MONOTONIC); + struct timespec msl2 = timespec_make(60, 0); // Documented in tcp(4). + deadline = timespec_add(now, msl2); + } + else if ( mod32_le(send_una, send_pos) ) + { + struct timespec now = Time::Get(CLOCK_MONOTONIC); + struct timespec delay = timespec_make(1 + 1 * retransmissions, 0); + deadline = timespec_add(now, delay); + } +} + +void TCPSocket::SetTimer() // tcp_lock locked +{ + if ( timer_armed ) + { + if ( !timer.TryCancel() ) + return; + timer_armed = false; + } + if ( state == TCP_STATE_CLOSED ) + return; + bool destruction_is_wanted = want_destruction(); + if ( transmit_scheduled || destruction_is_wanted || 0 <= deadline.tv_sec ) + { + int flags = TIMER_FUNC_MAY_DEALLOCATE_TIMER; + struct itimerspec timeout; + memset(&timeout, 0, sizeof(timeout)); + // Slightly delay transmission to batch together a better reply. + if ( transmit_scheduled ) + timeout.it_value = timespec_make(0, 1); + else if ( 0 <= deadline.tv_sec ) + { + timeout.it_value = deadline; + flags |= TIMER_ABSOLUTE; + } + timer.Set(&timeout, NULL, flags, TCPSocket__OnTimer, this); + timer_armed = true; + } +} + +void TCPSocket::ProcessPacket(Ref pkt, + union tcp_sockaddr* pkt_src, + union tcp_sockaddr* pkt_dst) // tcp_lock locked +{ + const unsigned char* in = pkt->from + pkt->offset; + size_t inlen = pkt->length - pkt->offset; + struct tcphdr hdr; + memcpy(&hdr, in, sizeof(hdr)); + hdr.th_sport = be16toh(hdr.th_sport); + hdr.th_dport = be16toh(hdr.th_dport); + hdr.th_seq = be32toh(hdr.th_seq); + hdr.th_ack = be32toh(hdr.th_ack); + hdr.th_win = be16toh(hdr.th_win); + hdr.th_urp = be16toh(hdr.th_urp); + size_t offset = TCP_OFFSET_DECODE(hdr.th_offset) * 4; + in += offset; + inlen -= offset; + if ( state == TCP_STATE_CLOSED ) // STD 7, RFC 793, page 65. + { + if ( hdr.th_flags & TH_RST ) + return; + // TODO: ACK the RST. + return; + } + else if ( state == TCP_STATE_LISTEN ) // STD 7, RFC 793, page 65. + { + if ( hdr.th_flags & TH_RST ) + return; + if ( hdr.th_flags & TH_ACK ) + { + // TODO: Send . + return; + } + if ( !(hdr.th_flags & TH_SYN) ) + return; + if ( !hdr.th_win ) + return; + if ( backlog_max <= backlog_used ) + return; + // TODO: Use SYN cache to mitigate SYN flood attack. + TCPSocket* socket = new TCPSocket(af); + if ( !socket ) + return; + assert(pkt_src); + assert(pkt_dst); + socket->remote = *pkt_src; + socket->local = *pkt_dst; + socket->remoted = true; + socket->bound = true; + if ( af == AF_INET ) + { + uint16_t port = be16toh(socket->local.in.sin_port); + socket->prev_socket = NULL; + socket->next_socket = bindings_v4[port]; + if ( socket->next_socket ) + socket->next_socket->prev_socket = socket; + bindings_v4[port] = socket; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(socket->local.in6.sin6_port); + socket->prev_socket = NULL; + socket->next_socket = bindings_v6[port]; + if ( socket->next_socket ) + socket->next_socket->prev_socket = socket; + bindings_v6[port] = socket; + } + socket->iss = arc4random(); + socket->send_una = socket->iss; + socket->send_nxt = socket->iss; + socket->send_wnd = 1; + socket->send_pos = socket->iss; + socket->outgoing_syn = TCP_SPECIAL_PENDING; + socket->recv_wnd = TCP_MAXWIN; + socket->recv_acked = hdr.th_seq; + socket->recv_nxt = hdr.th_seq + 1; + socket->irs = hdr.th_seq; + socket->has_syn = true; + socket->state = TCP_STATE_SYN_RECV; + socket->UpdateWindow(hdr.th_win); + socket->connecting_parent = this; + socket->connecting_prev = NULL; + socket->connecting_next = connecting_half; + if ( socket->connecting_next ) + socket->connecting_next->connecting_prev = socket; + connecting_half = socket; + backlog_used++; + socket->TransmitLoop(); + return; + } + else if ( state == TCP_STATE_SYN_SENT ) // STD 7, RFC 793, page 66. + { + if ( hdr.th_flags & TH_ACK ) + { + if ( mod32_le(hdr.th_ack, iss) || mod32_gt(hdr.th_ack, send_nxt) ) + { + if ( hdr.th_flags & TH_RST ) + return; + // TODO: Send RST. + return; + } + if ( !(mod32_le(send_una, hdr.th_ack) && + mod32_le(hdr.th_ack, send_nxt)) ) + return; + } + if ( hdr.th_flags & TH_RST ) + { + Fail(ECONNREFUSED); + return; + } + if ( !(hdr.th_flags & TH_SYN) ) + return; + recv_acked = hdr.th_seq; + recv_nxt = hdr.th_seq + 1; + irs = hdr.th_seq; + has_syn = true; + // RFC 1122 4.2.2.20 (c), page 94. + UpdateWindow(hdr.th_win); + send_wl1 = hdr.th_seq; + send_wl2 = hdr.th_ack; + // TODO: Drop packet if the packet contains data/FIN beyond the SYN? + if ( hdr.th_flags & TH_ACK ) + { + send_una = hdr.th_ack; + retransmissions = 0; + deadline = timespec_make(-1, 0); + SetDeadline(); + SetTimer(); + if ( mod32_lt(iss, send_una) ) + { + outgoing_syn = TCP_SPECIAL_ACKED; + state = TCP_STATE_ESTAB; + kthread_cond_broadcast(&receive_cond); // Wake up connect. + return; + } + } + state = TCP_STATE_SYN_RECV; + return; + } + // STD 7, RFC 793, page 69. + bool acceptable; + if ( inlen == 0 && recv_wnd == 0 ) + acceptable = hdr.th_seq == recv_nxt; + else if ( inlen == 0 && 0 < recv_wnd ) + acceptable = mod32_le(recv_nxt, hdr.th_seq) && + mod32_lt(hdr.th_seq, recv_nxt + recv_wnd); + else if ( 0 < inlen && 0 < recv_wnd ) + { + tcp_seq seg_end = (tcp_seq) (hdr.th_seq + inlen - 1); + acceptable = (mod32_le(recv_nxt, hdr.th_seq) && + mod32_lt(hdr.th_seq, recv_nxt + recv_wnd)) || + (mod32_le(recv_nxt, seg_end) && + mod32_lt(seg_end, recv_nxt + recv_wnd)); + } + else + { + acceptable = false; + // TODO: STD 7, RFC 793, page 69 "If the RCV.WND is zero, no segments + // will be acceptable, but special allowance should be made to + // accept valid ACKs, URGs and RSTs". + } + if ( !acceptable ) + { + if ( hdr.th_flags & TH_RST ) + return; + // Send . + recv_acked = recv_nxt - 1; + return; + } + // STD 7, RFC 793, page 70. Process segments in the right order and trim the + // segment to the receive window. + uint16_t real_seq = hdr.th_seq; + if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_SYN) ) + { + hdr.th_flags &= ~TH_SYN; + hdr.th_seq++; + } + if ( mod32_lt(hdr.th_seq, recv_nxt) ) + { + tcp_seq skip = recv_nxt - hdr.th_seq; + if ( inlen < skip ) + skip = inlen; + hdr.th_seq += skip; + in += skip; + inlen -= skip; + } + if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_FIN) ) + { + hdr.th_flags &= ~TH_FIN; + hdr.th_seq++; + } + if ( mod32_lt(hdr.th_seq, recv_nxt) ) // Already processes. + return; + if ( mod32_gt(hdr.th_seq, recv_nxt) ) // Can't process yet. + { + // Insert the segment in the receive queue. + Ref prev; + Ref iter = receive_queue; + // TODO: For n packets in the worst order, this scales O(n^2). + // TODO: This wastes a packet per byte in the worst case. + while ( iter ) + { + const unsigned char* iter_in = iter->from + iter->offset; + const unsigned char* iter_in_seq = + iter_in + offsetof(struct tcphdr, th_seq); + tcp_seq iter_seq; + memcpy(&iter_seq, iter_in_seq, sizeof(iter_seq)); + iter_seq = be32toh(iter_seq); + if ( mod32_le(real_seq, iter_seq) ) + break; + // TODO: Handle duplicate and overlapping segments. + prev = iter; + iter = iter->next; + } + if ( prev ) + { + pkt->next = prev->next; + prev->next = pkt; + } + else + { + pkt->next = receive_queue; + receive_queue = pkt; + } + return; + } + if ( recv_wnd < inlen ) + inlen = recv_wnd; + // STD 7, RFC 793, page 70. + if ( hdr.th_flags & TH_RST ) + { + if ( state == TCP_STATE_CLOSING || + state == TCP_STATE_LAST_ACK || + state == TCP_STATE_TIME_WAIT ) + Close(); + else + Fail(ECONNRESET); + return; + } + // STD 7, RFC 793, page 71. + if ( hdr.th_flags & TH_SYN ) + { + // TODO: Send RST. + Fail(ECONNRESET); + return; + } + // STD 7, RFC 793, page 72. + if ( !(hdr.th_flags & TH_ACK) ) + return; + // STD 7, RFC 793, page 72. + if ( state == TCP_STATE_SYN_RECV ) + { + // RFC 1122 4.2.2.20 (f), page 94. + UpdateWindow(hdr.th_win); + send_wl1 = hdr.th_seq; + send_wl2 = hdr.th_ack; + if ( mod32_le(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt) ) + { + state = TCP_STATE_ESTAB; + kthread_cond_broadcast(&receive_cond); // Wake up connect. + if ( connecting_parent ) + { + if ( connecting_prev ) + connecting_prev->connecting_next = connecting_next; + else + connecting_parent->connecting_half = connecting_next; + if ( connecting_next ) + connecting_next->connecting_prev = connecting_prev; + // TODO: This inserts the connection to the front of the + // accept queue, rather than the end, which is unfair to + // connections that have been waiting longer. + connecting_prev = NULL; + connecting_next = connecting_parent->connecting_ready; + if ( connecting_next ) + connecting_next->connecting_prev = this; + connecting_parent->connecting_ready = this; + kthread_cond_broadcast(&connecting_parent->receive_cond); + uint16_t status = connecting_parent->PollEventStatus(); + connecting_parent->poll_channel.Signal(status); + } + } + else + { + // TODO: Send . + TransmitLoop(); + return; + } + } + // STD 7, RFC 793, page 72. + // TODO: RFC 1122 4.2.2.20 (g), page 94 says SEG.ACK =< SND.UNA however this + // causes incoming connections to fail. + if ( mod32_lt(hdr.th_ack, send_una) ) + return; // Drop duplicate ack already seen. + else if ( mod32_lt(send_nxt, hdr.th_ack) ) + { + // TODO: Send ACK. + return; + } + // STD 7, RFC 793, page 72. Remove acknowledged data from the window. + tcp_seq old_send_una = send_una; + tcp_seq acked = (tcp_seq) (hdr.th_ack - send_una); + if ( outgoing_syn == TCP_SPECIAL_WINDOW && 0 < acked ) + { + outgoing_syn = TCP_SPECIAL_ACKED; + acked--; + send_una++; + } + tcp_seq window_data = (tcp_seq) (send_nxt - send_una); + if ( outgoing_fin == TCP_SPECIAL_WINDOW ) + window_data--; + if ( window_data && acked ) + { + size_t amount = window_data < acked ? window_data : acked; + assert(outgoing_offset < sizeof(outgoing)); + outgoing_offset += amount; + if ( sizeof(outgoing) <= outgoing_offset ) + outgoing_offset -= sizeof(outgoing); + assert(outgoing_offset < sizeof(outgoing)); + assert(amount <= outgoing_used); + outgoing_used -= amount; + kthread_cond_broadcast(&transmit_cond); + poll_channel.Signal(PollEventStatus()); + acked -= amount; + send_una += amount; + } + bool fin_was_acked = false; + if ( outgoing_fin == TCP_SPECIAL_WINDOW && 0 < acked ) + { + outgoing_fin = TCP_SPECIAL_ACKED; + acked--; + send_una++; + fin_was_acked = true; + } + if ( send_una != old_send_una ) + { + // TODO: Possibly recalculate the average time to contact remote. + retransmissions = 0; + SetTimer(); + } + // STD 7, RFC 793, page 72. + if ( mod32_lt(send_wl1, hdr.th_seq) || + (send_wl1 == hdr.th_seq && mod32_le(send_wl2, hdr.th_ack)) ) + { + UpdateWindow(hdr.th_win); + send_wl1 = hdr.th_seq; + send_wl2 = hdr.th_ack; + } + // STD 7, RFC 793, page 73. + if ( state == TCP_STATE_FIN_WAIT_1 ) + { + if ( fin_was_acked ) + { + state = TCP_STATE_FIN_WAIT_2; + // Time out the connection if the socket is no longer referenced. + if ( !is_referenced ) + { + deadline = timespec_make(-1, 0); + SetDeadline(); + SetTimer(); + } + } + } + else if ( state == TCP_STATE_CLOSING ) + { + if ( fin_was_acked ) + { + state = TCP_STATE_TIME_WAIT; + deadline = timespec_make(-1, 0); + SetDeadline(); + SetTimer(); + } + return; + } + else if ( state == TCP_STATE_LAST_ACK ) + { + if ( fin_was_acked ) + Close(); + return; + } + // TODO: Urgent data per STD 7, RFC 793, page 73. + // STD 7, RFC 793, page 74. + if ( state == TCP_STATE_ESTAB || + state == TCP_STATE_FIN_WAIT_1 || + state == TCP_STATE_FIN_WAIT_2 ) + { + assert(incoming_offset < sizeof(incoming)); + assert(incoming_used <= sizeof(incoming)); + size_t available = sizeof(incoming) - incoming_used; + size_t amount = available < inlen ? available : inlen; + assert(amount <= sizeof(incoming)); + assert(amount <= available); + size_t newat = incoming_offset + incoming_used; + if ( sizeof(incoming) <= newat ) + newat -= sizeof(incoming); + assert(newat < sizeof(incoming)); + size_t until_end = sizeof(incoming) - newat; + assert(until_end <= sizeof(incoming)); + size_t first = until_end < amount ? until_end : amount; + assert(first <= amount); + assert(first <= sizeof(incoming)); + size_t second = amount - first; + assert(second <= amount); + assert(second <= sizeof(incoming)); + assert(first + second == amount); + assert(first + second <= sizeof(incoming)); + assert(first + second <= available); + if ( !shutdown_receive ) + { + memcpy(incoming + newat, in, first); + if ( second ) + memcpy(incoming, in + first, second); + incoming_used += amount; + } + available = sizeof(incoming) - incoming_used; + if ( available < recv_wnd ) + recv_wnd = available; + recv_nxt = hdr.th_seq + amount; + if ( amount == inlen && (hdr.th_flags & TH_FIN) ) + { + recv_nxt++; + has_fin = true; + } + if ( incoming_used || has_fin ) + { + kthread_cond_broadcast(&receive_cond); + poll_channel.Signal(PollEventStatus()); + } + } + // STD 7, RFC 793, page 75. + if ( hdr.th_flags & TH_FIN ) + { + if ( state == TCP_STATE_ESTAB ) + { + state = TCP_STATE_CLOSE_WAIT; + kthread_cond_broadcast(&receive_cond); + poll_channel.Signal(PollEventStatus()); + } + else if ( state == TCP_STATE_FIN_WAIT_1 ) + { + // Our sent FIN hasn't been ACK'd or we'd be in FIN_WAIT_2. + state = TCP_STATE_CLOSING; + } + else if ( state == TCP_STATE_FIN_WAIT_2 ) + { + state = TCP_STATE_TIME_WAIT; + deadline = timespec_make(-1, 0); + SetDeadline(); + SetTimer(); + } + else if ( state == TCP_STATE_TIME_WAIT ) + { + // The timer is not reset like as by the standard to avoid a hostile + // remote from staying forever in TIME-WAIT. + } + } +} + +void TCPSocket::ReceivePacket(Ref pktnew, + union tcp_sockaddr* pkt_src, + union tcp_sockaddr* pkt_dst) // tcp_lock locked +{ + if ( pktnew ) + ProcessPacket(pktnew, pkt_src, pkt_dst); + while ( receive_queue ) + { + Ref pkt = receive_queue; + const unsigned char* in = pkt->from + pkt->offset; + const unsigned char* in_seq = in + offsetof(struct tcphdr, th_seq); + tcp_seq seq; + memcpy(&seq, in_seq, sizeof(seq)); + seq = be32toh(seq); + if ( mod32_gt(seq, recv_nxt) ) + break; + receive_queue = pkt->next; + pkt->next.Reset(); + if ( seq == recv_nxt ) + ProcessPacket(pkt, pkt_src, pkt_dst); + } + // Delay transmit to answer more efficiently based on upcoming packets. + ScheduleTransmit(); +} + +void TCPSocket::UpdateWindow(uint16_t new_window) +{ + tcp_seq pending = (tcp_seq) (send_nxt - send_una); + if ( new_window < pending ) + send_nxt = (tcp_seq) (send_una + pending); + send_wnd = new_window; +} + +int TCPSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + ScopedLock lock(&tcp_lock); + // TODO: os-test listen + connect, what errno? + if ( state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV ) + return errno = EALREADY, -1; + if ( state != TCP_STATE_CLOSED ) + return errno = EISCONN, -1; // TODO: Another errno if listening? + union tcp_sockaddr new_remote; + if ( !ImportAddress(ctx, &new_remote, addr, addrsize) ) + return -1; + if ( af == AF_INET ) + { + // Verify the port is non-zero. + if ( be16toh(new_remote.in.sin_port) == 0 ) + return errno = EADDRNOTAVAIL, -1; + } + else + return errno = EAFNOSUPPORT, -1; + // TODO: os-test AF_UNSPEC + // If the socket is not bound, find a route to the remote address and bind + // to the appropriate source address. + if ( !bound ) + { + union tcp_sockaddr new_local; + memset(&new_local, 0, sizeof(new_local)); + if ( af == AF_INET ) + { + struct in_addr any; + any.s_addr = htobe32(INADDR_ANY); + new_local.in.sin_family = AF_INET; + if ( !IP::GetSourceIP(&any, &new_remote.in.sin_addr, + &new_local.in.sin_addr, ifindex, NULL) ) + return -1; + new_local.in.sin_port = htobe16(0); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !BindDefault(&new_local) ) + return -1; + } + // Test if there is a route from the local address to the remote address. + // TODO: Does TCP also do this? Note that connecting to the any address + // should be forbidden, right? + if ( af == AF_INET ) + { + if ( !IP::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr, + NULL, ifindex, NULL) ) + return -1; + } + else + return errno = EAFNOSUPPORT, -1; + memcpy(&remote, &new_remote, sizeof(new_remote)); + remoted = true; + iss = arc4random(); + recv_wnd = TCP_MAXWIN; + send_una = iss; + send_nxt = iss; + send_wnd = 1; + send_pos = iss; + outgoing_syn = TCP_SPECIAL_PENDING; + state = TCP_STATE_SYN_SENT; + TransmitLoop(); + while ( !sockerr && + (state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV) ) + { + // TODO: os-test non-blocking connect. + if ( ctx->dflags & O_NONBLOCK ) + return errno = EINPROGRESS, -1; + if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) ) + return errno = EINTR, -1; + } + if ( sockerr ) + { + // TODO: This is not recoverable. Is that correct? + // TODO: os-test whether reconnect is possible after failed connect? + return errno = sockerr, -1; + } + return 0; +} + +int TCPSocket::listen(ioctx_t* /*ctx*/, int backlog) +{ + if ( backlog < 0 ) + return errno = EINVAL, -1; + // TODO: os-test if zero backlog allows connections. + if ( backlog == 0 ) + backlog = 1; + else if ( backlog < 0 || SOMAXCONN < backlog ) + backlog = SOMAXCONN; + ScopedLock lock(&tcp_lock); + if ( !bound ) + return errno = EDESTADDRREQ, -1; + // TODO: os-test a regular connection, close, and then try to listen. + if ( state != TCP_STATE_CLOSED ) + return errno = EINVAL, -1; + backlog_max = backlog; + memset(&remote, 0, sizeof(remote)); + if ( af == AF_INET ) + { + remote.in.sin_family = AF_INET; + remote.in.sin_addr.s_addr = htobe32(INADDR_ANY); + } + else if ( af == AF_INET6 ) + { + remote.in6.sin6_family = AF_INET6; + remote.in6.sin6_addr = in6addr_any; + } + else + return errno = EAFNOSUPPORT, -1; + remoted = true; + state = TCP_STATE_LISTEN; + return 0; +} + +ssize_t TCPSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) +{ + ScopedLock lock(&tcp_lock); + ssize_t result = recv_unlocked(ctx, buf, count, flags); + // Respond immediately if receive window has become empty. + !incoming_used ? TransmitLoop() : ScheduleTransmit(); + return result; +} + +ssize_t TCPSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags) +{ + struct msghdr msg; + if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) + return -1; + if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) + return errno = EINVAL, -1; + size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); + struct iovec* iov = new struct iovec[msg.msg_iovlen]; + if ( !iov ) + return -1; + struct iovec* user_iov = msg.msg_iov; + if ( !ctx->copy_from_src(iov, user_iov, iov_size) ) + return delete[] iov, -1; + msg.msg_iov = iov; + kthread_mutex_lock(&tcp_lock); + ssize_t result = 0; + for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ ) + { + size_t maximum = SSIZE_MAX - (size_t) result; + uint8_t* buf = (uint8_t*) iov[i].iov_base; + size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum; + if ( !count ) + continue; + ssize_t amount = recv_unlocked(ctx, buf, count, flags); + if ( amount < 0 ) + { + if ( result == 0 ) + result = -1; + break; + } + result += amount; + if ( (size_t) amount != count ) + break; + } + // Respond immediately if receive window has become empty. + !incoming_used ? TransmitLoop() : ScheduleTransmit(); + kthread_mutex_unlock(&tcp_lock); + msg.msg_iov = user_iov; + // TODO: os-test POSIX's requirement to ignore the namemsg_name + // msg_namelen, plus msg_controllen's behavior is unspecified. + msg.msg_namelen = 0; + msg.msg_controllen = 0; + delete[] iov; + if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) ) + return -1; + return result; +} + +ssize_t TCPSocket::recv_unlocked(ioctx_t* ctx, + uint8_t* buf, + size_t count, + int flags) // tcp_lock taken +{ + if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) // TODO: MSG_OOB. + return errno = EINVAL, -1; + if ( sockerr ) + return errno = sockerr, -1; + // TODO: os-test non-blocking connect + immediate recv. + // TODO: CLOSED after it has been closed? + if ( state == TCP_STATE_CLOSED || + state == TCP_STATE_LISTEN || + state == TCP_STATE_SYN_SENT || + state == TCP_STATE_SYN_RECV ) + return errno = ENOTCONN, -1; + size_t sofar = 0; + while ( sofar < count ) + { + while ( !(incoming_used || has_fin || shutdown_receive) ) + { + if ( sockerr ) + return sofar ? sofar : (errno = sockerr, -1); + if ( state == TCP_STATE_CLOSED ) + return sofar; + if ( sofar && !(flags & MSG_WAITALL) ) + return sofar; + if ( ctx->dflags & O_NONBLOCK ) + return sofar ? sofar : (errno = EWOULDBLOCK, -1); + if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) ) + return sofar ? sofar : (errno = EINTR, -1); + if ( sockerr ) + return sofar ? sofar : (errno = sockerr, -1); + } + if ( incoming_used == 0 && (has_fin || shutdown_receive) ) + return sofar; + uint8_t* data = buf + sofar; + size_t left = count - sofar; + assert(incoming_used <= sizeof(incoming)); + size_t amount = incoming_used < left ? incoming_used : left; + assert(incoming_offset < sizeof(incoming)); + size_t until_end = sizeof(incoming) - incoming_offset; + size_t first = until_end < amount ? until_end : amount; + size_t second = amount - first; + if ( !ctx->copy_to_dest(data, incoming + incoming_offset, first) ) + return sofar ? sofar : -1; + if ( second && !ctx->copy_to_dest(data + first, incoming, second) ) + return sofar ? sofar : -1; + sofar += amount; + if ( flags & MSG_PEEK ) + return sofar; + incoming_offset += amount; + if ( sizeof(incoming) <= incoming_offset ) + incoming_offset -= sizeof(incoming); + assert(incoming_offset < sizeof(incoming)); + incoming_used -= amount; + recv_wnd = sizeof(incoming) - incoming_used; + if ( UINT16_MAX < recv_wnd ) + recv_wnd = UINT16_MAX; + if ( TCP_MAXWIN < recv_wnd ) + recv_wnd = TCP_MAXWIN; + } + return sofar; +} + +ssize_t TCPSocket::send(ioctx_t* ctx, + const uint8_t* buf, + size_t count, + int flags) +{ + ScopedLock lock(&tcp_lock); + ssize_t result = send_unlocked(ctx, buf, count, flags); + TransmitLoop(); + return result; +} + +ssize_t TCPSocket::sendmsg(ioctx_t* ctx, + const struct msghdr* msg_ptr, + int flags) +{ + struct msghdr msg; + if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) + return -1; + if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) + return errno = EINVAL, -1; + // TODO: os-test if msg_name/msg_namelen/msg_control/msg_controllen are set. + size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); + struct iovec* iov = new struct iovec[msg.msg_iovlen]; + if ( !iov ) + return -1; + if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) + return delete[] iov, -1; + msg.msg_iov = iov; + kthread_mutex_lock(&tcp_lock); + ssize_t result = 0; + for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ ) + { + size_t maximum = SSIZE_MAX - (size_t) result; + const uint8_t* buf = (const uint8_t*) iov[i].iov_base; + size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum; + ssize_t amount = send_unlocked(ctx, buf, count, flags); + if ( amount < 0 ) + { + if ( result == 0 ) + result = -1; + break; + } + result += amount; + if ( (size_t) amount != count ) + break; + } + TransmitLoop(); + kthread_mutex_unlock(&tcp_lock); + delete[] iov; + return result; +} + +ssize_t TCPSocket::send_unlocked(ioctx_t* ctx, + const uint8_t* buf, + size_t count, + int flags) // tcp_lock taken +{ + // TODO: MSG_MORE (and implement TCP_CORK), MSG_OOB, MSG_DONTROUTE. + if ( flags & ~(MSG_NOSIGNAL) ) + return errno = EINVAL, -1; + if ( sockerr ) + return errno = sockerr, -1; + if ( state == TCP_STATE_CLOSED || + state == TCP_STATE_LISTEN || + state == TCP_STATE_SYN_SENT || + state == TCP_STATE_SYN_RECV ) + return errno = ENOTCONN, -1; + size_t sofar = 0; + while ( sofar < count ) + { + while ( outgoing_used == sizeof(outgoing) || + (state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT) ) + { + if ( sofar ) + return sofar; + if ( sockerr ) + return errno = sockerr, -1; + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK; + if ( !kthread_cond_wait_signal(&transmit_cond, &tcp_lock) ) + return errno = EINTR, -1; + } + if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT ) + { + if ( !(flags & MSG_NOSIGNAL) ) + CurrentThread()->DeliverSignal(SIGPIPE); + return errno = EPIPE, -1; + } + const uint8_t* data = buf + sofar; + size_t left = count - sofar; + assert(outgoing_offset < sizeof(outgoing)); + assert(outgoing_used <= sizeof(outgoing)); + size_t available = sizeof(outgoing) - outgoing_used; + size_t amount = available < left ? available : left; + size_t newat = outgoing_offset + outgoing_used; + if ( sizeof(outgoing) <= newat ) + newat -= sizeof(outgoing); + assert(newat < sizeof(outgoing)); + size_t until_end = sizeof(outgoing) - newat; + size_t first = until_end < amount ? until_end : amount; + size_t second = amount - first; + if ( !ctx->copy_from_src(outgoing + newat, data, first) ) + return sofar ? sofar : -1; + if ( second && !ctx->copy_from_src(outgoing, data + first, second) ) + return sofar ? sofar : -1; + outgoing_used += amount; + assert(outgoing_used <= sizeof(outgoing)); + sofar += amount; + // TODO: If there's a sent packet that hasn't been acknowledged, and + // there isn't a full packet yet, then just buffer and don't + // transmit yet. + // TODO: TCP_NODELAY, TCP_NOPUSH, MSG_MORE. + // TODO: Set PUSH appropriately. + } + return sofar; +} + +ssize_t TCPSocket::read(ioctx_t* ctx, uint8_t* buf, size_t count) +{ + return recv(ctx, buf, count, 0); +} + +ssize_t TCPSocket::write(ioctx_t* ctx, const uint8_t* buf, size_t count) +{ + return send(ctx, buf, count, 0); +} + +short TCPSocket::PollEventStatus() +{ + // TODO: os-test the poll bits. + // TODO: OOB poll bits. + short status = 0; + if ( connecting_ready ) + status |= POLLIN | POLLRDNORM; + if ( incoming_used || has_fin || shutdown_receive ) + status |= POLLIN | POLLRDNORM; + if ( (state == TCP_STATE_ESTAB || state == TCP_STATE_CLOSE_WAIT) && + outgoing_used < sizeof(outgoing) ) + status |= POLLOUT | POLLWRNORM; + if ( state == TCP_STATE_CLOSE_WAIT || + state == TCP_STATE_LAST_ACK || + state == TCP_STATE_TIME_WAIT || + state == TCP_STATE_CLOSED ) + status |= POLLHUP; + if ( sockerr ) + status |= POLLERR; + return status; +} + +int TCPSocket::poll(ioctx_t* /*ctx*/, PollNode* node) +{ + ScopedLock lock(&tcp_lock); + short ret_status = PollEventStatus() & node->events; + if ( ret_status ) + { + node->master->revents |= ret_status; + return 0; + } + poll_channel.Register(node); + return errno = EAGAIN, -1; +} + +int TCPSocket::getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr) +{ + ScopedLock lock(&tcp_lock); + + if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) + { + ScopedLock lock(&netifs_lock); + const char* ifname = ""; + if ( ifindex < netifs_count && netifs[ifindex] ) + ifname = netifs[ifindex]->ifinfo.name; + size_t option_size; + if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) ) + return -1; + size_t len = strlen(ifname); + size_t size = len + 1; + if ( option_size < size ) + return errno = ERANGE, -1; + if ( !CopyToUser(option_value, ifname, size) || + !CopyToUser(option_size_ptr, &size, sizeof(size)) ) + return -1; + return 0; + } + + uintmax_t result = 0; + + if ( level == IPPROTO_TCP ) + { + switch ( option_name ) + { + // TODO: TCP_NODELAY + // TODO: TCP_MAXSEG + // TODO: TCP_NOPUSH + // TODO: TCP_CORK + default: return errno = ENOPROTOOPT, -1; + } + } + else if ( level == SOL_SOCKET ) + { + switch ( option_name ) + { + case SO_BINDTOINDEX: result = ifindex; break; + case SO_DEBUG: result = 0; break; + case SO_DOMAIN: result = af; break; + case SO_ERROR: result = sockerr; break; + case SO_PROTOCOL: result = IPPROTO_TCP; break; + case SO_RCVBUF: result = sizeof(incoming); break; + case SO_REUSEADDR: result = reuseaddr; break; + case SO_SNDBUF: result = sizeof(outgoing); break; + case SO_TYPE: result = SOCK_STREAM; break; + // TODO: SO_ACCEPTCONN + // TODO: SO_LINGER + // TODO: SO_OOBINLINE + // TODO: SO_RCVLOWAT + // TODO: SO_RCVTIMEO + // TODO: SO_SNDLOWAT + // TODO: SO_SNDTIMEO + // TODO: SO_DONTROUTE + // TODO: SO_BROADCAST + default: return errno = ENOPROTOOPT, -1; + } + } + else + return errno = EINVAL, -1; + + if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) ) + return -1; + + return 0; +} + +// TODO: os-test socket options on shut down sockets. POSIX says EINVAL. +// TODO: os-test the errno for an invalid protocol. +// TODO: os-test the errno for an invalid option at a protocol level. + +int TCPSocket::setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size) +{ + ScopedLock lock(&tcp_lock); + + if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) + { + char ifname[IF_NAMESIZE]; + if ( sizeof(ifname) < option_size ) + option_size = sizeof(ifname); + if ( !CopyFromUser(ifname, option_value, option_size) ) + return -1; + if ( strnlen(ifname, option_size) == sizeof(ifname) ) + return errno = ENODEV, -1; + ifname[option_size] = '\0'; + ScopedLock lock(&netifs_lock); + for ( size_t i = 1; i < netifs_count; i++ ) + { + if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) ) + { + ifindex = i; + return 0; + } + } + return errno = ENODEV, -1; + } + + uintmax_t value; + if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) ) + return -1; + + if ( level == IPPROTO_TCP ) + { + switch ( option_name ) + { + case TCP_NODELAY: break; // TODO: Transmit if turned on? + case TCP_MAXSEG: break; // TODO: Implement this. + case TCP_NOPUSH: break; // TODO: Implement this. + // TODO: TCP_CORK + default: + return errno = ENOPROTOOPT, -1; + } + } + else if ( level == SOL_SOCKET ) + { + switch ( option_name ) + { + case SO_BINDTOINDEX: + if ( UINT_MAX < value ) + return errno = EINVAL, -1; + ifindex = value; + break; + case SO_DEBUG: + if ( value != 0 ) + return errno = EPERM, -1; + break; + case SO_KEEPALIVE: break; // TODO: Implement this. + case SO_REUSEADDR: reuseaddr = value; break; + case SO_LINGER: break; // TODO: Implement this. + case SO_RCVBUF: break; // TODO: Implement this. + case SO_SNDBUF: break; // TODO: Implement this. + // TODO: SO_BROADCAST + // TODO: SO_DONTROUTE + // TODO: SO_LINGER + // TODO: SO_RCVLOWAT + // TODO: SO_RCVTIMEO + // TODO: SO_SNDLOWAT + // TODO: SO_SNDTIMEO + default: return errno = ENOPROTOOPT, -1; + } + } + else + return errno = EINVAL, -1; + + return 0; +} + +int TCPSocket::shutdown(ioctx_t* /*ctx*/, int how) +{ + ScopedLock lock(&tcp_lock); + return shutdown_unlocked(how); +} + +int TCPSocket::shutdown_unlocked(int how) // tcp_lock taken +{ + // STD 7, RFC 793, page 60. + if ( state != TCP_STATE_SYN_SENT && + state != TCP_STATE_SYN_RECV && + state != TCP_STATE_ESTAB && + state != TCP_STATE_CLOSE_WAIT ) + return errno = ENOTCONN, -1; + if ( how & SHUT_WR ) + { + // STD 7, RFC 793, page 60. + if ( state == TCP_STATE_SYN_SENT ) + Close(); + else // TCP_STATE_SYN_SENT || TCP_STATE_ESTAB || TCP_STATE_CLOSE_WAIT + { + outgoing_fin = TCP_SPECIAL_PENDING; + // TODO: Should this state transition be delayed until the FIN + // enters the window or is sent? + if ( state == TCP_STATE_CLOSE_WAIT ) + state = TCP_STATE_LAST_ACK /* RFC 1122, 4.2.2.20 (a), page 93 */; + else + state = TCP_STATE_FIN_WAIT_1; + kthread_cond_broadcast(&transmit_cond); + TransmitLoop(); + } + } + if ( how & SHUT_RD ) + { + shutdown_receive = true; + kthread_cond_broadcast(&receive_cond); + } + return 0; +} + +int TCPSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) +{ + ScopedLock lock(&tcp_lock); + if ( !remoted || state == TCP_STATE_LISTEN ) + return errno = ENOTCONN, -1; + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return -1; + if ( af == AF_INET ) + { + if ( sizeof(remote.in) < addrsize ) + addrsize = sizeof(remote.in); + } + else if ( af == AF_INET6 ) + { + if ( sizeof(remote.in6) < addrsize ) + addrsize = sizeof(remote.in6); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !ctx->copy_to_dest(addr, &remote, addrsize) ) + return -1; + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return -1; + return 0; +} + +int TCPSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) +{ + ScopedLock lock(&tcp_lock); + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return -1; + if ( af == AF_INET ) + { + if ( sizeof(local.in) < addrsize ) + addrsize = sizeof(local.in); + } + else if ( af == AF_INET6 ) + { + if ( sizeof(local.in6) < addrsize ) + addrsize = sizeof(local.in6); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !ctx->copy_to_dest(addr, &local, addrsize) ) + return -1; + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return -1; + return 0; +} + +// TODO: os-test fstat on a socket. +TCPSocketNode::TCPSocketNode(TCPSocket* socket) +{ + this->socket = socket; + socket->is_referenced = true; + Process* process = CurrentProcess(); + inode_type = INODE_TYPE_STREAM; + dev = (dev_t) this; + ino = (ino_t) this; + type = S_IFSOCK; + kthread_mutex_lock(&process->idlock); + stat_uid = process->uid; + stat_gid = process->gid; + kthread_mutex_unlock(&process->idlock); + stat_mode = 0600 | this->type; +} + +TCPSocketNode::~TCPSocketNode() +{ + socket->Unreference(); +} + +Ref TCPSocketNode::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, + int flags) +{ + return socket->accept4(ctx, addr, addrsize, flags); +} + +int TCPSocketNode::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + return socket->bind(ctx, addr, addrsize); +} + +int TCPSocketNode::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + return socket->connect(ctx, addr, addrsize); +} + +int TCPSocketNode::listen(ioctx_t* ctx, int backlog) +{ + return socket->listen(ctx, backlog); +} + +ssize_t TCPSocketNode::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) +{ + return socket->recv(ctx, buf, count, flags); +} + +ssize_t TCPSocketNode::recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags) +{ + return socket->recvmsg(ctx, msg, flags); +} + +ssize_t TCPSocketNode::send(ioctx_t* ctx, const uint8_t* buf, size_t count, + int flags) +{ + return socket->send(ctx, buf, count, flags); +} + +ssize_t TCPSocketNode::sendmsg(ioctx_t* ctx, const struct msghdr* msg, + int flags) +{ + return socket->sendmsg(ctx, msg, flags); +} + +ssize_t TCPSocketNode::read(ioctx_t* ctx, uint8_t* buf, size_t count) +{ + return socket->read(ctx, buf, count); +} + +ssize_t TCPSocketNode::write(ioctx_t* ctx, const uint8_t* buf, size_t count) +{ + return socket->write(ctx, buf, count); +} + +int TCPSocketNode::poll(ioctx_t* ctx, PollNode* node) +{ + return socket->poll(ctx, node); +} + +int TCPSocketNode::getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr) +{ + return socket->getsockopt(ctx, level, option_name, option_value, + option_size_ptr); +} + +int TCPSocketNode::setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size) +{ + return socket->setsockopt(ctx, level, option_name, option_value, + option_size); +} + +int TCPSocketNode::shutdown(ioctx_t* ctx, int how) +{ + return socket->shutdown(ctx, how); +} + +int TCPSocketNode::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize) +{ + return socket->getpeername(ctx, addr, addrsize); +} + +int TCPSocketNode::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize) +{ + return socket->getsockname(ctx, addr, addrsize); +} + +void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast) +{ + if ( src->s_addr == htobe32(INADDR_ANY) ) + return; + if ( dst_broadcast ) + return; + const unsigned char* in = pkt->from + pkt->offset; + size_t inlen = pkt->length - pkt->offset; + struct tcphdr hdr; + if ( inlen < sizeof(hdr) ) + return; + if ( UINT16_MAX < inlen ) + return; + memcpy(&hdr, in, sizeof(hdr)); + hdr.th_sport = be16toh(hdr.th_sport); + hdr.th_dport = be16toh(hdr.th_dport); + hdr.th_sum = be16toh(hdr.th_sum); + uint16_t sum = 0; + sum = IP::ipsum_buf(sum, src, sizeof(struct in_addr)); + sum = IP::ipsum_buf(sum, dst, sizeof(struct in_addr)); + sum = IP::ipsum_word(sum, IPPROTO_TCP); + sum = IP::ipsum_word(sum, inlen); + sum = IP::ipsum_buf(sum, in, inlen); + if ( sum != 0 && sum != 0xFFFF ) + return; + if ( TCP_OFFSET_DECODE(hdr.th_offset) < sizeof(hdr) / 4 || + inlen < (size_t) TCP_OFFSET_DECODE(hdr.th_offset) * 4 ) + return; + // Port 0 is not valid. + if ( hdr.th_sport == 0 || hdr.th_dport == 0 ) + return; + // TODO: TCP options. Respect TCPOPT_MAXSEG. + TCPSocket* socket = NULL; + TCPSocket* socket_listener = NULL; + TCPSocket* any_socket_listener = NULL; + ScopedLock lock(&tcp_lock); + for ( TCPSocket* iter = bindings_v4[hdr.th_dport]; + !socket && iter; + iter = iter->next_socket ) + { + // TODO: If a TCP socket is bound, and then connected to, what happens? + // What if the TCP socket then connects to the other side? + if ( !iter->remoted ) + continue; + // The datagram was sent to the socket's local address. + if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) ) + { + // The first priority is to receive on a socket with the correct + // local address and the correct remote address. + if ( !memcmp(&iter->remote.in.sin_addr, src, sizeof(*src)) && + be16toh(iter->remote.in.sin_port) == hdr.th_sport ) + socket = iter; + // The second priority is to receive on a socket with the correct + // local address and listening for connections from any address. + else if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) + socket_listener = iter; + } + // The socket is bound to the any address. + if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) + { + // The third priority is to receive on a socket bound to the any + // address and listening for connections from any address. + if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) + any_socket_listener = iter; + } + } + if ( !socket ) + socket = socket_listener; + if ( !socket ) + socket = any_socket_listener; + // No socket wanted to receive the packet. + if ( !socket ) + { + // TODO: Send RST. + return; + } + // If the socket is bound to a network interface, require the packet to + // have been received on that network interface. + if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid ) + { + // TODO: Send RST. + return; + } + union tcp_sockaddr pkt_src; + pkt_src.in.sin_family = AF_INET; + pkt_src.in.sin_addr = *src; + pkt_src.in.sin_port = htobe16(hdr.th_sport); + union tcp_sockaddr pkt_dst; + pkt_dst.in.sin_family = AF_INET; + pkt_dst.in.sin_addr = *dst; + pkt_dst.in.sin_port = htobe16(hdr.th_dport); + // Receive the packet on the socket. + socket->ReceivePacket(pkt, &pkt_src, &pkt_dst); + // Delete the socket if needed or schedule a transmit if needed. + if ( socket->can_destroy() ) + delete socket; +} + +Ref Socket(int af) +{ + if ( !IsSupportedAddressFamily(af) ) + return errno = EAFNOSUPPORT, Ref(NULL); + TCPSocket* socket = new TCPSocket(af); + if ( !socket ) + return Ref(); + Ref result(new TCPSocketNode(socket)); + if ( !result ) + return delete socket, Ref(); + return result; +} + +} // namespace TCP +} // namespace Sortix diff --git a/kernel/net/tcp.h b/kernel/net/tcp.h new file mode 100644 index 00000000..2c7d8e81 --- /dev/null +++ b/kernel/net/tcp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/tcp.h + * Transmission Control Protocol. + */ + +#ifndef SORTIX_NET_TCP_H +#define SORTIX_NET_TCP_H + +#include + +#include +#include + +namespace Sortix { +namespace TCP { + +void Init(); +void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast); +Ref Socket(int af); + +} // namespace TCP +} // namespace Sortix + +#endif diff --git a/kernel/net/udp.cpp b/kernel/net/udp.cpp new file mode 100644 index 00000000..02a8d68d --- /dev/null +++ b/kernel/net/udp.cpp @@ -0,0 +1,1263 @@ +/* + * Copyright (c) 2016, 2017, 2018, 2022 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/udp.cpp + * User Datagram Protocol. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef IOV_MAX +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ip.h" +#include "udp.h" + +namespace Sortix { +namespace UDP { + +class UDPSocket; + +union udp_sockaddr +{ + sa_family_t family; + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + +// These values are documented in udp(4). +static const size_t DEFAULT_PACKET_LIMIT = 64; +static const size_t MAXIMAL_PACKET_LIMIT = 4096; + +static kthread_mutex_t bind_lock = KTHREAD_MUTEX_INITIALIZER; +static UDPSocket** bindings_v4; +static UDPSocket** bindings_v6; + +void Init() +{ + if ( !(bindings_v4 = new UDPSocket*[65536]) || + !(bindings_v6 = new UDPSocket*[65536]) ) + Panic("Failed to allocate UDP Socket bindings"); + for ( size_t i = 0; i < 65536; i++ ) + { + bindings_v4[i] = NULL; + bindings_v6[i] = NULL; + } +} + +static bool IsSupportedAddressFamily(int af) +{ + return af == AF_INET /* TODO: || af == AF_INET6 */; +} + +static size_t AddressFamilySize(int af) +{ + switch ( af ) + { + case AF_INET: return sizeof(struct sockaddr_in); + case AF_INET6: return sizeof(struct sockaddr_in6); + } + return 0; +} + +class UDPSocket : public AbstractInode +{ + friend void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast); + +public: + UDPSocket(int af); + virtual ~UDPSocket(); + virtual Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, + int flags); + virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); + virtual int listen(ioctx_t* ctx, int backlog); + virtual ssize_t readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt); + virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); + virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); + virtual ssize_t recvmsg_internal(ioctx_t* ctx, struct msghdr* msg, + int flags); + virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, + int flags); + virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg, int flags); + virtual ssize_t sendmsg_internal(ioctx_t* ctx, const struct msghdr* msg, + int flags); + virtual ssize_t writev(ioctx_t* ctx, const struct iovec* iov, int iovcnt); + virtual int poll(ioctx_t* ctx, PollNode* node); + virtual int getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr); + virtual int setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size); + virtual int shutdown(ioctx_t* ctx, int how); + virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); + +public: + void ReceivePacket(Ref pkt); + +private: + short PollEventStatus(); + bool ImportAddress(ioctx_t* ctx, union udp_sockaddr* dest, + const void* addr, size_t addrsize); + bool CanBind(union udp_sockaddr new_local); + bool BindDefault(const union udp_sockaddr* new_local); + +private: + kthread_mutex_t socket_lock; + kthread_cond_t receive_cond; + PollChannel poll_channel; + union udp_sockaddr local; + union udp_sockaddr remote; + Ref first_packet; + Ref last_packet; + UDPSocket* prev_socket; + UDPSocket* next_socket; + size_t receive_current; + size_t receive_limit; + size_t send_limit; + unsigned int ifindex; + int af; + int sockerr; + int how_shutdown; + bool bound; + bool broadcast; + bool connected; + bool reuseaddr; + +}; + +// TODO: os-test fstat on a socket. +UDPSocket::UDPSocket(int af) +{ + Process* process = CurrentProcess(); + inode_type = INODE_TYPE_STREAM; + dev = (dev_t) this; + ino = (ino_t) this; + type = S_IFSOCK; + kthread_mutex_lock(&process->idlock); + stat_uid = process->uid; + stat_gid = process->gid; + kthread_mutex_unlock(&process->idlock); + stat_mode = 0600 | this->type; + supports_iovec = true; + socket_lock = KTHREAD_MUTEX_INITIALIZER; + receive_cond = KTHREAD_COND_INITIALIZER; + // poll_channel initialized by constructor + memset(&local, 0, sizeof(local)); + memset(&remote, 0, sizeof(remote)); + if ( af == AF_INET ) + { + local.in.sin_family = AF_INET; + local.in.sin_addr.s_addr = htobe32(INADDR_ANY); + local.in.sin_port = htobe16(0); + remote.in.sin_family = AF_INET; + remote.in.sin_addr.s_addr = htobe32(INADDR_ANY); + remote.in.sin_port = htobe16(0); + } + else if ( af == AF_INET6 ) + { + local.in6.sin6_family = AF_INET6; + local.in6.sin6_addr = in6addr_any; + local.in6.sin6_port = htobe16(0); + remote.in6.sin6_family = AF_INET6; + remote.in6.sin6_addr = in6addr_any; + remote.in6.sin6_port = htobe16(0); + } + // first_packet initialized by constructor + // last_packet initialized by constructor + prev_socket = NULL; + next_socket = NULL; + receive_current = 0; + receive_limit = DEFAULT_PACKET_LIMIT * Page::Size(); + send_limit = DEFAULT_PACKET_LIMIT * Page::Size(); + ifindex = 0; + this->af = af; + sockerr = 0; + how_shutdown = 0; + bound = false; + broadcast = false; + connected = false; + reuseaddr = false; +} + +UDPSocket::~UDPSocket() +{ + if ( bound ) + { + ScopedLock lock(&bind_lock); + if ( af == AF_INET ) + { + uint16_t port = be16toh(local.in.sin_port); + if ( prev_socket ) + prev_socket->next_socket = next_socket; + else + bindings_v4[port] = next_socket; + if ( next_socket ) + next_socket->prev_socket = prev_socket; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(local.in6.sin6_port); + if ( prev_socket ) + prev_socket->next_socket = next_socket; + else + bindings_v6[port] = next_socket; + if ( next_socket ) + next_socket->prev_socket = prev_socket; + } + bound = false; + } + // Avoid stack overflow in first_packet recursive destructor. + while ( first_packet ) + { + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + } + last_packet.Reset(); +} + +Ref UDPSocket::accept4(ioctx_t* /*ctx*/, uint8_t* /*addr*/, + size_t* /*addrsize*/, int /*flags*/) +{ + return errno = EOPNOTSUPP, Ref(NULL); +} + +bool UDPSocket::ImportAddress(ioctx_t* ctx, + union udp_sockaddr* dest, + const void* addr, + size_t addrsize) +{ + if ( addrsize != AddressFamilySize(af) ) + { + sa_family_t family; + if ( sizeof(family) <= addrsize && + ctx->copy_from_src(&family, addr, sizeof(family)) && + family == AF_UNSPEC ) + { + union udp_sockaddr unspec; + memset(&unspec, 0, sizeof(unspec)); + unspec.family = AF_UNSPEC; + memcpy(dest, &unspec, sizeof(unspec)); + return true; + } + return errno = EINVAL, false; + } + union udp_sockaddr copy; + memset(©, 0, sizeof(copy)); + if ( !ctx->copy_from_src(©, addr, addrsize) ) + return false; + if ( copy.family != af && copy.family != AF_UNSPEC ) + return errno = EAFNOSUPPORT, false; + memcpy(dest, ©, sizeof(copy)); + return true; +} + +// bind_lock locked, socket_lock locked (in that order) +bool UDPSocket::CanBind(union udp_sockaddr new_local) +{ + if ( af == AF_INET ) + { + // Bind to either the any address, the broadcast address, the address of + // a network interface, or the broadcast address of a network interface. + if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) && + new_local.in.sin_addr.s_addr != htobe32(INADDR_BROADCAST) ) + { + // TODO: What happens to sockets if the network interface changes + // its address? + ScopedLock ifs_lock(&netifs_lock); + bool found = false; + for ( unsigned int i = 1; i < netifs_count; i++ ) + { + NetworkInterface* netif = netifs[i]; + if ( !netif ) + continue; + ScopedLock cfg_lock(&netif->cfg_lock); + struct in_addr if_broadcast_ip; + if_broadcast_ip.s_addr = netif->cfg.inet.address.s_addr | + ~netif->cfg.inet.subnet.s_addr; + if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr, + sizeof(struct in_addr)) == 0 || + memcmp(&if_broadcast_ip, &new_local.in.sin_addr, + sizeof(struct in_addr)) == 0 ) + { + found = true; + break; + } + } + // No interface had the correct address. + if ( !found ) + return errno = EADDRNOTAVAIL, false; + } + uint16_t port = be16toh(new_local.in.sin_port); + if ( port == 0 ) + return errno = EINVAL, false; + for ( UDPSocket* socket = bindings_v4[port]; + socket; + socket = socket->next_socket ) + { + // Taking the lock of the other socket is safe against deadlocks, + // despite having the lock of this socket, because bind_lock was + // locked prior to this socket's lock, and bind_lock must always + // be taken before the same thread locks two sockets. + ScopedLock lock(&socket->socket_lock); + if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && + !(reuseaddr && socket->reuseaddr) ) + return errno = EADDRINUSE, false; + if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && + !(reuseaddr && socket->reuseaddr) ) + return errno = EADDRINUSE, false; + if ( new_local.in.sin_addr.s_addr == + socket->local.in.sin_addr.s_addr ) + return errno = EADDRINUSE, false; + } + } + else if ( af == AF_INET6 ) + { + // TODO: IPv6 support for seeing if any interface has the address. + if ( true ) + return errno = EAFNOSUPPORT, false; + uint16_t port = be16toh(new_local.in6.sin6_port); + if ( port == 0 ) + return errno = EINVAL, false; + for ( UDPSocket* socket = bindings_v6[port]; + socket; + socket = socket->next_socket ) + { + if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any, + sizeof(in6addr_any)) && + !(reuseaddr && socket->reuseaddr) ) + if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any, + sizeof(in6addr_any)) && + !(reuseaddr && socket->reuseaddr) ) + if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr, + sizeof(new_local.in6.sin6_addr)) ) + return errno = EADDRINUSE, false; + } + } + else + return errno = EAFNOSUPPORT, false; + return true; +} + +int UDPSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + ScopedLock lock2(&bind_lock); + ScopedLock lock(&socket_lock); + if ( bound ) + return errno = EINVAL, -1; + union udp_sockaddr new_local; + if ( !ImportAddress(ctx, &new_local, addr, addrsize) ) + return -1; + if ( new_local.family == AF_UNSPEC ) + return errno = EAFNOSUPPORT, -1; + uint16_t port; + if ( af == AF_INET ) + port = be16toh(new_local.in.sin_port); + else if ( af == AF_INET6 ) + port = be16toh(new_local.in6.sin6_port); + else + return errno = EAFNOSUPPORT, -1; + if ( port == 0 ) + return BindDefault(&new_local) ? 0 : -1; + if ( !CanBind(new_local) ) + return -1; + if ( af == AF_INET ) + { + uint16_t port = be16toh(new_local.in.sin_port); + if ( bindings_v4[port] ) + bindings_v4[port]->prev_socket = this; + next_socket = bindings_v4[port]; + prev_socket = NULL; + bindings_v4[port] = this; + } + else if ( af == AF_INET6 ) + { + uint16_t port = be16toh(new_local.in6.sin6_port); + if ( bindings_v6[port] ) + bindings_v6[port]->prev_socket = this; + next_socket = bindings_v6[port]; + prev_socket = NULL; + bindings_v6[port] = this; + } + else + return errno = EAFNOSUPPORT, -1; + memcpy(&local, &new_local, sizeof(new_local)); + bound = true; + return 0; +} + +// bind_lock locked, socket_lock locked (in that order) +bool UDPSocket::BindDefault(const union udp_sockaddr* new_local_ptr) +{ + // TODO: This allocator becomes increasingly biased as more ports are + // allocated. + // TODO: Try not to allocate recently used ports. + union udp_sockaddr new_local; + if ( new_local_ptr ) + memcpy(&new_local, new_local_ptr, sizeof(union udp_sockaddr)); + else + { + memset(&new_local, 0, sizeof(new_local)); + if ( af == AF_INET ) + { + new_local.in.sin_family = AF_INET; + new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY); + } + else if ( af == AF_INET6 ) + { + new_local.in6.sin6_family = AF_INET6; + new_local.in6.sin6_addr = in6addr_any; + } + else + return errno = EAFNOSUPPORT, false; + } + uint16_t start = 32768; // Documented in udp(4). + uint16_t end = 61000; // Documented in udp(4). + uint16_t count = end - start; + uint16_t offset = arc4random_uniform(count); + for ( uint16_t i = 0; i < count; i++ ) + { + uint16_t j = offset + i; + if ( count <= j ) + j -= count; + uint16_t port = start + j; + if ( af == AF_INET ) + new_local.in.sin_port = htobe16(port); + else if ( af == AF_INET6 ) + new_local.in6.sin6_port = htobe16(port); + else + return errno = EAFNOSUPPORT, false; + if ( !CanBind(new_local) ) + { + if ( errno == EADDRINUSE ) + continue; + return false; + } + if ( af == AF_INET ) + { + if ( bindings_v4[port] ) + bindings_v4[port]->prev_socket = this; + next_socket = bindings_v4[port]; + prev_socket = NULL; + bindings_v4[port] = this; + } + else if ( af == AF_INET6 ) + { + if ( bindings_v6[port] ) + bindings_v6[port]->prev_socket = this; + next_socket = bindings_v6[port]; + prev_socket = NULL; + bindings_v6[port] = this; + } + else + return errno = EAFNOSUPPORT, false; + memcpy(&local, &new_local, sizeof(new_local)); + bound = true; + return true; + } + return errno = EAGAIN, false; +} + +int UDPSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) +{ + ScopedLock lock2(&bind_lock); + ScopedLock lock(&socket_lock); + union udp_sockaddr new_remote; + if ( !ImportAddress(ctx, &new_remote, addr, addrsize) ) + return -1; + if ( new_remote.family == AF_UNSPEC ) + { + // Disconnect the socket when connecting to the AF_UNSPEC family. + connected = false; + return 0; + } + else if ( af == AF_INET ) + { + // Verify the port is non-zero. + if ( be16toh(new_remote.in.sin_port) == 0 ) + return errno = EADDRNOTAVAIL, -1; + } + else + return errno = EAFNOSUPPORT, -1; + // If the socket is not bound, find a route to the remote address and bind + // to the appropriate source address. + if ( !bound ) + { + union udp_sockaddr new_local; + memset(&new_local, 0, sizeof(new_local)); + if ( af == AF_INET ) + { + struct in_addr any; + any.s_addr = htobe32(INADDR_ANY); + new_local.in.sin_family = AF_INET; + if ( !IP::GetSourceIP(&any, &new_remote.in.sin_addr, + &new_local.in.sin_addr, ifindex, NULL) ) + return -1; + new_local.in.sin_port = htobe16(0); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !BindDefault(&new_local) ) + return -1; + } + // Test if there is a route from the local address to the remote address. + if ( af == AF_INET ) + { + if ( !IP::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr, NULL, + ifindex, NULL) ) + { + // TODO: Rebind to another interface if reconnecting? Note that this + // violates the design that sockets can only be bound once. + // DragonFly, FreeBSD, Haiku, macOS, NetBSD, OpenBSD, and + // OpenIndiana does this, but Hurd and Linux does not. See + // os-test's connect-loopback-reconnect-wan-getsockname. If + // so, give preference to the same port if available. + return -1; + } + } + else + return errno = EAFNOSUPPORT, -1; + // Set the remote address and become connected. + connected = true; + memcpy(&remote, &new_remote, sizeof(new_remote)); + // Discard datagrams not from the new remote, thus enforcing that all + // datagrams provided by recvmsg always comes from the address connected to. + size_t name_size = AddressFamilySize(af); + Ref* packet_ptr = &first_packet; + while ( *packet_ptr ) + { + void* name = first_packet->from + first_packet->offset; + if ( memcmp(name, &remote, name_size) != 0 ) + { + Ref next = (*packet_ptr)->next; + (*packet_ptr)->next.Reset(); + packet_ptr->Reset(); + *packet_ptr = next; + continue; + } + packet_ptr = &(*packet_ptr)->next; + } + if ( !first_packet ) + last_packet.Reset(); + return 0; +} + +int UDPSocket::listen(ioctx_t* /*ctx*/, int /*backlog*/) +{ + return errno = EOPNOTSUPP, -1; +} + +ssize_t UDPSocket::readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt) +{ + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = (struct iovec*) iov; + msg.msg_iovlen = iovcnt; + return recvmsg_internal(ctx, &msg, 0); +} + +ssize_t UDPSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) +{ + struct iovec iov; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = (void*) buf; + iov.iov_len = count; + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return recvmsg_internal(ctx, &msg, flags); +} + +ssize_t UDPSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags) +{ + struct msghdr msg; + if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) + return -1; + if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) + return errno = EINVAL, -1; + size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); + struct iovec* iov = new struct iovec[msg.msg_iovlen]; + if ( !iov ) + return -1; + struct iovec* user_iov = msg.msg_iov; + if ( !ctx->copy_from_src(iov, user_iov, iov_size) ) + return delete[] iov, -1; + msg.msg_iov = iov; + ssize_t result = recvmsg_internal(ctx, &msg, flags); + msg.msg_iov = user_iov; + delete[] iov; + if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) ) + return -1; + return result; +} + +ssize_t UDPSocket::recvmsg_internal(ioctx_t* ctx, struct msghdr* msg, int flags) +{ + if ( flags & ~(MSG_PEEK) ) + return errno = EINVAL, -1; + ScopedLock lock(&socket_lock); + if ( sockerr ) + { + errno = sockerr; + sockerr = 0; + return -1; + } + if ( how_shutdown & SHUT_RD ) + return 0; + while ( !first_packet ) + { + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK, -1; + if ( !kthread_cond_wait_signal(&receive_cond, &socket_lock) ) + return errno = EINTR, -1; + } + void* name = first_packet->from + first_packet->offset; + size_t name_size = AddressFamilySize(af); + assert(name_size <= first_packet->length - first_packet->offset); + if ( msg->msg_name ) + { + if ( name_size < msg->msg_namelen ) + msg->msg_namelen = name_size; + if ( !ctx->copy_to_dest(msg->msg_name, name, msg->msg_namelen) ) + return -1; + } + else + msg->msg_namelen = 0; + first_packet->offset += name_size; + const unsigned char* in = first_packet->from + first_packet->offset; + size_t in_length = first_packet->length - first_packet->offset; + msg->msg_controllen = 0; + msg->msg_flags = 0; + if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) + return errno = EINVAL, -1; + size_t sofar = 0; + for ( int i = 0; i < msg->msg_iovlen && sofar < in_length; i++) + { + size_t in_left = in_length - sofar; + const struct iovec* iov = &msg->msg_iov[i]; + size_t count = in_left < iov->iov_len ? in_left : iov->iov_len; + if ( !ctx->copy_to_dest(iov->iov_base, in + sofar, count) ) + return -1; + sofar += count; + } + if ( sofar < in_length ) + msg->msg_flags |= MSG_TRUNC; + if ( !(flags & MSG_PEEK) ) + { + receive_current -= first_packet->pmap.size; + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + if ( !first_packet ) + last_packet.Reset(); + } + return sofar; +} + +ssize_t UDPSocket::send(ioctx_t* ctx, + const uint8_t* buf, + size_t count, + int flags) +{ + struct iovec iov; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = (void*) buf; + iov.iov_len = count; + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return sendmsg_internal(ctx, &msg, flags); +} + +ssize_t UDPSocket::sendmsg(ioctx_t* ctx, + const struct msghdr* msg_ptr, + int flags) +{ + struct msghdr msg; + if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) + return -1; + if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) + return errno = EINVAL, -1; + size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); + struct iovec* iov = new struct iovec[msg.msg_iovlen]; + if ( !iov ) + return -1; + if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) + return delete[] iov, -1; + msg.msg_iov = iov; + ssize_t result = sendmsg_internal(ctx, &msg, flags); + delete[] iov; + return result; +} + +ssize_t UDPSocket::sendmsg_internal(ioctx_t* ctx, + const struct msghdr* msg, + int flags) +{ + if ( flags & ~(MSG_NOSIGNAL) ) // TODO: MSG_DONTROUTE + return errno = EINVAL, -1; + ScopedLock lock(&socket_lock); + if ( how_shutdown & SHUT_WR ) + { + if ( !(flags & MSG_NOSIGNAL) ) + CurrentThread()->DeliverSignal(SIGPIPE); + return errno = EPIPE, -1; + } + if ( sockerr ) + { + errno = sockerr; + sockerr = 0; + return -1; + } + union udp_sockaddr sendto; + if ( msg->msg_name ) + { + if ( connected ) + return errno = EISCONN, -1; + if ( af == AF_INET ) + { + if ( msg->msg_namelen != sizeof(sendto.in) ) + return errno = EINVAL, -1; + sendto.family = af; + if ( !ctx->copy_from_src(&sendto.in, msg->msg_name, + sizeof(sendto.in)) ) + return -1; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, -1; + } + else if ( connected ) + sendto = remote; + else + return errno = EDESTADDRREQ, -1; + if ( !bound ) + { + kthread_mutex_unlock(&socket_lock); // Don't deadlock. + kthread_mutex_lock(&bind_lock); + kthread_mutex_lock(&socket_lock); + bool was_bound = BindDefault(NULL); + kthread_mutex_unlock(&bind_lock); + if ( !was_bound ) + return -1; + } + // Find a route to the destination and verify the port is non-zero. + union udp_sockaddr sendfrom; + if ( af == AF_INET ) + { + if ( be16toh(sendto.in.sin_port) == 0 ) + return errno = EADDRNOTAVAIL, -1; + if ( !IP::GetSourceIP(&local.in.sin_addr, &sendto.in.sin_addr, + &sendfrom.in.sin_addr, ifindex) ) + return -1; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, -1; + Ref pkt = GetPacket(); + if ( !pkt ) + return -1; + size_t mtu = pkt->pmap.size; + if ( mtu < sizeof(struct udphdr) ) + return errno = EMSGSIZE, -1; + pkt->length = sizeof(struct udphdr); + unsigned char* out = pkt->from; + struct udphdr hdr; + if ( af == AF_INET ) + { + hdr.uh_sport = local.in.sin_port; + hdr.uh_dport = sendto.in.sin_port; + } + else if ( af == AF_INET6 ) + { + hdr.uh_sport = local.in6.sin6_port; + hdr.uh_dport = sendto.in6.sin6_port; + } + else + return errno = EAFNOSUPPORT, -1; + if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) + return errno = EINVAL, -1; + size_t count = 0; + for ( int i = 0; i < msg->msg_iovlen; i++ ) + { + const struct iovec* iov = &msg->msg_iov[i]; + if ( mtu - pkt->length < iov->iov_len ) + return errno = EMSGSIZE, -1; + if ( !ctx->copy_from_src(out + pkt->length, iov->iov_base, + iov->iov_len) ) + return -1; + pkt->length += iov->iov_len; + count += iov->iov_len; + } + hdr.uh_ulen = htobe16(pkt->length); + memcpy(out, &hdr, sizeof(hdr)); + uint16_t checksum = 0; + if ( af == AF_INET ) + { + checksum = IP::ipsum_buf(checksum, &sendfrom.in.sin_addr, + sizeof(struct in_addr)); + checksum = IP::ipsum_buf(checksum, &sendto.in.sin_addr, + sizeof(struct in_addr)); + } + else if ( af == AF_INET6 ) + { + checksum = IP::ipsum_buf(checksum, &sendfrom.in6.sin6_addr, + sizeof(struct in6_addr)); + checksum = IP::ipsum_buf(checksum, &sendto.in6.sin6_addr, + sizeof(struct in6_addr)); + } + else + return errno = EAFNOSUPPORT, -1; + checksum = IP::ipsum_word(checksum, IPPROTO_UDP); + checksum = IP::ipsum_word(checksum, pkt->length); + checksum = IP::ipsum_buf(checksum, out, pkt->length); + checksum = IP::ipsum_finish(checksum); + if ( checksum == 0x0000 ) + checksum = 0xFFFF; + hdr.uh_sum = htobe16(checksum); + memcpy(out, &hdr, sizeof(hdr)); + (void) flags; + if ( af == AF_INET ) + { + if ( !IP::Send(pkt, &sendfrom.in.sin_addr, &sendto.in.sin_addr, + IPPROTO_UDP, ifindex, broadcast) ) + return -1; + } + // TODO: IPv6 support. + else + return errno = EAFNOSUPPORT, -1; + return count; +} + +ssize_t UDPSocket::writev(ioctx_t* ctx, const struct iovec* iov, int iovcnt) +{ + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = (struct iovec*) iov; + msg.msg_iovlen = iovcnt; + return sendmsg_internal(ctx, &msg, 0); +} + +short UDPSocket::PollEventStatus() +{ + short status = 0; + if ( first_packet || (how_shutdown & SHUT_RD) ) + status |= POLLIN | POLLRDNORM; + if ( !(how_shutdown & SHUT_WR) ) + status |= POLLOUT | POLLWRNORM; + else + status |= POLLHUP; + if ( sockerr ) + status |= POLLERR; + return status; +} + +int UDPSocket::poll(ioctx_t* /*ctx*/, PollNode* node) +{ + ScopedLock lock(&socket_lock); + short ret_status = PollEventStatus() & node->events; + if ( ret_status ) + { + node->master->revents |= ret_status; + return 0; + } + poll_channel.Register(node); + return errno = EAGAIN, -1; +} + +int UDPSocket::getsockopt(ioctx_t* ctx, int level, int option_name, + void* option_value, size_t* option_size_ptr) +{ + ScopedLock lock(&socket_lock); + + if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) + { + ScopedLock lock(&netifs_lock); + const char* ifname = ""; + if ( ifindex < netifs_count && netifs[ifindex] ) + ifname = netifs[ifindex]->ifinfo.name; + size_t option_size; + if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) ) + return -1; + size_t len = strlen(ifname); + size_t size = len + 1; + if ( option_size < size ) + return errno = ERANGE, -1; + if ( !CopyToUser(option_value, ifname, size) || + !CopyToUser(option_size_ptr, &size, sizeof(size)) ) + return -1; + return 0; + } + + uintmax_t result = 0; + + if ( level == IPPROTO_UDP ) + { + switch ( option_name ) + { + default: return errno = ENOPROTOOPT, -1; + } + } + else if ( level == SOL_SOCKET ) + { + switch ( option_name ) + { + case SO_BINDTOINDEX: result = ifindex; break; + case SO_BROADCAST: result = broadcast; break; + case SO_DEBUG: result = 0; break; + case SO_DOMAIN: result = af; break; + case SO_DONTROUTE: result = 0; break; + case SO_ERROR: result = sockerr; sockerr = 0; break; + case SO_PROTOCOL: result = IPPROTO_UDP; break; + case SO_RCVBUF: result = receive_limit; break; + case SO_REUSEADDR: result = reuseaddr; break; + case SO_SNDBUF: result = send_limit; break; + case SO_TYPE: result = SOCK_DGRAM; break; + default: return errno = ENOPROTOOPT, -1; + } + } + else + return errno = EINVAL, -1; + + if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) ) + return -1; + + return 0; +} + +int UDPSocket::setsockopt(ioctx_t* ctx, int level, int option_name, + const void* option_value, size_t option_size) +{ + ScopedLock lock(&socket_lock); + + if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) + { + char ifname[IF_NAMESIZE]; + if ( sizeof(ifname) < option_size ) + option_size = sizeof(ifname); + if ( !CopyFromUser(ifname, option_value, option_size) ) + return -1; + if ( strnlen(ifname, option_size) == sizeof(ifname) ) + return errno = ENODEV, -1; + ifname[option_size] = '\0'; + ScopedLock lock(&netifs_lock); + for ( size_t i = 1; i < netifs_count; i++ ) + { + if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) ) + { + ifindex = i; + return 0; + } + } + return errno = ENODEV, -1; + } + + uintmax_t value; + if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) ) + return -1; + + if ( level == IPPROTO_UDP ) + { + switch ( option_name ) + { + default: return errno = ENOPROTOOPT, -1; + } + } + else if ( level == SOL_SOCKET ) + { + switch ( option_name ) + { + case SO_BINDTOINDEX: + if ( UINT_MAX < value ) + return errno = EINVAL, -1; + ifindex = value; + break; + case SO_BROADCAST: broadcast = value; break; + case SO_DEBUG: + if ( value != 0 ) + return errno = EPERM, -1; + break; + case SO_DONTROUTE: + if ( value != 0 ) + return errno = EPERM, -1; + break; + case SO_RCVBUF: + { + size_t hard_limit = MAXIMAL_PACKET_LIMIT * Page::Size(); + if ( hard_limit < value ) + value = hard_limit; + receive_limit = value; + // Shrink the receive queue until it fits. + while ( first_packet && receive_limit < receive_current ) + { + Ref packet = first_packet; + first_packet->next.Reset(); + first_packet = first_packet->next; + receive_current -= packet->pmap.size; + } + if ( !first_packet ) + last_packet.Reset(); + break; + } + case SO_REUSEADDR: reuseaddr = value; break; + case SO_SNDBUF: + { + size_t hard_limit = MAXIMAL_PACKET_LIMIT * Page::Size(); + if ( hard_limit < value ) + value = hard_limit; + // TODO: This value is unused. + send_limit = value; + break; + } + default: return errno = ENOPROTOOPT, -1; + } + } + else + return errno = EINVAL, -1; + + return 0; +} + +int UDPSocket::shutdown(ioctx_t* ctx, int how) +{ + (void) ctx; + ScopedLock lock(&socket_lock); + if ( how & ~(SHUT_RD | SHUT_WR) ) + return errno = EINVAL, -1; + how_shutdown |= how; + // Drop the receive queue if shut down for read. + if ( how & SHUT_RD ) + { + // Avoid stack overflow in first_packet recursive destructor. + while ( first_packet ) + { + Ref next = first_packet->next; + first_packet->next.Reset(); + first_packet = next; + } + last_packet.Reset(); + } + kthread_cond_broadcast(&receive_cond); + poll_channel.Signal(PollEventStatus()); + return 0; +} + +int UDPSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) +{ + ScopedLock lock(&socket_lock); + if ( !connected ) + return errno = ENOTCONN, -1; + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return -1; + if ( af == AF_INET ) + { + if ( sizeof(remote.in) < addrsize ) + addrsize = sizeof(remote.in); + } + else if ( af == AF_INET6 ) + { + if ( sizeof(remote.in6) < addrsize ) + addrsize = sizeof(remote.in6); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !ctx->copy_to_dest(addr, &remote, addrsize) ) + return -1; + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return -1; + return 0; +} + +int UDPSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) +{ + ScopedLock lock(&socket_lock); + size_t addrsize; + if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) + return -1; + if ( af == AF_INET ) + { + if ( sizeof(local.in) < addrsize ) + addrsize = sizeof(local.in); + } + else if ( af == AF_INET6 ) + { + if ( sizeof(local.in6) < addrsize ) + addrsize = sizeof(local.in6); + } + else + return errno = EAFNOSUPPORT, -1; + if ( !ctx->copy_to_dest(addr, &local, addrsize) ) + return -1; + if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) + return -1; + return 0; +} + +// socket_lock locked +void UDPSocket::ReceivePacket(Ref pkt) +{ + if ( how_shutdown & SHUT_RD ) + return; + // Drop the packet if the receive queue is full. + if ( receive_limit < receive_current ) + return; + size_t available = receive_limit - receive_current; + if ( available < pkt->pmap.size ) + return; + // Add the packet to the receive queue. + receive_current += pkt->pmap.size; + if ( last_packet ) + { + last_packet->next = pkt; + last_packet = pkt; + } + else + { + first_packet = pkt; + last_packet = pkt; + } + kthread_cond_broadcast(&receive_cond); + poll_channel.Signal(PollEventStatus()); +} + +void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast) +{ + (void) dst_broadcast; + const unsigned char* in = pkt->from + pkt->offset; + size_t inlen = pkt->length - pkt->offset; + struct udphdr hdr; + if ( inlen < sizeof(hdr) ) + return; + memcpy(&hdr, in, sizeof(hdr)); + hdr.uh_sport = be16toh(hdr.uh_sport); + hdr.uh_dport = be16toh(hdr.uh_dport); + hdr.uh_ulen = be16toh(hdr.uh_ulen); + hdr.uh_sum = be16toh(hdr.uh_sum); + if ( hdr.uh_sum ) + { + uint16_t sum = 0; + sum = IP::ipsum_buf(sum, src, sizeof(struct in_addr)); + sum = IP::ipsum_buf(sum, dst, sizeof(struct in_addr)); + sum = IP::ipsum_word(sum, IPPROTO_UDP); + sum = IP::ipsum_word(sum, hdr.uh_ulen); + sum = IP::ipsum_buf(sum, in, inlen); + if ( sum != 0 && sum != 0xFFFF ) + return; + } + if ( hdr.uh_ulen < sizeof(hdr) ) + return; + if ( inlen < hdr.uh_ulen ) + return; + pkt->length = pkt->offset + hdr.uh_ulen; + pkt->offset += sizeof(hdr); + // Port 0 is not valid. + if ( hdr.uh_sport == 0 || hdr.uh_dport == 0 ) + return; + ScopedLock lock1(&bind_lock); + // Find the socket that would receive the datagram sent to that address + // and port, or if no such socket, perhaps a socket bound to the any address + // and that port. + UDPSocket* socket = NULL; + UDPSocket* any_socket = NULL; + for ( UDPSocket* iter = bindings_v4[hdr.uh_dport]; + !socket && iter; + iter = iter->next_socket ) + { + // Receive the datagram only if sent to the socket's address. + if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) ) + socket = iter; + // Receive the datagram only if the socket's address was the any address + // (and no other socket is bound to the datagram's destination address + // and port). + if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) + any_socket = iter; + } + // If no socket was bound to the datagram's destination address and port, + // try to deliver it to a socket bound to the any address and that port. + if ( !socket ) + socket = any_socket; + // Drop the datagram is no socket would receive it. + if ( !socket ) + return; + // If connected, require the source address is the remote address and the + // source port is the remote port, otherwise drop the datagram. + if ( socket->connected && + (memcmp(&socket->remote.in.sin_addr, src, sizeof(*src)) != 0 || + be16toh(socket->remote.in.sin_port) != hdr.uh_sport) ) + return; + ScopedLock lock2(&socket->socket_lock); + // If the socket is bound to a network interface, require the datagram to + // have been received on that network interface. + if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid ) + return; + // Prepend the source address to the packet. + struct sockaddr_in from_addr; + memset(&from_addr, 0, sizeof(from_addr)); + from_addr.sin_family = AF_INET; + from_addr.sin_port = htobe16(hdr.uh_sport); + from_addr.sin_addr = *src; + if ( pkt->offset < sizeof(from_addr) ) + return; + pkt->offset -= sizeof(from_addr); + memcpy(pkt->from + pkt->offset, &from_addr, sizeof(from_addr)); + // Receive the datagram on the socket. + socket->ReceivePacket(pkt); +} + +Ref Socket(int af) +{ + if ( !IsSupportedAddressFamily(af) ) + return errno = EAFNOSUPPORT, Ref(NULL); + return Ref(new UDPSocket(af)); +} + +} // namespace UDP +} // namespace Sortix diff --git a/kernel/net/udp.h b/kernel/net/udp.h new file mode 100644 index 00000000..0cb6f8aa --- /dev/null +++ b/kernel/net/udp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/udp.h + * User Datagram Protocol. + */ + +#ifndef SORTIX_NET_UDP_H +#define SORTIX_NET_UDP_H + +#include + +#include +#include + +namespace Sortix { +namespace UDP { + +void Init(); +void HandleIP(Ref pkt, + const struct in_addr* src, + const struct in_addr* dst, + bool dst_broadcast); +Ref Socket(int af); + +} // namespace UDP +} // namespace Sortix + +#endif diff --git a/libc/Makefile b/libc/Makefile index f6aa3634..427b372e 100644 --- a/libc/Makefile +++ b/libc/Makefile @@ -57,6 +57,7 @@ malloc/heap_init.o \ malloc/__heap_lock.o \ malloc/__heap_unlock.o \ malloc/__heap_verify.o \ +netinet/if_ether/etheraddr_broadcast.o \ netinet/in/in6addr_any.o \ netinet/in/in6addr_loopback.o \ regex/regcomp.o \ diff --git a/libc/include/net/if.h b/libc/include/net/if.h new file mode 100644 index 00000000..c7f102e4 --- /dev/null +++ b/libc/include/net/if.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * net/if.h + * Network interface. + */ + +#ifndef _INCLUDE_NET_IF_H +#define _INCLUDE_NET_IF_H + +#include + +#define IF_NAMESIZE 32 + +#if __USE_SORTIX +#include +#include +#include +#include + +#define IF_HWADDR_MAXSIZE 6 + +#define IF_TYPE_LOOPBACK 1 +#define IF_TYPE_ETHERNET 2 + +#define IF_FEATURE_ETHERNET_CRC_OFFLOAD (1 << 0) + +struct if_info +{ + unsigned int linkid; + int type; + int features; + size_t addrlen; + char name[IF_NAMESIZE]; + unsigned char addr[IF_HWADDR_MAXSIZE]; +}; + +#define IF_STATUS_FLAGS_UP (1 << 0) + +struct if_status +{ + int flags; + size_t mtu; +}; + +struct if_config_ether +{ + struct ether_addr address; +}; + +struct if_config_inet +{ + struct in_addr address; + struct in_addr router; + struct in_addr subnet; +}; + +struct if_config +{ + struct if_config_ether ether; + struct if_config_inet inet; +}; +#endif + +#endif diff --git a/libc/include/netinet/if_ether.h b/libc/include/netinet/if_ether.h new file mode 100644 index 00000000..9e969693 --- /dev/null +++ b/libc/include/netinet/if_ether.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * netinet/if_ether.h + * Ethernet interfaces. + */ + +#ifndef _INCLUDE_NETINET_IF_ETHER_H +#define _INCLUDE_NETINET_IF_ETHER_H + +#include + +#include + +#define ETHER_ADDR_LEN 6 +#define ETHER_TYPE_LEN 2 +#define ETHER_HDR_LEN (ETHER_ADDR_LEN + ETHER_ADDR_LEN + ETHER_TYPE_LEN) /* 14 */ +#define ETHER_CRC_LEN 4 +#define ETHER_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN) /* 18 */ +#define ETHER_MIN_LEN 64 +#define ETHER_MAX_LEN 1518 + +#define ETHERMTU (ETHER_MAX_LEN - ETHER_LEN) /* 1500 */ +#define ETHERMIN (ETHER_MIN_LEN - ETHER_LEN) /* 46 */ + +#define ETHERADDR_BROADCAST_INIT { { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } } + +#define ETHERTYPE_IP 0x0800 +#define ETHERTYPE_ARP 0x0806 +#define ETHERTYPE_IPV6 0x86dd + +struct ether_addr +{ + uint8_t ether_addr_octet[ETHER_ADDR_LEN]; +}; + +struct ether_header +{ + uint8_t ether_dhost[ETHER_ADDR_LEN]; + uint8_t ether_shost[ETHER_ADDR_LEN]; + uint16_t ether_type; +}; + +struct ether_footer +{ + uint32_t ether_crc; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +extern const struct ether_addr etheraddr_broadcast; /* ff:ff:ff:ff:ff:ff */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/libc/include/netinet/in.h b/libc/include/netinet/in.h index 9361fb9d..7be44f74 100644 --- a/libc/include/netinet/in.h +++ b/libc/include/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2014, 2015 Jonas 'Sortie' Termansen. + * Copyright (c) 2013, 2014, 2015, 2016, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -23,14 +23,11 @@ #include #include + #include <__/endian.h> #include -#ifdef __cplusplus -extern "C" { -#endif - #ifndef __in_port_t_defined #define __in_port_t_defined typedef uint16_t in_port_t; @@ -43,7 +40,7 @@ typedef uint32_t in_addr_t; #ifndef __sa_family_t_defined #define __sa_family_t_defined -typedef unsigned short int sa_family_t; +typedef uint16_t sa_family_t; #endif #ifndef __socklen_t_defined @@ -77,8 +74,6 @@ struct sockaddr_in6 uint32_t sin6_scope_id; }; -extern const struct in6_addr in6addr_any; /* :: */ -extern const struct in6_addr in6addr_loopback; /* ::1 */ #define IN6ADDR_ANY_INIT { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } } #define IN6ADDR_LOOPBACK_INIT { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } @@ -90,16 +85,20 @@ struct ipv6_mreq /* #define SOL_SOCKET 0 - in */ #define IPPROTO_ICMP 1 -#define IPPROTO_IP 2 -#define IPPROTO_IPV6 3 -#define IPPROTO_RAW 4 -#define IPPROTO_TCP 5 -#define IPPROTO_UDP 6 +#define IPPROTO_TCP 6 +#define IPPROTO_UDP 17 +#define IPPROTO_RAW 255 +#define IPPROTO_IP 256 +#define IPPROTO_IPV6 257 +#define IPPROTO_PING 258 -#define INADDR_ANY ((in_addr_t) 0x00000000) -#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) -#define INADDR_NONE ((in_addr_t) 0xffffffff) -#define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) +#define INADDR_ANY ((in_addr_t) 0x00000000) /* 0.0.0.0 */ +#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) /* 255.255.255.255 */ +#define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) /* 127.0.0.1 */ +#define INADDR_NONE ((in_addr_t) 0xffffffff) /* 255.255.255.255 */ +#if __USE_SORTIX +#define INADDR_LOOPMASK ((in_addr_t) 0xff000000) /* 255.0.0.0 */ +#endif #define INADDR_UNSPEC_GROUP ((in_addr_t) 0xe0000000) #define INADDR_ALLHOSTS_GROUP ((in_addr_t) 0xe0000001) @@ -178,6 +177,15 @@ struct ipv6_mreq #define IN_LOOPBACKNET 127 +#define IP_EVIL_INTENT 1 + +#ifdef __cplusplus +extern "C" { +#endif + +extern const struct in6_addr in6addr_any; /* :: */ +extern const struct in6_addr in6addr_loopback; /* ::1 */ + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/libc/include/netinet/ping.h b/libc/include/netinet/ping.h new file mode 100644 index 00000000..79ff72f9 --- /dev/null +++ b/libc/include/netinet/ping.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * netinet/ping.h + * Internet Control Message Protocol Echo. + */ + +#ifndef _INCLUDE_NETINET_PING_H +#define _INCLUDE_NETINET_PING_H + +#include + +#include + +#endif diff --git a/libc/include/netinet/tcp.h b/libc/include/netinet/tcp.h index 59a94178..39fab7fb 100644 --- a/libc/include/netinet/tcp.h +++ b/libc/include/netinet/tcp.h @@ -14,7 +14,7 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * netinet/tcp.h - * Definitions for the Internet Transmission Control Protocol. + * Transmission Control Protocol. */ #ifndef _INCLUDE_NETINET_TCP_H @@ -22,6 +22,69 @@ #include +#if __USE_SORTIX +#include +#endif + +#if __USE_SORTIX +typedef uint32_t tcp_seq; /* TCP sequence number. */ + +/* Control Bits in struct tcphdr th_flags. */ +#define TH_FIN (1 << 0) /* No more data from sender. */ +#define TH_SYN (1 << 1) /* Synchronize sequence numbers. */ +#define TH_RST (1 << 2) /* Reset the connection. */ +#define TH_PUSH (1 << 3) /* Push Function. */ +#define TH_ACK (1 << 4) /* Acknowledgment field significant. */ +#define TH_URG (1 << 5) /* Urgent Pointer field significant. */ + +struct tcphdr +{ + in_port_t th_sport; /* Source Port. */ + in_port_t th_dport; /* Destination Port. */ + tcp_seq th_seq; /* Sequence Number. */ + tcp_seq th_ack; /* Acknowledgment Number. */ + __extension__ union + { + __extension__ struct + { + #if __BYTE_ORDER == __LITTLE_ENDIAN + uint8_t th_x2:4; /* Reserved. */ + uint8_t th_off:4; /* Data offset. */ + #elif __BYTE_ORDER == __BIG_ENDIAN + uint8_t th_off:4; /* Data offset. */ + uint8_t th_x2:4; /* Reserved. */ + #else + #warning "You need to add support for your endian" + #endif + }; + uint8_t th_offset; + }; + uint8_t th_flags; /* Control Bits. */ + uint16_t th_win; /* Window. */ + uint16_t th_sum; /* Checksum. */ + uint16_t th_urp; /* Urgent Pointer. */ +}; + +#define TCP_OFFSET_ENCODE(x) (((x) & 0xF) << 4) /* Encode th_offset. */ +#define TCP_OFFSET_DECODE(x) (((x) >> 4) & 0xF) /* Decode th_offset. */ + +#define TCP_MSS 536 /* Default Maximum Segment Size. */ + +#define TCPOPT_EOL 0 /* End of Option List. */ +#define TCPOPT_NOP 1 /* No-Operation. */ + +#define TCPOPT_MAXSEG 2 /* Maximum Segment Size. */ +#define TCPOLEN_MAXSEG 4 /* Length of Maximum Segment Size. */ + +/* Maximum header size: 16 * 4 bytes */ +#define TCP_MAXHLEN 64 + +/* Maximum total length of options. */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + +#define TCP_MAXWIN 65535 /* Maximum window size. */ +#endif + /* Options at the IPPROTO_TCP socket level. */ #define TCP_NODELAY 1 #if __USE_SORTIX diff --git a/libc/include/netinet/udp.h b/libc/include/netinet/udp.h new file mode 100644 index 00000000..62e69702 --- /dev/null +++ b/libc/include/netinet/udp.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * netinet/udp.h + * User Datagram Protocol. + */ + +#ifndef _INCLUDE_NETINET_UDP_H +#define _INCLUDE_NETINET_UDP_H + +#include + +#include + +struct udphdr +{ + in_port_t uh_sport; /* Source Port */ + in_port_t uh_dport; /* Destination Port */ + uint16_t uh_ulen; /* Length */ + uint16_t uh_sum; /* Checksum */ +}; + +#endif diff --git a/libc/include/sys/socket.h b/libc/include/sys/socket.h index f75f8cb4..59394b68 100644 --- a/libc/include/sys/socket.h +++ b/libc/include/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2014 Jonas 'Sortie' Termansen. + * Copyright (c) 2013, 2014, 2016, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -24,11 +24,9 @@ #include -#include +#include <__/stdint.h> -#ifdef __cplusplus -extern "C" { -#endif +#include #ifndef __socklen_t_defined #define __socklen_t_defined @@ -48,19 +46,11 @@ typedef __ssize_t ssize_t; #ifndef __sa_family_t_defined #define __sa_family_t_defined -typedef unsigned short int sa_family_t; -#endif - -#ifdef __cplusplus -} /* extern "C" */ +typedef __uint16_t sa_family_t; #endif #include -#ifdef __cplusplus -extern "C" { -#endif - struct sockaddr { sa_family_t sa_family; @@ -139,8 +129,14 @@ struct linger #define SO_SNDLOWAT 14 #define SO_SNDTIMEO 15 #define SO_TYPE 16 +#if __USE_SORTIX +#define SO_BINDTOINDEX 17 +#define SO_BINDTODEVICE 18 +#define SO_DOMAIN 19 +#define SO_PROTOCOL 20 +#endif -#define SOMAXCONN 5 +#define SOMAXCONN 128 #define MSG_CTRUNC (1<<0) #define MSG_DONTROUTE (1<<1) @@ -174,6 +170,10 @@ struct linger #define SHUT_WR (1 << 1) #define SHUT_RDWR (SHUT_RD | SHUT_WR) +#ifdef __cplusplus +extern "C" { +#endif + int accept4(int, struct sockaddr* __restrict, socklen_t* __restrict, int); int accept(int, struct sockaddr* __restrict, socklen_t* __restrict); int bind(int, const struct sockaddr*, socklen_t); diff --git a/libc/include/sys/un.h b/libc/include/sys/un.h index 62ca9ae6..07ff6ba7 100644 --- a/libc/include/sys/un.h +++ b/libc/include/sys/un.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Jonas 'Sortie' Termansen. + * Copyright (c) 2013, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -24,13 +24,15 @@ #include +#include <__/stdint.h> + #ifdef __cplusplus extern "C" { #endif #ifndef __sa_family_t_defined #define __sa_family_t_defined -typedef unsigned short int sa_family_t; +typedef __uint16_t sa_family_t; #endif struct sockaddr_un diff --git a/libc/netinet/if_ether/etheraddr_broadcast.c b/libc/netinet/if_ether/etheraddr_broadcast.c new file mode 100644 index 00000000..c092fe78 --- /dev/null +++ b/libc/netinet/if_ether/etheraddr_broadcast.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2017 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * netinet/if_ether/etheraddr_broadcast.c + * Ethernet broadcast address. + */ + +#include + +const struct ether_addr etheraddr_broadcast = ETHERADDR_BROADCAST_INIT; diff --git a/libc/sys/dnsconfig/getdnsconfig.2 b/libc/sys/dnsconfig/getdnsconfig.2 index c787a100..2b3196de 100644 --- a/libc/sys/dnsconfig/getdnsconfig.2 +++ b/libc/sys/dnsconfig/getdnsconfig.2 @@ -118,6 +118,8 @@ does not match the size of the address corresponding to .Fa family . .El .Sh SEE ALSO +.Xr inet 4 , +.Xr inet6 4 , .Xr dnsconfig 8 .Sh HISTORY The diff --git a/share/man/man4/arp.4 b/share/man/man4/arp.4 new file mode 100644 index 00000000..dda7fa20 --- /dev/null +++ b/share/man/man4/arp.4 @@ -0,0 +1,103 @@ +.Dd June 5, 2017 +.Dt ARP 4 +.Os +.Sh NAME +.Nm arp +.Nd address resolution protocol +.Sh SYNOPSIS +.In netinet/if_ether.h +.In netinet/in.h +.Sh DESCRIPTION +The Address Resolution Protocol (ARP) provides resolution of network layer +addresses to link layer addresses on the local network. +ARP requests asks for the link layer address of a network layer address and ARP +replies contains the link layer address of the requested network layer +address. +Requests are broadcast on the local network, while replies are unicast back to +the sending machine. +.Pp +The +.Xr kernel 7 +uses the ARP to resolve Internet Protocol version 4 addresses +.Xr ( inet 4 ) +to Ethernet addresses +.Xr ( ether 4 ) +in order to transmit Internet Protocol version 4 +.Xr ( ip 4 ) +datagrams on Ethernet network interfaces +.Xr ( if 4 ) . +.Pp +The +.Xr kernel 7 +maintains a cache of ARP replies for every network interface, which is actively +populated whenever there is a need to transmit to a network layer address, and +passively populated with the source addresses of ARP requests from other hosts. +Network layer datagrams are queued whenever a network layer address needs to +be resolved. +Queued datagrams are transmitted when the destination link layer address has +been resolved, or are discarded if the resolution times out. +.Sh IMPLEMENTATION NOTES +The transmission queue is limited to 16 packets for each network layer address. +.Pp +ARP requests are attempted three times, each attempt timing out after a second. +If each request fails, the cache entry is evicted. +.Pp +When an network address is resolved, its cache entry remains valid for 60 +seconds. +Upon expiry, if the cache entry was unused, it is evicted. +Otherwise, its network address is renewed by resolving it again with three +attempts. +In the meanwhile, the entry cache continues to be used for routing. +If the renewal fails, the cache entry is evicted. +.Pp +The ARP cache can contain up to 256 entries. +If the cache is full, +the least recently used cache entry is evicted when a network address is resolved +that is not currently in the cache, and the source addresses from received ARP +messages are not passively added to the cache. +.Pp +The ARP cache uses a hash table with 256 entries, using a linked list in case +of hash collisions. +The hash is the the bytewise xor (exclusive or) of every byte in the network +address. +This hash is perfect if the subnet contains no more than 256 addresses, and +degrades in quality for larger subnets, at worst needing to linearly scan the +whole ARP cache. +.Pp +The ARP cache is purged when the network interface's +.Xr ether 4 +or +.Xr inet 4 +configuration changes. +Packets in the ARP transmission queue are dropped. +.Sh SEE ALSO +.Xr ether 4 , +.Xr if 4 , +.Xr inet 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A D. Plummer +.%D November 1982 +.%R STD 37 +.%R RFC 826 +.%T \&An Ethernet Address Resolution Protocol +.Re +.Sh BUGS +The ARP timeout is not configurable. +.Sh SECURITY CONSIDERATIONS +The source network layer and source link layer addresses of received ARP request +and replies are trusted. +If the router does not validate ARP messages on the network are consistent with +the DHCP leases, an attacker will be able to engage in an ARP spoofing attack +that would allow denial of service, man in the middle, and session hijacking +attacks. +.Pp +The cache is limited to 256 entries per interface and if the local subnet is +larger than 256 addresses, then if an attacker is capable of making the local +system concurrently transmit to 256 addresses on the local network not currently +in the ARP cache, then doing so would purge the whole transmission queue. +If the attacker can do this faster than the machines on the local network can +answer ARP requests, transmission service may be denied entirely, or at the +least be significantly degraded as the higher layers wait a little while before +they retransmit. diff --git a/share/man/man4/ether.4 b/share/man/man4/ether.4 new file mode 100644 index 00000000..54401f33 --- /dev/null +++ b/share/man/man4/ether.4 @@ -0,0 +1,173 @@ +.Dd June 6, 2017 +.Dt ETHER 4 +.Os +.Sh NAME +.Nm ether +.Nd ethernet +.Sh SYNOPSIS +.In netinet/if_ether.h +.Bd -literal +struct ether_addr { + uint8_t ether_addr_octet[ETHER_ADDR_LEN /* 6 */]; +}; + +struct ether_header { + uint8_t ether_dhost[ETHER_ADDR_LEN /* 6 */]; + uint8_t ether_shost[ETHER_ADDR_LEN /* 6 */]; + uint16_t ether_type; +}; + +struct ether_footer { + uint32_t ether_crc; +}; +.Ed +.Sh DESCRIPTION +The Ethernet is a physical and link layer protocol for datagram exchange on a +local area network. +An Ethernet packet contains an Ethernet frame that contains a datagram of a +higher level protocol. +Ethernet 2.0 framing is implemented. +IEEE Std 802.3 Ethernet framing is intentionally not implemented. +.Pp +Ethernet hosts are addressed with a globally unique six byte address assigned to +the network interface controller. +An Ethernet address can be stored in the +.Vt struct ether_addr +type. +Ethernet addresses are notated in hexadecimal form interspaced with colons, +e.g. 00:00:5e:00:53:ff. +The Ethernet addresses in the range from 00:00:5e:00:53:00 (inclusive) to +00:00:5e:00:53:ff (inclusive) are reserved for documentation. +The address with all bits set +.Pq ff:ff:ff:ff:ff:ff +is the broadcast address and can be conveniently accessed using the +.Xr etheraddr_broadcast 3 +which is initialized to +.Dv ETHERADDR_BROADCAST_INIT . +.Pp +Ethernet packets are transmitted starting with an eight-byte start-of-frame +preamble, followed by the Ethernet frame itself, ending with a twelve-byte +interpacket gap. +Ethernet 2.0 frames consist of a header, a datagram payload, and a frame check +sequence. +Frames are at least 64 bytes and are at most 1518 bytes. +With 18 bytes of Ethernet frame overhead, the minimum transmission unit of the +payload datagram is 46 bytes and the maximum transmission unit is 1500 bytes. +The size of the frame is implicitly determined by the interpacket gap. +The header has the format of +.Vt struct ether_header , +starting with the destination host address +.Pq Va ether_dhost , +followed by the source host address +.Pq Va ether_shost , +ending with a big-endian 16-bit EtherType value denoting the protocol associated +with the payload datagram +.Pq Va ether_type . +The frame check sequence has the format of +.Vt struct ether_footer , +being a 32-bit cyclic redundancy checksum of the frame except the checksum +itself +.Pq Va ether_crc . +.Pp +Frames are discarded on receipt if the destination address is neither the local +address nor the broadcast address, if the source address is the broadcast +address, or if the checksum is invalid. +.Pp +Network layer addresses can be resolved to Ethernet addresses using the +Address Resolution Protocol +.Xr arp 4 . +.Pp +The +.Va ether_type +field denotes the payload datagram protocol and the following values are +supported: +.Bl -tag -width "12345678" +.It Dv ETHERTYPE_IP Pq Li 0x0800 +The Internet Protocol version 4 +.Xr ip 4 . +.It Dv ETHERTYPE_ARP Pq Li 0x0806 +The Address Resolution Protocol +.Xr arp 4 . +.El +.Pp +The following constants are defined in +.In netinet/if_ether.h : +.Bl -tag -width "12345678" +.It Dv ETHER_ADDR_LEN Li 6 +The size of an Ethernet address in bytes. +.It Dv ETHER_TYPE_LEN Li 2 +The size of the type field in the Ethernet header in bytes. +.It Dv ETHER_HDR_LEN Li 14 +The size of the Ethernet header in bytes. +.It Dv ETHER_CRC_LEN Li 4 +The size of the Ethernet frame check sequence in bytes. +.It Dv ETHER_LEN Li 18 +The total size of the Ethernet header and the frame check sequence in bytes. +.It Dv ETHER_MIN_LEN Li 64 +The minimum size of Ethernet frames. +.It Dv ETHER_MAX_LEN Li 1518 +The maximum size of Ethernet frames. +.It Dv ETHERMTU Li 1500 +The maximum transmission unit for Ethernet payloads. +.It Dv ETHERMIN Li 46 +The minimum transmission unit for Ethernet payloads. +.It Dv ETHERADDR_BROADCAST_INIT Li {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}} +An initializer list for +.Vt struct ether_addr +that initializes it to the broadcast address ff:ff:ff:ff:ff:ff. +.El +.Sh ERRORS +Socket operations can fail due to these error conditions, in addition to the +error conditions of the invoked function. +.Bl -tag -width [EADDRNOTAVAIL] +.It Bq Er EMSGSIZE +The datagram was too large to be sent because it exceeded the maximum +transmission unit (MTU) (1500 bytes) of the Ethernet protocol. +.It Bq Er ENOBUFS +There was not enough memory available for network packets. +.El +.Sh SEE ALSO +.Xr etheraddr_broadcast 3 , +.Xr arp 4 , +.Xr if 4 , +.Xr inet 4 , +.Xr ip 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A Digital Equipment Corporation +.%A Intel Corporation +.%A Xerox Corporation +.%D November 1982 +.%R Version 2.0 +.%T The Ethernet - A Local Area Network +.Re +.Pp +.Rs +.%A C. Hornig +.%D April 1984 +.%R STD 41 +.%R RFC 894 +.%T A Standard for the Transmission of IP Datagrams over Ethernet Networks +.%Q Symbolics Cambridge Research Center +.Re +.Pp +.Rs +.%A Internet Engineering Task Force +.%A R. Braden (ed.) +.%D October 1989 +.%R STD 3 +.%R RFC 1122 +.%T Requirements for Internet Hosts -- Communication Layers +.%Q USC/Information Sciences Institute +.Re +.Rs +.%A Internet Engineering Task Force +.%A D. Eastlake 3rd +.%A J. Abley +.%D October 2013 +.%R RFC 7042 +.%T IANA Considerations and IETF Protocol and Documentation Usage for IEEE 802 Parameters +.%Q Huawei +.%Q Dyn, Inc. +.Re diff --git a/share/man/man4/icmp.4 b/share/man/man4/icmp.4 new file mode 100644 index 00000000..c4b324d6 --- /dev/null +++ b/share/man/man4/icmp.4 @@ -0,0 +1,35 @@ +.Dd June 4, 2017 +.Dt ICMP 4 +.Os +.Sh NAME +.Nm icmp +.Nd internet control message protocol +.Sh SYNOPSIS +.In sys/socket.h +.In netinet/in.h +.Sh DESCRIPTION +The Internet Control Message Protocol (ICMP) is a control and error reporting +layer for the Internet Protocol version 4 +.Xr ip 4 . +ICMP is commonly used to diagnose network problems, such as the destination +being unreachable, network congestion, packets exceeding their time to live, and +so on. +.Sh SEE ALSO +.Xr inet 4 , +.Xr ip 4 , +.Xr ping 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A J. Postel (ed.) +.%D September 1981 +.%R STD 5 +.%R RFC 792 +.%T Internet Control Message Protocol - DARPA Internet Program Protocol Specification +.%Q USC/Information Sciences Institute +.Re +.Sh BUGS +ICMP is not implemented yet, except automatically replying to the Echo Request +message (ping) and the existence of +.Xr ping 4 +sockets for sending Echo Request messages and receiving Echo Reply messages. diff --git a/share/man/man4/if.4 b/share/man/man4/if.4 new file mode 100644 index 00000000..a99f4c3a --- /dev/null +++ b/share/man/man4/if.4 @@ -0,0 +1,377 @@ +.Dd June 11, 2017 +.Dt IF 4 +.Os +.Sh NAME +.Nm if +.Nd network interface +.Sh SYNOPSIS +.In sys/ioctl.h +.In net/if.h +.Sh DESCRIPTION +Network interfaces are devices that provide transmission and receipt of network +packets. +The name of a network interface is the name of the device driver followed by the +driver instance number, and each network interface have an unique index number +distinct from the driver instance number. +The name length is restricted to +.Dv IF_NAMESIZE +bytes including the trailing nul byte. +Network interfaces are exposed in the filesystem as +.Pa /dev/ Ns Ar name Ns Ar X +devices, where +.Ar name +is the driver name, and the +.Ar X +number denotes which device using that driver. +Each driver is documented with a manual page with the driver's name in section +4 of the manual. +.Pp +The state of the network interface can be awaited with +.Xr poll 2 +where the +.Dv POLLIN +and +.Dv POLLOUT +events are signaled when the network is up +.Dv ( IF_STATUS_FLAGS_UP ) . +.Sh LINK LAYER +Network interfaces abstracts a hardware device or a software device as a link +layer protocol: +.Bl -tag -width "12345678" +.It Ethernet Controller Pq Dv IF_TYPE_ETHERNET +Packets are received and transmitted with the Ethernet +.Xr ether 4 +link layer protocol. +The +.Va type +field of +.Vt struct if_info +is set to +.Dv IF_TYPE_ETHERNET +and the +.Va addr +field contains the 6-byte Ethernet address assigned to the Ethernet controller. +.It Loopback Device Pq Dv IF_TYPE_LOOPBACK +The software loopback device +.Xr lo 4 +on the local host receives any packets transmitted on it. +The +.Va type +field of +.Vt struct if_info +is set to +.Dv IF_TYPE_LOOPBACK +and the +.Va addr +field is unused. +.El +.Sh NETWORK LAYER +Network layer protocols are layered on top of the link layer: +.Bl -tag -width "12345678" +.It Internet Protocol version 4 Pq Dv AF_INET +The Internet Protocol version 4 +.Xr ip 4 +provides the network layer of the Internet Protocol version 4 protocol family +.Xr inet 4 , +containing transport protocols such as the Transmission Control Protocol +.Xr tcp 4 , +and the User Datagram Protocol +.Xr udp 4 . +When combined with the Ethernet link layer, the Address Resolution Protocol +.Xr arp 4 +is used to resolve network layer addresses into link layer addresses. +.El +.Sh CONFIGURATION +The static information about a network interface is stored in +.Vt struct if_info : +.Bd -literal +struct if_info { + unsigned int linkid; + int type; + int features; + size_t addrlen; + char name[IF_NAMESIZE]; + unsigned char addr[IF_HWADDR_MAXSIZE]; +}; +.Ed +.Pp +.Va linkid +is the network interface's index number. +.Va type +is the link layer protocol. +.Va features +is a bit mask of the features provided by the network interface: +.Bl -tag -width "12345678" +.It IF_FEATURE_ETHERNET_CRC_OFFLOAD +The Ethernet CRC32 checksum is computed in hardware. +.El +.Pp +.Va addrlen +is the size of the interface's assigned hardware address stored in the +.Va addr +field. +.Va name +is the nul-terminated string name of the network interface. +.Pp +The status information about a network interface is stored in +.Vt struct if_status : +.Bd -literal +struct if_status { + int flags; + size_t mtu; +}; +.Ed +.Pp +.Va flags +is a bit mask of network interface status conditions: +.Bl -tag -width "12345678" +.It IF_STATUS_FLAGS_UP +The network interface link is up and packets can be received and transmitted. +.El +.Pp +.Va mtu +is the maximum transmission unit of the network layer datagram that can be +sent or transmitted on the link layer. +.Pp +The configuration of the network interface is stored in +.Vt if_config : +.Bd -literal +struct if_config_ether { + struct ether_addr address; +}; + +struct if_config_inet { + struct in_addr address; + struct in_addr router; + struct in_addr subnet; +}; + +struct if_config { + struct if_config_ether ether; + struct if_config_inet inet; +}; +.Ed +.Pp +.Va ether +is the configuration of the +.Xr ether 4 +link layer protocol where +.Va address +is the Ethernet address that received packets must have as the destination +address and the address used as the source address in transmitted packets. +.Va address +defaults on network interface creation to the value of the +.Va addr +field of the the network interface's +.Va struct if_info . +.Pp +.Va inet +is the configuration of the +.Xr ip 4 +network layer protocol where +.Va address +is the local address, +.Va router +is the default route, and +.Va subnet +is the subnet mask. +The protocol is disabled if +.Va address +is set to the any address +.Pq 0.0.0.0 . +.Pp +Configuration changes to the local addresses or routing information will cause +the remote side of existing sockets to become unreachable where paths are no +longer configured. +Currently outgoing packets are unaffected by configuration changes when they +have left the appropriate network layers. +Outgoing packets may be queued for a short period in queues such as +the data link layer address resolution queue or in the transmission queue. +.Sh IOCTLS +Network interfaces provide the following +.Xr ioctl 2 +requests defined in +.In sys/ioctl.h : +.Bl -tag -width "12345678" +.It Dv IOCGETTYPE Fa "void" +Return the device +.Fa type +that as a parameter to the +.Dv IOC_TYPE(int type) +macro returns +.Dv IOC_TYPE_NETWORK_INTERFACE +if the device is a network interface. +.It Dv NIOC_GETCONFIG Fa "struct if_config *" +Retrieve the network interface configuration for all protocols atomically. +.It Dv NIOC_GETCONFIG_ETHER Fa "struct if_config_ether *" +Retrieve the Ethernet configuration. +.It Dv NIOC_GETCONFIG_INET Fa "struct if_config_inet *" +Retrieve Internet Protocol version 4 configuration. +.It Dv NIOC_GETINFO Fa "struct if_info *" +Retrieve the network interface static information. +.It Dv NIOC_GETSTATUS Fa "struct if_status *" +Retrieve the network interface status. +.It Dv NIOC_SETCONFIG Fa "const struct if_config *" +Set the network interface configuration for all protocols atomically. +.It Dv NIOC_SETCONFIG_ETHER Fa "const struct if_config_ether *" +Set the Ethernet configuration. +.It Dv NIOC_SETCONFIG_INET Fa "const struct if_config_inet *" +Set the Internet Protocol version 4 configuration. +.El +.Sh SOCKET OPTIONS +Sockets are made with +.Xr socket 2 +by requesting the desired network layer protocol and the desired transport layer +protocol. +These +.Xr setsockopt 2 / +.Xr getsockopt 2 +options of level +.Dv SOL_SOCKET +control aspects related to the network interface and are defined in +.In sys/socket.h : +.Bl -tag -width "12345678" +.It Dv SO_BINDTODEVICE Fa "char[]" +Set the network interface the socket is bound to by looking up the string value +(which need not be nul-terminated) as an network interface name, and then +binding the socket to that network interface index number; or failing with +.Er ENODEV +if no such device exists. +Gets the name of the network interface the socket is bound to, by looking up the +network interface index number the socket is bound to, and copying out the name +of that network interface; or copying out the empty string if so no such device +exists. +If bound to a network interface, a socket will only receive from and transmit on +that network interface. +(Initially the empty string) +.It Dv SO_BINDTOINDEX Fa "unsigned int" +Sets the network interface the socket is bound to by the network interface index +number, not verifying such an network interface exists, returning with the error +.Er EINVAL +if the requested index number exceeds +.Dv UINT_MAX . +Gets the index number of the network interface the socket is bound to. +Index 0 means no network interface. +If bound to a network interface, a socket will only receive from and transmit on +that network interface. +(Initially 0) +.It Dv SO_BROADCAST Fa "int" +Sending to a broadcast address is allowed when set to 1, sending to a broadcast +address will fail with +.Er EACCESS +when set to 0. +This option is boolean, setting it to non-zero is the same as setting it to 1. +This option only pertains to datagram sockets. +(Initially 0) +.It Dv SO_DEBUG Fa "int" +Whether the socket is in debug mode. +This option is not implemented. +This option is boolean, setting it to non-zero is the same as setting it to 1. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Initially 0) +.It Dv SO_DOMAIN Fa "sa_family_t" +The socket domain (the address family). +This option can only be read. +The initial value is set when making the socket. +.It Dv SO_DONTROUTE Fa "int" +Whether to bypass the routing table and only send on the local network. +This option is not implemented. +This option is boolean, setting it to non-zero is the same as setting it to 1. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Initially 0) +.It Dv SO_ERROR Fa "int" +The asynchronous pending error +(an +.Xr errno 3 +value). +Cleared to 0 when read unless the error is permanent. +This option can only be read. +(Initially 0) +.It Dv SO_PROTOCOL Fa "int" +The socket protocol. +This option can only be read. +The initial value is set when making the socket. +.It Dv SO_RCVBUF Fa "int" +How many bytes the receive queue can use. +Setting this option to a value beyond the socket's hard limit will instead set +this option to the hard limit. +The initial value depends on the socket protocol. +.It Dv SO_REUSEADDR Fa "int" +Don't fail to +.Xr bind 2 +the second socket with +.Er EADDRINUSE +when one socket is bound to the any address and a port and the other socket is +bound to another address and that port, whenever this option is set to 1. +This option is boolean, setting it to non-zero is the same as setting it to 1. +(Initially 0) +.It Dv SO_SNDBUF Fa "int" +How many bytes the send queue can use. +Setting this option to a value beyond the socket's hard limit will instead set +this option to the hard limit. +The initial value depends on the socket protocol. +.It Dv SO_TYPE Fa "int" +The socket type. +This option can only be read. +The initial value is set when making the socket. +.El +.Sh IMPLEMENTATION NOTES +Network packets waiting to be transmitted or received have 384 dedicated pages +of backing memory (allocated on first use). +If more packets are needed, available system memory is used up to a limit of +1/16 of the total system memory. +If no memory is available for another network packet or the limit is hit, +received packets may be dropped and transmitted packets may be dropped or +temporarily fail with +.Er ENOBUFS . +.Sh SEE ALSO +.Xr getsockopt 2 , +.Xr ioctl 2 , +.Xr setsockopt 2 , +.Xr arp 4 , +.Xr ether 4 , +.Xr inet 4 , +.Xr ip 4 , +.Xr lo 4 , +.Xr kernel 7 +.Sh STANDARDS +.St -p1003.1-2008 +only specifies a minimal +.In net/if.h +with +.Dv IF_NAMESIZE , +.Vt struct if_nameindex +and the +.Xr if_nameindex 3 +family of functions. +.Pp +.St -p1003.1-2008 +specifies the socket options +.Dv SO_ACCEPTCONN , +.Dv SO_BROADCAST , +.Dv SO_DEBUG , +.Dv SO_DONTROUTE , +.Dv SO_ERROR , +.Dv SO_KEEPALIVE , +.Dv SO_LINGER , +.Dv SO_OOBINLINE , +.Dv SO_RCVBUF , +.Dv SO_RCVLOWAT , +.Dv SO_RCVTIMEO , +.Dv SO_REUSEADDR , +.Dv SO_SNDBUF , +.Dv SO_SNDLOWAT , +.Dv SO_SNDTIMEO , +and +.Dv SO_TYPE +in +.In sys/socket.h +.Sh HISTORY +Network interfaces as described here originally appeared in Sortix 1.1, except +when noted otherwise. +.Pp +The +.Dv SO_BINDTODEVICE +socket option is also found on Linux. diff --git a/share/man/man4/inet.4 b/share/man/man4/inet.4 new file mode 100644 index 00000000..395433e6 --- /dev/null +++ b/share/man/man4/inet.4 @@ -0,0 +1,214 @@ +.Dd June 1, 2017 +.Dt INET 4 +.Os +.Sh NAME +.Nm inet +.Nd internet protocol family +.Sh SYNOPSIS +.In sys/socket.h +.In netinet/in.h +.Pp +.Fd #define AF_INET 1 +.Bd -literal +typedef uint16_t sa_family_t; +typedef uint16_t in_port_t; +typedef uint32_t in_addr_t; + +struct in_addr { + in_addr_t s_addr; +}; + +struct sockaddr_in { + sa_family_t sin_family; + in_port_t sin_port; + struct in_addr sin_addr; +}; +.Ed +.Sh DESCRIPTION +The Internet Protocol version 4 protocol family is a set of protocols using the +Internet Protocol version 4 +.Xr ip 4 +as the network layer. +.Dv SOCK_STREAM +sockets are provided by the Transmission Control Protocol +.Xr tcp 4 . +.Dv SOCK_DGRAM +sockets are provided by the User Datagram Protocol +.Xr udp 4 . +.Pp +Hosts are addressed with a four byte Internet Protocol (IP) address stored in a +.Vt struct in_addr +in network byte order. +IP addresses are notated by +the decimal byte values interspaced with periods, e.g. 192.0.2.255. +Subnetworks are ranges of IP addresses given by a starting IP address along with +how many leading bits (most significant bit first) of the IP address are common +to the network (the prefix), the first address of the the subnetwork is the +prefix with the remaining bits set to zero, and the last address is the prefix +with the +remaining bits set to one. +A subnetwork can be denoted by the starting IP address and the leading bits, +e.g. 198.51.100.0/24 spans from 198.51.100.0 to 198.51.100.255. +The subnet mask of a subnetwork is an IP address with the bits set that belong +to the network prefix, e.g. the subnet mask of 203.0.113.0/24 is 255.255.255.0. +The subnetworks 192.0.2.0/24 (TEST-NET-1), 198.51.100.0/24 (TEST-NET-2), and +203.0.113.0/24 (TEST-NET-2) are reserved for documentation. +The last address in a network is normally the broadcast address for the network. +.Pp +Services are addressed +with a 16-bit byte port number in a +.Vt in_port_t +in network byte order. +.Pp +Sockets are addressed with the combination of a host address and port number +stored in a +.Vt struct sockaddr_in +where +.Va sin_family +is set to +.Dv AF_INET , +.Va sin_port +is set to the 16-bit port number in network byte order, and +.Va sin_addr +is set to the host address in network byte order. +.Pp +.Va sin_port +can be set to +.Li 0 +(converted to network byte order) +to request +.Xr bind 2 +allocate a port. +Port +.Li 0 +is not valid as a destination port. +.Pp +.Va sin_addr.s_addr +can be set to +.Dv INADDR_ANY +.Pq 0.0.0.0 +(converted to network byte order) to mean an unspecified address. +When a socket is bound to the address +.Dv INADDR_ANY , +messages are accepted from any address. +In +.Xr connect 2 +and +.Xr sendto 2 , +the destination address +.Dv INADDR_ANY +means the current host. +.Pp +.Va sin_addr.s_addr +can be set to +.Dv INADDR_BROADCAST +.Pq 255.255.255.255 +(converted to network byte order), +the broadcast address of the local network. +.Pp +.Va sin_addr.s_addr +can be set to +.Dv INADDR_LOOPBACK +.Pq 127.0.0.1 +(converted to network byte order), the address of the loopback interface +.Xr lo 4 . +.Dv INADDR_LOOPMASK +.Pq 255.0.0.0 +contains the subnet mask of the loopback interface. +.Pp +Sockets of this protocol family can be created by passing +.Dv AF_INET +as the +.Fa domain +parameter of +.Xr socket 2 . +.Pp +The network byte order is big-endian. +.Pp +IP addresses in 32-bit integer format in the host endian can be converted +to network byte order using +.Xr htobe32 3 +or +.Xr htonl 3 +and back using +.Xr be32toh 3 +or +.Xr ntohl 3 . +.Pp +Port numbers in 16-bit integer format in the host endian can be converted +to network byte order using +.Xr htobe16 3 +or +.Xr htons 3 +and back using +.Xr be16toh 3 +or +.Xr ntohs 3 . +.Pp +.Xr inet_pton 3 +can be used to convert an IP address from textual representation to binary +representation. +.Xr inet_ntop 3 +can be used to convert an IP address from binary representation to textual +representation. +.Sh EXAMPLES +This example manually constructs and deconstructs a +.Vt struct inaddr_in . +.Bd -literal +struct sockaddr_in saddr; + +memset(&saddr, 0, sizeof(saddr)); +saddr.sin_family = AF_INET; +saddr.sin_addr.s_addr = htobe32(0xC0000201); /* 192.0.2.1 */ +saddr.sin_port = htobe16(1234); + +sa_family_t family = saddr.sin_family; +in_addr_t addr = be32toh(saddr.sin_addr.s_addr); +in_port_t port = be16toh(saddr.sin_port); +.Ed +.Sh COMPATIBILITY +On some operating systems, +.Vt struct sockaddr_in +may contain padding and additional members and the structure should be +initialized to zero prior to initializing its members. +.Sh SEE ALSO +.Xr socket 2 , +.Xr arp 4 , +.Xr icmp 4 , +.Xr ip 4 , +.Xr ping 4 , +.Xr tcp 4 , +.Xr udp 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A J. Postel (ed.) +.%D September 1981 +.%R STD 5 +.%R RFC 791 +.%T Internet Protocol - DARPA Internet Program Protocol Specification +.%Q USC/Information Sciences Institute +.Re +.Pp +.Rs +.%A Internet Engineering Task Force +.%A J. Arkko +.%A M. Cotton +.%A L. Vegoda +.%D January 2010 +.%R RFC 5737 +.%T IPv4 Address Blocks Reserved for Documentation +.%Q Ericsson +.%Q ICANN +.Re +.Pp +The protocol family programming interface conforms to +.St -p1003.1-2008 . +.Sh BUGS +The network stack implementation is incomplete and has known bugs. +See the protocol manual pages for more information. +.Pp +The 4-byte address space allows only a maximum of 4294967296 addresses and is +being exhausted. +The Internet Protocol version 6 replaces version 4 and provides a 16-byte +address space instead. diff --git a/share/man/man4/ip.4 b/share/man/man4/ip.4 new file mode 100644 index 00000000..01d01f86 --- /dev/null +++ b/share/man/man4/ip.4 @@ -0,0 +1,135 @@ +.Dd June 3, 2017 +.Dt IP 4 +.Os +.Sh NAME +.Nm ip +.Nd internet protocol +.Sh SYNOPSIS +.In sys/socket.h +.In netinet/in.h +.Ft int +.Fn socket AF_INET type protocol +.Sh DESCRIPTION +The Internet Protocol version 4 is the original network layer protocol of the +Internet and provides best-effort delivery of datagrams between hosts. +It provides for addressing of hosts, routing over packet-switched networks, +fragmentation and reassembly of datagrams across networks with small maximum +transmission unit sizes; but it does not provide guarantee of delivery, +avoidance of delivering multiple times, ordering, flow control, nor data +integrity. +Its protocol family +.Xr inet 4 +can be layered on top of the Internet Protocol to provide the enhanced service +of the transport layer. +For instance, the Transmission Control Protocol +.Xr tcp 4 +can be used to provide multiplexed reliable communication across the Internet, +while the User Datagram Protocol +.Xr udp 4 +can be used to provide low-overhead multiplexed unreliable communication across +the Internet. +.Pp +Datagrams contain a header followed by a datagram of the above protocol layer. +The header contains the Internet Protocol version (4), the header size, the +desired type of service, the datagram size, information for the reassembly of +fragmented datagrams, the remaining time this datagram has left to live, the +protocol number of the above protocol layer, a checksum of the header, the +address of the source host and the address of the destination host, and an +optional set of options. +.Pp +An incoming datagram on a network interface will be received and passed to the +higher level protocol if the following conditions hold: +.Pp +.Bl -bullet -compact +.It +The checksum is valid. +.It +The protocol is Internet Protocol version 4 and the packet is well-formed. +.It +The source address is neither the broadcast address +.Pq 255.255.255.255 +or the subnet's broadcast address. +.It +If the network interface is not the loopback network interface +.Xr lo 4 , +neither the source nor the destination belong to the loopback subnet +.Pq 127.0.0.0/24 +.It +The destination address is either the local address (and the link layer +destination address was not a broadcast address) of the network interface, the +broadcast address of the network interface, or the broadcast address +.Pq 255.255.255.255 . +.El +.Sh ERRORS +Socket operations can fail due to these error conditions, in addition to the +error conditions of link layer and the error conditions of the invoked function. +.Bl -tag -width [EADDRNOTAVAIL] +.It Bq Er EACCES +A datagram was sent to a broadcast address, but +.Dv SO_BROADCAST +is turned off. +.It Bq Er EADDRNOTAVAIL +The socket cannot be bound to the requested address because no network interface +had that address or broadcast address. +.It Bq Er ECONNREFUSED +The destination host of a datagram was not listening on the port. +.It Bq Er EHOSTDOWN +The destination host of a datagram is not up. +.It Bq Er EHOSTUNREACH +The destination host of a datagram was unreachable. +.It Bq Er EMSGSIZE +The datagram was too large to be sent because it exceeded the maximum +transmission unit (MTU) on the path between the local and remote address. +.It Bq Er ENETDOWN +The network interface used to deliver a datagram isn't up. +.It Bq Er ENETUNREACH +The destination network of a datagram was unreachable. +.It Bq Er ENOBUFS +There was not enough memory available for network packets. +.El +.Sh SEE ALSO +.Xr arp 4 , +.Xr icmp 4 , +.Xr inet 4 , +.Xr ping 4 , +.Xr tcp 4 , +.Xr udp 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A J. Postel (ed.) +.%D September 1981 +.%R STD 5 +.%R RFC 791 +.%T Internet Protocol - DARPA Internet Program Protocol Specification +.%Q USC/Information Sciences Institute +.Re +.Pp +.Rs +.%A Internet Engineering Task Force +.%A R. Braden (ed.) +.%D October 1989 +.%R STD 3 +.%R RFC 1122 +.%T Requirements for Internet Hosts -- Communication Layers +.%Q USC/Information Sciences Institute +.Re +.Sh BUGS +The implementation is incomplete and has known bugs. +.Pp +Fragmented datagrams are not yet supported and are discarded on receipt. +The fragment identification field is always set to 0, preventing the proper +reassembly of multiple datagrams that became fragmented around the same time. +.Pp +Options are not yet supported and are ignored. +.Pp +The 4-byte address space allows only a maximum of 4294967296 addresses and is +being exhausted. +The Internet Protocol version 6 replaces version 4 and provides a 16-byte +address space instead. +.Pp +There is no routing table that can be configured. +Routing happens by searching for the first appropriate network interface that +can transmit the datagram. +If multiple network interfaces have a default route, the packet is sent using +the default route of the network interface with the lowest index number. diff --git a/share/man/man4/lo.4 b/share/man/man4/lo.4 new file mode 100644 index 00000000..02f5790a --- /dev/null +++ b/share/man/man4/lo.4 @@ -0,0 +1,36 @@ +.Dd May 27, 2017 +.Dt LO 4 +.Os +.Sh NAME +.Nm lo +.Nd loopback network interface +.Sh SYNOPSIS +.Nm /dev/lo Ns Ar X +.Sh DESCRIPTION +.Nm +is a loopback network interface that receives what is sent on it, used for +communication within the local host. +.Nm +is implemented in software and the packets sent on it are not transmitted on the +network. +.Pp +The +.Xr kernel 7 +creates the +.Sy lo0 +network interface on boot and configures it with the +.Xr inet 4 +address +.Dv 127.0.0.1 +in the subnet +.Dv 127.0.0.0/8 . +Packets with source or destination outside this subnet are dropped. +.Sh SEE ALSO +.Xr kernel 7 +.Sh CAVEATS +The default +.Xr inet 4 +configuration +of the +.Sy lo0 +network interface must not be changed or local loopback communication may fail. diff --git a/share/man/man4/ping.4 b/share/man/man4/ping.4 new file mode 100644 index 00000000..a12e1270 --- /dev/null +++ b/share/man/man4/ping.4 @@ -0,0 +1,537 @@ +.Dd June 4, 2017 +.Dt PING 4 +.Os +.Sh NAME +.Nm ping +.Nd ping protocol +.Sh SYNOPSIS +.In sys/socket.h +.In netinet/in.h +.In netinet/ping.h +.Ft int +.Fn socket AF_INET SOCK_DGRAM IPPROTO_PING +.Sh DESCRIPTION +The Ping Protocol uses the Echo Request and Echo Reply messages of the Internet +Control Message Protocol (ICMP) to provide a connectionless best-effort echo of +datagrams. +A cooperating host will send back a Echo Reply message containing the same data +as any Echo Request messages it receives. +It is designed for packet-switched networks and provides multiplexing with a +16-bit port number (using the identifier field of the Echo Request and Echo +Reply messages), and basic data integrity checks (16-bit ones' complement sum), +and broadcasting. +It does not provide a guarantee of delivery, avoidance of delivering multiple +times, ordering, out of band data, nor flow control. +.Pp +Ping sockets allow only sending Echo Request messages and receiving Echo Reply +messages. +The kernel will automatically send Echo Reply messages in response to any +received Echo Request Messages. +.Pp +The structure of ping datagrams is a 4 bytes sequence number (in big endian) +followed by 0 or more bytes of an optional payload. +Ping datagrams are sent inside a Echo Request message (with that sequence +number) and received inside a Echo Reply message (also containing a sequence +number). +.Pp +Ping sockets are made with +.Xr socket 2 +by passing an appropriate +.Fa domain +.Dv ( AF_INET ) , +.Dv SOCK_DGRAM +as the +.Fa type , +and +.Dv IPPROTO_PING +as the +.Fa protocol . +Initially a socket is not bound, it won't receive datagrams, and it does not +have a remote address and port set. +.Pp +A Ping socket has the following state: +.Pp +.Bl -bullet -compact +.It +The address family it belongs to. +.It +The network interface it is bound to (if any) +.Dv ( SO_BINDTODEVICE +and +.Dv SO_BINDTOINDEX ) +(initially none). +.It +The local address and port (when bound) (initially none). +.It +The remote address and port (when connected) (initially none). +.It +A receive queue (initially empty). +.It +Whether the socket has been +.Xr shutdown 2 +for read and/or write (initially neither). +.It +A single pending asynchronous error (if any) +.Dv ( SO_ERROR ) +(initially none). +.It +Whether broadcast datagrams can be sent +.Dv ( SO_BROADCAST ) +(initially no). +.It +Whether binding to the any address and a port doesn't conflict with binding to +another address on the same port +.Dv ( SO_REUSEADDR ) +(initially no). +.It +Limits on the size of the receive and send queues +.Dv ( SO_RCVBUF +and +.Dv SO_SNDBUF ) . +.El +.Pp +Datagrams are sent as a packet with a header and the datagram itself. +The header contains the port and the checksum. +The header is 8 bytes. +.Pp +Port numbers are 16-bit and range from 1 to 65535. +Port 0 is not valid. +Binding to port 0 will assign an available port on the requested address. +Sending or connecting to port 0 will fail with +.Er EADDRNOTAVAIL . +Received Echo Reply packets whose port number is port 0 will be silently +dropped. +Ping ports are distinct from ports in other transport layer protocols. +.Pp +Packets contain a 16-bit ones' complement checksum. +A received packet will be silently discarded if its checksum does not match its +contents. +.Pp +Sockets can be bound to a local address and port with +.Xr bind 2 +(if not already bound), +or an local address and port will be automatically assigned on the first send +or connect operation. +The local address and port can be read with +.Xr getsockname 2 . +If the socket hasn't been bound, the local address and port is reported as the +any address on port 0. +There are no ports that require superuser privileges. +.Pp +Sockets can be bound to the any address, the broadcast address, the address of +a network interface, or the broadcast address of a network interface. +Binding to port 0 will automatically assign an available port on the requested +local address or fail with +.Er EAGAIN +if no port is available. +No two sockets can bind to the same local address and port. +No two sockets can be bound such that one is bound to the any address and a +port, and the other socket is bound to another address and the same port; unless +both sockets had the +.Dv SO_REUSEADDR +socket option set when the second socket was bound, and the current user is the +same that bound the first socket or the current user has superuser privileges. +.Pp +A socket bound to a local address and port will receive an incoming datagram of +the following conditions hold: +.Pp +.Bl -bullet -compact +.It +The datagram belongs to the socket's address family and the protocol is Ping. +.It +The datagram's checksum matches the datagram. +.It +The datagram is an Echo Reply message. +.It +The datagram's port number is not port 0. +.It +The datagram is sent to the address or broadcast address of the network +interface it is received on, or the datagram was sent to the broadcast address; +.It +The socket is either bound to the receiving network interface, or the socket is +not bound to a network interface; +.It +The datagram is sent to the socket's local port; +.It +The datagram is sent to the socket's local address, or the socket's local +address is the any address (and no other socket is bound to the datagram's +address and that port); +.It +The socket is connected and the datagram was sent from the remote address and +the remote port, or the socket is not connected; and +.It +The socket is not shut down for reading. +.El +.Pp +If so, the datagram is added to the socket's receive queue, otherwise it is +discarded. +The receive queue contains incoming packets waiting to be received. +Incoming packets are dropped if the receive queue is full. +Shrinking the receive queue limit drops packets as needed to stay below the +limit. +.Pp +The remote address and port can be set multiple times with +.Xr connect 2 , +after which the socket is said to be connected, but Ping is connectionless and +no handshake is sent. +The remote port must not be port 0 or the connection will fail with +.Er EADDRNOTAVAIL . +If the socket is not bound, +.Xr connect 2 +will determine which network interface will be used to send to the remote +address, and then bind to the address of that network interface together with an +available port. +.Xr connect 2 +will fail if there is no route from the local address to the requested remote +address. +A connected socket only receive datagrams from the remote address and port. +.Xr connect 2 +will drop datagrams in the receive queue that don't originate from the +requested remote address. +The +.Xr send 2 , +.Xr write 2 , +and +.Xr writev 2 +functions can be used on a connected socket and they send to the remote address +and port by default. +If the socket is connected, the destination given to +.Xr sendto 2 +and +.Xr sendmsg 2 +must be +.Dv NULL . +The remote address and port can be read with +.Xr getpeername 2 . +.Pp +The socket can be disconnected by connecting to a socket address with the family +value set to +.Dv AF_UNSPEC , +which resets the remote address and port (if set), and otherwise has no effect. +.Pp +Datagrams can be sent with +.Xr sendmsg 2 +and +.Xr sendto 2 . +Sending on a socket not bound to a local address and port will bind to the +any address and an available port, or fail with +.Er EAGAIN +if no port is available. +Datagrams can be received with +.Xr recvmsg 2 , +.Xr recvfrom 2 , +.Xr recv 2 , +.Xr read 2 , +and +.Xr readv 2 . +If an asynchronous error is pending, the next send and receive operation will +fail with that error and clear the asynchronous eror, so the next operation can +succeed. +Asynchronous errors can arise from network problems. +There is no send queue at the Ping level and datagrams are directly forwarded to +the network layer. +It is an error to use any of the flags +.Dv MSG_CMSG_CLOEXEC , +.Dv MSG_CMSG_CLOFORK , +.Dv MSG_EOR , +.Dv MSG_OOB , +and +.Dv MSG_WAITALL . +.Pp +The condition of the socket can be tested with +.Xr poll 2 +where +.Dv POLLIN +signifies a packet has been received (or the socket is shut down for reading), +.Dv POLLOUT +signifies a packet can be sent now (and the socket is not shut down for +writing), +.Dv POLLHUP +signifies the socket is shut down for writing, and +.Dv POLLERR +signifies an asynchronous error is pending. +.Pp +The socket can be shut down for receiving and/or sending with +.Xr shutdown 2 . +The receive queue is emptied when shut down for receive (asynchronous errors are +preserved) and receive operations will succeed with an end of file +condition, but any pending asynchronous errors will take precedence and be +delivered instead. +Sending when shut down for writing will raise +.Dv SIGPIPE +and fail with +.Er EPIPE +(regardless of a pending asynchronous error). +.Pp +Socket options can be set with +.Xr setsockopt 2 +and read with +.Xr getsockopt 2 +and exist on the +.Dv IPPROTO_PING +level as well as applicable underlying protocol levels. +.Pp +Broadcast Echo Requests can be sent by setting the +.Dv SO_BROADCAST +socket option with +.Xr setsockopt 2 +and sending to a broadcast address of the network layer. +RFC 1122 3.2.2.6 allows hosts to ignore broadcast Echo Requests. +.Sh SOCKET OPTIONS +Ping sockets support these +.Xr setsockopt 2 / +.Xr getsockopt 2 +options at level +.Dv SOL_SOCKET : +.Bl -tag -width "12345678" +.It Dv SO_BINDTODEVICE Fa "char[]" +Bind to a network interface by its name. +(Described in +.Xr if 4 ) +.It Dv SO_BINDTOINDEX Fa "unsigned int" +Bind to a network interface by its index number. +(Described in +.Xr if 4 ) +.It Dv SO_BROADCAST Fa "int" +Whether sending to a broadcast address is allowed. +(Described in +.Xr if 4 ) +.It Dv SO_DEBUG Fa "int" +Whether the socket is in debug mode. +This option is not implemented and is initially 0. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Described in +.Xr if 4 ) +.It Dv SO_DOMAIN Fa "sa_family_t" +The socket +.Fa domain +(the address family). +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_DONTROUTE Fa "int" +Whether to bypass the routing table and only send on the local network. +This option is not implemented and is initially 0. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Described in +.Xr if 4 ) +.It Dv SO_ERROR Fa "int" +The asynchronous pending error +(an +.Xr errno 3 +value). +Cleared to 0 when read. +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_PROTOCOL Fa "int" +The socket protocol +.Dv ( IPPROTO_PING ) . +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_RCVBUF Fa "int" +How many bytes the receive queue can use (default is 64 pages, max 4096 pages). +(Described in +.Xr if 4 ) +.It Dv SO_REUSEADDR Fa "int" +Whether binding to the any address on a port doesn't conflict with binding to +another address and the same port, if both sockets have this option set and the +user binding the second socket is the same that bound the first socket or the +user binding the second socket has superuser privileges. +(Described in +.Xr if 4 ) +.It Dv SO_SNDBUF Fa "int" +How many bytes the send queue can use (default is 64 pages, max 4096 pages). +(Described in +.Xr if 4 ) +.It Dv SO_TYPE Fa "int" +The socket type +.Dv ( SOCK_DGRAM ) . +This option can only be read. +(Described in +.Xr if 4 ) +.El +.Sh IMPLEMENTATION NOTES +Received broadcast echo requests are ignored as permitted by RFC 1122 3.2.2.6. +.Pp +Each packet currently use a page of memory, which counts towards the receive +queue limit. +.Pp +If no specific port is requested, one is randomly selected in the dynamic port +range 32768 (inclusive) through 61000 (exclusive). +.Sh EXAMPLES +This example sends a Echo Request and blocks indefinitely until it receives a +Echo Reply. +.Va remote +is the remote socket address and +.Va remote_len +is the size of +.Va remote. +The +.Va remote +and +.Va remote_len +values should all be chosen according to the address family and network layer. +.Bd -literal +sa_family_t af = /* ... */; +const struct sockaddr *remote = /* ... */; +socklen_t remote_len = /* ... */; + +int fd = socket(af, SOCK_DGRAM, IPPROTO_PING); +if (fd < 0) + err(1, "socket"); +if (connect(fd, remote, remote_len) < 0) + err(1, "connect"); +unsigned char request[56]; +arc4random_buf(request, sizeof(request)); +if (send(fd, request, sizeof(request), 0) < 0) + err(1, "send"); +unsigned char reply[56 + 1 /* detect too large reply */]; +ssize_t amount = recv(fd, reply, sizeof(reply), 0); +if (amount < 0 ) + err(1, "recv"); +if (amount == sizeof(request) && !memcmp(request, reply, sizeof(request))) + printf("correct echo reply\\n"); +else + printf("incorrect echo reply\\n"); +.Ed +.Sh ERRORS +Socket operations can fail due to these error conditions, in addition to the +error conditions of the network and link layer, and the error conditions of the +invoked function. +.Bl -tag -width [EADDRNOTAVAIL] +.It Bq Er EACCES +A datagram was sent to a broadcast address, but +.Dv SO_BROADCAST +is turned off. +.It Bq Er EADDRINUSE +The socket cannot be bound to the requested address and port because another +socket was already bound to 1) the same address and port 2) the any address +and the same port (and +.Dv SO_REUSEADDR +was not set on both sockets), or 3) some address and the same port but the +requested address was the any address (and +.Dv SO_REUSEADDR +was not set on both sockets). +.It Bq Er EADDRNOTAVAIL +The socket cannot be bound to the requested address because no network interface +had that address or broadcast address. +.It Bq Er EAGAIN +A port could not be assigned because each port in the dynamic port range had +already been bound to a socket in a conflicting manner. +.It Bq Er ECONNREFUSED +The destination host of a datagram was not listening on the port. +This error can happen asynchronously. +.It Bq Er EHOSTDOWN +The destination host of a datagram is not up. +This error can happen asynchronously. +.It Bq Er EHOSTUNREACH +The destination host of a datagram was unreachable. +This error can happen asynchronously. +.It Bq Er EISCONN +A destination address and port was specified when sending a datagram, but the +socket has already been connected to a remote address and port. +.It Bq Er EMSGSIZE +The datagram was too large to be sent because it exceeded the maximum +transmission unit (MTU) on the path between the local and remote address. +This error can happen asynchronously. +.It Bq Er ENETDOWN +The network interface used to deliver a datagram isn't up. +This error can happen asynchronously. +.It Bq Er ENETUNREACH +The destination network of a datagram was unreachable. +This error can happen asynchronously. +.It Bq Er ENETUNREACH +The remote address could not be connected because there was no route from the +local address to the remote address. +.It Bq Er ENOBUFS +There was not enough memory available for network packets. +.It Bq Er EPERM +One of the unimplemented +.Dv SO_DEBUG +and +.Dv SO_DONTROUTE +socket options was attempted to be set to a non-zero value. +.El +.Sh SEE ALSO +.Xr bind 2 , +.Xr connect 2 , +.Xr getpeername 2 , +.Xr getsockname 2 , +.Xr getsockopt 2 , +.Xr poll 2 , +.Xr recvfrom 2 , +.Xr recvmsg 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr setsockopt 2 , +.Xr shutdown 2 , +.Xr socket 2 , +.Xr icmp 4 , +.Xr if 4 , +.Xr inet 4 , +.Xr ip 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A J. Postel (ed.) +.%D September 1981 +.%R STD 5 +.%R RFC 792 +.%T Internet Control Message Protocol - DARPA Internet Program Protocol Specification +.%Q USC/Information Sciences Institute +.Re +.Pp +.Rs +.%A Internet Engineering Task Force +.%A R. Braden (ed.) +.%D October 1989 +.%R STD 3 +.%R RFC 1122 +.%T Requirements for Internet Hosts -- Communication Layers +.%Q USC/Information Sciences Institute +.Re +.Sh HISTORY +Ping sockets originally appeared in Sortix 1.1. +.Sh BUGS +The handling of +.Dv SO_REUSEADDR in +.Xr bind 2 +does not yet enforce the two sockets to be bound by the same user or the second +socket to be bound by a user with superuser privileges. +The requirement that both sockets have +.Dv SO_REUSEADDR +set might be relaxed to only the second socket having it set when this +permission check is implemented. +.Pp +The integration with the network layer is inadequate and the asynchronous errors +.Er ECONNREFUSED , +.Er EHOSTDOWN , +.Er EHOSTUNREACH , +and +.Er ENETUNREACH +are never delivered asynchronously from the network. +.Pp +Ping sockets does not yet provide access to IP header values such as the Time +To Live and does not yet report ICMP error messages. +.Pp +The +.Xr send 2 +flag +.Dv MSG_DONTROUTE +and the +.Dv SO_DONTROUTE +socket option are not implemented yet. +.Pp +The +.Dv SO_SNDBUF +socket option is currently not used and the send queue is not limited at the +socket level. +.Pp +The automatic assignment of ports is random, but is statistically biased. +A random port is picked, and if it is taken, the search sequentially iterates +ports in ascending order until an available port is found or the search +terminates. diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 new file mode 100644 index 00000000..f1c0743f --- /dev/null +++ b/share/man/man4/tcp.4 @@ -0,0 +1,382 @@ +.Dd June 3, 2017 +.Dt TCP 4 +.Os +.Sh NAME +.Nm tcp +.Nd transmission control protocol +.Sh SYNOPSIS +.In sys/socket.h +.In netinet/in.h +.In netinet/tcp.h +.Ft int +.Fn socket AF_INET SOCK_STREAM IPPROTO_TCP +.Sh DESCRIPTION +The Transmission Control Protocol (TCP) is a connection-oriented transport layer +for the Internet Protocol +.Xr ip 4 +that provides a reliable byte stream connection between two hosts. +It is designed for packet-switched networks and provides sequenced data, +retransmissions on packet loss, handling of duplicated packets, flow control, +basic data integrity checks, multiplexing with a 16-bit port number, support for +out-of-band urgent data, and detection of lost connection. +TCP provides the +.Dv SOCK_STREAM +abstraction for the +.Xr inet 4 +protocol family. +.Pp +TCP sockets are made with +.Xr socket 2 +by passing an appropriate +.Fa domain +.Dv ( AF_INET ) , +.Dv SOCK_STREAM +as the +.Fa type , +and 0 or +.Dv IPPROTO_TCP +as the +.Fa protocol . +Newly created TCP sockets are not bound to a local address nor connected to a +remote socket. +.Pp +Port numbers are 16-bit and range from 1 to 65535. +Port 0 is not valid. +Binding to port 0 will assign an available port on the requested address. +Connecting to port 0 will fail with +.Er EADDRNOTAVAIL . +Received packets whose source or destination address is port 0 will be silently +dropped. +TCP ports are distinct from ports in other transport layer protocols. +.Pp +Packets contain a 16-bit ones' complement checksum. +Received packets will be silently discarded if their checksum does not match +the contents. +.Pp +Sockets can be bound to a local address and port with +.Xr bind 2 +(if not already bound), +or an local address and port will be automatically assigned when connected. +The local address and port can be read with +.Xr getsockname 2 . +If the socket hasn't been bound, the local address and port is reported as the +any address on port 0. +Binding to a well-known port (port 1 through port 1023) requires superuser +privileges. +.Pp +Sockets can be bound to the any address, the broadcast address, the address of +a network interface, or the broadcast address of a network interface. +Binding to port 0 will automatically assign an available port on the requested +local address or fail with +.Er EAGAIN +if no port is available. +No two sockets can bind to the same local address and port. +No two sockets can be bound such that one is bound to the any address and a +port, and the other socket is bound to another address and the same port; unless +both sockets had the +.Dv SO_REUSEADDR +socket option set when the second socket was bound, and the current user is the +same that bound the first socket or the current user has superuser privileges. +.Pp +A connection to a remote TCP socket can be established with +.Xr connect 2 . +Connections can be established when both sides calls +.Xr connect 2 +on each other. +If the socket is not bound, +.Xr connect 2 +will determine which network interface will be used to send to the remote +address, and then bind to the address of that network interface together with an +available port. +.Xr connect 2 +will fail if there is no route from the local address to the requested remote +address. +.Pp +Incoming connections can be received by binding to a local address with +.Xr bind 2 +and listening for connections with +.Xr listen 2 , +after which incoming connections can be retrieved with +.Xr accept 2 . +.Pp +Bytes can be received from the remote TCP socket with +.Xr recv 2 , +.Xr recvmsg 2 , +.Xr recvfrom 2 , +.Xr read 2 , +or +.Xr readv 2 . +Bytes can be transmitted to the remote TCP socket with +.Xr send 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr write 2 , +or +.Xr writev 2 . +Transmitting when the connection has broken will result in the process being +sent the +.Dv SIGPIPE +signal and fail with +.Er EPIPE . +.Pp +The receiving socket will acknowledge any received data. +If no acknowledgement is received in a timely manner, the transmitting socket +will transmit the data again. +If a acknowledgement still isn't received after a while, the connection is +considered broken and no further receipt or transmission is possible. +.Pp +The condition of the socket can be tested with +.Xr poll 2 +where +.Dv POLLIN +signifies new data been received or the remote socket has shut down for writing +or an incoming connection can be retrieved with +.Xr accept 2 , +.Dv POLLOUT +signifies new data can be sent now (and the socket is not shut down for +writing), +.Dv POLLHUP +signifies the socket is shut down for writing, and +.Dv POLLERR +signifies an asynchronous error is pending. +.Pp +The connection can be shut down with +.Xr shutdown 2 +in either the reading direction (discarding further received data) or the +writing direction (which sends the finish control flag). +The connection is closed when both sockets have sent and acknowledged the finish +control flag. +Upon the +.Xr close 2 +of the last file descriptor for a connected socket, the socket is shut down in +both directions. +.Pp +Socket options can be set with +.Xr setsockopt 2 +and read with +.Xr getsockopt 2 +and exist on the +.Dv IPPROTO_TCP +level as well as applicable underlying protocol levels. +.Sh SOCKET OPTIONS +TCP sockets support these +.Xr setsockopt 2 / +.Xr getsockopt 2 +options at level +.Dv SOL_SOCKET : +.Bl -tag -width "12345678" +.It Dv SO_BINDTODEVICE Fa "char[]" +Bind to a network interface by its name. +(Described in +.Xr if 4 ) +.It Dv SO_BINDTOINDEX Fa "unsigned int" +Bind to a network interface by its index number. +(Described in +.Xr if 4 ) +.It Dv SO_DEBUG Fa "int" +Whether the socket is in debug mode. +This option is not implemented and is initially 0. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Described in +.Xr if 4 ) +.It Dv SO_DOMAIN Fa "sa_family_t" +The socket +.Fa domain +(the address family). +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_ERROR Fa "int" +The asynchronous pending error +(an +.Xr errno 3 +value). +Errors are permanent. +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_PROTOCOL Fa "int" +The socket protocol +.Dv ( IPPROTO_TCP ) . +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_RCVBUF Fa "int" +How many bytes the receive queue can use (default is 64 KiB). +(Described in +.Xr if 4 ) +.It Dv SO_REUSEADDR Fa "int" +Whether binding to the any address on a port doesn't conflict with binding to +another address and the same port, if both sockets have this option set and the +user binding the second socket is the same that bound the first socket or the +user binding the second socket has superuser privileges. +(Described in +.Xr if 4 ) +.It Dv SO_SNDBUF Fa "int" +How many bytes the send queue can use (default is 64 KiB). +(Described in +.Xr if 4 ) +.It Dv SO_TYPE Fa "int" +The socket type +.Dv ( SOCK_STREAM ) . +This option can only be read. +(Described in +.Xr if 4 ) +.El +.Pp +TCP sockets currently implement no +.Xr setsockopt 2 / +.Xr getsockopt 2 +options at level +.Dv IPPROTO_TCP . +.Sh IMPLEMENTATION NOTES +Connections time out when a segment has not been acknowledged by the remote +socket after 6 attempts to deliver the segment. +Each retransmission happens after 1 second plus 1 second per failed +transmissions so far. +Successful delivery of any segment resets the retransmission count to 0. +.Pp +The receive and transmission buffers are both 64 KiB by default. +.Pp +If no specific port is requested, one is randomly selected in the dynamic port +range 32768 (inclusive) through 61000 (exclusive). +.Pp +The Maximum Segment Lifetime (MSL) is set to 30 seconds and the quiet time of +two MSLs before reusing sockets is 60 seconds. +.Sh ERRORS +Socket operations can fail due to these error conditions, in addition to the +error conditions of the network and link layer, and the error conditions of the +invoked function. +.Bl -tag -width [EADDRNOTAVAIL] +.It Bq Er EADDRINUSE +The socket cannot be bound to the requested address and port because another +socket was already bound to 1) the same address and port 2) the any address +and the same port (and +.Dv SO_REUSEADDR +was not set on both sockets), or 3) some address and the same port but the +requested address was the any address (and +.Dv SO_REUSEADDR +was not set on both sockets). +.It Bq Er EADDRNOTAVAIL +The socket cannot be bound to the requested address because no network interface +had that address or broadcast address. +.It Bq Er EADDRNOTAVAIL +The socket was connected to port 0. +.It Bq Er EAGAIN +A port could not be assigned because each port in the dynamic port range had +already been bound to a socket in a conflicting manner. +.It Bq Er ECONNREFUSED +The destination host refused the connection. +.It Bq Er ECONNRESET +The connection was reset by the remote socket. +.It Bq Er EHOSTDOWN +The destination host is not up. +This error can happen asynchronously. +.It Bq Er EHOSTUNREACH +The destination host was unreachable. +This error can happen asynchronously. +.It Bq Er ENETDOWN +The network interface isn't up. +This error can happen asynchronously. +.It Bq Er ENETUNREACH +The destination network was unreachable. +This error can happen asynchronously. +.It Bq Er ENETUNREACH +The remote address could not be connected because there was no route from the +local address to the remote address. +.It Bq Er ENOBUFS +There was not enough memory available for network packets. +.It Bq Er EPERM +The unimplemented +.Dv SO_DEBUG +socket options was attempted to be set to a non-zero value. +.It Bq Er EPIPE +The transmission failed because the connetion is broken. +The +.Dv SIGPIPE +signal is sent as well unless disabled. +.It Bq Er ETIMEDOUT +The connection timed out delivering a segment. +This error can happen asynchronously. +.El +.Sh SEE ALSO +.Xr accept 2 , +.Xr bind 2 , +.Xr connect 2 , +.Xr getpeername 2 , +.Xr getsockname 2 , +.Xr getsockopt 2 , +.Xr poll 2 , +.Xr recv 2 , +.Xr recvfrom 2 , +.Xr recvmsg 2 , +.Xr send 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr setsockopt 2 , +.Xr shutdown 2 , +.Xr socket 2 , +.Xr if 4 , +.Xr inet 4 , +.Xr ip 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A J. Postel (ed.) +.%D September 1981 +.%R STD 7 +.%R RFC 793 +.%T Transmission Control Protocol +.%Q USC/Information Sciences Institute +.Re +.Pp +.Rs +.%A Internet Engineering Task Force +.%A R. Braden (ed.) +.%D October 1989 +.%R STD 3 +.%R RFC 1122 +.%T Requirements for Internet Hosts -- Communication Layers +.%Q USC/Information Sciences Institute +.Re +.Pp +.St -p1003.1-2008 specifies the TCP socket programming interface. +.Sh BUGS +The implementation is incomplete and has known bugs. +.Pp +Out-of-band data is not yet supported and is ignored on receipt. +.Pp +The round trip time is not estimated which prevents efficient retransmission +when data is lost +Retransmissions happen after a second, which means unnecessary retransmissions +happen if the round trip time is more than a second. +.Pp +Options are not supported and are ignored on receipt. +.Pp +No extensions are implemented yet that improve efficiency for long fast networks +with large bandwidth * delay products. +.Pp +There is not yet any support for sending keep-alive packets. +.Pp +There is not yet any support for respecting +.Xr icmp 4 +condition such as destination unreachable or source quench. +.Pp +Half-open connections use memory, but until the handshake is complete, it is not +confirmed whether the remote is actually able to transmit from the source +qaddress. +An attacker may be able to transmit many packets from forged addresses, +reaching the limit on pending TCP sockets in the listen queue and thus deny +service to further legitimate connections. +A SYN queue or SYN cookies would mitigate this problem, but neither is yet +implemented. +.Pp +.Xr bind 2 +does not yet enforce that binding to a well-known port (port 1 through port +1023) requires superuser privileges. +.Pp +The automatic assignment of ports is random, but is statistically biased. +A random port is picked, and if it is taken, the search sequentially iterates +ports in ascending order until an available port is found or the search +terminates. diff --git a/share/man/man4/udp.4 b/share/man/man4/udp.4 new file mode 100644 index 00000000..5d43653f --- /dev/null +++ b/share/man/man4/udp.4 @@ -0,0 +1,699 @@ +.Dd June 4, 2017 +.Dt UDP 4 +.Os +.Sh NAME +.Nm udp +.Nd user datagram protocol +.Sh SYNOPSIS +.In sys/socket.h +.In netinet/in.h +.In netinet/udp.h +.Ft int +.Fn socket AF_INET SOCK_DGRAM IPPROTO_UDP +.Sh DESCRIPTION +The User Datagram Protocol (UDP) is a connectionless transport layer for the +Internet Protocol +.Xr ip 4 +that provides best-effort delivery of datagrams. +It is designed for packet-switched networks and provides multiplexing with a +16-bit port number, basic data integrity checks (16-bit ones' complement sum), +and broadcasting. +It does not provide a guarantee of delivery, avoidance of delivering multiple +times, ordering, out of band data, nor flow control. +UDP provides the +.Dv SOCK_DGRAM +abstraction for the +.Xr inet 4 +protocol family. +.Pp +UDP sockets are made with +.Xr socket 2 +by passing an appropriate +.Fa domain +.Dv ( AF_INET ) , +.Dv SOCK_DGRAM +as the +.Fa type , +and 0 or +.Dv IPPROTO_UDP +as the +.Fa protocol . +Initially a socket is not bound, it won't receive datagrams, and it does not +have a remote address and port set. +.Pp +A UDP socket has the following state: +.Pp +.Bl -bullet -compact +.It +The address family it belongs to. +.It +The network interface it is bound to (if any) +.Dv ( SO_BINDTODEVICE +and +.Dv SO_BINDTOINDEX ) +(initially none). +.It +The local address and port (when bound) (initially none). +.It +The remote address and port (when connected) (initially none). +.It +A receive queue (initially empty). +.It +Whether the socket has been +.Xr shutdown 2 +for read and/or write (initially neither). +.It +A single pending asynchronous error (if any) +.Dv ( SO_ERROR ) +(initially none). +.It +Whether broadcast datagrams can be sent +.Dv ( SO_BROADCAST ) +(initially no). +.It +Whether binding to the any address and a port doesn't conflict with binding to +another address on the same port +.Dv ( SO_REUSEADDR ) +(initially no). +.It +Limits on the size of the receive and send queues +.Dv ( SO_RCVBUF +and +.Dv SO_SNDBUF ) . +.El +.Pp +Datagrams are sent as a packet with a header and the datagram itself. +The header contains the source port, the destination port, the checksum, and the +packet's length. +The length is a 16-bit value, allowing the packet to be up to 65535 bytes. +The header is 8 bytes, allowing the maximum datagram size of 65527 bytes. +However, the actual maximum datagram size may be smaller, as the network layer +and link layer, as well as the path to the destination host, will add their own +headers and maximum transmission unit (MTU) restrictions. +.Pp +Port numbers are 16-bit and range from 1 to 65535. +Port 0 is not valid. +Binding to port 0 will assign an available port on the requested address. +Sending or connecting to port 0 will fail with +.Er EADDRNOTAVAIL . +Received packets whose source or destination address is port 0 will be silently +dropped. +UDP ports are distinct from ports in other transport layer protocols. +.Pp +Packets contain a 16-bit ones' complement checksum by default. +Unless the packet has no checksum, a received packet will be silently discarded +if its checksum does not match its contents. +.Pp +Sockets can be bound to a local address and port with +.Xr bind 2 +(if not already bound), +or an local address and port will be automatically assigned on the first send +or connect operation. +The local address and port can be read with +.Xr getsockname 2 . +If the socket hasn't been bound, the local address and port is reported as the +any address on port 0. +Binding to a well-known port (port 1 through port 1023) requires superuser +privileges. +.Pp +Sockets can be bound to the any address, the broadcast address, the address of +a network interface, or the broadcast address of a network interface. +Binding to port 0 will automatically assign an available port on the requested +local address or fail with +.Er EAGAIN +if no port is available. +No two sockets can bind to the same local address and port. +No two sockets can be bound such that one is bound to the any address and a +port, and the other socket is bound to another address and the same port; unless +both sockets had the +.Dv SO_REUSEADDR +socket option set when the second socket was bound, and the current user is the +same that bound the first socket or the current user has superuser privileges. +.Pp +A socket bound to a local address and port will receive an incoming datagram if +the following conditions hold: +.Pp +.Bl -bullet -compact +.It +The datagram belongs to the socket's address family and the protocol is UDP. +.It +The datagram's checksum matches the datagram or it has no checksum. +.It +The datagram is not sent from port 0 and is not sent to port 0. +.It +The datagram is sent to the address or broadcast address of the network +interface it is received on, or the datagram was sent to the broadcast address; +.It +The socket is either bound to the receiving network interface, or the socket is +not bound to a network interface; +.It +The datagram is sent to the socket's local port; +.It +The datagram is sent to the socket's local address, or the socket's local +address is the any address (and no other socket is bound to the datagram's +address and that port); +.It +The socket is connected and the datagram was sent from the remote address and +the remote port, or the socket is not connected; and +.It +The socket is not shut down for reading. +.El +.Pp +If so, the datagram is added to the socket's receive queue, otherwise it is +discarded. +The receive queue contains incoming packets waiting to be received. +Incoming packets are dropped if the receive queue is full. +Shrinking the receive queue limit drops packets as needed to stay below the +limit. +.Pp +The remote address and port can be set multiple times with +.Xr connect 2 , +after which the socket is said to be connected, but UDP is connectionless and +no handshake is sent. +The remote port must not be port 0 or the connection will fail with +.Er EADDRNOTAVAIL . +If the socket is not bound, +.Xr connect 2 +will determine which network interface will be used to send to the remote +address, and then bind to the address of that network interface together with an +available port. +.Xr connect 2 +will fail if there is no route from the local address to the requested remote +address. +A connected socket only receive datagrams from the remote address and port. +.Xr connect 2 +will drop datagrams in the receive queue that don't originate from the +requested remote address. +The +.Xr send 2 , +.Xr write 2 , +and +.Xr writev 2 +functions can be used on a connected socket and they send to the remote address +and port by default. +If the socket is connected, the destination given to +.Xr sendto 2 +and +.Xr sendmsg 2 +must be +.Dv NULL . +The remote address and port can be read with +.Xr getpeername 2 . +.Pp +The socket can be disconnected by connecting to a socket address with the family +value set to +.Dv AF_UNSPEC , +which resets the remote address and port (if set), and otherwise has no effect. +The socket can be disconnected even if not connected, but it has no effect. +.Pp +Datagrams can be sent with +.Xr sendmsg 2 +and +.Xr sendto 2 . +Sending on a unbound socket will bind to the any address and an available port, +or fail with +.Er EAGAIN +if no port is available. +Datagrams can be received with +.Xr recvmsg 2 , +.Xr recvfrom 2 , +.Xr recv 2 , +.Xr read 2 , +and +.Xr readv 2 . +If an asynchronous error is pending, the next send and receive operation will +fail with that error and clear the asynchronous eror, so the next operation can +succeed. +Asynchronous errors can arise from network problems. +There is no send queue at the UDP level and datagrams are directly forwarded to +the network layer. +It is an error to use any of the flags +.Dv MSG_CMSG_CLOEXEC , +.Dv MSG_CMSG_CLOFORK , +.Dv MSG_EOR , +.Dv MSG_OOB , +and +.Dv MSG_WAITALL . +.Pp +The condition of the socket can be tested with +.Xr poll 2 +where +.Dv POLLIN +signifies a packet has been received (or the socket is shut down for reading), +.Dv POLLOUT +signifies a packet can be sent now (and the socket is not shut down for +writing), +.Dv POLLHUP +signifies the socket is shut down for writing, and +.Dv POLLERR +signifies an asynchronous error is pending. +.Pp +The socket can be shut down for receiving and/or sending with +.Xr shutdown 2 . +The receive queue is emptied when shut down for receive (asynchronous errors are +preserved) and receive operations will succeed with an end of file +condition, but any pending asynchronous errors will take precedence and be +delivered instead. +Sending when shut down for writing will raise +.Dv SIGPIPE +and fail with +.Er EPIPE +(regardless of a pending asynchronous error). +.Pp +Socket options can be set with +.Xr setsockopt 2 +and read with +.Xr getsockopt 2 +and exist on the +.Dv IPPROTO_UDP +level as well as applicable underlying protocol levels. +.Pp +Broadcast datagrams can be sent by setting the +.Dv SO_BROADCAST +socket option with +.Xr setsockopt 2 +and sending to a broadcast address of the network layer. +.Sh SOCKET OPTIONS +UDP sockets support these +.Xr setsockopt 2 / +.Xr getsockopt 2 +options at level +.Dv SOL_SOCKET : +.Bl -tag -width "12345678" +.It Dv SO_BINDTODEVICE Fa "char[]" +Bind to a network interface by its name. +(Described in +.Xr if 4 ) +.It Dv SO_BINDTOINDEX Fa "unsigned int" +Bind to a network interface by its index number. +(Described in +.Xr if 4 ) +.It Dv SO_BROADCAST Fa "int" +Whether sending to a broadcast address is allowed. +(Described in +.Xr if 4 ) +.It Dv SO_DEBUG Fa "int" +Whether the socket is in debug mode. +This option is not implemented and is initially 0. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Described in +.Xr if 4 ) +.It Dv SO_DOMAIN Fa "sa_family_t" +The socket +.Fa domain +(the address family). +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_DONTROUTE Fa "int" +Whether to bypass the routing table and only send on the local network. +This option is not implemented and is initially 0. +Attempting to set it to non-zero will fail with +.Er EPERM . +(Described in +.Xr if 4 ) +.It Dv SO_ERROR Fa "int" +The asynchronous pending error +(an +.Xr errno 3 +value). +Cleared to 0 when read. +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_PROTOCOL Fa "int" +The socket protocol +.Dv ( IPPROTO_UDP ) . +This option can only be read. +(Described in +.Xr if 4 ) +.It Dv SO_RCVBUF Fa "int" +How many bytes the receive queue can use (default is 64 pages, max 4096 pages). +(Described in +.Xr if 4 ) +.It Dv SO_REUSEADDR Fa "int" +Whether binding to the any address on a port doesn't conflict with binding to +another address and the same port, if both sockets have this option set and the +user binding the second socket is the same that bound the first socket or the +user binding the second socket has superuser privileges. +(Described in +.Xr if 4 ) +.It Dv SO_SNDBUF Fa "int" +How many bytes the send queue can use (default is 64 pages, max 4096 pages). +(Described in +.Xr if 4 ) +.It Dv SO_TYPE Fa "int" +The socket type +.Dv ( SOCK_DGRAM ) . +This option can only be read. +(Described in +.Xr if 4 ) +.El +.Pp +UDP sockets currently implement no +.Xr setsockopt 2 / +.Xr getsockopt 2 +options at level +.Dv IPPROTO_UDP . +.Sh IMPLEMENTATION NOTES +There is no way to disable the checksum on sent packets, however received +packets without a checksum will not be checksummed. +.Pp +Each packet currently use a page of memory, which counts towards the receive +queue limit. +.Pp +If no specific port is requested, one is randomly selected in the dynamic port +range 32768 (inclusive) through 61000 (exclusive). +.Sh EXAMPLES +This example creates and binds a UDP socket to a local address and port and +sends a broadcast datagram to a remote address and port and receives a response +and remembers who sent the response. +.Va local +is the local socket address that is bound to and +.Va local_len +is the size of the local socket address and likewise with +.Va remote +and +.Va remote_len . +.Va responder +is an uninitialized socket address of the appropriate size +.Va responder_len +for the protocol family +.Va af +where the source address of the response is stored. +The response is stored in the +.Va incoming +array of size +.Va amount . +The +.Va af , local , local_len , remote , remote_len , responder , +and +.Va responder_len +values should all be chosen according to the address family and network layer. +.Bd -literal +sa_family_t af = /* ... */; +const struct sockaddr *local = /* ... */; +socklen_t local_len = /* ... */; +const struct sockaddr *remote = /* ... */; +socklen_t remote_len = /* ... */; +const struct sockaddr *responder = /* ... */; +socklen_t responder_len = /* ... */; + +int fd = socket(af, SOCK_DGRAM, IPPROTO_UDP); +if (fd < 0) + err(1, "socket"); +if (bind(fd, local, local_len) < 0) + err(1, "bind"); +int value = 1; +if (setsockopt(fd, SOL_SOCKET, SO_BROADCAST, &value, sizeof(value)) < 0) + err(1, "setsockopt"); +char outgoing[] = "Hello"; +if (sendto(fd, outgoing, strlen(outgoing), 0, remote, remote_len) < 0) + err(1, "sendto"); +char incoming[1024]; +ssize_t amount = recvfrom(fd, incoming, sizeof(incoming), 0, + responder, &responder_len); +if (amount < 0 ) + err(1, "recvfrom"); +.Ed +.Sh COMPATIBILITY +Sortix is the only known system where +.Xr connect 2 +will remove datagrams from the wrong source from the receive queue. +All other systems will deliver datagrams already present in the receive queue, +even if from the wrong source, despite the POSIX requirement that +.Xr connect 2 +"limits the remote sender for subsequent recv() functions". +Software for affected systems must either first empty the receive queue after +.Xr connect 2 , +or use +.Xr recvmsg 2 +and validate the source address rather than rely on the kernel validation. +.Pp +.Xr sendto 2 +or +.Xr sendmsg 2 +on a connected socket must have the destination be +.Dv NULL +(the default destination) +on Sortix, FreeBSD, Haiku, macOS, NetBSD, OpenBSD, and SunOS; but the +destination can be +.Dv NULL +or any address on DragonFly, GNU/Hurd, Linux, and Minix. +.Pp +Socket disconnect is implemented on Sortix, DragonFly, Haiku, GNU/Hurd, Linux, +Minix, and SunOS; but socket disconnect is not implemented on on FreeBSD, macOS, +NetBSD and OpenBSD. +Storing the +.Dv AF_FAMILY +value in the address family's socket address structure or struct sockaddr is +portable to the systems implementing socket disconnect. +A socket can be disconnected even if not connected on Sortix, DragonFly, Haiku, +GNU/Hurd, Linux, and Minix; but SunOS requires the socket to be connected +before it can be disconnected. +.Pp +The broadcast address can be bound on Sortix, GNU/Hurd, Linux, OpenBSD, and +SunOS; but can't be bound on DragonFly, FreeBSD, macOS, Minix and NetBSD. +.Pp +.Dv SO_BROADCAST +doesn't need to be enabled to +.Xr connect 2 +to the broadcast address on Sortix, DragonFly, FreeBSD, Haiku, macOS, Minix, +NetBSD, OpenBSD, and SunOS; but is required on GNU/Hurd and Linux. +.Pp +Reconnecting a socket to an address that is not reachable from the local address +will fail on Sortix, GNU/Hurd, and Linux; but the socket will be bound to +another address that can reach the remote address (even though it is not +possible to bind a socket twice) (on the same port if possible) on DragonFly, +FreeBSD, Haiku, macOS, NetBSD, OpenBSD, and SunOS. +.Pp +.Xr connect 2 +will not deliver asynchronous errors on Sortix, DragonFly, FreeBSD, Haiku, +GNU/Hurd, Linux, and Minix; however it will deliver asynchronous errors on +macOS, NetBSD, OpenBSD, and SunOS. +.Pp +Shutting a socket down for reading will cause receives to return 0 on Sortix, +DragonFly, FreeBSD, macOS, Minix, NetBSD, OpenBSD, and SunOS; but receives will +fail with fail with +.Er EWOULDBLOCK +on Linux. +.Pp +Shutting a socket down for writing will cause sends to raise SIGPIPE and fail +with EPIPE on Sortix, DragonFly, FreeBSD, GNU/Hurd, macOS, NetBSD, OpenBSD, and +SunOS; but will not raise SIGPIPE and only fail with EPIPE on Linux and Minix. +.Pp +Sortix, GNU/Hurd, Linux, and Minix will signal POLLIN if a datagram has been +received or if shut down for read. +DragonFly, FreeBSD, macOS, NetBSD, OpenBSD, and SunOS will signal POLLIN if a +datagram has been received, if shut down for read, or if an error is pending. +.Pp +Sortix and DragonFly will signal POLLOUT if a datagram can be sent, unless the +socket has been shut down for write. +FreeBSD will signal POLLOUT if a datagram can be sent, unless the socket has +been shut down for both read and write. +GNU/Hurd will signal POLLOUT if a datagram can be sent, unless the socket has +been shut down for write or if an error is pending. +Linux, Minix, OpenBSD, and SunOS will signal POLLOUT if a datagram can be sent, +regardless of whether the socket has been shut down. +macOS will signal POLLOUT if a datagram can be sent, unless the socket has been +shut down for either read or write. +.Pp +Sortix and DragonFly will signal POLLHUP if shut down for write. +FreeBSD and Linux will signal POLLHUP if shut down for both read and write. +GNU/Hurd, macOS, Minix, NetBSD, OpenBSD, and SunOS will not signal POLLHUP. +macOS will signal POLLHUP if shut down for either read or write. +.Pp +Sortix, Haiku, GNU/Hurd, and Linux will signal POLLERR if an error is pending. +DragonFly, FreeBSD, macOS, Minix, NetBSD, OpenBSD, and SunOS will not signal +POLLERR. +.Pp +Shutting a socket down for read doesn't work on GNU/Hurd and Linux, where the +socket continues to receive datagrams. +.Pp +Linux delivers asynchronous errors on send, even if shut down for write. +.Pp +Sockets can be shut down even if not connected on Sortix, DragonFly, Minix, +NetBSD, and OpenBSD; but sockets must be connected before they can be shut down +on FreeBSD, GNU/Hurd, Linux, macOS, and SunOS. +.Pp +Connecting to the any address will fail with +.Er ENETUNREACH +on Sortix. +On DragonFly, FreeBSD, Haiku, GNU/Hurd, Linux, macOS, OpenBSD, and SunOS it will +succeed and +.Xr getpeername 2 +will report the loopback address (OpenBSD will report the any address). +.Pp +Connecting to port 0 will fail on Sortix, FreeBSD, macOS, Minix, NetBSD, OpenBSD, +and SunOS; but will succeed on DragonFly, Haiku, GNU/Hurd and Linux. +.Pp +Sortix's handling of +.Dv SO_REUSEADDR +requires the two sockets to bound by the same user or the second socket to be +bound by a user with superuser privileges. +It's unclear what other systems also perform this check and when the user +identity is captured. +.Pp +Setting +.Dv SO_REUSEADDR +on both sockets is required on Sortix, Haiku, GNU/Hurd, and Linux; but +DragonFly, FreeBSD, Minix, macOS, NetBSD, OpenBSD, and SunOS only require it to +be set on the second socket. +.Pp +Two sockets can't be bound to the same address and port on Sortix, DragonFly, +FreeBSD, Haiku, macOS, NetBSD, and OpenBSD; but GNU/Hurd, Linux, Minix, and +SunOS allows it when +.Dv SO_REUSEADDR +is set. +.Sh ERRORS +Socket operations can fail due to these error conditions, in addition to the +error conditions of the network and link layer, and the error conditions of the +invoked function. +.Bl -tag -width [EADDRNOTAVAIL] +.It Bq Er EACCES +A datagram was sent to a broadcast address, but +.Dv SO_BROADCAST +is turned off. +.It Bq Er EADDRINUSE +The socket cannot be bound to the requested address and port because another +socket was already bound to 1) the same address and port 2) the any address +and the same port (and +.Dv SO_REUSEADDR +was not set on both sockets), or 3) some address and the same port but the +requested address was the any address (and +.Dv SO_REUSEADDR +was not set on both sockets). +.It Bq Er EADDRNOTAVAIL +The socket cannot be bound to the requested address because no network interface +had that address or broadcast address. +.It Bq Er EADDRNOTAVAIL +The socket was connected to port 0, or a datagram was sent to port 0. +.It Bq Er EAGAIN +A port could not be assigned because each port in the dynamic port range had +already been bound to a socket in a conflicting manner. +.It Bq Er ECONNREFUSED +The destination host of a datagram was not listening on the port. +This error can happen asynchronously. +.It Bq Er EHOSTDOWN +The destination host of a datagram is not up. +This error can happen asynchronously. +.It Bq Er EHOSTUNREACH +The destination host of a datagram was unreachable. +This error can happen asynchronously. +.It Bq Er EISCONN +A destination address and port was specified when sending a datagram, but the +socket has already been connected to a remote address and port. +.It Bq Er EMSGSIZE +The datagram was too large to be sent because it exceeded the maximum +transmission unit (MTU) on the path between the local and remote address, or it +exceeded the UDP datagram size limit of 65527 bytes. +This error can happen asynchronously. +.It Bq Er ENETDOWN +The network interface used to deliver a datagram isn't up. +This error can happen asynchronously. +.It Bq Er ENETUNREACH +The destination network of a datagram was unreachable. +This error can happen asynchronously. +.It Bq Er ENETUNREACH +The remote address could not be connected because there was no route from the +local address to the remote address. +.It Bq Er ENOBUFS +There was not enough memory available for network packets. +.It Bq Er EPERM +One of the unimplemented +.Dv SO_DEBUG +and +.Dv SO_DONTROUTE +socket options was attempted to be set to a non-zero value. +.El +.Sh SEE ALSO +.Xr bind 2 , +.Xr connect 2 , +.Xr getpeername 2 , +.Xr getsockname 2 , +.Xr getsockopt 2 , +.Xr poll 2 , +.Xr recvfrom 2 , +.Xr recvmsg 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr setsockopt 2 , +.Xr shutdown 2 , +.Xr socket 2 , +.Xr if 4 , +.Xr inet 4 , +.Xr ip 4 , +.Xr kernel 7 +.Sh STANDARDS +.Rs +.%A J. Postel +.%D August 1980 +.%R STD 6 +.%R RFC 768 +.%T User Datagram Protocol +.%Q USC/Information Sciences Institute +.Re +.Pp +.Rs +.%A Internet Engineering Task Force +.%A R. Braden (ed.) +.%D October 1989 +.%R STD 3 +.%R RFC 1122 +.%T Requirements for Internet Hosts -- Communication Layers +.%Q USC/Information Sciences Institute +.Re +.Pp +.St -p1003.1-2008 specifies the UDP socket programming interface and defines the +socket options +.Dv SO_BROADCAST , SO_DEBUG , SO_DONTROUTE, SO_ERROR, SO_RCVBUF, SO_REUSEADDR , +.Dv SO_SNDBUF , +and +.Dv SO_TYPE . +.Sh BUGS +.Xr bind 2 +does not yet enforce that binding to a well-known port (port 1 through port +1023) requires superuser privileges. +.Pp +The handling of +.Dv SO_REUSEADDR in +.Xr bind 2 +does not yet enforce the two sockets to be bound by the same user or the second +socket to be bound by a user with superuser privileges. +The requirement that both sockets have +.Dv SO_REUSEADDR +set might be relaxed to only the second socket having it set when this +permission check is implemented. +.Pp +The integration with the network layer is inadequate and the asynchronous errors +.Er ECONNREFUSED , +.Er EHOSTDOWN , +.Er EHOSTUNREACH , +and +.Er ENETUNREACH +are never delivered asynchronously from the network. +.Pp +The +.Xr send 2 +flag +.Dv MSG_DONTROUTE +and the +.Dv SO_DONTROUTE +socket option are not implemented yet. +.Pp +The +.Dv SO_SNDBUF +socket option is currently not used and the send queue is not limited at the +socket level. +.Pp +The automatic assignment of ports is random, but is statistically biased. +A random port is picked, and if it is taken, the search sequentially iterates +ports in ascending order until an available port is found or the search +terminates. +.Pp +FreeBSD's and OpenBSD's UDP documentation states in the BUGS section that +receiving a datagram on a socket shutdown for read should reply with a ICMP +Port Unreachable message, however they don't implement this behavior. +No other system appears to implement this behavior, and it is unclear whether +it should be implemented. diff --git a/share/man/man7/development.7 b/share/man/man7/development.7 index c4bcd131..23e4aa35 100644 --- a/share/man/man7/development.7 +++ b/share/man/man7/development.7 @@ -398,19 +398,21 @@ The makefile target can be used to verify your work needs some of the development conventions. .Pp -You can then easily prepare your a set of patches for upstream submission: +Prepare a set of patches suitable for upstream submission and submit a merge +request to the upstream project. +.Pp +If your installation does not have network connectivity, you will need to +submit the changes from another system. +If you are dual booting and have another operating system with network +connectivity, you can boot into the other operating system and mount the +appropriate filesystem from there. +If you have a serial line, you can produce a set of .patch files containing +your changes with .Bd -literal git format-patch master..local .Ed .Pp -This will create a series of .patch files containing your changes. -Review them and rewrite git history as needed until they are of submittable -quality. -You can then submit them for review at the official website. -.Pp -To transfer files out of the operating system, you can either mount the local -root filesystem from another operating system with networking, or you transmit -the patches over the serial connection as described in +and then transfer them over the serial connection as described in .Xr serial-transfer 7 . .Ss Releases CD-ROM release of the operating system can be built with the diff --git a/share/man/man7/following-development.7 b/share/man/man7/following-development.7 index 11053f6b..b51e19cb 100644 --- a/share/man/man7/following-development.7 +++ b/share/man/man7/following-development.7 @@ -69,6 +69,18 @@ releasing Sortix x.y, foo." to allow the maintainer to easily .Xr grep 1 for it after a release. .Sh CHANGES +.Ss Add networking stack +The network stack has been implemented in the kernel and exposed through +additions to the system call interface. +.Pp +This is a compatible ABI change that adds features to +.Xr socket 2 +.Sy ( AF_INET , IPPROTO_TCP , IPPROTO_UDP , IPPROTO_PING ) , +the ioctls for +.Xr if 4 , +socket options, and the +.Xr lo 4 +loopback interface. .Ss Add daemon support to init(8) .Xr init 8 has gained diff --git a/share/man/man7/installation.7 b/share/man/man7/installation.7 index bacc7d66..d897eceb 100644 --- a/share/man/man7/installation.7 +++ b/share/man/man7/installation.7 @@ -82,7 +82,7 @@ per the instructions in The release modification procedure lets you customize aspects such as the default bootloader menu option and timeout, the default hostname, the default keyboard layout, the default graphics resolution, adding files of your choice to -the live environment, and so on. +the live environment, control which drivers are loaded by default, and so on. .Pp Warning: The live environment does not come with any random entropy and entropy gathering is not yet implemented. @@ -130,6 +130,11 @@ You need enough memory to store the whole system and the runtime usage. If the system memory is really insufficient, then the bootloader may have strange behavior, take a really long time to load, or not complete the boot at all. +.Ss Bootloader Advanced Options +The bootloader advanced options menu lets you customize the live environment by +making one-time adjustments to the boot process. +These decisions will not carry over to the final installed system, which you +instead will need to configure to have the same effects. .Pp You can configure which ports gets loaded using the bootloader menu. The base system is rather lean and can be made quite small. @@ -367,6 +372,17 @@ If you invoked yourself, then you will be returned to your live environment shell. Otherwise the computer will power off when the chroot environment terminates. .Pp +This is a last chance to make modifications before the new system boots for the +first time. +If you want to make final modifications to the system (examples are below), you +can answer +.Sy '!' +to escape to a shell in the live environment inside the subdirectory where the +new system is mounted. +You can then run +.Sy "chroot -d ." +to enter a shell within the new installation. +.Pp Upon boot of the new system it will be configured in multi-user mode and you will be presented with a login screen. Authenticate as one of the local users and you will be given a shell. @@ -387,6 +403,55 @@ The manual page is a basic overview of the system for new users. .Pp Congratulations on your new Sortix system. +.Ss Disabling Networking by Default +To disable networking drivers by default, edit the bootloader configuration to +pass the +.Fl \-disable-network-drivers +option by default on the +.Xr kernel 7 +command line. +.Pp +If you are at the final stage of installation, you can answer +.Sy '!' +to get a shell in the live environment and then run +.Sy "chroot -d ." +to enter a shell inside the new installation. +.Pp +For instance, if GRUB is used the bootloader, networking can be disabled by +default by done by editing +.Pa /etc/grub.d/10_sortix +of the new installation. +.Xr editor 1 +or any editor can be used to edit the file. +Change the line from +.Bd -literal + multiboot $BOOT_REL/sortix.bin +.Ed +.Pp +to instead be +.Bd -literal + multiboot $BOOT_REL/sortix.bin --disable-network-drivers +.Ed +.Pp +If the included GRUB bootloader is used, after making the above edit, run +.Xr update-grub 8 +within the new installation to regenerate the bootloader configuration. +Note that +.Pa /etc/grub.d/10_sortix +is part of the GRUB package and local changes will be undone when the GRUB +package is updated or reinstalled, in which case you must make this change again +and run +.Xr update-grub 8 +again. +.Pp +If the included GRUB bootloader is not used, but instead the +.Pa /etc/grub.d/10_sortix.cache +fragment is spliced into another GRUB installation, make the above change and +then run the +.Pa /etc/grub.d/10_sortix +command and use the freshly regenerated +.Pa /etc/grub.d/10_sortix.cache +fragment instead. .Sh SEE ALSO .Xr chkblayout 1 , .Xr chvideomode 1 , diff --git a/share/man/man7/kernel.7 b/share/man/man7/kernel.7 index 198c4895..3a2356fe 100644 --- a/share/man/man7/kernel.7 +++ b/share/man/man7/kernel.7 @@ -6,6 +6,8 @@ .Nd operating system kernel .Sh SYNOPSIS .Pa /boot/sortix.bin +.Op Fl \-disable-network-drivers +.Op Fl \-enable-network-drivers .Op Fl \-no-random-seed .Op Fl \- .Op Ar init ... @@ -46,6 +48,12 @@ otherwise. .Pp The options are as follows: .Bl -tag -width "12345678" +.It Fl \-disable-network-drivers +Don't initialize any network drivers. +This option ensures the booted system is not networked. +.It Fl \-enable-network-drivers +Do initialize network drivers. +This is the default behavior. .It Fl \-no-random-seed Don't warn if no random seed file was loaded by the bootloader (usually from .Pa /boot/random.seed ) . diff --git a/share/man/man7/release-iso-bootconfig.7 b/share/man/man7/release-iso-bootconfig.7 index 2ae67e43..d592d2ea 100644 --- a/share/man/man7/release-iso-bootconfig.7 +++ b/share/man/man7/release-iso-bootconfig.7 @@ -128,6 +128,8 @@ The .Sy hook_advanced_menu_pre hook is run. .Pp +A menu entry is emitted to control whether network drivers are enabled. +.Pp A menu entry is emitted that goes to the binary packages menu (which runs .Li configfile /boot/grub/tix.cfg ) .Pp @@ -260,6 +262,13 @@ If the selected menu option itself is a submenu, it can be appended with a .Sy '>' and another selection to pick a default menu option in that submenu, and so on. (Default: 0) +.It Sy enable_network_drivers +An additional +.Xr kernel 7 +command line parameter that controls whether network drivers are enabled. +Either set to the empty string (network drivers are enabled) or +.Sy --disable-network-drivers . +(Default: The empty string). .It Sy enable_src Whether to load the source code initrd containing .Pa /src . @@ -330,6 +339,7 @@ with a .Xr kernel 7 command line consisting of .Sy $no_random_seed +.Sy $enable_network_drivers followed by the arguments to this function (which should contain .Li "-- /sbin/init --target=desired-target" ) followed by any additional options to diff --git a/share/man/man7/release-iso-modification.7 b/share/man/man7/release-iso-modification.7 index fd5ec2bc..13b389a4 100644 --- a/share/man/man7/release-iso-modification.7 +++ b/share/man/man7/release-iso-modification.7 @@ -16,7 +16,7 @@ configuration as described in section 5 of the manual. The release modification procedure lets you customize aspects such as the default bootloader menu option and timeout, the default hostname, the default keyboard layout, the default graphics resolution, adding files of your choice to -the live environment, and so on. +the live environment, control which drivers are loaded by default, and so on. .Ss Prerequisites .Bl -bullet -compact .It @@ -390,6 +390,13 @@ hook_ports_menu_sets EOF tix-iso-add sortix.iso bootconfig .Ed +.Ss Disable Networking Drivers By Default +To customize a release so it doesn't load network drivers by default, useful for +security reasons or to work around driver issues: +.Bd -literal +tix-iso-bootconfig --disable-network-drivers bootconfig +tix-iso-add sortix.iso bootconfig +.Ed .Sh SEE ALSO .Xr xorriso 1 , .Xr development 7 , diff --git a/share/man/man7/upgrade.7 b/share/man/man7/upgrade.7 index 4db0dfe6..ed28b54e 100644 --- a/share/man/man7/upgrade.7 +++ b/share/man/man7/upgrade.7 @@ -29,7 +29,7 @@ per the instructions in The release modification procedure lets you customize aspects such as the default bootloader menu option and timeout, the default hostname, the default keyboard layout, the default graphics resolution, adding files of your choice to -the live environment, and so on. +the live environment, control which drivers are loaded by default, and so on. .Pp Warning: The live environment does not come with any random entropy and entropy gathering is not yet implemented. diff --git a/share/man/man7/user-guide.7 b/share/man/man7/user-guide.7 index 6b66818d..79b58250 100644 --- a/share/man/man7/user-guide.7 +++ b/share/man/man7/user-guide.7 @@ -120,9 +120,14 @@ You can make a compatible filesystem with: .Pp .Dl $ mkfs.ext2 -O none,large_file,filetype .Ss Networking -Sortix does not have networking at this time. -Unix sockets have a basic implementation incapable of advanced features. -The standard library and kernel provides stubs for many network interfaces. +Internet Protocol version 4 +.Pq Xr ip 4 +networking is available if you have a supported network interface +.Pq Xr if 4 . +.Pp +The Internet Protocol version 6 +.Xr ( ip6 4 ) +is not yet supported. .Ss Serial Transfer It is possible to transfer files over serial devices as described in .Xr serial-transfer 7 . diff --git a/sysinstall/sysinstall.c b/sysinstall/sysinstall.c index bf2087c0..9a081927 100644 --- a/sysinstall/sysinstall.c +++ b/sysinstall/sysinstall.c @@ -1126,6 +1126,8 @@ int main(void) } text("\n"); + // TODO: Ask if networking should be disabled / enabled. + text("It's time to boot into the newly installed system.\n\n"); if ( strcasecmp(accept_grub, "no") == 0 ) diff --git a/tix/tix-iso-bootconfig b/tix/tix-iso-bootconfig index fb66f47d..53e2eb25 100755 --- a/tix/tix-iso-bootconfig +++ b/tix/tix-iso-bootconfig @@ -22,6 +22,7 @@ append_title="modified by $(id -un)@$(hostname)" default= directory= enable_append_title=true +enable_network_drivers= enable_src= init_target= liveconfig= @@ -51,8 +52,10 @@ for argument do --default=*) default=$parameter ;; --default) previous_option=default ;; --disable-append-title) enable_append_title=false ;; + --disable-network-drivers) enable_network_drivers=false ;; --disable-src) enable_src=false ;; --enable-append-title) enable_append_title=true ;; + --enable-network-drivers) enable_network_drivers=true ;; --enable-src) enable_src=true ;; --init-target=*) init_target=$parameter ;; --init-target) previous_option=init_target ;; @@ -135,6 +138,7 @@ mkdir -p -- "$directory/boot/grub" if [ -n "$timeout" ]; then printf 'timeout="%s"\n' "$timeout" fi + print_enable_default "$enable_network_drivers" network_drivers network-drivers print_enable_default_bool "$enable_src" src src if $enable_append_title; then printf "base_menu_title=\"\$base_menu_title - \"'%s'\n" \ diff --git a/tix/tix-iso-bootconfig.8 b/tix/tix-iso-bootconfig.8 index a0b1fe10..40dc4a0f 100644 --- a/tix/tix-iso-bootconfig.8 +++ b/tix/tix-iso-bootconfig.8 @@ -9,8 +9,10 @@ .Op Fl \-append-title Ns = Ns Ar text .Op Fl \-default Ns = Ns Ar default-boot-menu-option .Op Fl \-disable-append-title +.Op Fl \-disable-network-drivers .Op Fl \-disable-src .Op Fl \-enable-append-title +.Op Fl \-enable-network-drivers .Op Fl \-enable-src .Op Fl \-init-target Ns = Ns Ar target .Op Fl \-liveconfig Ns = Ns Ar liveconfig-directory @@ -89,6 +91,14 @@ GRUB variable. Don't append anything to the bootloader menu title by appending to the .Sy base_menu_title GRUB variable. +.It Fl \-disable-network-drivers +Disable network drivers by setting the +.Sy enable_network_drivers +GRUB variable to the +.Fl \-disable-network-drivers +option which will be passed on the +.Xr kernel 7 +command line. .It Fl \-disable-src Disable loading the source code in .Pa /src @@ -104,6 +114,14 @@ to the bootloader menu title by appending to the GRUB variable. This option is on by default and can be disabled with .Fl \-disable-append-title . +.It Fl \-enable-network-drivers +Enable network drivers by setting the +.Sy enable_network_drivers +GRUB variable to the +.Fl \-enable-network-drivers +option which will be passed on the +.Xr kernel 7 +command line. .It Fl \-enable-src Enable loading the source code in .Pa /src @@ -246,6 +264,13 @@ of your choice: tix-iso-bootconfig --append-title="Initech Company Edition" bootconfig tix-iso-add sortix.iso bootconfig .Ed +.Ss Disable Networking Drivers By Default +To customize a release so it doesn't load network drivers by default, useful for +security reasons or to work around driver issues: +.Bd -literal +tix-iso-bootconfig --disable-network-drivers bootconfig +tix-iso-add sortix.iso bootconfig +.Ed .Sh SEE ALSO .Xr xorriso 1 , .Xr kernel 7 ,