Ensure correct utf-8

2019-07-10 21:20:24 +03:00 · 2019-07-10 21:20:24 +03:00 · 55fc6932e2
parent 6cd84902fc
commit 55fc6932e2
1 changed files with 128 additions and 6 deletions
--- a/ethermess.c
+++ b/ethermess.c
@ -450,6 +450,122 @@ bool check_padding(const unsigned char *data, size_t index, size_t data_length)
 	return true;
 }

+bool check_utf8(const unsigned char *data, size_t data_length, bool newline_allowed) {
+	size_t remaining = 0;
+	size_t length = 0;
+	uint32_t codepoint = 0;
+
+	for (size_t i = 0; i < data_length; i++) {
+		unsigned char byte = data[i];
+
+		if (byte <= 0x7f) {
+			// 0xxxxxxx - single byte
+			if (remaining != 0) {
+				// Can't appear in the middle of a multibyte sequence
+				return false;
+			}
+			remaining = 0;
+			length = 1;
+			codepoint = byte;
+		} else if (byte <= 0xbf) {
+			// 10xxxxxx - continuation byte
+			if (remaining == 0) {
+				// Can only appear in the middle of a multibyte sequence
+				return false;
+			}
+			remaining--;
+			length++;
+			codepoint <<= 6;
+			codepoint |= byte & 0x3f;
+		} else if (byte <= 0xdf) {
+			// 110xxxxx - first byte of double byte sequence
+			if (remaining != 0) {
+				// Can't appear in the middle of a multibyte sequence
+				return false;
+			}
+			remaining = 1;
+			length = 1;
+			codepoint = byte & 0x1f;
+		} else if (byte <= 0xef) {
+			// 1110xxxx - first byte of triple byte sequence
+			if (remaining != 0) {
+				// Can't appear in the middle of a multibyte sequence
+				return false;
+			}
+			remaining = 2;
+			length = 1;
+			codepoint = byte & 0x0f;
+		} else if (byte <= 0xf7) {
+			// 11110xxx - first byte of quadruple byte sequence
+			if (remaining != 0) {
+				// Can't appear in the middle of a multibyte sequence
+				return false;
+			}
+			remaining = 3;
+			length = 1;
+			codepoint = byte & 0x07;
+		}
+
+		if (remaining == 0) {
+			// Full codepoint constructed
+
+			// Reject overlong encodings
+			if (codepoint <= 0x007f && length > 1) {
+				return false;
+			} else if (codepoint <= 0x07ff && length > 2) {
+				return false;
+			} else if (codepoint <= 0xffff && length > 3) {
+				return false;
+			} else if (codepoint <= 0x10ffff && length > 4) {
+				return false;
+			}
+
+			// Reject code points over U+10FFFF
+			if (codepoint > 0x10ffff) {
+				return false;
+			}
+
+			// Reject surrogate pairs
+			if (0xd800 <= codepoint && codepoint <= 0xdfff) {
+				return false;
+			}
+
+			// Reject non-characters
+			if (codepoint & 0xffff == 0xfffe || codepoint & 0xffff == 0xffff) {
+				// Plane end non-characters
+				return false;
+			} else if (0xfdd0 <= codepoint && codepoint <= 0xfdef) {
+				// BMP non-character block
+				return false;
+			}
+
+			// Reject control characters
+			if (codepoint <= 0x1f) {
+				// C0 control character
+				if (!newline_allowed || codepoint == 0x0a) {
+					return false;
+				}
+			} else if (0x80 <= codepoint && codepoint <= 0x9f) {
+				// C1 control character
+				return false;
+			} else if (codepoint == 0x2028) {
+				// U+2028 LINE SEPARATOR
+				return false;
+			} else if (codepoint == 0x2029) {
+				// U+2029 PARAGRAPH SEPARATOR
+				return false;
+			}
+		}
+	}
+
+	if (remaining != 0) {
+		// Can't end at the middle of a multibyte sequence
+		return false;
+	}
+
+	return true;
+}
+
 void handle_status(const unsigned char source_mac[6], const unsigned char *data, size_t data_length) {
 	if (data_length < 2) {
 		// Too short
@ -478,7 +594,10 @@ void handle_status(const unsigned char source_mac[6], const unsigned char *data,
 		return;
 	}

-	// TODO: check that nick is valid utf-8 with no control chars
+	if (!check_utf8(nick, nick_length, false)) {
+		// Malformed utf-8, or has control chars
+		return;
+	}

 	char mac[18];
 	format_mac(source_mac, mac);
@ -579,7 +698,10 @@ void handle_message(const unsigned char source_mac[6], const unsigned char *data
 		return;
 	}

-	// TODO: Check that the message is valid utf-8 with newline as the only control char
+	if (!check_utf8(message, message_length, true)) {
+		// Malformed utf-8, or has control chars other than newline
+		return;
+	}

 	// See whether we've already received this message and update the next msgid if so
 	ssize_t cache_index = msgid_cache_lookup(source_mac);
@ -948,11 +1070,11 @@ int main(int argc, char **argv) {

 	// Set our nick
 	if (memcmp(own_mac, veth0a_mac, 6) == 0) {
-		memcpy(own_nick, "foo", 3);
-		own_nick_length = 3;
+		memcpy(own_nick, "𐀀ab", 6);
+		own_nick_length = 6;
 	} else {
-		memcpy(own_nick, "bar", 3);
-		own_nick_length = 3;
+		memcpy(own_nick, "ࠀ<EFBFBD>", 6);
+		own_nick_length = 6;
 	}

 	// Initialize the message id cache