Ensure correct utf-8

This commit is contained in:
Juhani Krekelä 2019-07-10 21:20:24 +03:00
parent 6cd84902fc
commit 55fc6932e2
1 changed files with 128 additions and 6 deletions

View File

@ -450,6 +450,122 @@ bool check_padding(const unsigned char *data, size_t index, size_t data_length)
return true;
}
bool check_utf8(const unsigned char *data, size_t data_length, bool newline_allowed) {
size_t remaining = 0;
size_t length = 0;
uint32_t codepoint = 0;
for (size_t i = 0; i < data_length; i++) {
unsigned char byte = data[i];
if (byte <= 0x7f) {
// 0xxxxxxx - single byte
if (remaining != 0) {
// Can't appear in the middle of a multibyte sequence
return false;
}
remaining = 0;
length = 1;
codepoint = byte;
} else if (byte <= 0xbf) {
// 10xxxxxx - continuation byte
if (remaining == 0) {
// Can only appear in the middle of a multibyte sequence
return false;
}
remaining--;
length++;
codepoint <<= 6;
codepoint |= byte & 0x3f;
} else if (byte <= 0xdf) {
// 110xxxxx - first byte of double byte sequence
if (remaining != 0) {
// Can't appear in the middle of a multibyte sequence
return false;
}
remaining = 1;
length = 1;
codepoint = byte & 0x1f;
} else if (byte <= 0xef) {
// 1110xxxx - first byte of triple byte sequence
if (remaining != 0) {
// Can't appear in the middle of a multibyte sequence
return false;
}
remaining = 2;
length = 1;
codepoint = byte & 0x0f;
} else if (byte <= 0xf7) {
// 11110xxx - first byte of quadruple byte sequence
if (remaining != 0) {
// Can't appear in the middle of a multibyte sequence
return false;
}
remaining = 3;
length = 1;
codepoint = byte & 0x07;
}
if (remaining == 0) {
// Full codepoint constructed
// Reject overlong encodings
if (codepoint <= 0x007f && length > 1) {
return false;
} else if (codepoint <= 0x07ff && length > 2) {
return false;
} else if (codepoint <= 0xffff && length > 3) {
return false;
} else if (codepoint <= 0x10ffff && length > 4) {
return false;
}
// Reject code points over U+10FFFF
if (codepoint > 0x10ffff) {
return false;
}
// Reject surrogate pairs
if (0xd800 <= codepoint && codepoint <= 0xdfff) {
return false;
}
// Reject non-characters
if (codepoint & 0xffff == 0xfffe || codepoint & 0xffff == 0xffff) {
// Plane end non-characters
return false;
} else if (0xfdd0 <= codepoint && codepoint <= 0xfdef) {
// BMP non-character block
return false;
}
// Reject control characters
if (codepoint <= 0x1f) {
// C0 control character
if (!newline_allowed || codepoint == 0x0a) {
return false;
}
} else if (0x80 <= codepoint && codepoint <= 0x9f) {
// C1 control character
return false;
} else if (codepoint == 0x2028) {
// U+2028 LINE SEPARATOR
return false;
} else if (codepoint == 0x2029) {
// U+2029 PARAGRAPH SEPARATOR
return false;
}
}
}
if (remaining != 0) {
// Can't end at the middle of a multibyte sequence
return false;
}
return true;
}
void handle_status(const unsigned char source_mac[6], const unsigned char *data, size_t data_length) {
if (data_length < 2) {
// Too short
@ -478,7 +594,10 @@ void handle_status(const unsigned char source_mac[6], const unsigned char *data,
return;
}
// TODO: check that nick is valid utf-8 with no control chars
if (!check_utf8(nick, nick_length, false)) {
// Malformed utf-8, or has control chars
return;
}
char mac[18];
format_mac(source_mac, mac);
@ -579,7 +698,10 @@ void handle_message(const unsigned char source_mac[6], const unsigned char *data
return;
}
// TODO: Check that the message is valid utf-8 with newline as the only control char
if (!check_utf8(message, message_length, true)) {
// Malformed utf-8, or has control chars other than newline
return;
}
// See whether we've already received this message and update the next msgid if so
ssize_t cache_index = msgid_cache_lookup(source_mac);
@ -948,11 +1070,11 @@ int main(int argc, char **argv) {
// Set our nick
if (memcmp(own_mac, veth0a_mac, 6) == 0) {
memcpy(own_nick, "foo", 3);
own_nick_length = 3;
memcpy(own_nick, "𐀀ab", 6);
own_nick_length = 6;
} else {
memcpy(own_nick, "bar", 3);
own_nick_length = 3;
memcpy(own_nick, "<EFBFBD>", 6);
own_nick_length = 6;
}
// Initialize the message id cache