Ensure correct utf-8
This commit is contained in:
parent
6cd84902fc
commit
55fc6932e2
134
ethermess.c
134
ethermess.c
|
@ -450,6 +450,122 @@ bool check_padding(const unsigned char *data, size_t index, size_t data_length)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool check_utf8(const unsigned char *data, size_t data_length, bool newline_allowed) {
|
||||
size_t remaining = 0;
|
||||
size_t length = 0;
|
||||
uint32_t codepoint = 0;
|
||||
|
||||
for (size_t i = 0; i < data_length; i++) {
|
||||
unsigned char byte = data[i];
|
||||
|
||||
if (byte <= 0x7f) {
|
||||
// 0xxxxxxx - single byte
|
||||
if (remaining != 0) {
|
||||
// Can't appear in the middle of a multibyte sequence
|
||||
return false;
|
||||
}
|
||||
remaining = 0;
|
||||
length = 1;
|
||||
codepoint = byte;
|
||||
} else if (byte <= 0xbf) {
|
||||
// 10xxxxxx - continuation byte
|
||||
if (remaining == 0) {
|
||||
// Can only appear in the middle of a multibyte sequence
|
||||
return false;
|
||||
}
|
||||
remaining--;
|
||||
length++;
|
||||
codepoint <<= 6;
|
||||
codepoint |= byte & 0x3f;
|
||||
} else if (byte <= 0xdf) {
|
||||
// 110xxxxx - first byte of double byte sequence
|
||||
if (remaining != 0) {
|
||||
// Can't appear in the middle of a multibyte sequence
|
||||
return false;
|
||||
}
|
||||
remaining = 1;
|
||||
length = 1;
|
||||
codepoint = byte & 0x1f;
|
||||
} else if (byte <= 0xef) {
|
||||
// 1110xxxx - first byte of triple byte sequence
|
||||
if (remaining != 0) {
|
||||
// Can't appear in the middle of a multibyte sequence
|
||||
return false;
|
||||
}
|
||||
remaining = 2;
|
||||
length = 1;
|
||||
codepoint = byte & 0x0f;
|
||||
} else if (byte <= 0xf7) {
|
||||
// 11110xxx - first byte of quadruple byte sequence
|
||||
if (remaining != 0) {
|
||||
// Can't appear in the middle of a multibyte sequence
|
||||
return false;
|
||||
}
|
||||
remaining = 3;
|
||||
length = 1;
|
||||
codepoint = byte & 0x07;
|
||||
}
|
||||
|
||||
if (remaining == 0) {
|
||||
// Full codepoint constructed
|
||||
|
||||
// Reject overlong encodings
|
||||
if (codepoint <= 0x007f && length > 1) {
|
||||
return false;
|
||||
} else if (codepoint <= 0x07ff && length > 2) {
|
||||
return false;
|
||||
} else if (codepoint <= 0xffff && length > 3) {
|
||||
return false;
|
||||
} else if (codepoint <= 0x10ffff && length > 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject code points over U+10FFFF
|
||||
if (codepoint > 0x10ffff) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject surrogate pairs
|
||||
if (0xd800 <= codepoint && codepoint <= 0xdfff) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject non-characters
|
||||
if (codepoint & 0xffff == 0xfffe || codepoint & 0xffff == 0xffff) {
|
||||
// Plane end non-characters
|
||||
return false;
|
||||
} else if (0xfdd0 <= codepoint && codepoint <= 0xfdef) {
|
||||
// BMP non-character block
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject control characters
|
||||
if (codepoint <= 0x1f) {
|
||||
// C0 control character
|
||||
if (!newline_allowed || codepoint == 0x0a) {
|
||||
return false;
|
||||
}
|
||||
} else if (0x80 <= codepoint && codepoint <= 0x9f) {
|
||||
// C1 control character
|
||||
return false;
|
||||
} else if (codepoint == 0x2028) {
|
||||
// U+2028 LINE SEPARATOR
|
||||
return false;
|
||||
} else if (codepoint == 0x2029) {
|
||||
// U+2029 PARAGRAPH SEPARATOR
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (remaining != 0) {
|
||||
// Can't end at the middle of a multibyte sequence
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void handle_status(const unsigned char source_mac[6], const unsigned char *data, size_t data_length) {
|
||||
if (data_length < 2) {
|
||||
// Too short
|
||||
|
@ -478,7 +594,10 @@ void handle_status(const unsigned char source_mac[6], const unsigned char *data,
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO: check that nick is valid utf-8 with no control chars
|
||||
if (!check_utf8(nick, nick_length, false)) {
|
||||
// Malformed utf-8, or has control chars
|
||||
return;
|
||||
}
|
||||
|
||||
char mac[18];
|
||||
format_mac(source_mac, mac);
|
||||
|
@ -579,7 +698,10 @@ void handle_message(const unsigned char source_mac[6], const unsigned char *data
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO: Check that the message is valid utf-8 with newline as the only control char
|
||||
if (!check_utf8(message, message_length, true)) {
|
||||
// Malformed utf-8, or has control chars other than newline
|
||||
return;
|
||||
}
|
||||
|
||||
// See whether we've already received this message and update the next msgid if so
|
||||
ssize_t cache_index = msgid_cache_lookup(source_mac);
|
||||
|
@ -948,11 +1070,11 @@ int main(int argc, char **argv) {
|
|||
|
||||
// Set our nick
|
||||
if (memcmp(own_mac, veth0a_mac, 6) == 0) {
|
||||
memcpy(own_nick, "foo", 3);
|
||||
own_nick_length = 3;
|
||||
memcpy(own_nick, "𐀀ab", 6);
|
||||
own_nick_length = 6;
|
||||
} else {
|
||||
memcpy(own_nick, "bar", 3);
|
||||
own_nick_length = 3;
|
||||
memcpy(own_nick, "ࠀ<EFBFBD>", 6);
|
||||
own_nick_length = 6;
|
||||
}
|
||||
|
||||
// Initialize the message id cache
|
||||
|
|
Loading…
Reference in New Issue