happybot/happybot/unicode/update.py

244 lines
8.5 KiB
Python

#!/usr/bin/env python3
from urllib.request import urlretrieve
from dataclasses import dataclass
from typing import Union, Set
import json
# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
categories = '''
Lu Uppercase_Letter an uppercase letter
Ll Lowercase_Letter a lowercase letter
Lt Titlecase_Letter a digraphic character, with first part uppercase
LC Cased_Letter Lu | Ll | Lt
Lm Modifier_Letter a modifier letter
Lo Other_Letter other letters, including syllables and ideographs
L Letter Lu | Ll | Lt | Lm | Lo
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
Mc Spacing_Mark a spacing combining mark (positive advance width)
Me Enclosing_Mark an enclosing combining mark
M Mark Mn | Mc | Me
Nd Decimal_Number a decimal digit
Nl Letter_Number a letterlike numeric character
No Other_Number a numeric character of other type
N Number Nd | Nl | No
Pc Connector_Punctuation a connecting punctuation mark, like a tie
Pd Dash_Punctuation a dash or hyphen punctuation mark
Ps Open_Punctuation an opening punctuation mark (of a pair)
Pe Close_Punctuation a closing punctuation mark (of a pair)
Pi Initial_Punctuation an initial quotation mark
Pf Final_Punctuation a final quotation mark
Po Other_Punctuation a punctuation mark of other type
P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po
Sm Math_Symbol a symbol of mathematical use
Sc Currency_Symbol a currency sign
Sk Modifier_Symbol a non-letterlike modifier symbol
So Other_Symbol a symbol of other type
S Symbol Sm | Sc | Sk | So
Zs Space_Separator a space character (of various non-zero widths)
Zl Line_Separator U+2028 LINE SEPARATOR only
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
Z Separator Zs | Zl | Zp
Cc Control a C0 or C1 control code
Cf Format a format control character
Cs Surrogate a surrogate code point
Co Private_Use a private-use character
Cn Unassigned a reserved unassigned code point or a noncharacter
C Other Cc | Cf | Cs | Co | Cn
'''.strip().split('\n')
categories = [ row.split('\t', 2) for row in categories ]
categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
custom_names = {
0x00: "NUL '\\0' (null character)",
0x01: "SOH (start of heading)",
0x02: "STX (start of text)",
0x03: "ETX (end of text)",
0x04: "EOT (end of transmission)",
0x05: "ENQ (enquiry)",
0x06: "ACK (acknowledge)",
0x07: "BEL '\\a' (bell)",
0x08: "BS '\\b' (backspace)",
0x09: "HT '\\t' (horizontal tab)",
0x0A: "LF '\\n' (new line)",
0x0B: "VT '\\v' (vertical tab)",
0x0C: "FF '\\f' (form feed)",
0x0D: "CR '\\r' (carriage return)",
0x0E: "SO (shift out)",
0x0F: "SI (shift in)",
0x10: "DLE (data link escape)",
0x11: "DC1 (device control 1)",
0x12: "DC2 (device control 2)",
0x13: "DC3 (device control 3)",
0x14: "DC4 (device control 4)",
0x15: "NAK (negative acknowledge)",
0x16: "SYN (synchronous idle)",
0x17: "ETB (end of transmission block)",
0x18: "CAN (cancel)",
0x19: "EM (end of medium)",
0x1A: "SUB (substitute)",
0x1B: "ESC (escape)",
0x1C: "FS (file separator)",
0x1D: "GS (group separator)",
0x1E: "RS (record separator)",
0x1F: "US (unit separator)",
0x7F: "DEL (delete)",
}
@dataclass
class Character:
ordinal: int
ordinal_end: Union[None, int]
category: str
names: Set[str]
name: str
combining: int
in_range: Union[bool, str]
def __repr__(self):
return f'<U+{uni(self.ordinal)}>'
def uni(ordinal):
return hex(ordinal)[2:].zfill(4)
def main(update_uni, update_names):
UnicodeData = 'UnicodeData.txt'
NamesList = 'NamesList.txt'
if update_uni:
UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData)
if update_names:
NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList)
database = dict()
ranges = dict()
# UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html
# And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files
with open(UnicodeData, 'r') as fh:
for line in fh:
row = line.rstrip().split(';')
code, name, category, combining,\
bidirectional, decomposition, decimal,\
digit, numeric, mirrored, old_name,\
iso_comment, upper, lower, title = row
ordinal = int(code, 16)
combining = int(combining)
names = set()
if old_name:
names.add(old_name)
firstlast = None
if name == '<control>':
name = old_name
elif name[0] == '<' and name[-1] == '>':
name = name[1:-1]
name, firstlast = name.split(', ', 1)
else:
names.add(name)
if ordinal in custom_names:
name = custom_names[ordinal]
char = Character(
ordinal=ordinal,
ordinal_end=None,
category=category,
name=name,
names=names,
in_range=False,
combining=combining,
)
if firstlast and name not in ranges:
ranges[name] = [None, None]
if firstlast == 'First':
ranges[name][0] = char
elif firstlast == 'Last':
ranges[name][1] = char
else:
database[char.ordinal] = char
# TODO: What's something nicer than this?
for range_name, (start, end) in ranges.items():
name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])'
start.name = name
end.name = name
assert start.category == end.category
assert start.combining == end.combining
char = Character(
ordinal=start.ordinal,
ordinal_end=end.ordinal,
category=start.category,
name=name,
names={name},
in_range=range_name,
combining=start.combining,
)
database[char.ordinal] = char
# NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html
# But I sort-of guessed and hoped for the best.
with open(NamesList, 'r') as fh:
char = None
for line in fh:
line = line.rstrip()
if line[0] in ';@':
continue
if line[0] == '\t':
if line[1] == '=':
char.names.add(line[3:])
else:
char, name = line.split('\t', 1)
ordinal = int(char, 16)
if ordinal in database:
char = database[ordinal]
char.names.add(name)
else:
char = Character(
ordinal=ordinal,
ordinal_end=None,
category='Cn',
name=name,
names={name},
in_range=False,
combining=0,
)
database[ordinal] = char
for char in database.values():
char.names -= {'<control>'}
char.names -= { n for n in char.names if n.endswith(' (1.0)') }
# TODO: Add Nami.txt short names in parens.
with open('UnicodeDataFull.json', 'w') as fh:
for o in sorted(database.keys()):
char = database[o]
combining = char.category[0] == 'M'
names = [char.name] + list(sorted(char.names - {char.name}))
row = [char.ordinal, char.ordinal_end, char.category, combining, names]
fh.write(json.dumps(row) + '\n')
if __name__ == '__main__':
import sys
update_uni = False
update_names = False
for arg in sys.argv[1:]:
if arg in ('-u', '--unicode-data'):
update_uni = True
elif arg in ('-l', '--names-list'):
update_names = True
else:
print('Usage:', sys.argv[0], '[-n]')
print(' ', '-u, --unicode-data: Download UnicodeData.txt.')
print(' ', '-l, --names-list: Download NamesList.txt.')
print(' ', '-h, --help: Show this help.')
exit(1)
main(update_uni=update_uni, update_names=update_names)