happybot/happybot/unicode/update.py

#!/usr/bin/env python3

from urllib.request import urlretrieve
from dataclasses import dataclass
from typing import Union, Set
import json

# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
categories = '''
Lu	Uppercase_Letter	an uppercase letter
Ll	Lowercase_Letter	a lowercase letter
Lt	Titlecase_Letter	a digraphic character, with first part uppercase
LC	Cased_Letter	Lu | Ll | Lt
Lm	Modifier_Letter	a modifier letter
Lo	Other_Letter	other letters, including syllables and ideographs
L	Letter	Lu | Ll | Lt | Lm | Lo
Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
Mc	Spacing_Mark	a spacing combining mark (positive advance width)
Me	Enclosing_Mark	an enclosing combining mark
M	Mark	Mn | Mc | Me
Nd	Decimal_Number	a decimal digit
Nl	Letter_Number	a letterlike numeric character
No	Other_Number	a numeric character of other type
N	Number	Nd | Nl | No
Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
Pd	Dash_Punctuation	a dash or hyphen punctuation mark
Ps	Open_Punctuation	an opening punctuation mark (of a pair)
Pe	Close_Punctuation	a closing punctuation mark (of a pair)
Pi	Initial_Punctuation	an initial quotation mark
Pf	Final_Punctuation	a final quotation mark
Po	Other_Punctuation	a punctuation mark of other type
P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
Sm	Math_Symbol	a symbol of mathematical use
Sc	Currency_Symbol	a currency sign
Sk	Modifier_Symbol	a non-letterlike modifier symbol
So	Other_Symbol	a symbol of other type
S	Symbol	Sm | Sc | Sk | So
Zs	Space_Separator	a space character (of various non-zero widths)
Zl	Line_Separator	U+2028 LINE SEPARATOR only
Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
Z	Separator	Zs | Zl | Zp
Cc	Control	a C0 or C1 control code
Cf	Format	a format control character
Cs	Surrogate	a surrogate code point
Co	Private_Use	a private-use character
Cn	Unassigned	a reserved unassigned code point or a noncharacter
C	Other	Cc | Cf | Cs | Co | Cn
'''.strip().split('\n')
categories = [ row.split('\t', 2) for row in categories ]
categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }

custom_names = {
        0x00: "NUL '\\0' (null character)",
        0x01: "SOH (start of heading)",
        0x02: "STX (start of text)",
        0x03: "ETX (end of text)",
        0x04: "EOT (end of transmission)",
        0x05: "ENQ (enquiry)",
        0x06: "ACK (acknowledge)",
        0x07: "BEL '\\a' (bell)",
        0x08: "BS '\\b' (backspace)",
        0x09: "HT '\\t' (horizontal tab)",
        0x0A: "LF '\\n' (new line)",
        0x0B: "VT '\\v' (vertical tab)",
        0x0C: "FF '\\f' (form feed)",
        0x0D: "CR '\\r' (carriage return)",
        0x0E: "SO (shift out)",
        0x0F: "SI (shift in)",
        0x10: "DLE (data link escape)",
        0x11: "DC1 (device control 1)",
        0x12: "DC2 (device control 2)",
        0x13: "DC3 (device control 3)",
        0x14: "DC4 (device control 4)",
        0x15: "NAK (negative acknowledge)",
        0x16: "SYN (synchronous idle)",
        0x17: "ETB (end of transmission block)",
        0x18: "CAN (cancel)",
        0x19: "EM (end of medium)",
        0x1A: "SUB (substitute)",
        0x1B: "ESC (escape)",
        0x1C: "FS (file separator)",
        0x1D: "GS (group separator)",
        0x1E: "RS (record separator)",
        0x1F: "US (unit separator)",
        0x7F: "DEL (delete)",
    }

@dataclass
class Character:
    ordinal: int
    ordinal_end: Union[None, int]
    category: str
    names: Set[str]
    name: str
    combining: int
    in_range: Union[bool, str]

    def __repr__(self):
        return f'<U+{uni(self.ordinal)}>'

def uni(ordinal):
    return hex(ordinal)[2:].zfill(4)

def main(update_uni, update_names):
    UnicodeData = 'UnicodeData.txt'
    NamesList   = 'NamesList.txt'

    if update_uni:
        UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData)
    if update_names:
        NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList)

    database = dict()
    ranges = dict()

    # UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html
    # And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files
    with open(UnicodeData, 'r') as fh:
        for line in fh:
            row = line.rstrip().split(';')
            code, name, category, combining,\
            bidirectional, decomposition, decimal,\
            digit, numeric, mirrored, old_name,\
            iso_comment, upper, lower, title = row

            ordinal = int(code, 16)
            combining = int(combining)
            names = set()

            if old_name:
                names.add(old_name)

            firstlast = None
            if name == '<control>':
                name = old_name
            elif name[0] == '<' and name[-1] == '>':
                name = name[1:-1]
                name, firstlast = name.split(', ', 1)
            else:
                names.add(name)

            if ordinal in custom_names:
                name = custom_names[ordinal]

            char = Character(
                    ordinal=ordinal,
                    ordinal_end=None,
                    category=category,
                    name=name,
                    names=names,
                    in_range=False,
                    combining=combining,
                    )

            if firstlast and name not in ranges:
                ranges[name] = [None, None]

            if firstlast == 'First':
                ranges[name][0] = char
            elif firstlast == 'Last':
                ranges[name][1] = char
            else:
                database[char.ordinal] = char

    # TODO: What's something nicer than this?
    for range_name, (start, end) in ranges.items():
        name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])'
        start.name = name
        end.name   = name
        assert start.category == end.category
        assert start.combining == end.combining
        char = Character(
                ordinal=start.ordinal,
                ordinal_end=end.ordinal,
                category=start.category,
                name=name,
                names={name},
                in_range=range_name,
                combining=start.combining,
                )
        database[char.ordinal] = char

    # NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html
    # But I sort-of guessed and hoped for the best.
    with open(NamesList, 'r') as fh:
        char = None
        for line in fh:
            line = line.rstrip()
            if line[0] in ';@':
                continue
            if line[0] == '\t':
                if line[1] == '=':
                    char.names.add(line[3:])
            else:
                char, name = line.split('\t', 1)
                ordinal = int(char, 16)
                if ordinal in database:
                    char = database[ordinal]
                    char.names.add(name)
                else:
                    char = Character(
                            ordinal=ordinal,
                            ordinal_end=None,
                            category='Cn',
                            name=name,
                            names={name},
                            in_range=False,
                            combining=0,
                            )
                    database[ordinal] = char

    for char in database.values():
        char.names -= {'<control>'}
        char.names -= { n for n in char.names if n.endswith(' (1.0)') }

    # TODO: Add Nami.txt short names in parens.

    with open('UnicodeDataFull.json', 'w') as fh:
        for o in sorted(database.keys()):
            char = database[o]
            combining = char.category[0] == 'M'
            names = [char.name] + list(sorted(char.names - {char.name}))
            row = [char.ordinal, char.ordinal_end, char.category, combining, names]
            fh.write(json.dumps(row) + '\n')

if __name__ == '__main__':
    import sys

    update_uni = False
    update_names = False
    for arg in sys.argv[1:]:
        if arg in ('-u', '--unicode-data'):
            update_uni = True
        elif arg in ('-l', '--names-list'):
            update_names = True
        else:
            print('Usage:', sys.argv[0], '[-n]')
            print(' ', '-u, --unicode-data: Download UnicodeData.txt.')
            print(' ', '-l, --names-list: Download NamesList.txt.')
            print(' ', '-h, --help: Show this help.')
            exit(1)

    main(update_uni=update_uni, update_names=update_names)