#!/usr/bin/env python3 from urllib.request import urlretrieve from dataclasses import dataclass from typing import Union, Set import json # Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table categories = ''' Lu Uppercase_Letter an uppercase letter Ll Lowercase_Letter a lowercase letter Lt Titlecase_Letter a digraphic character, with first part uppercase LC Cased_Letter Lu | Ll | Lt Lm Modifier_Letter a modifier letter Lo Other_Letter other letters, including syllables and ideographs L Letter Lu | Ll | Lt | Lm | Lo Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) Mc Spacing_Mark a spacing combining mark (positive advance width) Me Enclosing_Mark an enclosing combining mark M Mark Mn | Mc | Me Nd Decimal_Number a decimal digit Nl Letter_Number a letterlike numeric character No Other_Number a numeric character of other type N Number Nd | Nl | No Pc Connector_Punctuation a connecting punctuation mark, like a tie Pd Dash_Punctuation a dash or hyphen punctuation mark Ps Open_Punctuation an opening punctuation mark (of a pair) Pe Close_Punctuation a closing punctuation mark (of a pair) Pi Initial_Punctuation an initial quotation mark Pf Final_Punctuation a final quotation mark Po Other_Punctuation a punctuation mark of other type P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po Sm Math_Symbol a symbol of mathematical use Sc Currency_Symbol a currency sign Sk Modifier_Symbol a non-letterlike modifier symbol So Other_Symbol a symbol of other type S Symbol Sm | Sc | Sk | So Zs Space_Separator a space character (of various non-zero widths) Zl Line_Separator U+2028 LINE SEPARATOR only Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only Z Separator Zs | Zl | Zp Cc Control a C0 or C1 control code Cf Format a format control character Cs Surrogate a surrogate code point Co Private_Use a private-use character Cn Unassigned a reserved unassigned code point or a noncharacter C Other Cc | Cf | Cs | Co | Cn '''.strip().split('\n') categories = [ row.split('\t', 2) for row in categories ] categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 } custom_names = { 0x00: "NUL '\\0' (null character)", 0x01: "SOH (start of heading)", 0x02: "STX (start of text)", 0x03: "ETX (end of text)", 0x04: "EOT (end of transmission)", 0x05: "ENQ (enquiry)", 0x06: "ACK (acknowledge)", 0x07: "BEL '\\a' (bell)", 0x08: "BS '\\b' (backspace)", 0x09: "HT '\\t' (horizontal tab)", 0x0A: "LF '\\n' (new line)", 0x0B: "VT '\\v' (vertical tab)", 0x0C: "FF '\\f' (form feed)", 0x0D: "CR '\\r' (carriage return)", 0x0E: "SO (shift out)", 0x0F: "SI (shift in)", 0x10: "DLE (data link escape)", 0x11: "DC1 (device control 1)", 0x12: "DC2 (device control 2)", 0x13: "DC3 (device control 3)", 0x14: "DC4 (device control 4)", 0x15: "NAK (negative acknowledge)", 0x16: "SYN (synchronous idle)", 0x17: "ETB (end of transmission block)", 0x18: "CAN (cancel)", 0x19: "EM (end of medium)", 0x1A: "SUB (substitute)", 0x1B: "ESC (escape)", 0x1C: "FS (file separator)", 0x1D: "GS (group separator)", 0x1E: "RS (record separator)", 0x1F: "US (unit separator)", 0x7F: "DEL (delete)", } @dataclass class Character: ordinal: int ordinal_end: Union[None, int] category: str names: Set[str] name: str combining: int in_range: Union[bool, str] def __repr__(self): return f'' def uni(ordinal): return hex(ordinal)[2:].zfill(4) def main(update_uni, update_names): UnicodeData = 'UnicodeData.txt' NamesList = 'NamesList.txt' if update_uni: UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData) if update_names: NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList) database = dict() ranges = dict() # UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html # And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files with open(UnicodeData, 'r') as fh: for line in fh: row = line.rstrip().split(';') code, name, category, combining,\ bidirectional, decomposition, decimal,\ digit, numeric, mirrored, old_name,\ iso_comment, upper, lower, title = row ordinal = int(code, 16) combining = int(combining) names = set() if old_name: names.add(old_name) firstlast = None if name == '': name = old_name elif name[0] == '<' and name[-1] == '>': name = name[1:-1] name, firstlast = name.split(', ', 1) else: names.add(name) if ordinal in custom_names: name = custom_names[ordinal] char = Character( ordinal=ordinal, ordinal_end=None, category=category, name=name, names=names, in_range=False, combining=combining, ) if firstlast and name not in ranges: ranges[name] = [None, None] if firstlast == 'First': ranges[name][0] = char elif firstlast == 'Last': ranges[name][1] = char else: database[char.ordinal] = char # TODO: What's something nicer than this? for range_name, (start, end) in ranges.items(): name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])' start.name = name end.name = name assert start.category == end.category assert start.combining == end.combining char = Character( ordinal=start.ordinal, ordinal_end=end.ordinal, category=start.category, name=name, names={name}, in_range=range_name, combining=start.combining, ) database[char.ordinal] = char # NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html # But I sort-of guessed and hoped for the best. with open(NamesList, 'r') as fh: char = None for line in fh: line = line.rstrip() if line[0] in ';@': continue if line[0] == '\t': if line[1] == '=': char.names.add(line[3:]) else: char, name = line.split('\t', 1) ordinal = int(char, 16) if ordinal in database: char = database[ordinal] char.names.add(name) else: char = Character( ordinal=ordinal, ordinal_end=None, category='Cn', name=name, names={name}, in_range=False, combining=0, ) database[ordinal] = char for char in database.values(): char.names -= {''} char.names -= { n for n in char.names if n.endswith(' (1.0)') } # TODO: Add Nami.txt short names in parens. with open('UnicodeDataFull.json', 'w') as fh: for o in sorted(database.keys()): char = database[o] combining = char.category[0] == 'M' names = [char.name] + list(sorted(char.names - {char.name})) row = [char.ordinal, char.ordinal_end, char.category, combining, names] fh.write(json.dumps(row) + '\n') if __name__ == '__main__': import sys update_uni = False update_names = False for arg in sys.argv[1:]: if arg in ('-u', '--unicode-data'): update_uni = True elif arg in ('-l', '--names-list'): update_names = True else: print('Usage:', sys.argv[0], '[-n]') print(' ', '-u, --unicode-data: Download UnicodeData.txt.') print(' ', '-l, --names-list: Download NamesList.txt.') print(' ', '-h, --help: Show this help.') exit(1) main(update_uni=update_uni, update_names=update_names)