diff --git a/happybot/unicode/unicode.py b/happybot/unicode/unicode.py new file mode 100644 index 0000000..d4a230b --- /dev/null +++ b/happybot/unicode/unicode.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python3 + +from subprocess import Popen, PIPE +from os import chdir, environ +from pathlib import Path +import re + +# Make sure we're in the correct directory, for module imports and such too. +basedir = Path(environ.get('basedir', '.')) +chdir(basedir) + +def cmd(args): + proc = Popen(args, stdout=PIPE) + while True: + line = proc.stdout.readline() + if line: + try: + yield str(line[:-1], 'utf-8', 'ignore') + except: + pass + else: + break + +def irc(chan): + global trigger + server = environ.get('serv', 'irc.libera.chat') + fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan + fin = fdir + '/in' + fout = fdir + '/out' + + for line in cmd(['tail', '-n', '0', '-f', fout]): + date, time, nick, line = line.split(' ', 3) + if nick[0] != '<' or nick[-1] != '>': + continue + nick = nick[1:-1] + m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line) + if m: + flags, query = m.groups() + if not flags: + flags = '' + result = doit(flags, query).split('\n') + result = [ f'\u200b{nick}: ' + line for line in result ] + result = result[:4] # capping at 4 lines max + result = '\n'.join(result) + with open(fin, 'w') as fh: + fh.write(result + '\n') + +from collections import defaultdict +from math import ceil, log +import json + +# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table +categories = ''' +Lu Uppercase_Letter an uppercase letter +Ll Lowercase_Letter a lowercase letter +Lt Titlecase_Letter a digraphic character, with first part uppercase +LC Cased_Letter Lu | Ll | Lt +Lm Modifier_Letter a modifier letter +Lo Other_Letter other letters, including syllables and ideographs +L Letter Lu | Ll | Lt | Lm | Lo +Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) +Mc Spacing_Mark a spacing combining mark (positive advance width) +Me Enclosing_Mark an enclosing combining mark +M Mark Mn | Mc | Me +Nd Decimal_Number a decimal digit +Nl Letter_Number a letterlike numeric character +No Other_Number a numeric character of other type +N Number Nd | Nl | No +Pc Connector_Punctuation a connecting punctuation mark, like a tie +Pd Dash_Punctuation a dash or hyphen punctuation mark +Ps Open_Punctuation an opening punctuation mark (of a pair) +Pe Close_Punctuation a closing punctuation mark (of a pair) +Pi Initial_Punctuation an initial quotation mark +Pf Final_Punctuation a final quotation mark +Po Other_Punctuation a punctuation mark of other type +P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po +Sm Math_Symbol a symbol of mathematical use +Sc Currency_Symbol a currency sign +Sk Modifier_Symbol a non-letterlike modifier symbol +So Other_Symbol a symbol of other type +S Symbol Sm | Sc | Sk | So +Zs Space_Separator a space character (of various non-zero widths) +Zl Line_Separator U+2028 LINE SEPARATOR only +Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only +Z Separator Zs | Zl | Zp +Cc Control a C0 or C1 control code +Cf Format a format control character +Cs Surrogate a surrogate code point +Co Private_Use a private-use character +Cn Unassigned a reserved unassigned code point or a noncharacter +C Other Cc | Cf | Cs | Co | Cn +'''.strip().split('\n') +categories = [ row.split('\t', 2) for row in categories ] +categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 } + +def utf8uni(ordinal): + s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:] + if len(s) % 2 == 1: + s = '0' + s + return '0x' + s + +def uniuni(ordinal): + return 'U+' + hex(ordinal)[2:].zfill(4).upper() + +UnicodeDataFull = 'UnicodeDataFull.json' + +tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*') +invalid = [None, "Cn", False, [""]] +unknown = [None, "Cn", False, [""]] + +def doit(flags, query): + quiet = 'q' in flags + verbose = 'v' in flags + decode = 'd' in flags + utf8 = '8' in flags + + if utf8: + unif = utf8uni + else: + unif = uniuni + + cache = dict() + + if len(query) <= 2: + decode = True + + if decode: + search = list(map(ord, query)) + + else: + index, merge = 0, False + search = [] + for match in tokens.finditer(query): + missed = query[index:match.start()] + if missed: + if merge: + search[-1] += missed + else: + search.append(missed) + index = match.end() + merge = False + uni1, uni2, hexa, bina = match.groups() + uni = uni1 or uni2 + if uni: + search.append(int(uni, 16)) + elif hexa: + try: + byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error') + search.extend(map(ord, byt)) + except: + if isinstance(search[-1], str): + search[-1] += '0x' + hexa + else: + search.append('0x' + hexa) + merge = True + elif bina: + try: + byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error') + search.extend(map(ord, byt)) + except: + if isinstance(search[-1], str): + search[-1] += '0b' + bina + else: + search.append('0b' + bina) + merge = True + missed = query[index:] + if missed: + if merge: + search[-1] += missed + else: + search.append(missed) + + results = [[] for _ in range(len(search))] + numbers = defaultdict(list) + strings = defaultdict(list) + for i, elem in enumerate(search): + if isinstance(elem, int): + numbers[elem].append(i) + elif isinstance(elem, str): + strings[elem.lower()].append(i) + numbers = list(sorted(numbers.items(), reverse=True)) + + # The actual searching. + filled = set() + with open(UnicodeDataFull, 'r') as fh: + for line in fh: + row = json.loads(line) + if numbers: + if row[0] == numbers[-1][0]: + cache[row[0]] = row + for index in numbers[-1][1]: + filled.add(index) + results[index].append(row[0]) + numbers.pop() + elif row[1]: + while numbers and row[0] <= numbers[-1][0] <= row[1]: + cache[numbers[-1][0]] = row + for index in numbers[-1][1]: + filled.add(index) + results[index].append(numbers[-1][0]) + numbers.pop() + elif not strings: + break + for string, indices in strings.items(): + if any(string in name.lower() for name in row[4]): + num = row[0] + if row[1]: + num = -num + cache[num] = row + for index in indices: + filled.add(index) + results[index].append(num) + + missing = set(range(len(search))) - filled + numbers = defaultdict(list) + indices = set() + for i in missing: + elem = search[i] + if isinstance(elem, int): + cache[elem] = [elem] + invalid + results[i].append(elem) + elif isinstance(elem, str): + results[i] = [None] * len(elem) + for j, c in enumerate(elem): + numbers[ord(c)].append((i, j)) + indices.add((i, j)) + numbers = list(sorted(numbers.items(), reverse=True)) + + if indices: + # Decoding what we have left, just some numbers. + with open(UnicodeDataFull, 'r') as fh: + for line in fh: + row = json.loads(line) + if numbers: + if row[0] == numbers[-1][0]: + cache[row[0]] = row + for i, j in numbers[-1][1]: + indices.remove((i, j)) + results[i][j] = row[0] + numbers.pop() + elif row[1]: + while numbers and row[0] <= numbers[-1][0] <= row[1]: + cache[numbers[-1][0]] = row + for i, j in numbers[-1][1]: + indices.remove((i, j)) + results[i][j] = numbers[-1][0] + numbers.pop() + else: + break + + for i, j in indices: + num = ord(search[i][j]) + cache[num] = [num] + unknown + results[i][j] = num + + if len(search) == 1: + # This means we've fallen back on decoding our single input as a string. + # Setting this lets us display output differently, hopefully more usefully. + decode = True + + results = [r for inner in results for r in inner] + + if quiet and not verbose: + if decode: + fmt = '{code}' + else: + fmt = '{char}' + join = ' ' + elif verbose and not quiet: + fmt = '{code} [{long_category}] {names}: {char}' + join = '\n' + else: + fmt = '{code} [{category}] {name}: {char}' + join = '\n' + + def get_output(results): + output = [] + + for num in results: + is_range = False + if num < 0: + is_range = True + range_start, range_end, category, compose, names = cache[num] + if is_range: + if compose: + char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end) + else: + char = chr(range_start) + '..' + chr(range_end) + code = unif(range_start) + '..' + unif(range_end) + else: + char = chr(num) + if compose: + char = '\u25cc' + char + code = unif(num) + output.append(fmt.format( + code=code, + name=names[0], + names=', '.join(names), + char=char, + category=category, + long_category=categories[category], + )) + + return join.join(output) + + if quiet: + output = get_output(results) + output8 = output.encode('utf-8') + if len(output8) > 470: + cut = len(results) // 2 + clen = cut + tried_okay = set() + for i in range(ceil(log(len(results), 2)) + 1): + output8 = (get_output(results[:cut]) + ' ...').encode('utf-8') + clen //= 2 + if len(output8) < 450: + tried_okay.add(cut) + cut += clen + else: + cut -= clen + output = get_output(results[:max(tried_okay)]) + ' ...' + return output + else: + return get_output(results) + +if __name__ == '__main__': + from sys import argv + + if len(argv) == 2: + irc(argv[1]) + elif len(argv) == 3: + print(doit(argv[1], argv[2])) + else: + print('Usage:', argv[0], '#channel') + print(' or:', argv[0], '[qvd]*', 'query') + exit(1) diff --git a/happybot/unicode/update.py b/happybot/unicode/update.py new file mode 100644 index 0000000..9e3319f --- /dev/null +++ b/happybot/unicode/update.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 + +from urllib.request import urlretrieve +from dataclasses import dataclass +from typing import Union, Set +import json + +# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table +categories = ''' +Lu Uppercase_Letter an uppercase letter +Ll Lowercase_Letter a lowercase letter +Lt Titlecase_Letter a digraphic character, with first part uppercase +LC Cased_Letter Lu | Ll | Lt +Lm Modifier_Letter a modifier letter +Lo Other_Letter other letters, including syllables and ideographs +L Letter Lu | Ll | Lt | Lm | Lo +Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) +Mc Spacing_Mark a spacing combining mark (positive advance width) +Me Enclosing_Mark an enclosing combining mark +M Mark Mn | Mc | Me +Nd Decimal_Number a decimal digit +Nl Letter_Number a letterlike numeric character +No Other_Number a numeric character of other type +N Number Nd | Nl | No +Pc Connector_Punctuation a connecting punctuation mark, like a tie +Pd Dash_Punctuation a dash or hyphen punctuation mark +Ps Open_Punctuation an opening punctuation mark (of a pair) +Pe Close_Punctuation a closing punctuation mark (of a pair) +Pi Initial_Punctuation an initial quotation mark +Pf Final_Punctuation a final quotation mark +Po Other_Punctuation a punctuation mark of other type +P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po +Sm Math_Symbol a symbol of mathematical use +Sc Currency_Symbol a currency sign +Sk Modifier_Symbol a non-letterlike modifier symbol +So Other_Symbol a symbol of other type +S Symbol Sm | Sc | Sk | So +Zs Space_Separator a space character (of various non-zero widths) +Zl Line_Separator U+2028 LINE SEPARATOR only +Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only +Z Separator Zs | Zl | Zp +Cc Control a C0 or C1 control code +Cf Format a format control character +Cs Surrogate a surrogate code point +Co Private_Use a private-use character +Cn Unassigned a reserved unassigned code point or a noncharacter +C Other Cc | Cf | Cs | Co | Cn +'''.strip().split('\n') +categories = [ row.split('\t', 2) for row in categories ] +categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 } + +custom_names = { + 0x00: "NUL '\\0' (null character)", + 0x01: "SOH (start of heading)", + 0x02: "STX (start of text)", + 0x03: "ETX (end of text)", + 0x04: "EOT (end of transmission)", + 0x05: "ENQ (enquiry)", + 0x06: "ACK (acknowledge)", + 0x07: "BEL '\\a' (bell)", + 0x08: "BS '\\b' (backspace)", + 0x09: "HT '\\t' (horizontal tab)", + 0x0A: "LF '\\n' (new line)", + 0x0B: "VT '\\v' (vertical tab)", + 0x0C: "FF '\\f' (form feed)", + 0x0D: "CR '\\r' (carriage return)", + 0x0E: "SO (shift out)", + 0x0F: "SI (shift in)", + 0x10: "DLE (data link escape)", + 0x11: "DC1 (device control 1)", + 0x12: "DC2 (device control 2)", + 0x13: "DC3 (device control 3)", + 0x14: "DC4 (device control 4)", + 0x15: "NAK (negative acknowledge)", + 0x16: "SYN (synchronous idle)", + 0x17: "ETB (end of transmission block)", + 0x18: "CAN (cancel)", + 0x19: "EM (end of medium)", + 0x1A: "SUB (substitute)", + 0x1B: "ESC (escape)", + 0x1C: "FS (file separator)", + 0x1D: "GS (group separator)", + 0x1E: "RS (record separator)", + 0x1F: "US (unit separator)", + 0x7F: "DEL (delete)", + } + +@dataclass +class Character: + ordinal: int + ordinal_end: Union[None, int] + category: str + names: Set[str] + name: str + combining: int + in_range: Union[bool, str] + + def __repr__(self): + return f'' + +def uni(ordinal): + return hex(ordinal)[2:].zfill(4) + +def main(update_uni, update_names): + UnicodeData = 'UnicodeData.txt' + NamesList = 'NamesList.txt' + + if update_uni: + UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData) + if update_names: + NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList) + + database = dict() + ranges = dict() + + # UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html + # And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files + with open(UnicodeData, 'r') as fh: + for line in fh: + row = line.rstrip().split(';') + code, name, category, combining,\ + bidirectional, decomposition, decimal,\ + digit, numeric, mirrored, old_name,\ + iso_comment, upper, lower, title = row + + ordinal = int(code, 16) + combining = int(combining) + names = set() + + if old_name: + names.add(old_name) + + firstlast = None + if name == '': + name = old_name + elif name[0] == '<' and name[-1] == '>': + name = name[1:-1] + name, firstlast = name.split(', ', 1) + else: + names.add(name) + + if ordinal in custom_names: + name = custom_names[ordinal] + + char = Character( + ordinal=ordinal, + ordinal_end=None, + category=category, + name=name, + names=names, + in_range=False, + combining=combining, + ) + + if firstlast and name not in ranges: + ranges[name] = [None, None] + + if firstlast == 'First': + ranges[name][0] = char + elif firstlast == 'Last': + ranges[name][1] = char + else: + database[char.ordinal] = char + + # TODO: What's something nicer than this? + for range_name, (start, end) in ranges.items(): + name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])' + start.name = name + end.name = name + assert start.category == end.category + assert start.combining == end.combining + char = Character( + ordinal=start.ordinal, + ordinal_end=end.ordinal, + category=start.category, + name=name, + names={name}, + in_range=range_name, + combining=start.combining, + ) + database[char.ordinal] = char + + # NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html + # But I sort-of guessed and hoped for the best. + with open(NamesList, 'r') as fh: + char = None + for line in fh: + line = line.rstrip() + if line[0] in ';@': + continue + if line[0] == '\t': + if line[1] == '=': + char.names.add(line[3:]) + else: + char, name = line.split('\t', 1) + ordinal = int(char, 16) + if ordinal in database: + char = database[ordinal] + char.names.add(name) + else: + char = Character( + ordinal=ordinal, + ordinal_end=None, + category='Cn', + name=name, + names={name}, + in_range=False, + combining=0, + ) + database[ordinal] = char + + for char in database.values(): + char.names -= {''} + char.names -= { n for n in char.names if n.endswith(' (1.0)') } + + # TODO: Add Nami.txt short names in parens. + + with open('UnicodeDataFull.json', 'w') as fh: + for o in sorted(database.keys()): + char = database[o] + combining = char.category[0] == 'M' + names = [char.name] + list(sorted(char.names - {char.name})) + row = [char.ordinal, char.ordinal_end, char.category, combining, names] + fh.write(json.dumps(row) + '\n') + +if __name__ == '__main__': + import sys + + update_uni = False + update_names = False + for arg in sys.argv[1:]: + if arg in ('-u', '--unicode-data'): + update_uni = True + elif arg in ('-l', '--names-list'): + update_names = True + else: + print('Usage:', sys.argv[0], '[-n]') + print(' ', '-u, --unicode-data: Download UnicodeData.txt.') + print(' ', '-l, --names-list: Download NamesList.txt.') + print(' ', '-h, --help: Show this help.') + exit(1) + + main(update_uni=update_uni, update_names=update_names)