We now support the output that you actually wanted!

Who would've thought that shell scripts were awkward for complex situations...
2021-07-11 06:36:30 -04:00 · 2021-07-11 06:36:30 -04:00 · a0c6637486
parent d58a32d07d
commit a0c6637486
2 changed files with 579 additions and 0 deletions
--- a/happybot/unicode/unicode.py
+++ b/happybot/unicode/unicode.py
@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+
+from subprocess import Popen, PIPE
+from os import chdir, environ
+from pathlib import Path
+import re
+
+# Make sure we're in the correct directory, for module imports and such too.
+basedir = Path(environ.get('basedir', '.'))
+chdir(basedir)
+
+def cmd(args):
+    proc = Popen(args, stdout=PIPE)
+    while True:
+        line = proc.stdout.readline()
+        if line:
+            try:
+                yield str(line[:-1], 'utf-8', 'ignore')
+            except:
+                pass
+        else:
+            break
+
+def irc(chan):
+    global trigger
+    server = environ.get('serv', 'irc.libera.chat')
+    fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan
+    fin = fdir + '/in'
+    fout = fdir + '/out'
+
+    for line in cmd(['tail', '-n', '0', '-f', fout]):
+        date, time, nick, line = line.split(' ', 3)
+        if nick[0] != '<' or nick[-1] != '>':
+            continue
+        nick = nick[1:-1]
+        m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line)
+        if m:
+            flags, query = m.groups()
+            if not flags:
+                flags = ''
+            result = doit(flags, query).split('\n')
+            result = [ f'\u200b{nick}: ' + line for line in result ]
+            result = result[:4] # capping at 4 lines max
+            result = '\n'.join(result)
+            with open(fin, 'w') as fh:
+                fh.write(result + '\n')
+
+from collections import defaultdict
+from math import ceil, log
+import json
+
+# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
+categories = '''
+Lu	Uppercase_Letter	an uppercase letter
+Ll	Lowercase_Letter	a lowercase letter
+Lt	Titlecase_Letter	a digraphic character, with first part uppercase
+LC	Cased_Letter	Lu | Ll | Lt
+Lm	Modifier_Letter	a modifier letter
+Lo	Other_Letter	other letters, including syllables and ideographs
+L	Letter	Lu | Ll | Lt | Lm | Lo
+Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
+Mc	Spacing_Mark	a spacing combining mark (positive advance width)
+Me	Enclosing_Mark	an enclosing combining mark
+M	Mark	Mn | Mc | Me
+Nd	Decimal_Number	a decimal digit
+Nl	Letter_Number	a letterlike numeric character
+No	Other_Number	a numeric character of other type
+N	Number	Nd | Nl | No
+Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
+Pd	Dash_Punctuation	a dash or hyphen punctuation mark
+Ps	Open_Punctuation	an opening punctuation mark (of a pair)
+Pe	Close_Punctuation	a closing punctuation mark (of a pair)
+Pi	Initial_Punctuation	an initial quotation mark
+Pf	Final_Punctuation	a final quotation mark
+Po	Other_Punctuation	a punctuation mark of other type
+P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
+Sm	Math_Symbol	a symbol of mathematical use
+Sc	Currency_Symbol	a currency sign
+Sk	Modifier_Symbol	a non-letterlike modifier symbol
+So	Other_Symbol	a symbol of other type
+S	Symbol	Sm | Sc | Sk | So
+Zs	Space_Separator	a space character (of various non-zero widths)
+Zl	Line_Separator	U+2028 LINE SEPARATOR only
+Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
+Z	Separator	Zs | Zl | Zp
+Cc	Control	a C0 or C1 control code
+Cf	Format	a format control character
+Cs	Surrogate	a surrogate code point
+Co	Private_Use	a private-use character
+Cn	Unassigned	a reserved unassigned code point or a noncharacter
+C	Other	Cc | Cf | Cs | Co | Cn
+'''.strip().split('\n')
+categories = [ row.split('\t', 2) for row in categories ]
+categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
+
+def utf8uni(ordinal):
+    s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:]
+    if len(s) % 2 == 1:
+        s = '0' + s
+    return '0x' + s
+
+def uniuni(ordinal):
+    return 'U+' + hex(ordinal)[2:].zfill(4).upper()
+
+UnicodeDataFull = 'UnicodeDataFull.json'
+
+tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*')
+invalid = [None, "Cn", False, ["<invalid>"]]
+unknown = [None, "Cn", False, ["<unknown>"]]
+
+def doit(flags, query):
+    quiet = 'q' in flags
+    verbose = 'v' in flags
+    decode = 'd' in flags
+    utf8 = '8' in flags
+
+    if utf8:
+        unif = utf8uni
+    else:
+        unif = uniuni
+
+    cache = dict()
+
+    if len(query) <= 2:
+        decode = True
+
+    if decode:
+        search = list(map(ord, query))
+
+    else:
+        index, merge = 0, False
+        search = []
+        for match in tokens.finditer(query):
+            missed = query[index:match.start()]
+            if missed:
+                if merge:
+                    search[-1] += missed
+                else:
+                    search.append(missed)
+            index = match.end()
+            merge = False
+            uni1, uni2, hexa, bina = match.groups()
+            uni = uni1 or uni2
+            if uni:
+                search.append(int(uni, 16))
+            elif hexa:
+                try:
+                    byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error')
+                    search.extend(map(ord, byt))
+                except:
+                    if isinstance(search[-1], str):
+                        search[-1] += '0x' + hexa
+                    else:
+                        search.append('0x' + hexa)
+                    merge = True
+            elif bina:
+                try:
+                    byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error')
+                    search.extend(map(ord, byt))
+                except:
+                    if isinstance(search[-1], str):
+                        search[-1] += '0b' + bina
+                    else:
+                        search.append('0b' + bina)
+                    merge = True
+        missed = query[index:]
+        if missed:
+            if merge:
+                search[-1] += missed
+            else:
+                search.append(missed)
+
+    results = [[] for _ in range(len(search))]
+    numbers = defaultdict(list)
+    strings = defaultdict(list)
+    for i, elem in enumerate(search):
+        if isinstance(elem, int):
+            numbers[elem].append(i)
+        elif isinstance(elem, str):
+            strings[elem.lower()].append(i)
+    numbers = list(sorted(numbers.items(), reverse=True))
+
+    # The actual searching.
+    filled = set()
+    with open(UnicodeDataFull, 'r') as fh:
+        for line in fh:
+            row = json.loads(line)
+            if numbers:
+                if row[0] == numbers[-1][0]:
+                    cache[row[0]] = row
+                    for index in numbers[-1][1]:
+                        filled.add(index)
+                        results[index].append(row[0])
+                    numbers.pop()
+                elif row[1]:
+                    while numbers and row[0] <= numbers[-1][0] <= row[1]:
+                        cache[numbers[-1][0]] = row
+                        for index in numbers[-1][1]:
+                            filled.add(index)
+                            results[index].append(numbers[-1][0])
+                        numbers.pop()
+            elif not strings:
+                break
+            for string, indices in strings.items():
+                if any(string in name.lower() for name in row[4]):
+                    num = row[0]
+                    if row[1]:
+                        num = -num
+                    cache[num] = row
+                    for index in indices:
+                        filled.add(index)
+                        results[index].append(num)
+
+    missing = set(range(len(search))) - filled
+    numbers = defaultdict(list)
+    indices = set()
+    for i in missing:
+        elem = search[i]
+        if isinstance(elem, int):
+            cache[elem] = [elem] + invalid
+            results[i].append(elem)
+        elif isinstance(elem, str):
+            results[i] = [None] * len(elem)
+            for j, c in enumerate(elem):
+                numbers[ord(c)].append((i, j))
+                indices.add((i, j))
+    numbers = list(sorted(numbers.items(), reverse=True))
+
+    if indices:
+        # Decoding what we have left, just some numbers.
+        with open(UnicodeDataFull, 'r') as fh:
+            for line in fh:
+                row = json.loads(line)
+                if numbers:
+                    if row[0] == numbers[-1][0]:
+                        cache[row[0]] = row
+                        for i, j in numbers[-1][1]:
+                            indices.remove((i, j))
+                            results[i][j] = row[0]
+                        numbers.pop()
+                    elif row[1]:
+                        while numbers and row[0] <= numbers[-1][0] <= row[1]:
+                            cache[numbers[-1][0]] = row
+                            for i, j in numbers[-1][1]:
+                                indices.remove((i, j))
+                                results[i][j] = numbers[-1][0]
+                            numbers.pop()
+                else:
+                    break
+
+        for i, j in indices:
+            num = ord(search[i][j])
+            cache[num] = [num] + unknown
+            results[i][j] = num
+
+        if len(search) == 1:
+            # This means we've fallen back on decoding our single input as a string.
+            # Setting this lets us display output differently, hopefully more usefully.
+            decode = True
+
+    results = [r for inner in results for r in inner]
+
+    if quiet and not verbose:
+        if decode:
+            fmt = '{code}'
+        else:
+            fmt = '{char}'
+        join = ' '
+    elif verbose and not quiet:
+        fmt = '{code} [{long_category}] {names}: {char}'
+        join = '\n'
+    else:
+        fmt = '{code} [{category}] {name}: {char}'
+        join = '\n'
+
+    def get_output(results):
+        output = []
+
+        for num in results:
+            is_range = False
+            if num < 0:
+                is_range = True
+            range_start, range_end, category, compose, names = cache[num]
+            if is_range:
+                if compose:
+                    char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end)
+                else:
+                    char = chr(range_start) + '..' + chr(range_end)
+                code = unif(range_start) + '..' + unif(range_end)
+            else:
+                char = chr(num)
+                if compose:
+                    char = '\u25cc' + char
+                code = unif(num)
+            output.append(fmt.format(
+                    code=code,
+                    name=names[0],
+                    names=', '.join(names),
+                    char=char,
+                    category=category,
+                    long_category=categories[category],
+                    ))
+
+        return join.join(output)
+
+    if quiet:
+        output = get_output(results)
+        output8 = output.encode('utf-8')
+        if len(output8) > 470:
+            cut = len(results) // 2
+            clen = cut
+            tried_okay = set()
+            for i in range(ceil(log(len(results), 2)) + 1):
+                output8 = (get_output(results[:cut]) + ' ...').encode('utf-8')
+                clen //= 2
+                if len(output8) < 450:
+                    tried_okay.add(cut)
+                    cut += clen
+                else:
+                    cut -= clen
+            output = get_output(results[:max(tried_okay)]) + ' ...'
+        return output
+    else:
+        return get_output(results)
+
+if __name__ == '__main__':
+    from sys import argv
+
+    if len(argv) == 2:
+        irc(argv[1])
+    elif len(argv) == 3:
+        print(doit(argv[1], argv[2]))
+    else:
+        print('Usage:', argv[0], '#channel')
+        print('   or:', argv[0], '[qvd]*', 'query')
+        exit(1)
--- a/happybot/unicode/update.py
+++ b/happybot/unicode/update.py
@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlretrieve
+from dataclasses import dataclass
+from typing import Union, Set
+import json
+
+# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
+categories = '''
+Lu	Uppercase_Letter	an uppercase letter
+Ll	Lowercase_Letter	a lowercase letter
+Lt	Titlecase_Letter	a digraphic character, with first part uppercase
+LC	Cased_Letter	Lu | Ll | Lt
+Lm	Modifier_Letter	a modifier letter
+Lo	Other_Letter	other letters, including syllables and ideographs
+L	Letter	Lu | Ll | Lt | Lm | Lo
+Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
+Mc	Spacing_Mark	a spacing combining mark (positive advance width)
+Me	Enclosing_Mark	an enclosing combining mark
+M	Mark	Mn | Mc | Me
+Nd	Decimal_Number	a decimal digit
+Nl	Letter_Number	a letterlike numeric character
+No	Other_Number	a numeric character of other type
+N	Number	Nd | Nl | No
+Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
+Pd	Dash_Punctuation	a dash or hyphen punctuation mark
+Ps	Open_Punctuation	an opening punctuation mark (of a pair)
+Pe	Close_Punctuation	a closing punctuation mark (of a pair)
+Pi	Initial_Punctuation	an initial quotation mark
+Pf	Final_Punctuation	a final quotation mark
+Po	Other_Punctuation	a punctuation mark of other type
+P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
+Sm	Math_Symbol	a symbol of mathematical use
+Sc	Currency_Symbol	a currency sign
+Sk	Modifier_Symbol	a non-letterlike modifier symbol
+So	Other_Symbol	a symbol of other type
+S	Symbol	Sm | Sc | Sk | So
+Zs	Space_Separator	a space character (of various non-zero widths)
+Zl	Line_Separator	U+2028 LINE SEPARATOR only
+Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
+Z	Separator	Zs | Zl | Zp
+Cc	Control	a C0 or C1 control code
+Cf	Format	a format control character
+Cs	Surrogate	a surrogate code point
+Co	Private_Use	a private-use character
+Cn	Unassigned	a reserved unassigned code point or a noncharacter
+C	Other	Cc | Cf | Cs | Co | Cn
+'''.strip().split('\n')
+categories = [ row.split('\t', 2) for row in categories ]
+categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
+
+custom_names = {
+        0x00: "NUL '\\0' (null character)",
+        0x01: "SOH (start of heading)",
+        0x02: "STX (start of text)",
+        0x03: "ETX (end of text)",
+        0x04: "EOT (end of transmission)",
+        0x05: "ENQ (enquiry)",
+        0x06: "ACK (acknowledge)",
+        0x07: "BEL '\\a' (bell)",
+        0x08: "BS '\\b' (backspace)",
+        0x09: "HT '\\t' (horizontal tab)",
+        0x0A: "LF '\\n' (new line)",
+        0x0B: "VT '\\v' (vertical tab)",
+        0x0C: "FF '\\f' (form feed)",
+        0x0D: "CR '\\r' (carriage return)",
+        0x0E: "SO (shift out)",
+        0x0F: "SI (shift in)",
+        0x10: "DLE (data link escape)",
+        0x11: "DC1 (device control 1)",
+        0x12: "DC2 (device control 2)",
+        0x13: "DC3 (device control 3)",
+        0x14: "DC4 (device control 4)",
+        0x15: "NAK (negative acknowledge)",
+        0x16: "SYN (synchronous idle)",
+        0x17: "ETB (end of transmission block)",
+        0x18: "CAN (cancel)",
+        0x19: "EM (end of medium)",
+        0x1A: "SUB (substitute)",
+        0x1B: "ESC (escape)",
+        0x1C: "FS (file separator)",
+        0x1D: "GS (group separator)",
+        0x1E: "RS (record separator)",
+        0x1F: "US (unit separator)",
+        0x7F: "DEL (delete)",
+    }
+
+@dataclass
+class Character:
+    ordinal: int
+    ordinal_end: Union[None, int]
+    category: str
+    names: Set[str]
+    name: str
+    combining: int
+    in_range: Union[bool, str]
+
+    def __repr__(self):
+        return f'<U+{uni(self.ordinal)}>'
+
+def uni(ordinal):
+    return hex(ordinal)[2:].zfill(4)
+
+def main(update_uni, update_names):
+    UnicodeData = 'UnicodeData.txt'
+    NamesList   = 'NamesList.txt'
+
+    if update_uni:
+        UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData)
+    if update_names:
+        NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList)
+
+    database = dict()
+    ranges = dict()
+
+    # UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html
+    # And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files
+    with open(UnicodeData, 'r') as fh:
+        for line in fh:
+            row = line.rstrip().split(';')
+            code, name, category, combining,\
+            bidirectional, decomposition, decimal,\
+            digit, numeric, mirrored, old_name,\
+            iso_comment, upper, lower, title = row
+
+            ordinal = int(code, 16)
+            combining = int(combining)
+            names = set()
+
+            if old_name:
+                names.add(old_name)
+
+            firstlast = None
+            if name == '<control>':
+                name = old_name
+            elif name[0] == '<' and name[-1] == '>':
+                name = name[1:-1]
+                name, firstlast = name.split(', ', 1)
+            else:
+                names.add(name)
+
+            if ordinal in custom_names:
+                name = custom_names[ordinal]
+
+            char = Character(
+                    ordinal=ordinal,
+                    ordinal_end=None,
+                    category=category,
+                    name=name,
+                    names=names,
+                    in_range=False,
+                    combining=combining,
+                    )
+
+            if firstlast and name not in ranges:
+                ranges[name] = [None, None]
+
+            if firstlast == 'First':
+                ranges[name][0] = char
+            elif firstlast == 'Last':
+                ranges[name][1] = char
+            else:
+                database[char.ordinal] = char
+    
+    # TODO: What's something nicer than this?
+    for range_name, (start, end) in ranges.items():
+        name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])'
+        start.name = name
+        end.name   = name
+        assert start.category == end.category
+        assert start.combining == end.combining
+        char = Character(
+                ordinal=start.ordinal,
+                ordinal_end=end.ordinal,
+                category=start.category,
+                name=name,
+                names={name},
+                in_range=range_name,
+                combining=start.combining,
+                )
+        database[char.ordinal] = char
+
+    # NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html
+    # But I sort-of guessed and hoped for the best.
+    with open(NamesList, 'r') as fh:
+        char = None
+        for line in fh:
+            line = line.rstrip()
+            if line[0] in ';@':
+                continue
+            if line[0] == '\t':
+                if line[1] == '=':
+                    char.names.add(line[3:])
+            else:
+                char, name = line.split('\t', 1)
+                ordinal = int(char, 16)
+                if ordinal in database:
+                    char = database[ordinal]
+                    char.names.add(name)
+                else:
+                    char = Character(
+                            ordinal=ordinal,
+                            ordinal_end=None,
+                            category='Cn',
+                            name=name,
+                            names={name},
+                            in_range=False,
+                            combining=0,
+                            )
+                    database[ordinal] = char
+
+    for char in database.values():
+        char.names -= {'<control>'}
+        char.names -= { n for n in char.names if n.endswith(' (1.0)') }
+
+    # TODO: Add Nami.txt short names in parens.
+
+    with open('UnicodeDataFull.json', 'w') as fh:
+        for o in sorted(database.keys()):
+            char = database[o]
+            combining = char.category[0] == 'M'
+            names = [char.name] + list(sorted(char.names - {char.name}))
+            row = [char.ordinal, char.ordinal_end, char.category, combining, names]
+            fh.write(json.dumps(row) + '\n')
+
+if __name__ == '__main__':
+    import sys
+
+    update_uni = False
+    update_names = False
+    for arg in sys.argv[1:]:
+        if arg in ('-u', '--unicode-data'):
+            update_uni = True
+        elif arg in ('-l', '--names-list'):
+            update_names = True
+        else:
+            print('Usage:', sys.argv[0], '[-n]')
+            print(' ', '-u, --unicode-data: Download UnicodeData.txt.')
+            print(' ', '-l, --names-list: Download NamesList.txt.')
+            print(' ', '-h, --help: Show this help.')
+            exit(1)
+
+    main(update_uni=update_uni, update_names=update_names)