happybot/happybot/unicode/unicode.py

#!/usr/bin/env python3

from subprocess import Popen, PIPE
from os import chdir, environ
from pathlib import Path
import re

# Make sure we're in the correct directory, for module imports and such too.
basedir = Path(environ.get('basedir', '.'))
chdir(basedir)

def cmd(args):
    proc = Popen(args, stdout=PIPE)
    while True:
        line = proc.stdout.readline()
        if line:
            try:
                yield str(line[:-1], 'utf-8', 'ignore')
            except:
                pass
        else:
            break

def irc(chan):
    global trigger
    server = environ.get('serv', 'irc.libera.chat')
    fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan
    fin = fdir + '/in'
    fout = fdir + '/out'

    for line in cmd(['tail', '-n', '0', '-f', fout]):
        date, time, nick, line = line.split(' ', 3)
        if nick[0] != '<' or nick[-1] != '>':
            continue
        nick = nick[1:-1]
        m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line)
        if m:
            flags, query = m.groups()
            if not flags:
                flags = ''
            result = doit(flags, query).split('\n')
            result = [ f'\u200b{nick}: ' + line for line in result ]
            result = result[:4] # capping at 4 lines max
            result = '\n'.join(result)
            with open(fin, 'w') as fh:
                fh.write(result + '\n')

from collections import defaultdict
from math import ceil, log
import json

# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
categories = '''
Lu	Uppercase_Letter	an uppercase letter
Ll	Lowercase_Letter	a lowercase letter
Lt	Titlecase_Letter	a digraphic character, with first part uppercase
LC	Cased_Letter	Lu | Ll | Lt
Lm	Modifier_Letter	a modifier letter
Lo	Other_Letter	other letters, including syllables and ideographs
L	Letter	Lu | Ll | Lt | Lm | Lo
Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
Mc	Spacing_Mark	a spacing combining mark (positive advance width)
Me	Enclosing_Mark	an enclosing combining mark
M	Mark	Mn | Mc | Me
Nd	Decimal_Number	a decimal digit
Nl	Letter_Number	a letterlike numeric character
No	Other_Number	a numeric character of other type
N	Number	Nd | Nl | No
Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
Pd	Dash_Punctuation	a dash or hyphen punctuation mark
Ps	Open_Punctuation	an opening punctuation mark (of a pair)
Pe	Close_Punctuation	a closing punctuation mark (of a pair)
Pi	Initial_Punctuation	an initial quotation mark
Pf	Final_Punctuation	a final quotation mark
Po	Other_Punctuation	a punctuation mark of other type
P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
Sm	Math_Symbol	a symbol of mathematical use
Sc	Currency_Symbol	a currency sign
Sk	Modifier_Symbol	a non-letterlike modifier symbol
So	Other_Symbol	a symbol of other type
S	Symbol	Sm | Sc | Sk | So
Zs	Space_Separator	a space character (of various non-zero widths)
Zl	Line_Separator	U+2028 LINE SEPARATOR only
Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
Z	Separator	Zs | Zl | Zp
Cc	Control	a C0 or C1 control code
Cf	Format	a format control character
Cs	Surrogate	a surrogate code point
Co	Private_Use	a private-use character
Cn	Unassigned	a reserved unassigned code point or a noncharacter
C	Other	Cc | Cf | Cs | Co | Cn
'''.strip().split('\n')
categories = [ row.split('\t', 2) for row in categories ]
categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }

def utf8uni(ordinal):
    s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:]
    if len(s) % 2 == 1:
        s = '0' + s
    return '0x' + s

def uniuni(ordinal):
    return 'U+' + hex(ordinal)[2:].zfill(4).upper()

UnicodeDataFull = 'UnicodeDataFull.json'

tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*')
invalid = [None, "Cn", False, ["<invalid>"]]
unknown = [None, "Cn", False, ["<unknown>"]]

def doit(flags, query):
    quiet = 'q' in flags
    verbose = 'v' in flags
    decode = 'd' in flags
    utf8 = '8' in flags

    if utf8:
        unif = utf8uni
    else:
        unif = uniuni

    cache = dict()

    if len(query) <= 2:
        decode = True

    if decode:
        search = list(map(ord, query))

    else:
        index, merge = 0, False
        search = []
        for match in tokens.finditer(query):
            missed = query[index:match.start()]
            if missed:
                if merge:
                    search[-1] += missed
                else:
                    search.append(missed)
            index = match.end()
            merge = False
            uni1, uni2, hexa, bina = match.groups()
            uni = uni1 or uni2
            if uni:
                search.append(int(uni, 16))
            elif hexa:
                try:
                    byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error')
                    search.extend(map(ord, byt))
                except:
                    if isinstance(search[-1], str):
                        search[-1] += '0x' + hexa
                    else:
                        search.append('0x' + hexa)
                    merge = True
            elif bina:
                try:
                    byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error')
                    search.extend(map(ord, byt))
                except:
                    if isinstance(search[-1], str):
                        search[-1] += '0b' + bina
                    else:
                        search.append('0b' + bina)
                    merge = True
        missed = query[index:]
        if missed:
            if merge:
                search[-1] += missed
            else:
                search.append(missed)

    results = [[] for _ in range(len(search))]
    numbers = defaultdict(list)
    strings = defaultdict(list)
    for i, elem in enumerate(search):
        if isinstance(elem, int):
            numbers[elem].append(i)
        elif isinstance(elem, str):
            strings[elem.lower()].append(i)
    numbers = list(sorted(numbers.items(), reverse=True))

    # The actual searching.
    filled = set()
    with open(UnicodeDataFull, 'r') as fh:
        for line in fh:
            row = json.loads(line)
            if numbers:
                if row[0] == numbers[-1][0]:
                    cache[row[0]] = row
                    for index in numbers[-1][1]:
                        filled.add(index)
                        results[index].append(row[0])
                    numbers.pop()
                elif row[1]:
                    while numbers and row[0] <= numbers[-1][0] <= row[1]:
                        cache[numbers[-1][0]] = row
                        for index in numbers[-1][1]:
                            filled.add(index)
                            results[index].append(numbers[-1][0])
                        numbers.pop()
            elif not strings:
                break
            for string, indices in strings.items():
                if any(string in name.lower() for name in row[4]):
                    num = row[0]
                    if row[1]:
                        num = -num
                    cache[num] = row
                    for index in indices:
                        filled.add(index)
                        results[index].append(num)

    missing = set(range(len(search))) - filled
    numbers = defaultdict(list)
    indices = set()
    for i in missing:
        elem = search[i]
        if isinstance(elem, int):
            cache[elem] = [elem] + invalid
            results[i].append(elem)
        elif isinstance(elem, str):
            results[i] = [None] * len(elem)
            for j, c in enumerate(elem):
                numbers[ord(c)].append((i, j))
                indices.add((i, j))
    numbers = list(sorted(numbers.items(), reverse=True))

    if indices:
        # Decoding what we have left, just some numbers.
        with open(UnicodeDataFull, 'r') as fh:
            for line in fh:
                row = json.loads(line)
                if numbers:
                    if row[0] == numbers[-1][0]:
                        cache[row[0]] = row
                        for i, j in numbers[-1][1]:
                            indices.remove((i, j))
                            results[i][j] = row[0]
                        numbers.pop()
                    elif row[1]:
                        while numbers and row[0] <= numbers[-1][0] <= row[1]:
                            cache[numbers[-1][0]] = row
                            for i, j in numbers[-1][1]:
                                indices.remove((i, j))
                                results[i][j] = numbers[-1][0]
                            numbers.pop()
                else:
                    break

        for i, j in indices:
            num = ord(search[i][j])
            cache[num] = [num] + unknown
            results[i][j] = num

        if len(search) == 1:
            # This means we've fallen back on decoding our single input as a string.
            # Setting this lets us display output differently, hopefully more usefully.
            decode = True

    results = [r for inner in results for r in inner]

    if quiet and not verbose:
        if decode:
            fmt = '{code}'
        else:
            fmt = '{char}'
        join = ' '
    elif verbose and not quiet:
        fmt = '{code} [{long_category}] {names}: {char}'
        join = '\n'
    else:
        fmt = '{code} [{category}] {name}: {char}'
        join = '\n'

    def get_output(results):
        output = []

        for num in results:
            is_range = False
            if num < 0:
                is_range = True
            range_start, range_end, category, compose, names = cache[num]
            if is_range:
                if compose:
                    char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end)
                else:
                    char = chr(range_start) + '..' + chr(range_end)
                code = unif(range_start) + '..' + unif(range_end)
            else:
                char = chr(num)
                if compose:
                    char = '\u25cc' + char
                code = unif(num)
            output.append(fmt.format(
                    code=code,
                    name=names[0],
                    names=', '.join(names),
                    char=char,
                    category=category,
                    long_category=categories[category],
                    ))

        return join.join(output)

    if quiet:
        output = get_output(results)
        output8 = output.encode('utf-8')
        if len(output8) > 470:
            cut = len(results) // 2
            clen = cut
            tried_okay = set()
            for i in range(ceil(log(len(results), 2)) + 1):
                output8 = (get_output(results[:cut]) + ' ...').encode('utf-8')
                clen //= 2
                if len(output8) < 450:
                    tried_okay.add(cut)
                    cut += clen
                else:
                    cut -= clen
            output = get_output(results[:max(tried_okay)]) + ' ...'
        return output
    else:
        return get_output(results)

if __name__ == '__main__':
    from sys import argv

    if len(argv) == 2:
        irc(argv[1])
    elif len(argv) == 3:
        print(doit(argv[1], argv[2]))
    else:
        print('Usage:', argv[0], '#channel')
        print('   or:', argv[0], '[qvd]*', 'query')
        exit(1)