happybot/happybot/unicode/unicode.py

#!/usr/bin/env python3

from subprocess import Popen, PIPE
from os import chdir, environ
from pathlib import Path
import re

# Make sure we're in the correct directory, for module imports and such too.
basedir = Path(environ.get('basedir', '.'))
chdir(basedir)

def cmd(args):
    proc = Popen(args, stdout=PIPE)
    while True:
        line = proc.stdout.readline()
        if line:
            try:
                yield str(line[:-1], 'utf-8', 'ignore')
            except:
                pass
        else:
            break

def irc(chan):
    global trigger
    server = environ.get('serv', 'irc.libera.chat')
    fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan
    fin = fdir + '/in'
    fout = fdir + '/out'

    for line in cmd(['tail', '-n', '0', '-f', fout]):
        date, time, nick, line = line.split(' ', 3)
        if nick[0] != '<' or nick[-1] != '>':
            continue
        nick = nick[1:-1]
        m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line)
        if m:
            flags, query = m.groups()
            if not flags:
                flags = ''
            result = doit(flags, query).split('\n')
            result = [ f'\u200b{nick}: ' + line for line in result ]
            result = result[:4] # capping at 4 lines max
            result = '\n'.join(result)
            with open(fin, 'w') as fh:
                fh.write(result + '\n')

from collections import defaultdict
from math import ceil, log
from random import choice
import json

# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
categories = '''
Lu	Uppercase_Letter	an uppercase letter
Ll	Lowercase_Letter	a lowercase letter
Lt	Titlecase_Letter	a digraphic character, with first part uppercase
LC	Cased_Letter	Lu | Ll | Lt
Lm	Modifier_Letter	a modifier letter
Lo	Other_Letter	other letters, including syllables and ideographs
L	Letter	Lu | Ll | Lt | Lm | Lo
Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
Mc	Spacing_Mark	a spacing combining mark (positive advance width)
Me	Enclosing_Mark	an enclosing combining mark
M	Mark	Mn | Mc | Me
Nd	Decimal_Number	a decimal digit
Nl	Letter_Number	a letterlike numeric character
No	Other_Number	a numeric character of other type
N	Number	Nd | Nl | No
Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
Pd	Dash_Punctuation	a dash or hyphen punctuation mark
Ps	Open_Punctuation	an opening punctuation mark (of a pair)
Pe	Close_Punctuation	a closing punctuation mark (of a pair)
Pi	Initial_Punctuation	an initial quotation mark
Pf	Final_Punctuation	a final quotation mark
Po	Other_Punctuation	a punctuation mark of other type
P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
Sm	Math_Symbol	a symbol of mathematical use
Sc	Currency_Symbol	a currency sign
Sk	Modifier_Symbol	a non-letterlike modifier symbol
So	Other_Symbol	a symbol of other type
S	Symbol	Sm | Sc | Sk | So
Zs	Space_Separator	a space character (of various non-zero widths)
Zl	Line_Separator	U+2028 LINE SEPARATOR only
Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
Z	Separator	Zs | Zl | Zp
Cc	Control	a C0 or C1 control code
Cf	Format	a format control character
Cs	Surrogate	a surrogate code point
Co	Private_Use	a private-use character
Cn	Unassigned	a reserved unassigned code point or a noncharacter
C	Other	Cc | Cf | Cs | Co | Cn
'''.strip().split('\n')
categories = [ row.split('\t', 2) for row in categories ]
categories = { left: (right.replace('_', ' '), righter) for left, right, righter in categories if len(left) == 2 }

def utf8uni(ordinal):
    s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:]
    if len(s) % 2 == 1:
        s = '0' + s
    return '0x' + s

def uniuni(ordinal):
    return 'U+' + hex(ordinal)[2:].zfill(4).upper()

UnicodeDataFull = 'UnicodeDataFull.json'

tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*')
invalid = [None, "Cn", False, ["<invalid>"]]
unknown = [None, "Cn", False, ["<unknown>"]]

def doit(flags, query):
    if 'q' in flags and 'v' in flags:
        sub1 = 1
    else:
        sub1 = 0

    verbosity = (flags.count('v') - sub1) - (flags.count('q') - sub1)
    if verbosity > 0:
        quiet = False
        verbose = True
    elif verbosity < 0:
        quiet = True
        verbose = False
    elif verbosity == 0:
        if sub1:
            quiet = True
            verbose = True
        else:
            quiet = False
            verbose = False

    if verbosity < -1:
        return choice(['Ssssshh.', '[silence]', 'Complete quie-- oh, darn, I spoke.', '[the sound of nothing]', '[complete silence]'])

    decode = 'd' in flags
    utf8 = '8' in flags

    if utf8:
        unif = utf8uni
    else:
        unif = uniuni

    cache = dict()

    if len(query) <= 2:
        decode = True

    if decode:
        search = list(map(ord, query))

    else:
        index, merge = 0, False
        search = []
        for match in tokens.finditer(query):
            missed = query[index:match.start()]
            if missed:
                if merge:
                    search[-1] += missed
                else:
                    search.append(missed)
            index = match.end()
            merge = False
            uni1, uni2, hexa, bina = match.groups()
            uni = uni1 or uni2
            if uni:
                search.append(int(uni, 16))
            elif hexa:
                try:
                    byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error')
                    search.extend(map(ord, byt))
                except:
                    if isinstance(search[-1], str):
                        search[-1] += '0x' + hexa
                    else:
                        search.append('0x' + hexa)
                    merge = True
            elif bina:
                try:
                    byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error')
                    search.extend(map(ord, byt))
                except:
                    if isinstance(search[-1], str):
                        search[-1] += '0b' + bina
                    else:
                        search.append('0b' + bina)
                    merge = True
        missed = query[index:]
        if missed:
            if merge:
                search[-1] += missed
            else:
                search.append(missed)

    results = [[] for _ in range(len(search))]
    whymatched = defaultdict(set)

    numbers = defaultdict(list)
    strings = defaultdict(set)
    for i, elem in enumerate(search):
        if isinstance(elem, int):
            numbers[elem].append(i)
        elif isinstance(elem, str):
            strings[elem.lower()].add(i)
    numbers = list(sorted(numbers.items(), reverse=True))

    # The actual searching.
    filled = set()
    with open(UnicodeDataFull, 'r') as fh:
        for line in fh:
            row = json.loads(line)
            if numbers:
                if row[0] == numbers[-1][0]:
                    cache[row[0]] = row
                    for index in numbers[-1][1]:
                        filled.add(index)
                        results[index].append(row[0])
                    numbers.pop()
                elif row[1]:
                    while numbers and row[0] <= numbers[-1][0] <= row[1]:
                        cache[numbers[-1][0]] = row
                        for index in numbers[-1][1]:
                            filled.add(index)
                            results[index].append(numbers[-1][0])
                        numbers.pop()
            elif not strings:
                break
            for string, indices in strings.items():
                for i, name in enumerate(row[4]):
                    if string in name.lower():
                        num = row[0]
                        if row[1]:
                            num = -num
                        cache[num] = row
                        # Prioritize shown name matches first for string matches.
                        # Also save _why_ it matched.
                        if i > 0:
                            whymatched[num].add(name)
                        for index in indices:
                            filled.add(index)
                            if not results[index]:
                                results[index] = [[], []]
                            results[index][i > 0].append(num)
                        break

    # Merge priority and non-priority matches.
    for string, indices in strings.items():
        for index in indices & filled:
            results[index] = results[index][0] + results[index][1]

    missing = set(range(len(search))) - filled
    numbers = defaultdict(list)
    indices = set()
    for i in missing:
        elem = search[i]
        if isinstance(elem, int):
            cache[elem] = [elem] + invalid
            results[i].append(elem)
        elif isinstance(elem, str):
            results[i] = [None] * len(elem)
            for j, c in enumerate(elem):
                numbers[ord(c)].append((i, j))
                indices.add((i, j))
    numbers = list(sorted(numbers.items(), reverse=True))

    if indices:
        # Decoding what we have left, just some numbers.
        with open(UnicodeDataFull, 'r') as fh:
            for line in fh:
                row = json.loads(line)
                if numbers:
                    if row[0] == numbers[-1][0]:
                        cache[row[0]] = row
                        for i, j in numbers[-1][1]:
                            indices.remove((i, j))
                            results[i][j] = row[0]
                        numbers.pop()
                    elif row[1]:
                        while numbers and row[0] <= numbers[-1][0] <= row[1]:
                            cache[numbers[-1][0]] = row
                            for i, j in numbers[-1][1]:
                                indices.remove((i, j))
                                results[i][j] = numbers[-1][0]
                            numbers.pop()
                else:
                    break

        for i, j in indices:
            num = ord(search[i][j])
            cache[num] = [num] + unknown
            results[i][j] = num

        if len(search) == 1:
            # This means we've fallen back on decoding our single input as a string.
            # Setting this lets us display output differently, hopefully more usefully.
            decode = True

    results = [r for inner in results for r in inner]

    if quiet and not verbose:
        if decode:
            fmt = '{code}'
        else:
            fmt = '{char}'
        join = ' '
    elif verbose and not quiet:
        if verbosity > 1:
            fmt = '{code} [{long_category}, {extra_category}] {names}: {char}'
        else:
            fmt = '{code} [{long_category}] {names}: {char}'
        join = '\n'
    elif quiet and verbose:
        fmt = '{code} [{category}] {name}: {char}'
        join = '\n'
    else:
        fmt = '{code} [{category}] {namee}: {char}'
        join = '\n'

    def get_output(results):
        output = []

        for num in results:
            is_range = False
            if num < 0:
                is_range = True
            range_start, range_end, category, compose, names = cache[num]
            if is_range:
                if compose:
                    char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end)
                else:
                    char = chr(range_start) + '..' + chr(range_end)
                code = unif(range_start) + '..' + unif(range_end)
            else:
                char = chr(num)
                if compose:
                    char = '\u25cc' + char
                code = unif(num)

            extra = ''
            if num in whymatched:
                extra = ' (' + ', '.join(whymatched[num]) + ')'

            output.append(fmt.format(
                    code=code,
                    name=names[0],
                    namee=names[0] + extra,
                    names=', '.join(names),
                    char=char,
                    category=category,
                    long_category=categories[category][0],
                    extra_category=categories[category][1],
                    ))

        return join.join(output)

    if quiet:
        output = get_output(results)
        output8 = output.encode('utf-8')
        if len(output8) > 470:
            cut = len(results) // 2
            clen = cut
            tried_okay = set()
            for i in range(ceil(log(len(results), 2)) + 1):
                output8 = (get_output(results[:cut]) + ' ...').encode('utf-8')
                clen //= 2
                if len(output8) < 450:
                    tried_okay.add(cut)
                    cut += clen
                else:
                    cut -= clen
            output = get_output(results[:max(tried_okay)]) + ' ...'
        return output
    else:
        return get_output(results)

if __name__ == '__main__':
    from sys import argv

    if len(argv) == 2:
        irc(argv[1])
    elif len(argv) == 3:
        print(doit(argv[1], argv[2]))
    else:
        print('Usage:', argv[0], '#channel')
        print('   or:', argv[0], '[qvd]*', 'query')
        exit(1)