happybot/happybot/unicode/unicode.py

387 lines
13 KiB
Python

#!/usr/bin/env python3
from subprocess import Popen, PIPE
from os import chdir, environ
from pathlib import Path
import re
# Make sure we're in the correct directory, for module imports and such too.
basedir = Path(environ.get('basedir', '.'))
chdir(basedir)
def cmd(args):
proc = Popen(args, stdout=PIPE)
while True:
line = proc.stdout.readline()
if line:
try:
yield str(line[:-1], 'utf-8', 'ignore')
except:
pass
else:
break
def irc(chan):
global trigger
server = environ.get('serv', 'irc.libera.chat')
fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan
fin = fdir + '/in'
fout = fdir + '/out'
for line in cmd(['tail', '-n', '0', '-f', fout]):
date, time, nick, line = line.split(' ', 3)
if nick[0] != '<' or nick[-1] != '>':
continue
nick = nick[1:-1]
m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line)
if m:
flags, query = m.groups()
if not flags:
flags = ''
result = doit(flags, query).split('\n')
result = [ f'\u200b{nick}: ' + line for line in result ]
result = result[:4] # capping at 4 lines max
result = '\n'.join(result)
with open(fin, 'w') as fh:
fh.write(result + '\n')
from collections import defaultdict
from math import ceil, log
from random import choice
import json
# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
categories = '''
Lu Uppercase_Letter an uppercase letter
Ll Lowercase_Letter a lowercase letter
Lt Titlecase_Letter a digraphic character, with first part uppercase
LC Cased_Letter Lu | Ll | Lt
Lm Modifier_Letter a modifier letter
Lo Other_Letter other letters, including syllables and ideographs
L Letter Lu | Ll | Lt | Lm | Lo
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
Mc Spacing_Mark a spacing combining mark (positive advance width)
Me Enclosing_Mark an enclosing combining mark
M Mark Mn | Mc | Me
Nd Decimal_Number a decimal digit
Nl Letter_Number a letterlike numeric character
No Other_Number a numeric character of other type
N Number Nd | Nl | No
Pc Connector_Punctuation a connecting punctuation mark, like a tie
Pd Dash_Punctuation a dash or hyphen punctuation mark
Ps Open_Punctuation an opening punctuation mark (of a pair)
Pe Close_Punctuation a closing punctuation mark (of a pair)
Pi Initial_Punctuation an initial quotation mark
Pf Final_Punctuation a final quotation mark
Po Other_Punctuation a punctuation mark of other type
P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po
Sm Math_Symbol a symbol of mathematical use
Sc Currency_Symbol a currency sign
Sk Modifier_Symbol a non-letterlike modifier symbol
So Other_Symbol a symbol of other type
S Symbol Sm | Sc | Sk | So
Zs Space_Separator a space character (of various non-zero widths)
Zl Line_Separator U+2028 LINE SEPARATOR only
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
Z Separator Zs | Zl | Zp
Cc Control a C0 or C1 control code
Cf Format a format control character
Cs Surrogate a surrogate code point
Co Private_Use a private-use character
Cn Unassigned a reserved unassigned code point or a noncharacter
C Other Cc | Cf | Cs | Co | Cn
'''.strip().split('\n')
categories = [ row.split('\t', 2) for row in categories ]
categories = { left: (right.replace('_', ' '), righter) for left, right, righter in categories if len(left) == 2 }
def utf8uni(ordinal):
s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:]
if len(s) % 2 == 1:
s = '0' + s
return '0x' + s
def uniuni(ordinal):
return 'U+' + hex(ordinal)[2:].zfill(4).upper()
UnicodeDataFull = 'UnicodeDataFull.json'
tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*')
invalid = [None, "Cn", False, ["<invalid>"]]
unknown = [None, "Cn", False, ["<unknown>"]]
def doit(flags, query):
if 'q' in flags and 'v' in flags:
sub1 = 1
else:
sub1 = 0
verbosity = (flags.count('v') - sub1) - (flags.count('q') - sub1)
if verbosity > 0:
quiet = False
verbose = True
elif verbosity < 0:
quiet = True
verbose = False
elif verbosity == 0:
if sub1:
quiet = True
verbose = True
else:
quiet = False
verbose = False
if verbosity < -1:
return choice(['Ssssshh.', '[silence]', 'Complete quie-- oh, darn, I spoke.', '[the sound of nothing]', '[complete silence]'])
decode = 'd' in flags
utf8 = '8' in flags
if utf8:
unif = utf8uni
else:
unif = uniuni
cache = dict()
if len(query) <= 2:
decode = True
if decode:
search = list(map(ord, query))
else:
index, merge = 0, False
search = []
for match in tokens.finditer(query):
missed = query[index:match.start()]
if missed:
if merge:
search[-1] += missed
else:
search.append(missed)
index = match.end()
merge = False
uni1, uni2, hexa, bina = match.groups()
uni = uni1 or uni2
if uni:
search.append(int(uni, 16))
elif hexa:
try:
byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error')
search.extend(map(ord, byt))
except:
if isinstance(search[-1], str):
search[-1] += '0x' + hexa
else:
search.append('0x' + hexa)
merge = True
elif bina:
try:
byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error')
search.extend(map(ord, byt))
except:
if isinstance(search[-1], str):
search[-1] += '0b' + bina
else:
search.append('0b' + bina)
merge = True
missed = query[index:]
if missed:
if merge:
search[-1] += missed
else:
search.append(missed)
results = [[] for _ in range(len(search))]
whymatched = defaultdict(set)
numbers = defaultdict(list)
strings = defaultdict(set)
for i, elem in enumerate(search):
if isinstance(elem, int):
numbers[elem].append(i)
elif isinstance(elem, str):
strings[elem.lower()].add(i)
numbers = list(sorted(numbers.items(), reverse=True))
# The actual searching.
filled = set()
with open(UnicodeDataFull, 'r') as fh:
for line in fh:
row = json.loads(line)
if numbers:
if row[0] == numbers[-1][0]:
cache[row[0]] = row
for index in numbers[-1][1]:
filled.add(index)
results[index].append(row[0])
numbers.pop()
elif row[1]:
while numbers and row[0] <= numbers[-1][0] <= row[1]:
cache[numbers[-1][0]] = row
for index in numbers[-1][1]:
filled.add(index)
results[index].append(numbers[-1][0])
numbers.pop()
elif not strings:
break
for string, indices in strings.items():
for i, name in enumerate(row[4]):
if string in name.lower():
num = row[0]
if row[1]:
num = -num
cache[num] = row
# Prioritize shown name matches first for string matches.
# Also save _why_ it matched.
if i > 0:
whymatched[num].add(name)
for index in indices:
filled.add(index)
if not results[index]:
results[index] = [[], []]
results[index][i > 0].append(num)
break
# Merge priority and non-priority matches.
for string, indices in strings.items():
for index in indices & filled:
results[index] = results[index][0] + results[index][1]
missing = set(range(len(search))) - filled
numbers = defaultdict(list)
indices = set()
for i in missing:
elem = search[i]
if isinstance(elem, int):
cache[elem] = [elem] + invalid
results[i].append(elem)
elif isinstance(elem, str):
results[i] = [None] * len(elem)
for j, c in enumerate(elem):
numbers[ord(c)].append((i, j))
indices.add((i, j))
numbers = list(sorted(numbers.items(), reverse=True))
if indices:
# Decoding what we have left, just some numbers.
with open(UnicodeDataFull, 'r') as fh:
for line in fh:
row = json.loads(line)
if numbers:
if row[0] == numbers[-1][0]:
cache[row[0]] = row
for i, j in numbers[-1][1]:
indices.remove((i, j))
results[i][j] = row[0]
numbers.pop()
elif row[1]:
while numbers and row[0] <= numbers[-1][0] <= row[1]:
cache[numbers[-1][0]] = row
for i, j in numbers[-1][1]:
indices.remove((i, j))
results[i][j] = numbers[-1][0]
numbers.pop()
else:
break
for i, j in indices:
num = ord(search[i][j])
cache[num] = [num] + unknown
results[i][j] = num
if len(search) == 1:
# This means we've fallen back on decoding our single input as a string.
# Setting this lets us display output differently, hopefully more usefully.
decode = True
results = [r for inner in results for r in inner]
if quiet and not verbose:
if decode:
fmt = '{code}'
else:
fmt = '{char}'
join = ' '
elif verbose and not quiet:
if verbosity > 1:
fmt = '{code} [{long_category}, {extra_category}] {names}: {char}'
else:
fmt = '{code} [{long_category}] {names}: {char}'
join = '\n'
elif quiet and verbose:
fmt = '{code} [{category}] {name}: {char}'
join = '\n'
else:
fmt = '{code} [{category}] {namee}: {char}'
join = '\n'
def get_output(results):
output = []
for num in results:
is_range = False
if num < 0:
is_range = True
range_start, range_end, category, compose, names = cache[num]
if is_range:
if compose:
char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end)
else:
char = chr(range_start) + '..' + chr(range_end)
code = unif(range_start) + '..' + unif(range_end)
else:
char = chr(num)
if compose:
char = '\u25cc' + char
code = unif(num)
extra = ''
if num in whymatched:
extra = ' (' + ', '.join(whymatched[num]) + ')'
output.append(fmt.format(
code=code,
name=names[0],
namee=names[0] + extra,
names=', '.join(names),
char=char,
category=category,
long_category=categories[category][0],
extra_category=categories[category][1],
))
return join.join(output)
if quiet:
output = get_output(results)
output8 = output.encode('utf-8')
if len(output8) > 470:
cut = len(results) // 2
clen = cut
tried_okay = set()
for i in range(ceil(log(len(results), 2)) + 1):
output8 = (get_output(results[:cut]) + ' ...').encode('utf-8')
clen //= 2
if len(output8) < 450:
tried_okay.add(cut)
cut += clen
else:
cut -= clen
output = get_output(results[:max(tried_okay)]) + ' ...'
return output
else:
return get_output(results)
if __name__ == '__main__':
from sys import argv
if len(argv) == 2:
irc(argv[1])
elif len(argv) == 3:
print(doit(argv[1], argv[2]))
else:
print('Usage:', argv[0], '#channel')
print(' or:', argv[0], '[qvd]*', 'query')
exit(1)