#!/usr/bin/env python3 from subprocess import Popen, PIPE from os import chdir, environ from pathlib import Path import re # Make sure we're in the correct directory, for module imports and such too. basedir = Path(environ.get('basedir', '.')) chdir(basedir) def cmd(args): proc = Popen(args, stdout=PIPE) while True: line = proc.stdout.readline() if line: try: yield str(line[:-1], 'utf-8', 'ignore') except: pass else: break def irc(chan): global trigger server = environ.get('serv', 'irc.libera.chat') fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan fin = fdir + '/in' fout = fdir + '/out' for line in cmd(['tail', '-n', '0', '-f', fout]): date, time, nick, line = line.split(' ', 3) if nick[0] != '<' or nick[-1] != '>': continue nick = nick[1:-1] m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line) if m: flags, query = m.groups() if not flags: flags = '' result = doit(flags, query).split('\n') result = [ f'\u200b{nick}: ' + line for line in result ] result = result[:4] # capping at 4 lines max result = '\n'.join(result) with open(fin, 'w') as fh: fh.write(result + '\n') from collections import defaultdict from math import ceil, log import json # Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table categories = ''' Lu Uppercase_Letter an uppercase letter Ll Lowercase_Letter a lowercase letter Lt Titlecase_Letter a digraphic character, with first part uppercase LC Cased_Letter Lu | Ll | Lt Lm Modifier_Letter a modifier letter Lo Other_Letter other letters, including syllables and ideographs L Letter Lu | Ll | Lt | Lm | Lo Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) Mc Spacing_Mark a spacing combining mark (positive advance width) Me Enclosing_Mark an enclosing combining mark M Mark Mn | Mc | Me Nd Decimal_Number a decimal digit Nl Letter_Number a letterlike numeric character No Other_Number a numeric character of other type N Number Nd | Nl | No Pc Connector_Punctuation a connecting punctuation mark, like a tie Pd Dash_Punctuation a dash or hyphen punctuation mark Ps Open_Punctuation an opening punctuation mark (of a pair) Pe Close_Punctuation a closing punctuation mark (of a pair) Pi Initial_Punctuation an initial quotation mark Pf Final_Punctuation a final quotation mark Po Other_Punctuation a punctuation mark of other type P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po Sm Math_Symbol a symbol of mathematical use Sc Currency_Symbol a currency sign Sk Modifier_Symbol a non-letterlike modifier symbol So Other_Symbol a symbol of other type S Symbol Sm | Sc | Sk | So Zs Space_Separator a space character (of various non-zero widths) Zl Line_Separator U+2028 LINE SEPARATOR only Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only Z Separator Zs | Zl | Zp Cc Control a C0 or C1 control code Cf Format a format control character Cs Surrogate a surrogate code point Co Private_Use a private-use character Cn Unassigned a reserved unassigned code point or a noncharacter C Other Cc | Cf | Cs | Co | Cn '''.strip().split('\n') categories = [ row.split('\t', 2) for row in categories ] categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 } def utf8uni(ordinal): s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:] if len(s) % 2 == 1: s = '0' + s return '0x' + s def uniuni(ordinal): return 'U+' + hex(ordinal)[2:].zfill(4).upper() UnicodeDataFull = 'UnicodeDataFull.json' tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*') invalid = [None, "Cn", False, [""]] unknown = [None, "Cn", False, [""]] def doit(flags, query): quiet = 'q' in flags verbose = 'v' in flags decode = 'd' in flags utf8 = '8' in flags if utf8: unif = utf8uni else: unif = uniuni cache = dict() if len(query) <= 2: decode = True if decode: search = list(map(ord, query)) else: index, merge = 0, False search = [] for match in tokens.finditer(query): missed = query[index:match.start()] if missed: if merge: search[-1] += missed else: search.append(missed) index = match.end() merge = False uni1, uni2, hexa, bina = match.groups() uni = uni1 or uni2 if uni: search.append(int(uni, 16)) elif hexa: try: byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error') search.extend(map(ord, byt)) except: if isinstance(search[-1], str): search[-1] += '0x' + hexa else: search.append('0x' + hexa) merge = True elif bina: try: byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error') search.extend(map(ord, byt)) except: if isinstance(search[-1], str): search[-1] += '0b' + bina else: search.append('0b' + bina) merge = True missed = query[index:] if missed: if merge: search[-1] += missed else: search.append(missed) results = [[] for _ in range(len(search))] numbers = defaultdict(list) strings = defaultdict(list) for i, elem in enumerate(search): if isinstance(elem, int): numbers[elem].append(i) elif isinstance(elem, str): strings[elem.lower()].append(i) numbers = list(sorted(numbers.items(), reverse=True)) # The actual searching. filled = set() with open(UnicodeDataFull, 'r') as fh: for line in fh: row = json.loads(line) if numbers: if row[0] == numbers[-1][0]: cache[row[0]] = row for index in numbers[-1][1]: filled.add(index) results[index].append(row[0]) numbers.pop() elif row[1]: while numbers and row[0] <= numbers[-1][0] <= row[1]: cache[numbers[-1][0]] = row for index in numbers[-1][1]: filled.add(index) results[index].append(numbers[-1][0]) numbers.pop() elif not strings: break for string, indices in strings.items(): if any(string in name.lower() for name in row[4]): num = row[0] if row[1]: num = -num cache[num] = row for index in indices: filled.add(index) results[index].append(num) missing = set(range(len(search))) - filled numbers = defaultdict(list) indices = set() for i in missing: elem = search[i] if isinstance(elem, int): cache[elem] = [elem] + invalid results[i].append(elem) elif isinstance(elem, str): results[i] = [None] * len(elem) for j, c in enumerate(elem): numbers[ord(c)].append((i, j)) indices.add((i, j)) numbers = list(sorted(numbers.items(), reverse=True)) if indices: # Decoding what we have left, just some numbers. with open(UnicodeDataFull, 'r') as fh: for line in fh: row = json.loads(line) if numbers: if row[0] == numbers[-1][0]: cache[row[0]] = row for i, j in numbers[-1][1]: indices.remove((i, j)) results[i][j] = row[0] numbers.pop() elif row[1]: while numbers and row[0] <= numbers[-1][0] <= row[1]: cache[numbers[-1][0]] = row for i, j in numbers[-1][1]: indices.remove((i, j)) results[i][j] = numbers[-1][0] numbers.pop() else: break for i, j in indices: num = ord(search[i][j]) cache[num] = [num] + unknown results[i][j] = num if len(search) == 1: # This means we've fallen back on decoding our single input as a string. # Setting this lets us display output differently, hopefully more usefully. decode = True results = [r for inner in results for r in inner] if quiet and not verbose: if decode: fmt = '{code}' else: fmt = '{char}' join = ' ' elif verbose and not quiet: fmt = '{code} [{long_category}] {names}: {char}' join = '\n' else: fmt = '{code} [{category}] {name}: {char}' join = '\n' def get_output(results): output = [] for num in results: is_range = False if num < 0: is_range = True range_start, range_end, category, compose, names = cache[num] if is_range: if compose: char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end) else: char = chr(range_start) + '..' + chr(range_end) code = unif(range_start) + '..' + unif(range_end) else: char = chr(num) if compose: char = '\u25cc' + char code = unif(num) output.append(fmt.format( code=code, name=names[0], names=', '.join(names), char=char, category=category, long_category=categories[category], )) return join.join(output) if quiet: output = get_output(results) output8 = output.encode('utf-8') if len(output8) > 470: cut = len(results) // 2 clen = cut tried_okay = set() for i in range(ceil(log(len(results), 2)) + 1): output8 = (get_output(results[:cut]) + ' ...').encode('utf-8') clen //= 2 if len(output8) < 450: tried_okay.add(cut) cut += clen else: cut -= clen output = get_output(results[:max(tried_okay)]) + ' ...' return output else: return get_output(results) if __name__ == '__main__': from sys import argv if len(argv) == 2: irc(argv[1]) elif len(argv) == 3: print(doit(argv[1], argv[2])) else: print('Usage:', argv[0], '#channel') print(' or:', argv[0], '[qvd]*', 'query') exit(1)