happybot/happybot/compose/update.py

from urllib.request import urlopen
from collections import defaultdict
from copy import deepcopy
from os import chdir
import string
import re
import unicodedata

def keysym_names():
    result = dict()
    keysym = re.compile(r'^#define XK_([a-zA-Z_0-9]+)\s+0x[0-9a-f]+\s*/\*[ (]U\+([0-9A-F]+) (.+?)\s*\*/\s*$')
    with urlopen('https://cgit.freedesktop.org/xorg/proto/x11proto/plain/keysymdef.h') as web:
        for line in web:
            line = line.decode('utf-8', 'ignore').strip()
            if m := keysym.match(line):
                name, unicode, comment = m.groups()
                result[name] = (chr(int(unicode, 16)), comment)
    return result

def compose_keys():
    compose_line = re.compile(r'^<Multi_key>((?:\s*<[a-zA-Z_0-9]+>)+)\s*:\s*"((?:[^"]|\\.)+)"\s*([a-zA-Z_0-9]*)\s*#\s*(.*)')

    char_to_sequence = defaultdict(list)
    char_to_name = defaultdict(set)
    char_to_comment = dict()
    name_to_char = dict()

    with urlopen('https://cgit.freedesktop.org/xorg/lib/libX11/plain/nls/en_US.UTF-8/Compose.pre') as web:
        multilinecomment = False
        for line in web:
            line = line.decode('utf-8', 'error').strip()
            if multilinecomment:
                try:
                    index = line.index('*/')
                    multilinecomment = False
                    line = line[index + 2:]
                except:
                    continue
            else:
                try:
                    index = line.index('/*')
                    multilinecomment = True
                    line = line[:index]
                except:
                    pass

            if m := compose_line.match(line):
                sequence, char, name, comment = m.groups()

                sequence = [key[1:-1] for key in sequence.strip().split(' ')]
                char = re.sub(r'\\(.)', r'\1', char)

                char_to_sequence[char].append(sequence)

                if name:
                    char_to_name[char].add(name)

                    if name in name_to_char:
                        try:
                            assert name_to_char[name] == char
                        except:
                            print('Line:', line)
                            print('Name:', name)
                            print('Had char:', name_to_char[name])
                            print('Given char:', char)
                            raise
                    else:
                        name_to_char[name] = char

                if char in char_to_comment:
                    try:
                        assert char_to_comment[char] == comment
                    except:
                        print('Line:', line)
                        print('Char:', char)
                        print('Had comment:', char_to_comment[char])
                        print('Given comment:', comment)
                        raise
                else:
                    char_to_comment[char] = comment

    return char_to_sequence, char_to_name, name_to_char, char_to_comment

def merged():
    char_to_sequence, char_to_name, name_to_char, char_to_comment = compose_keys()

    for name, (char, comment) in keysym_names().items():
        if name in name_to_char:
            try:
                assert name_to_char[name] == char
            except:
                print('Name:', name)
                print('Compose char:', name_to_char[name])
                print('Keysym char:', char)
                raise
        else:
            name_to_char[name] = char

        char_to_name[char].add(name)

        if char in char_to_comment:
            if char_to_comment[char] != comment:
                char_to_comment[char] += ';' + comment
        else:
            char_to_comment[char] = comment

    # Compose symbol:
    try:
        assert '\u2384' not in char_to_name
    except:
        raise ValueError('Please use different symbol for compose key.')
    # Space symbol:
    space_names = char_to_name['\u2423']
    for name in char_to_name[' ']:
        name_to_char[name] = '\u2423'
    # Tab symbol:
    tab_sym_names = char_to_name['\u21e5']
    name_to_char['Tab'] = '\u21e5'

    # Filter out sequences that have keys we don't know how to display nicely.
    # Includes stuff like deadkeys.
    for char in list(char_to_sequence.keys()):
        sequences = char_to_sequence[char]
        new_sequences = []
        for sequence in sequences:
            if any(key in space_names for key in sequence):
                raise ValueError('Please switch to using \\u2420 (\u2420) for space.')
            if any(key in tab_sym_names for key in sequence):
                raise ValueError('Please switch to using \\u2420 (\u2420) for space.')
            for key in sequence:
                if key not in name_to_char and re.match(r'^U[0-9a-fA-F]+$', key):
                    name_to_char[key] = chr(int(key[1:], 16))
            if all(key in name_to_char for key in sequence):
                new_sequences.append(sequence)
        if new_sequences:
            char_to_sequence[char] = new_sequences
        else:
            del char_to_sequence[char]

    # Add spaces so that combining keys become more obvious as combining keys...
    # ...but this is still not obvious, but oh well.
    for name in list(name_to_char.keys()):
        if unicodedata.combining(name_to_char[name]):
            name_to_char[name] = '\u25cc' + name_to_char[name]

    return char_to_sequence, char_to_name, name_to_char, char_to_comment

def uni(char, zf=4):
    return hex(ord(char))[2:].upper().zfill(zf)

def main():
    char_to_sequence, char_to_name, name_to_char, char_to_comment = merged()

    comments_str = ''
    chars_str = ''
    sequences_str = ''

    for char, sequences in char_to_sequence.items():
        comments_str += ';'.join(char_to_name[char]) + ';' + char_to_comment[char]
        if len(char) == 1:
            comments_str += ';U+' + uni(char) + ';U' + uni(char)
        comments_str += '\n'
        chars_str += char + '\n'
        sequences_str += char + ' ← ' + ' '.join(
                '\u2384' + ''.join(name_to_char[name] for name in sequence)
                for sequence in sequences
                ) + '\n'

    chdir('/home/zgrep/offtopiabday/happybot/compose/')
    with open('comments.txt', 'w') as fh:
        fh.write(comments_str)
    with open('chars.txt', 'w') as fh:
        fh.write(chars_str)
    with open('sequences.txt', 'w') as fh:
        fh.write(sequences_str)

    print('Success!')

# TODO: Deadkey combinations decoding?
# TODO: Allow deadkeys in compose-key combinations?

if __name__ == '__main__':
    main()