import html.entities import re def base26(num): digits = [] while num > 0: # ceil(num / 26) - 1 next_num = (num + 25) // 26 - 1 digits.append(num - 26 * next_num) num = next_num return ''.join(chr(ord('a') + i - 1) for i in reversed(digits)) def debase26(digits): num = 0 for digit in digits: num = 26 * num + ord(digit) - ord('a') + 1 return num def render(text, inline = False, references = None, links = None): index = 0 start_of_line = True def eof(): nonlocal index return index >= len(text) def char(): nonlocal index, start_of_line c = text[index] if c == '\n': start_of_line = True else: start_of_line = False index += 1 return c def starts(prefix): nonlocal index return text[index:index + len(prefix)] == prefix def consume(prefix): nonlocal index, start_of_line assert text[index:index + len(prefix)] == prefix index += len(prefix) start_of_line = prefix[-1] == '\n' def read_until(delimiter): nonlocal index start = index while not eof() and not starts(delimiter): char() return text[start:index] def space(): nonlocal index return text[index].isspace() def save(): nonlocal index, start_of_line return (index, start_of_line) def load(state): nonlocal index, start_of_line index, start_of_line = state rendered = '' if references is None: references = [] references = references[:] if links is None: links = [] links = links[:] extlinks = [] while not eof(): if start_of_line and starts('=') and not inline: # ^\=+heading\=+\W*$ start = save() line = read_until('\n') if not eof(): consume('\n') match = re.fullmatch('(\=+.*\=+)\W*', line) if match is None: # Was not a heading after all load(start) rendered += char() continue heading, = match.groups() # Figure out which equal signs mark heading level and which are part of heading itself equals_start = 0 while all(i == '=' for i in heading[:equals_start + 1]): equals_start += 1 equals_end = 0 while all(i == '=' for i in heading[-(equals_end + 1):]): equals_end += 1 heading_level = min(equals_start, equals_end) # Extract the heading heading, references, links = render(heading[heading_level:-heading_level], inline = True, links = links, references = references) # Add '#'s before the heading to mark it's a heading while rendered[-2:] != '\n\n': rendered += '\n' rendered += f"{'#' * (heading_level - 1)} {heading}\n" elif start_of_line and starts('*') and not inline: # List item list_level = 0 while starts('*'): consume('*') list_level += 1 list_item, references, links = render(read_until('\n'), inline = True, links = links, references = references) if not eof(): consume('\n') if rendered[-1:] != '\n': rendered += '\n' rendered += f"{' ' * (list_level - 1)}* {list_item}\n" elif starts('[['): # Wikilink consume('[[') link = '' while not eof() and not starts(']]') and not starts('|'): link += char() displaytext = link nesting = 1 while nesting > 0: if starts('[['): consume('[[') nesting += 1 elif starts(']]'): consume(']]') nesting -= 1 elif nesting == 1 and starts('|'): consume('|') displaytext = '' else: displaytext += char() links.append(link) link_id = base26(len(links)) displaytext, references, links = render(displaytext, inline = True, links = links, references = references) rendered += f"{displaytext}[{link_id}]" elif starts("'''"): # '''bold''' consume("'''") rendered += '*' elif starts("''"): # ''italic'' consume("''") rendered += '*' elif starts('{{'): # {{Template|arg1{{!}}|arg2}} # TODO: Handle templates consume('{{') nesting = 1 while not eof() and nesting > 0: if starts('{{'): nesting += 1 consume('{{') elif starts('}}'): nesting -= 1 consume('}}') else: char() elif starts('… # tag = read_until('>') if not eof(): consume('>') if tag[-1:] != '/': references.append(read_until('')) if not eof(): consume('') # TODO: Use correct reference numbers for reused references rendered += f'[{len(references)}]' elif starts('\n\n'): # New paragraph consume('\n\n') if rendered != '': rendered += '\n\n' elif starts('\n'): # Can be removed consume('\n') elif starts('&'): start = save() consume('&') if starts('#'): # Numeric charref num = read_until(';') if not eof(): consume(';') if num.startswith('x'): rendered += chr(int(num[1:], 16)) else: rendered += chr(int(num)) else: # Named entity for entity in html.entities.html5: if starts(entity): rendered += html.entities.html5[entity] consume(entity) break else: # Was not an entity load(start) rendered += char() else: rendered += char() return rendered, references, links