221 lines
4.9 KiB
Python
221 lines
4.9 KiB
Python
import html.entities
|
|
import re
|
|
|
|
def base26(num):
|
|
digits = []
|
|
while num > 0:
|
|
# ceil(num / 26) - 1
|
|
next_num = (num + 25) // 26 - 1
|
|
digits.append(num - 26 * next_num)
|
|
num = next_num
|
|
return ''.join(chr(ord('a') + i - 1) for i in reversed(digits))
|
|
|
|
def debase26(digits):
|
|
num = 0
|
|
for digit in digits:
|
|
num = 26 * num + ord(digit) - ord('a') + 1
|
|
return num
|
|
|
|
def render(text, inline = False, references = None, links = None):
|
|
index = 0
|
|
start_of_line = True
|
|
|
|
def eof():
|
|
nonlocal index
|
|
return index >= len(text)
|
|
|
|
def char():
|
|
nonlocal index, start_of_line
|
|
c = text[index]
|
|
if c == '\n':
|
|
start_of_line = True
|
|
else:
|
|
start_of_line = False
|
|
index += 1
|
|
return c
|
|
|
|
def starts(prefix):
|
|
nonlocal index
|
|
return text[index:index + len(prefix)] == prefix
|
|
|
|
def consume(prefix):
|
|
nonlocal index, start_of_line
|
|
assert text[index:index + len(prefix)] == prefix
|
|
index += len(prefix)
|
|
start_of_line = prefix[-1] == '\n'
|
|
|
|
def read_until(delimiter):
|
|
nonlocal index
|
|
start = index
|
|
while not eof() and not starts(delimiter): char()
|
|
return text[start:index]
|
|
|
|
def space():
|
|
nonlocal index
|
|
return text[index].isspace()
|
|
|
|
def save():
|
|
nonlocal index, start_of_line
|
|
return (index, start_of_line)
|
|
|
|
def load(state):
|
|
nonlocal index, start_of_line
|
|
index, start_of_line = state
|
|
|
|
rendered = ''
|
|
if references is None:
|
|
references = []
|
|
references = references[:]
|
|
if links is None:
|
|
links = []
|
|
links = links[:]
|
|
extlinks = []
|
|
|
|
while not eof():
|
|
if start_of_line and starts('=') and not inline:
|
|
# ^\=+heading\=+\W*$
|
|
start = save()
|
|
|
|
line = read_until('\n')
|
|
if not eof(): consume('\n')
|
|
|
|
match = re.fullmatch('(\=+.*\=+)\W*', line)
|
|
if match is None:
|
|
# Was not a heading after all
|
|
load(start)
|
|
rendered += char()
|
|
continue
|
|
|
|
heading, = match.groups()
|
|
|
|
# Figure out which equal signs mark heading level and which are part of heading itself
|
|
equals_start = 0
|
|
while all(i == '=' for i in heading[:equals_start + 1]):
|
|
equals_start += 1
|
|
equals_end = 0
|
|
while all(i == '=' for i in heading[-(equals_end + 1):]):
|
|
equals_end += 1
|
|
heading_level = min(equals_start, equals_end)
|
|
|
|
# Extract the heading
|
|
heading, references, links = render(heading[heading_level:-heading_level], inline = True, links = links, references = references)
|
|
|
|
# Add '#'s before the heading to mark it's a heading
|
|
while rendered[-2:] != '\n\n':
|
|
rendered += '\n'
|
|
rendered += f"{'#' * (heading_level - 1)} {heading}\n"
|
|
|
|
elif start_of_line and starts('*') and not inline:
|
|
# List item
|
|
list_level = 0
|
|
while starts('*'):
|
|
consume('*')
|
|
list_level += 1
|
|
|
|
list_item, references, links = render(read_until('\n'), inline = True, links = links, references = references)
|
|
if not eof(): consume('\n')
|
|
|
|
if rendered[-1:] != '\n':
|
|
rendered += '\n'
|
|
rendered += f"{' ' * (list_level - 1)}* {list_item}\n"
|
|
|
|
elif starts('[['):
|
|
# Wikilink
|
|
consume('[[')
|
|
link = ''
|
|
while not eof() and not starts(']]') and not starts('|'):
|
|
link += char()
|
|
displaytext = link
|
|
nesting = 1
|
|
while nesting > 0:
|
|
if starts('[['):
|
|
consume('[[')
|
|
nesting += 1
|
|
elif starts(']]'):
|
|
consume(']]')
|
|
nesting -= 1
|
|
elif nesting == 1 and starts('|'):
|
|
consume('|')
|
|
displaytext = ''
|
|
else:
|
|
displaytext += char()
|
|
|
|
links.append(link)
|
|
link_id = base26(len(links))
|
|
displaytext, references, links = render(displaytext, inline = True, links = links, references = references)
|
|
rendered += f"{displaytext}[{link_id}]"
|
|
|
|
elif starts("'''"):
|
|
# '''bold'''
|
|
consume("'''")
|
|
rendered += '*'
|
|
|
|
elif starts("''"):
|
|
# ''italic''
|
|
consume("''")
|
|
rendered += '*'
|
|
|
|
elif starts('{{'):
|
|
# {{Template|arg1{{!}}|arg2}}
|
|
# TODO: Handle templates
|
|
consume('{{')
|
|
nesting = 1
|
|
while not eof() and nesting > 0:
|
|
if starts('{{'):
|
|
nesting += 1
|
|
consume('{{')
|
|
elif starts('}}'):
|
|
nesting -= 1
|
|
consume('}}')
|
|
else:
|
|
char()
|
|
|
|
elif starts('<ref'):
|
|
# <ref name="foo">…</ref>
|
|
# <ref name="foo"/>
|
|
tag = read_until('>')
|
|
if not eof(): consume('>')
|
|
if tag[-1:] != '/':
|
|
references.append(read_until('</ref>'))
|
|
if not eof(): consume('</ref>')
|
|
# TODO: Use correct reference numbers for reused references
|
|
rendered += f'[{len(references)}]'
|
|
|
|
elif starts('\n\n'):
|
|
# New paragraph
|
|
consume('\n\n')
|
|
if rendered != '':
|
|
rendered += '\n\n'
|
|
|
|
elif starts('\n'):
|
|
# Can be removed
|
|
consume('\n')
|
|
|
|
elif starts('&'):
|
|
start = save()
|
|
consume('&')
|
|
if starts('#'):
|
|
# Numeric charref
|
|
num = read_until(';')
|
|
if not eof(): consume(';')
|
|
if num.startswith('x'):
|
|
rendered += chr(int(num[1:], 16))
|
|
else:
|
|
rendered += chr(int(num))
|
|
else:
|
|
# Named entity
|
|
for entity in html.entities.html5:
|
|
if starts(entity):
|
|
rendered += html.entities.html5[entity]
|
|
consume(entity)
|
|
break
|
|
else:
|
|
# Was not an entity
|
|
load(start)
|
|
rendered += char()
|
|
|
|
else:
|
|
rendered += char()
|
|
|
|
return rendered, references, links
|