wikisms/markup.py

221 lines
4.9 KiB
Python

import html.entities
import re
def base26(num):
digits = []
while num > 0:
# ceil(num / 26) - 1
next_num = (num + 25) // 26 - 1
digits.append(num - 26 * next_num)
num = next_num
return ''.join(chr(ord('a') + i - 1) for i in reversed(digits))
def debase26(digits):
num = 0
for digit in digits:
num = 26 * num + ord(digit) - ord('a') + 1
return num
def render(text, inline = False, references = None, links = None):
index = 0
start_of_line = True
def eof():
nonlocal index
return index >= len(text)
def char():
nonlocal index, start_of_line
c = text[index]
if c == '\n':
start_of_line = True
else:
start_of_line = False
index += 1
return c
def starts(prefix):
nonlocal index
return text[index:index + len(prefix)] == prefix
def consume(prefix):
nonlocal index, start_of_line
assert text[index:index + len(prefix)] == prefix
index += len(prefix)
start_of_line = prefix[-1] == '\n'
def read_until(delimiter):
nonlocal index
start = index
while not eof() and not starts(delimiter): char()
return text[start:index]
def space():
nonlocal index
return text[index].isspace()
def save():
nonlocal index, start_of_line
return (index, start_of_line)
def load(state):
nonlocal index, start_of_line
index, start_of_line = state
rendered = ''
if references is None:
references = []
references = references[:]
if links is None:
links = []
links = links[:]
extlinks = []
while not eof():
if start_of_line and starts('=') and not inline:
# ^\=+heading\=+\W*$
start = save()
line = read_until('\n')
if not eof(): consume('\n')
match = re.fullmatch('(\=+.*\=+)\W*', line)
if match is None:
# Was not a heading after all
load(start)
rendered += char()
continue
heading, = match.groups()
# Figure out which equal signs mark heading level and which are part of heading itself
equals_start = 0
while all(i == '=' for i in heading[:equals_start + 1]):
equals_start += 1
equals_end = 0
while all(i == '=' for i in heading[-(equals_end + 1):]):
equals_end += 1
heading_level = min(equals_start, equals_end)
# Extract the heading
heading, references, links = render(heading[heading_level:-heading_level], inline = True, links = links, references = references)
# Add '#'s before the heading to mark it's a heading
while rendered[-2:] != '\n\n':
rendered += '\n'
rendered += f"{'#' * (heading_level - 1)} {heading}\n"
elif start_of_line and starts('*') and not inline:
# List item
list_level = 0
while starts('*'):
consume('*')
list_level += 1
list_item, references, links = render(read_until('\n'), inline = True, links = links, references = references)
if not eof(): consume('\n')
if rendered[-1:] != '\n':
rendered += '\n'
rendered += f"{' ' * (list_level - 1)}* {list_item}\n"
elif starts('[['):
# Wikilink
consume('[[')
link = ''
while not eof() and not starts(']]') and not starts('|'):
link += char()
displaytext = link
nesting = 1
while nesting > 0:
if starts('[['):
consume('[[')
nesting += 1
elif starts(']]'):
consume(']]')
nesting -= 1
elif nesting == 1 and starts('|'):
consume('|')
displaytext = ''
else:
displaytext += char()
links.append(link)
link_id = base26(len(links))
displaytext, references, links = render(displaytext, inline = True, links = links, references = references)
rendered += f"{displaytext}[{link_id}]"
elif starts("'''"):
# '''bold'''
consume("'''")
rendered += '*'
elif starts("''"):
# ''italic''
consume("''")
rendered += '*'
elif starts('{{'):
# {{Template|arg1{{!}}|arg2}}
# TODO: Handle templates
consume('{{')
nesting = 1
while not eof() and nesting > 0:
if starts('{{'):
nesting += 1
consume('{{')
elif starts('}}'):
nesting -= 1
consume('}}')
else:
char()
elif starts('<ref'):
# <ref name="foo">…</ref>
# <ref name="foo"/>
tag = read_until('>')
if not eof(): consume('>')
if tag[-1:] != '/':
references.append(read_until('</ref>'))
if not eof(): consume('</ref>')
# TODO: Use correct reference numbers for reused references
rendered += f'[{len(references)}]'
elif starts('\n\n'):
# New paragraph
consume('\n\n')
if rendered != '':
rendered += '\n\n'
elif starts('\n'):
# Can be removed
consume('\n')
elif starts('&'):
start = save()
consume('&')
if starts('#'):
# Numeric charref
num = read_until(';')
if not eof(): consume(';')
if num.startswith('x'):
rendered += chr(int(num[1:], 16))
else:
rendered += chr(int(num))
else:
# Named entity
for entity in html.entities.html5:
if starts(entity):
rendered += html.entities.html5[entity]
consume(entity)
break
else:
# Was not an entity
load(start)
rendered += char()
else:
rendered += char()
return rendered, references, links