wikisms/markup.py

import html.entities
import re

def base26(num):
	digits = []
	while num > 0:
		# ceil(num / 26) - 1
		next_num = (num + 25) // 26 - 1
		digits.append(num - 26 * next_num)
		num = next_num
	return ''.join(chr(ord('a') + i - 1) for i in reversed(digits))

def debase26(digits):
	num = 0
	for digit in digits:
		num = 26 * num + ord(digit) - ord('a') + 1
	return num

def render(text, inline = False, references = None, links = None):
	index = 0
	start_of_line = True

	def eof():
		nonlocal index
		return index >= len(text)

	def char():
		nonlocal index, start_of_line
		c = text[index]
		if c == '\n':
			start_of_line = True
		else:
			start_of_line = False
		index += 1
		return c

	def starts(prefix):
		nonlocal index
		return text[index:index + len(prefix)] == prefix

	def consume(prefix):
		nonlocal index, start_of_line
		assert text[index:index + len(prefix)] == prefix
		index += len(prefix)
		start_of_line = prefix[-1] == '\n'

	def read_until(delimiter):
		nonlocal index
		start = index
		while not eof() and not starts(delimiter): char()
		return text[start:index]

	def space():
		nonlocal index
		return text[index].isspace()

	def save():
		nonlocal index, start_of_line
		return (index, start_of_line)

	def load(state):
		nonlocal index, start_of_line
		index, start_of_line = state

	rendered = ''
	if references is None:
		references = []
	references = references[:]
	if links is None:
		links = []
	links = links[:]
	extlinks = []

	while not eof():
		if start_of_line and starts('=') and not inline:
			# ^\=+heading\=+\W*$
			start = save()

			line = read_until('\n')
			if not eof(): consume('\n')

			match = re.fullmatch('(\=+.*\=+)\W*', line)
			if match is None:
				# Was not a heading after all
				load(start)
				rendered += char()
				continue

			heading, = match.groups()

			# Figure out which equal signs mark heading level and which are part of heading itself
			equals_start = 0
			while all(i == '=' for i in heading[:equals_start + 1]):
				equals_start += 1
			equals_end = 0
			while all(i == '=' for i in heading[-(equals_end + 1):]):
				equals_end += 1
			heading_level = min(equals_start, equals_end)

			# Extract the heading
			heading, references, links = render(heading[heading_level:-heading_level], inline = True, links = links, references = references)

			# Add '#'s before the heading to mark it's a heading
			while rendered[-2:] != '\n\n':
				rendered += '\n'
			rendered += f"{'#' * (heading_level - 1)} {heading}\n"

		elif start_of_line and starts('*') and not inline:
			# List item
			list_level = 0
			while starts('*'):
				consume('*')
				list_level += 1

			list_item, references, links = render(read_until('\n'), inline = True, links = links, references = references)
			if not eof(): consume('\n')

			if rendered[-1:] != '\n':
				rendered += '\n'
			rendered += f"{' ' * (list_level - 1)}* {list_item}\n"

		elif starts('[['):
			# Wikilink
			consume('[[')
			link = ''
			while not eof() and not starts(']]') and not starts('|'):
				link += char()
			displaytext = link
			nesting = 1
			while nesting > 0:
				if starts('[['):
					consume('[[')
					nesting += 1
				elif starts(']]'):
					consume(']]')
					nesting -= 1
				elif nesting == 1 and starts('|'):
					consume('|')
					displaytext = ''
				else:
					displaytext += char()

			links.append(link)
			link_id = base26(len(links))
			displaytext, references, links = render(displaytext, inline = True, links = links, references = references)
			rendered += f"{displaytext}[{link_id}]"

		elif starts("'''"):
			# '''bold'''
			consume("'''")
			rendered += '*'

		elif starts("''"):
			# ''italic''
			consume("''")
			rendered += '*'

		elif starts('{{'):
			# {{Template|arg1{{!}}|arg2}}
			# TODO: Handle templates
			consume('{{')
			nesting = 1
			while not eof() and nesting > 0:
				if starts('{{'):
					nesting += 1
					consume('{{')
				elif starts('}}'):
					nesting -= 1
					consume('}}')
				else:
					char()

		elif starts('<ref'):
			# <ref name="foo">…</ref>
			# <ref name="foo"/>
			tag = read_until('>')
			if not eof(): consume('>')
			if tag[-1:] != '/':
				references.append(read_until('</ref>'))
				if not eof(): consume('</ref>')
			# TODO: Use correct reference numbers for reused references
			rendered += f'[{len(references)}]'

		elif starts('\n\n'):
			# New paragraph
			consume('\n\n')
			if rendered != '':
				rendered += '\n\n'

		elif starts('\n'):
			# Can be removed
			consume('\n')

		elif starts('&'):
			start = save()
			consume('&')
			if starts('#'):
				# Numeric charref
				num = read_until(';')
				if not eof(): consume(';')
				if num.startswith('x'):
					rendered += chr(int(num[1:], 16))
				else:
					rendered += chr(int(num))
			else:
				# Named entity
				for entity in html.entities.html5:
					if starts(entity):
						rendered += html.entities.html5[entity]
						consume(entity)
						break
				else:
					# Was not an entity
					load(start)
					rendered += char()

		else:
			rendered += char()

	return rendered, references, links