From dfe8770e2b26f03bbbb35682608e17c89327ffa9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juhani=20Krekel=C3=A4?= <juhani@krekelä.fi>
Date: Thu, 9 Jul 2020 20:03:41 +0300
Subject: [PATCH] First commit

---
 .gitignore       |   3 +
 CC0              | 116 +++++++++++++++++++++++++
 cli_client.py    |  24 ++++++
 markup.py        | 220 +++++++++++++++++++++++++++++++++++++++++++++++
 wikipedia_api.py |  34 ++++++++
 5 files changed, 397 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CC0
 create mode 100644 cli_client.py
 create mode 100644 markup.py
 create mode 100644 wikipedia_api.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..15c993e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__
+*.pyc
+*.swp
diff --git a/CC0 b/CC0
new file mode 100644
index 0000000..670154e
--- /dev/null
+++ b/CC0
@@ -0,0 +1,116 @@
+CC0 1.0 Universal
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator and
+subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for the
+purpose of contributing to a commons of creative, cultural and scientific
+works ("Commons") that the public can reliably and without fear of later
+claims of infringement build upon, modify, incorporate in other works, reuse
+and redistribute as freely as possible in any form whatsoever and for any
+purposes, including without limitation commercial purposes. These owners may
+contribute to the Commons to promote the ideal of a free culture and the
+further production of creative, cultural and scientific works, or to gain
+reputation or greater distribution for their Work in part through the use and
+efforts of others.
+
+For these and/or other purposes and motivations, and without any expectation
+of additional consideration or compensation, the person associating CC0 with a
+Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
+and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
+and publicly distribute the Work under its terms, with knowledge of his or her
+Copyright and Related Rights in the Work and the meaning and intended legal
+effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not limited
+to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display, communicate,
+  and translate a Work;
+
+  ii. moral rights retained by the original author(s) and/or performer(s);
+
+  iii. publicity and privacy rights pertaining to a person's image or likeness
+  depicted in a Work;
+
+  iv. rights protecting against unfair competition in regards to a Work,
+  subject to the limitations in paragraph 4(a), below;
+
+  v. rights protecting the extraction, dissemination, use and reuse of data in
+  a Work;
+
+  vi. database rights (such as those arising under Directive 96/9/EC of the
+  European Parliament and of the Council of 11 March 1996 on the legal
+  protection of databases, and under any national implementation thereof,
+  including any amended or successor version of such directive); and
+
+  vii. other similar, equivalent or corresponding rights throughout the world
+  based on applicable law or treaty, and any national implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention of,
+applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
+unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
+and Related Rights and associated claims and causes of action, whether now
+known or unknown (including existing as well as future claims and causes of
+action), in the Work (i) in all territories worldwide, (ii) for the maximum
+duration provided by applicable law or treaty (including future time
+extensions), (iii) in any current or future medium and for any number of
+copies, and (iv) for any purpose whatsoever, including without limitation
+commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
+the Waiver for the benefit of each member of the public at large and to the
+detriment of Affirmer's heirs and successors, fully intending that such Waiver
+shall not be subject to revocation, rescission, cancellation, termination, or
+any other legal or equitable action to disrupt the quiet enjoyment of the Work
+by the public as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason be
+judged legally invalid or ineffective under applicable law, then the Waiver
+shall be preserved to the maximum extent permitted taking into account
+Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
+is so judged Affirmer hereby grants to each affected person a royalty-free,
+non transferable, non sublicensable, non exclusive, irrevocable and
+unconditional license to exercise Affirmer's Copyright and Related Rights in
+the Work (i) in all territories worldwide, (ii) for the maximum duration
+provided by applicable law or treaty (including future time extensions), (iii)
+in any current or future medium and for any number of copies, and (iv) for any
+purpose whatsoever, including without limitation commercial, advertising or
+promotional purposes (the "License"). The License shall be deemed effective as
+of the date CC0 was applied by Affirmer to the Work. Should any part of the
+License for any reason be judged legally invalid or ineffective under
+applicable law, such partial invalidity or ineffectiveness shall not
+invalidate the remainder of the License, and in such case Affirmer hereby
+affirms that he or she will not (i) exercise any of his or her remaining
+Copyright and Related Rights in the Work or (ii) assert any associated claims
+and causes of action with respect to the Work, in either case contrary to
+Affirmer's express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+  a. No trademark or patent rights held by Affirmer are waived, abandoned,
+  surrendered, licensed or otherwise affected by this document.
+
+  b. Affirmer offers the Work as-is and makes no representations or warranties
+  of any kind concerning the Work, express, implied, statutory or otherwise,
+  including without limitation warranties of title, merchantability, fitness
+  for a particular purpose, non infringement, or the absence of latent or
+  other defects, accuracy, or the present or absence of errors, whether or not
+  discoverable, all to the greatest extent permissible under applicable law.
+
+  c. Affirmer disclaims responsibility for clearing rights of other persons
+  that may apply to the Work or any use thereof, including without limitation
+  any person's Copyright and Related Rights in the Work. Further, Affirmer
+  disclaims responsibility for obtaining any necessary consents, permissions
+  or other rights required for any use of the Work.
+
+  d. Affirmer understands and acknowledges that Creative Commons is not a
+  party to this document and has no duty or obligation with respect to this
+  CC0 or use of the Work.
+
+For more information, please see
+<http://creativecommons.org/publicdomain/zero/1.0/>
diff --git a/cli_client.py b/cli_client.py
new file mode 100644
index 0000000..9c27ebd
--- /dev/null
+++ b/cli_client.py
@@ -0,0 +1,24 @@
+import markup
+import wikipedia_api
+
+def main():
+	links = []
+	while True:
+		try:
+			go = input('go> ')
+		except (EOFError, KeyboardInterrupt):
+			break
+
+		if go[:1] == '!':
+			page = go[1:]
+		else:
+			# Links are 1-indexed
+			page = links[markup.debase26(go) - 1]
+
+		wikitext = wikipedia_api.wikitext(page)
+		rendered, references, links = markup.render(wikitext)
+
+		print(rendered)
+
+if __name__ == '__main__':
+	main()
diff --git a/markup.py b/markup.py
new file mode 100644
index 0000000..65db7cc
--- /dev/null
+++ b/markup.py
@@ -0,0 +1,220 @@
+import html.entities
+import re
+
+def base26(num):
+	digits = []
+	while num > 0:
+		# ceil(num / 26) - 1
+		next_num = (num + 25) // 26 - 1
+		digits.append(num - 26 * next_num)
+		num = next_num
+	return ''.join(chr(ord('a') + i - 1) for i in reversed(digits))
+
+def debase26(digits):
+	num = 0
+	for digit in digits:
+		num = 26 * num + ord(digit) - ord('a') + 1
+	return num
+
+def render(text, inline = False, references = None, links = None):
+	index = 0
+	start_of_line = True
+
+	def eof():
+		nonlocal index
+		return index >= len(text)
+
+	def char():
+		nonlocal index, start_of_line
+		c = text[index]
+		if c == '\n':
+			start_of_line = True
+		else:
+			start_of_line = False
+		index += 1
+		return c
+
+	def starts(prefix):
+		nonlocal index
+		return text[index:index + len(prefix)] == prefix
+
+	def consume(prefix):
+		nonlocal index, start_of_line
+		assert text[index:index + len(prefix)] == prefix
+		index += len(prefix)
+		start_of_line = prefix[-1] == '\n'
+
+	def read_until(delimiter):
+		nonlocal index
+		start = index
+		while not eof() and not starts(delimiter): char()
+		return text[start:index]
+
+	def space():
+		nonlocal index
+		return text[index].isspace()
+
+	def save():
+		nonlocal index, start_of_line
+		return (index, start_of_line)
+
+	def load(state):
+		nonlocal index, start_of_line
+		index, start_of_line = state
+
+	rendered = ''
+	if references is None:
+		references = []
+	references = references[:]
+	if links is None:
+		links = []
+	links = links[:]
+	extlinks = []
+
+	while not eof():
+		if start_of_line and starts('=') and not inline:
+			# ^\=+heading\=+\W*$
+			start = save()
+
+			line = read_until('\n')
+			if not eof(): consume('\n')
+
+			match = re.fullmatch('(\=+.*\=+)\W*', line)
+			if match is None:
+				# Was not a heading after all
+				load(start)
+				rendered += char()
+				continue
+
+			heading, = match.groups()
+
+			# Figure out which equal signs mark heading level and which are part of heading itself
+			equals_start = 0
+			while all(i == '=' for i in heading[:equals_start + 1]):
+				equals_start += 1
+			equals_end = 0
+			while all(i == '=' for i in heading[-(equals_end + 1):]):
+				equals_end += 1
+			heading_level = min(equals_start, equals_end)
+
+			# Extract the heading
+			heading, references, links = render(heading[heading_level:-heading_level], inline = True, links = links, references = references)
+
+			# Add '#'s before the heading to mark it's a heading
+			while rendered[-2:] != '\n\n':
+				rendered += '\n'
+			rendered += f"{'#' * (heading_level - 1)} {heading}\n"
+
+		elif start_of_line and starts('*') and not inline:
+			# List item
+			list_level = 0
+			while starts('*'):
+				consume('*')
+				list_level += 1
+
+			list_item, references, links = render(read_until('\n'), inline = True, links = links, references = references)
+			if not eof(): consume('\n')
+
+			if rendered[-1:] != '\n':
+				rendered += '\n'
+			rendered += f"{' ' * (list_level - 1)}* {list_item}\n"
+
+		elif starts('[['):
+			# Wikilink
+			consume('[[')
+			link = ''
+			while not eof() and not starts(']]') and not starts('|'):
+				link += char()
+			displaytext = link
+			nesting = 1
+			while nesting > 0:
+				if starts('[['):
+					consume('[[')
+					nesting += 1
+				elif starts(']]'):
+					consume(']]')
+					nesting -= 1
+				elif nesting == 1 and starts('|'):
+					consume('|')
+					displaytext = ''
+				else:
+					displaytext += char()
+
+			links.append(link)
+			link_id = base26(len(links))
+			displaytext, references, links = render(displaytext, inline = True, links = links, references = references)
+			rendered += f"{displaytext}[{link_id}]"
+
+		elif starts("'''"):
+			# '''bold'''
+			consume("'''")
+			rendered += '*'
+
+		elif starts("''"):
+			# ''italic''
+			consume("''")
+			rendered += '*'
+
+		elif starts('{{'):
+			# {{Template|arg1{{!}}|arg2}}
+			# TODO: Handle templates
+			consume('{{')
+			nesting = 1
+			while not eof() and nesting > 0:
+				if starts('{{'):
+					nesting += 1
+					consume('{{')
+				elif starts('}}'):
+					nesting -= 1
+					consume('}}')
+				else:
+					char()
+
+		elif starts('<ref'):
+			# <ref name="foo">…</ref>
+			# <ref name="foo"/>
+			tag = read_until('>')
+			if not eof(): consume('>')
+			if tag[-1:] != '/':
+				references.append(read_until('</ref>'))
+				if not eof(): consume('</ref>')
+			# TODO: Use correct reference numbers for reused references
+			rendered += f'[{len(references)}]'
+
+		elif starts('\n\n'):
+			# New paragraph
+			consume('\n\n')
+			if rendered != '':
+				rendered += '\n\n'
+
+		elif starts('\n'):
+			# Can be removed
+			consume('\n')
+
+		elif starts('&'):
+			start = save()
+			consume('&')
+			if starts('#'):
+				# Numeric charref
+				num = read_until(';')
+				if not eof(): consume(';')
+				if num.startswith('x'):
+					rendered += chr(int(num[1:], 16))
+				else:
+					rendered += chr(int(num))
+			else:
+				# Named entity
+				for entity in html.entities.html5:
+					if starts(entity):
+						rendered += html.entities.html5[entity]
+						consume(entity)
+						break
+				else:
+					# Was not an entity
+					load(start)
+					rendered += char()
+
+		else:
+			rendered += char()
+
+	return rendered, references, links
diff --git a/wikipedia_api.py b/wikipedia_api.py
new file mode 100644
index 0000000..e62421f
--- /dev/null
+++ b/wikipedia_api.py
@@ -0,0 +1,34 @@
+import urllib.parse
+import urllib.request
+
+default_endpoint = 'https://en.wikipedia.org/w/index.php'
+default_timeout = 10
+
+def wikitext(title, endpoint = default_endpoint, timeout = default_timeout):
+	protocol, host, path, query, fragment = urllib.parse.urlsplit(endpoint)
+	query = urllib.parse.urlencode(urllib.parse.parse_qsl(query) + [
+		('action', 'raw'),
+		('title', title)
+	])
+	url = urllib.parse.urlunsplit((protocol, host, path, query, fragment))
+	with urllib.request.urlopen(url, timeout=timeout) as r:
+		contents = r.read()
+		charset = r.headers.get_content_charset()
+
+	if charset is not None:
+		try:
+			return contents.decode(charset)
+		except LookupError:
+			# Unknown encoding
+			pass
+
+	# Default to trying utf-8, windows-1252, iso-8859-1
+	try:
+		return contents.decode('utf-8')
+	except UnicodeDecodeError:
+		pass
+	try:
+		return contents.decode('windows-1252')
+	except UnicodeDecodeError:
+		pass
+	return contents.decode('iso-8859-1')