From dfe8770e2b26f03bbbb35682608e17c89327ffa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juhani=20Krekel=C3=A4?= Date: Thu, 9 Jul 2020 20:03:41 +0300 Subject: [PATCH] First commit --- .gitignore | 3 + CC0 | 116 +++++++++++++++++++++++++ cli_client.py | 24 ++++++ markup.py | 220 +++++++++++++++++++++++++++++++++++++++++++++++ wikipedia_api.py | 34 ++++++++ 5 files changed, 397 insertions(+) create mode 100644 .gitignore create mode 100644 CC0 create mode 100644 cli_client.py create mode 100644 markup.py create mode 100644 wikipedia_api.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15c993e --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +*.pyc +*.swp diff --git a/CC0 b/CC0 new file mode 100644 index 0000000..670154e --- /dev/null +++ b/CC0 @@ -0,0 +1,116 @@ +CC0 1.0 Universal + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator and +subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the +purpose of contributing to a commons of creative, cultural and scientific +works ("Commons") that the public can reliably and without fear of later +claims of infringement build upon, modify, incorporate in other works, reuse +and redistribute as freely as possible in any form whatsoever and for any +purposes, including without limitation commercial purposes. These owners may +contribute to the Commons to promote the ideal of a free culture and the +further production of creative, cultural and scientific works, or to gain +reputation or greater distribution for their Work in part through the use and +efforts of others. + +For these and/or other purposes and motivations, and without any expectation +of additional consideration or compensation, the person associating CC0 with a +Work (the "Affirmer"), to the extent that he or she is an owner of Copyright +and Related Rights in the Work, voluntarily elects to apply CC0 to the Work +and publicly distribute the Work under its terms, with knowledge of his or her +Copyright and Related Rights in the Work and the meaning and intended legal +effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not limited +to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, communicate, + and translate a Work; + + ii. moral rights retained by the original author(s) and/or performer(s); + + iii. publicity and privacy rights pertaining to a person's image or likeness + depicted in a Work; + + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + + v. rights protecting the extraction, dissemination, use and reuse of data in + a Work; + + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation thereof, + including any amended or successor version of such directive); and + + vii. other similar, equivalent or corresponding rights throughout the world + based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, +applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and +unconditionally waives, abandons, and surrenders all of Affirmer's Copyright +and Related Rights and associated claims and causes of action, whether now +known or unknown (including existing as well as future claims and causes of +action), in the Work (i) in all territories worldwide, (ii) for the maximum +duration provided by applicable law or treaty (including future time +extensions), (iii) in any current or future medium and for any number of +copies, and (iv) for any purpose whatsoever, including without limitation +commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes +the Waiver for the benefit of each member of the public at large and to the +detriment of Affirmer's heirs and successors, fully intending that such Waiver +shall not be subject to revocation, rescission, cancellation, termination, or +any other legal or equitable action to disrupt the quiet enjoyment of the Work +by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be +judged legally invalid or ineffective under applicable law, then the Waiver +shall be preserved to the maximum extent permitted taking into account +Affirmer's express Statement of Purpose. In addition, to the extent the Waiver +is so judged Affirmer hereby grants to each affected person a royalty-free, +non transferable, non sublicensable, non exclusive, irrevocable and +unconditional license to exercise Affirmer's Copyright and Related Rights in +the Work (i) in all territories worldwide, (ii) for the maximum duration +provided by applicable law or treaty (including future time extensions), (iii) +in any current or future medium and for any number of copies, and (iv) for any +purpose whatsoever, including without limitation commercial, advertising or +promotional purposes (the "License"). The License shall be deemed effective as +of the date CC0 was applied by Affirmer to the Work. Should any part of the +License for any reason be judged legally invalid or ineffective under +applicable law, such partial invalidity or ineffectiveness shall not +invalidate the remainder of the License, and in such case Affirmer hereby +affirms that he or she will not (i) exercise any of his or her remaining +Copyright and Related Rights in the Work or (ii) assert any associated claims +and causes of action with respect to the Work, in either case contrary to +Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + + b. Affirmer offers the Work as-is and makes no representations or warranties + of any kind concerning the Work, express, implied, statutory or otherwise, + including without limitation warranties of title, merchantability, fitness + for a particular purpose, non infringement, or the absence of latent or + other defects, accuracy, or the present or absence of errors, whether or not + discoverable, all to the greatest extent permissible under applicable law. + + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without limitation + any person's Copyright and Related Rights in the Work. Further, Affirmer + disclaims responsibility for obtaining any necessary consents, permissions + or other rights required for any use of the Work. + + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to this + CC0 or use of the Work. + +For more information, please see + diff --git a/cli_client.py b/cli_client.py new file mode 100644 index 0000000..9c27ebd --- /dev/null +++ b/cli_client.py @@ -0,0 +1,24 @@ +import markup +import wikipedia_api + +def main(): + links = [] + while True: + try: + go = input('go> ') + except (EOFError, KeyboardInterrupt): + break + + if go[:1] == '!': + page = go[1:] + else: + # Links are 1-indexed + page = links[markup.debase26(go) - 1] + + wikitext = wikipedia_api.wikitext(page) + rendered, references, links = markup.render(wikitext) + + print(rendered) + +if __name__ == '__main__': + main() diff --git a/markup.py b/markup.py new file mode 100644 index 0000000..65db7cc --- /dev/null +++ b/markup.py @@ -0,0 +1,220 @@ +import html.entities +import re + +def base26(num): + digits = [] + while num > 0: + # ceil(num / 26) - 1 + next_num = (num + 25) // 26 - 1 + digits.append(num - 26 * next_num) + num = next_num + return ''.join(chr(ord('a') + i - 1) for i in reversed(digits)) + +def debase26(digits): + num = 0 + for digit in digits: + num = 26 * num + ord(digit) - ord('a') + 1 + return num + +def render(text, inline = False, references = None, links = None): + index = 0 + start_of_line = True + + def eof(): + nonlocal index + return index >= len(text) + + def char(): + nonlocal index, start_of_line + c = text[index] + if c == '\n': + start_of_line = True + else: + start_of_line = False + index += 1 + return c + + def starts(prefix): + nonlocal index + return text[index:index + len(prefix)] == prefix + + def consume(prefix): + nonlocal index, start_of_line + assert text[index:index + len(prefix)] == prefix + index += len(prefix) + start_of_line = prefix[-1] == '\n' + + def read_until(delimiter): + nonlocal index + start = index + while not eof() and not starts(delimiter): char() + return text[start:index] + + def space(): + nonlocal index + return text[index].isspace() + + def save(): + nonlocal index, start_of_line + return (index, start_of_line) + + def load(state): + nonlocal index, start_of_line + index, start_of_line = state + + rendered = '' + if references is None: + references = [] + references = references[:] + if links is None: + links = [] + links = links[:] + extlinks = [] + + while not eof(): + if start_of_line and starts('=') and not inline: + # ^\=+heading\=+\W*$ + start = save() + + line = read_until('\n') + if not eof(): consume('\n') + + match = re.fullmatch('(\=+.*\=+)\W*', line) + if match is None: + # Was not a heading after all + load(start) + rendered += char() + continue + + heading, = match.groups() + + # Figure out which equal signs mark heading level and which are part of heading itself + equals_start = 0 + while all(i == '=' for i in heading[:equals_start + 1]): + equals_start += 1 + equals_end = 0 + while all(i == '=' for i in heading[-(equals_end + 1):]): + equals_end += 1 + heading_level = min(equals_start, equals_end) + + # Extract the heading + heading, references, links = render(heading[heading_level:-heading_level], inline = True, links = links, references = references) + + # Add '#'s before the heading to mark it's a heading + while rendered[-2:] != '\n\n': + rendered += '\n' + rendered += f"{'#' * (heading_level - 1)} {heading}\n" + + elif start_of_line and starts('*') and not inline: + # List item + list_level = 0 + while starts('*'): + consume('*') + list_level += 1 + + list_item, references, links = render(read_until('\n'), inline = True, links = links, references = references) + if not eof(): consume('\n') + + if rendered[-1:] != '\n': + rendered += '\n' + rendered += f"{' ' * (list_level - 1)}* {list_item}\n" + + elif starts('[['): + # Wikilink + consume('[[') + link = '' + while not eof() and not starts(']]') and not starts('|'): + link += char() + displaytext = link + nesting = 1 + while nesting > 0: + if starts('[['): + consume('[[') + nesting += 1 + elif starts(']]'): + consume(']]') + nesting -= 1 + elif nesting == 1 and starts('|'): + consume('|') + displaytext = '' + else: + displaytext += char() + + links.append(link) + link_id = base26(len(links)) + displaytext, references, links = render(displaytext, inline = True, links = links, references = references) + rendered += f"{displaytext}[{link_id}]" + + elif starts("'''"): + # '''bold''' + consume("'''") + rendered += '*' + + elif starts("''"): + # ''italic'' + consume("''") + rendered += '*' + + elif starts('{{'): + # {{Template|arg1{{!}}|arg2}} + # TODO: Handle templates + consume('{{') + nesting = 1 + while not eof() and nesting > 0: + if starts('{{'): + nesting += 1 + consume('{{') + elif starts('}}'): + nesting -= 1 + consume('}}') + else: + char() + + elif starts('… + # + tag = read_until('>') + if not eof(): consume('>') + if tag[-1:] != '/': + references.append(read_until('')) + if not eof(): consume('') + # TODO: Use correct reference numbers for reused references + rendered += f'[{len(references)}]' + + elif starts('\n\n'): + # New paragraph + consume('\n\n') + if rendered != '': + rendered += '\n\n' + + elif starts('\n'): + # Can be removed + consume('\n') + + elif starts('&'): + start = save() + consume('&') + if starts('#'): + # Numeric charref + num = read_until(';') + if not eof(): consume(';') + if num.startswith('x'): + rendered += chr(int(num[1:], 16)) + else: + rendered += chr(int(num)) + else: + # Named entity + for entity in html.entities.html5: + if starts(entity): + rendered += html.entities.html5[entity] + consume(entity) + break + else: + # Was not an entity + load(start) + rendered += char() + + else: + rendered += char() + + return rendered, references, links diff --git a/wikipedia_api.py b/wikipedia_api.py new file mode 100644 index 0000000..e62421f --- /dev/null +++ b/wikipedia_api.py @@ -0,0 +1,34 @@ +import urllib.parse +import urllib.request + +default_endpoint = 'https://en.wikipedia.org/w/index.php' +default_timeout = 10 + +def wikitext(title, endpoint = default_endpoint, timeout = default_timeout): + protocol, host, path, query, fragment = urllib.parse.urlsplit(endpoint) + query = urllib.parse.urlencode(urllib.parse.parse_qsl(query) + [ + ('action', 'raw'), + ('title', title) + ]) + url = urllib.parse.urlunsplit((protocol, host, path, query, fragment)) + with urllib.request.urlopen(url, timeout=timeout) as r: + contents = r.read() + charset = r.headers.get_content_charset() + + if charset is not None: + try: + return contents.decode(charset) + except LookupError: + # Unknown encoding + pass + + # Default to trying utf-8, windows-1252, iso-8859-1 + try: + return contents.decode('utf-8') + except UnicodeDecodeError: + pass + try: + return contents.decode('windows-1252') + except UnicodeDecodeError: + pass + return contents.decode('iso-8859-1')