pheeder/pheeder.py

#!/usr/bin/env python3
import http.server
import socket
import sys
import urllib.parse
import xml.sax.saxutils
from collections import namedtuple

feeds = {}

def download_gophermap(feedurl):
	split = urllib.parse.urlsplit(feedurl)

	if split.scheme != 'gopher': raise ValueError(f'Must be a gopher url, {feedurl} is not')

	host = split.hostname
	port = split.port if split.port != None else 70

	itemtype = '1' # Gophermap by default
	query = None
	if split.query != '':
		query = urllib.parse.unquote(split.query)
		if '\n' in query or '\r' in query: raise ValueError(f'Newlines not allowed in the query')
		itemtype = '8' # Search has its own itemtype

	if split.path.lstrip('/') == '':
		# Special handling if the url doesn't include the path part
		path = ''
	elif split.path.lstrip('/')[0] != itemtype:
		# If the url specifies something else than a gophermap, or search if we have a search query, explode
		raise ValueError(f'Must be a gophermap or search, {feedurl} is not')
	else:
		path = urllib.parse.unquote(split.path.lstrip('/')[1:])
		if '\n' in path or '\r' in path or '\t' in path: raise ValueError(f'Newlines or tabs not allowed in the path')

	with socket.create_connection((host, port)) as sock:
		if query is None:
			sock.sendall(path.encode('utf-8') + b'\r\n')
		else:
			sock.sendall(path.encode('utf-8') + b'\t' + query.encode('utf-8') + b'\r\n')

		lines = []
		buf = bytearray()
		stream_end = False
		while not stream_end:
			data = sock.recv(1024)
			if data == b'': break

			buf.extend(data)

			while True:
				newline_index = buf.find(b'\n')
				if newline_index == -1: break

				line = buf[:newline_index]
				buf = buf[newline_index + 1:]
				if line[-1:] == b'\r': line = line[:-1]

				if line == b'.':
					# End of gophermap reached. Stop parsing here
					stream_end = True
					break
				elif line[:1] == b'.':
					# Dot unquoting
					line = line[1:]

				lines.append(line)

		return lines

def construct_url(itemtype, path, host, port):
	if path.lstrip(b'/').startswith(b'URL:'):
		# hURL
		# Copy the URL (after sanitizing) directly
		url = path.lstrip(b'/')[4:].decode('utf-8')
		return url

	host = host.decode('utf-8').encode('idna').decode()
	path = urllib.parse.quote(itemtype + path)

	if port == 70:
		# No need to add the port explicitly
		netloc = host
	else:
		# Do we have an IPv6 address that needs to be put in brackets?
		if ':' in host:
			netloc = f'[{host}]:port'
		else:
			netloc = f'{host}:{port}'

	return urllib.parse.urlunsplit(('gopher', netloc, path, '', ''))

def get_links(gophermap):
	links = []
	for line in gophermap:
		line = line.split(b'\t')
		if len(line) < 4:
			continue

		itemtype_name, path, host, port, *_ = line

		itemtype = itemtype_name[:1]
		name = itemtype_name[1:].decode('utf-8', errors='replace')
		if itemtype == b'3': raise Exception(f'From gopher: {name}')
		if itemtype == b'i': continue # Don't care about info text

		port = int(port)

		url = construct_url(itemtype, path, host, port)
		links.append((name, url))

	return links

def construct_rss(links, feed):
	rss = [f"""<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
	<channel>
		<title>{xml.sax.saxutils.escape(feed)} (pheeder)</title>
		<link>{xml.sax.saxutils.escape(feeds[feed])}</link>
		<description></description>
"""]
	for text, url in links:
		rss.append(f"""\t\t<item>
			<title>{xml.sax.saxutils.escape(text)}</title>
			<link>{xml.sax.saxutils.escape(url)}</link>
		</item>""")

	rss.append('\t</channel>\n</rss>\n')

	return '\n'.join(rss)

class Pheeder(http.server.BaseHTTPRequestHandler):
	def send_404(self):
		content = f'{self.path} not found'.encode('utf-8')

		self.send_response(404)
		self.send_header('Content-Type', 'text/plain; charset=utf-8')
		self.send_header('Content-Length', len(content))
		self.end_headers()

		self.wfile.write(content)

	def send_500(self):
		content =f'Internal server error while processing {self.path}'.encode('utf-8')

		self.send_response(500)
		self.send_header('Content-Type', 'text/plain; charset=utf-8')
		self.send_header('Content-Length', len(content))
		self.end_headers()

		self.wfile.write(content)

	def send_rss(self, rss):
		rss = rss.encode('utf-8')

		self.send_response(200)
		self.send_header('Content-Type', 'application/rss+xml; charset=utf-8')
		self.send_header('Content-Length', len(rss))
		self.end_headers()

		self.wfile.write(rss)

	def do_GET(self):
		feed = self.path.strip('/').split('/')[-1]

		if feed in feeds:
			try:
				gophermap = download_gophermap(feeds[feed])
				links = get_links(gophermap)
				rss = construct_rss(links, feed)
			except:
				self.send_500()
			else:
				self.send_rss(rss)
		else:
			self.send_404()

def load_feedfile(feedfile):
	global feeds

	with open(feedfile, 'r') as f:
		for line in f:
			line = line.strip()

			comment_start = line.find('#')
			if comment_start != -1:
				line = line[:comment_start]

			feed, url = line.split(' ', 2)

			feeds[feed] = url

if __name__ == '__main__':
	if len(sys.argv) < 3:
		print(f'Usage: {sys.argv[0]} port feedfile [feedfile …]', file=sys.stderr)
		sys.exit(1)

	for feedfile in sys.argv[2:]:
		load_feedfile(feedfile)

	http.server.HTTPServer(('', int(sys.argv[1])), Pheeder).serve_forever()