From 5bd2ae5410f973aa553d270a74115f188c1372e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juhani=20Krekel=C3=A4?= <juhani.haverinen@gmail.com>
Date: Fri, 12 Oct 2018 07:26:48 +0300
Subject: [PATCH] Start on the actual titlebot code

---
 botcmd.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)
diff --git a/botcmd.py b/botcmd.py
index 217ab41..829977f 100644
--- a/botcmd.py
+++ b/botcmd.py
@@ -1,3 +1,6 @@
+import html
+import urllib.request
+
 # initialize(*, config)
 # Called to initialize the IRC bot
 # Runs before even logger is brought up, and blocks further bringup until it's done
@@ -20,6 +23,80 @@ def on_connect(*, irc):
 def on_quit(*, irc):
 	...
 
+def find_urls(message):
+	urls = []
+	index = 0
+	while index < len(message):
+		# Scan for "http" as the common subset of http:// and https://
+		maybe_next_url = message[index:].find('http')
+		# End if there are no longer urls to find
+		if maybe_next_url == -1: break
+		maybe_next_url += index
+
+		if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':
+			# Looks like we found a URL, scan for its end
+			index = maybe_next_url
+			parens = 0
+			while index < len(message):
+				# Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well
+				if message[index] == '(':
+					parens += 1
+				elif message[index] == ')':
+					if parens > 0:
+						parens -= 1
+					else:
+						break
+				# Some people punctuate their URLs
+				elif message[index:index+2] in ('. ', ', '):
+					break
+				elif message[index] in (' ', '>'):
+					break
+
+				index += 1
+
+			urls.append(message[maybe_next_url:index])
+		else:
+			index = maybe_next_url + 1
+
+	return urls
+
+def extract_title(page_data):
+	# Find the <title> tag
+	title_start = None
+	index = 0
+	while True:
+		maybe_tag = page_data[index:].find(b'<')
+		if maybe_tag == -1: break
+		maybe_tag += index
+
+		if page_data[maybe_tag:maybe_tag+7].lower() == b'<title>':
+			title_start = maybe_tag + 7
+			break
+		else:
+			index = maybe_tag + 1
+
+	if title_start is None:
+		return None
+
+	# Find the </title> tag
+	title_end = None
+	index = 0
+	while True:
+		maybe_tag = page_data[index:].find(b'<')
+		if maybe_tag == -1: break
+		maybe_tag += index
+
+		if page_data[maybe_tag:maybe_tag+8].lower() == b'</title>':
+			title_end = maybe_tag
+			break
+		else:
+			index = maybe_tag + 1
+
+	if title_end is None:
+		title_end = len(page_data)
+
+	return html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace'))
+
 # handle_message(*, prefix, message, nick, channel, irc)
 # Called for PRIVMSGs.
 # prefix is the prefix at the start of the message, without the leading ':'
@@ -29,7 +106,25 @@ def on_quit(*, irc):
 # irc is the IRC API object
 # All strings are bytestrings
 def handle_message(*, prefix, message, nick, channel, irc):
-	...
+	urls = find_urls(message.decode('utf-8'))
+
+	# Don't titlebot >3 urls
+	possible_titles_left = 3
+	for url in urls:
+		if possible_titles_left == 0: break
+
+		try:
+			with urllib.request.urlopen(url, timeout = 1) as response:
+				if response.info().get_content_type() == 'text/html':
+					# First 4KB of a page should be enough for any <title>
+					first_kb = response.read(4 * 1024)
+					title = extract_title(first_kb)
+					print(title)#debg
+					possible_titles_left -= 1
+
+		except IOError:
+			continue
+
 
 # handle_nonmessage(*, prefix, command, arguments, irc)
 # Called for all other commands than PINGs and PRIVMSGs.