diff --git a/botcmd.py b/botcmd.py
index 217ab41..829977f 100644
--- a/botcmd.py
+++ b/botcmd.py
@@ -1,3 +1,6 @@
+import html
+import urllib.request
+
# initialize(*, config)
# Called to initialize the IRC bot
# Runs before even logger is brought up, and blocks further bringup until it's done
@@ -20,6 +23,80 @@ def on_connect(*, irc):
def on_quit(*, irc):
...
+def find_urls(message):
+ urls = []
+ index = 0
+ while index < len(message):
+ # Scan for "http" as the common subset of http:// and https://
+ maybe_next_url = message[index:].find('http')
+ # End if there are no longer urls to find
+ if maybe_next_url == -1: break
+ maybe_next_url += index
+
+ if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':
+ # Looks like we found a URL, scan for its end
+ index = maybe_next_url
+ parens = 0
+ while index < len(message):
+ # Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well
+ if message[index] == '(':
+ parens += 1
+ elif message[index] == ')':
+ if parens > 0:
+ parens -= 1
+ else:
+ break
+ # Some people punctuate their URLs
+ elif message[index:index+2] in ('. ', ', '):
+ break
+ elif message[index] in (' ', '>'):
+ break
+
+ index += 1
+
+ urls.append(message[maybe_next_url:index])
+ else:
+ index = maybe_next_url + 1
+
+ return urls
+
+def extract_title(page_data):
+ # Find the
tag
+ title_start = None
+ index = 0
+ while True:
+ maybe_tag = page_data[index:].find(b'<')
+ if maybe_tag == -1: break
+ maybe_tag += index
+
+ if page_data[maybe_tag:maybe_tag+7].lower() == b'':
+ title_start = maybe_tag + 7
+ break
+ else:
+ index = maybe_tag + 1
+
+ if title_start is None:
+ return None
+
+ # Find the tag
+ title_end = None
+ index = 0
+ while True:
+ maybe_tag = page_data[index:].find(b'<')
+ if maybe_tag == -1: break
+ maybe_tag += index
+
+ if page_data[maybe_tag:maybe_tag+8].lower() == b'':
+ title_end = maybe_tag
+ break
+ else:
+ index = maybe_tag + 1
+
+ if title_end is None:
+ title_end = len(page_data)
+
+ return html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace'))
+
# handle_message(*, prefix, message, nick, channel, irc)
# Called for PRIVMSGs.
# prefix is the prefix at the start of the message, without the leading ':'
@@ -29,7 +106,25 @@ def on_quit(*, irc):
# irc is the IRC API object
# All strings are bytestrings
def handle_message(*, prefix, message, nick, channel, irc):
- ...
+ urls = find_urls(message.decode('utf-8'))
+
+ # Don't titlebot >3 urls
+ possible_titles_left = 3
+ for url in urls:
+ if possible_titles_left == 0: break
+
+ try:
+ with urllib.request.urlopen(url, timeout = 1) as response:
+ if response.info().get_content_type() == 'text/html':
+ # First 4KB of a page should be enough for any
+ first_kb = response.read(4 * 1024)
+ title = extract_title(first_kb)
+ print(title)#debg
+ possible_titles_left -= 1
+
+ except IOError:
+ continue
+
# handle_nonmessage(*, prefix, command, arguments, irc)
# Called for all other commands than PINGs and PRIVMSGs.