2018-10-12 04:26:48 +00:00
|
|
|
import html
|
2018-10-12 04:48:57 +00:00
|
|
|
import urllib.error
|
|
|
|
import urllib.parse
|
2018-10-12 04:26:48 +00:00
|
|
|
import urllib.request
|
|
|
|
|
2018-01-03 16:08:24 +00:00
|
|
|
# initialize(*, config)
|
2017-09-06 17:47:32 +00:00
|
|
|
# Called to initialize the IRC bot
|
|
|
|
# Runs before even logger is brought up, and blocks further bringup until it's done
|
2018-01-03 16:08:24 +00:00
|
|
|
# config is a configpatser.ConfigParser object containig contents of bot.conf
|
|
|
|
def initialize(*, config):
|
2017-09-06 17:47:32 +00:00
|
|
|
...
|
|
|
|
|
2018-01-02 16:31:23 +00:00
|
|
|
# on_connect(*, irc)
|
|
|
|
# Called after IRC bot has connected and sent the USER/NICk commands but not yet attempted anything else
|
2018-06-14 07:50:30 +00:00
|
|
|
# Called for every reconnect
|
2018-01-02 16:31:23 +00:00
|
|
|
# Blocks the bot until it's done, including PING/PONG handling
|
2018-01-03 16:08:24 +00:00
|
|
|
# irc is the IRC API object
|
2018-01-02 16:31:23 +00:00
|
|
|
def on_connect(*, irc):
|
|
|
|
...
|
|
|
|
|
2018-01-20 14:54:13 +00:00
|
|
|
# on_quit(*, irc)
|
|
|
|
# Called just before IRC bot sends QUIT
|
|
|
|
# Blocks the bot until it's done, including PING/PONG handling
|
|
|
|
# irc is the IRC API object
|
|
|
|
def on_quit(*, irc):
|
|
|
|
...
|
|
|
|
|
2018-10-12 04:26:48 +00:00
|
|
|
def find_urls(message):
|
|
|
|
urls = []
|
|
|
|
index = 0
|
|
|
|
while index < len(message):
|
|
|
|
# Scan for "http" as the common subset of http:// and https://
|
|
|
|
maybe_next_url = message[index:].find('http')
|
|
|
|
# End if there are no longer urls to find
|
|
|
|
if maybe_next_url == -1: break
|
|
|
|
maybe_next_url += index
|
|
|
|
|
|
|
|
if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':
|
|
|
|
# Looks like we found a URL, scan for its end
|
|
|
|
index = maybe_next_url
|
|
|
|
parens = 0
|
|
|
|
while index < len(message):
|
|
|
|
# Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well
|
|
|
|
if message[index] == '(':
|
|
|
|
parens += 1
|
|
|
|
elif message[index] == ')':
|
|
|
|
if parens > 0:
|
|
|
|
parens -= 1
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
# Some people punctuate their URLs
|
|
|
|
elif message[index:index+2] in ('. ', ', '):
|
|
|
|
break
|
|
|
|
elif message[index] in (' ', '>'):
|
|
|
|
break
|
|
|
|
|
|
|
|
index += 1
|
|
|
|
|
|
|
|
urls.append(message[maybe_next_url:index])
|
|
|
|
else:
|
|
|
|
index = maybe_next_url + 1
|
|
|
|
|
|
|
|
return urls
|
|
|
|
|
|
|
|
def extract_title(page_data):
|
|
|
|
# Find the <title> tag
|
|
|
|
title_start = None
|
|
|
|
index = 0
|
|
|
|
while True:
|
|
|
|
maybe_tag = page_data[index:].find(b'<')
|
|
|
|
if maybe_tag == -1: break
|
|
|
|
maybe_tag += index
|
|
|
|
|
2019-01-26 02:21:12 +00:00
|
|
|
if page_data[maybe_tag:maybe_tag+6].lower() == b'<title':
|
|
|
|
# Apparently <title> can have key="value" things
|
|
|
|
# Find the end of the tag
|
|
|
|
tag_closing = page_data[maybe_tag:].find(b'>')
|
|
|
|
if tag_closing != -1:
|
|
|
|
# It is relative to maybe_tag and we want
|
|
|
|
# the string after it
|
|
|
|
title_start = maybe_tag + tag_closing + 1
|
|
|
|
break
|
2018-10-12 04:26:48 +00:00
|
|
|
else:
|
|
|
|
index = maybe_tag + 1
|
|
|
|
|
|
|
|
if title_start is None:
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Find the </title> tag
|
|
|
|
title_end = None
|
|
|
|
index = 0
|
|
|
|
while True:
|
|
|
|
maybe_tag = page_data[index:].find(b'<')
|
|
|
|
if maybe_tag == -1: break
|
|
|
|
maybe_tag += index
|
|
|
|
|
|
|
|
if page_data[maybe_tag:maybe_tag+8].lower() == b'</title>':
|
|
|
|
title_end = maybe_tag
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
index = maybe_tag + 1
|
|
|
|
|
|
|
|
if title_end is None:
|
|
|
|
title_end = len(page_data)
|
|
|
|
|
2018-10-12 04:48:57 +00:00
|
|
|
title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip()
|
|
|
|
while ' ' in title:
|
|
|
|
title = title.replace(' ', ' ')
|
|
|
|
|
|
|
|
return title
|
|
|
|
|
|
|
|
def sanitize(title):
|
2019-01-26 02:21:12 +00:00
|
|
|
if title is None: return None
|
2018-10-12 04:48:57 +00:00
|
|
|
return ''.join('\ufffd' if ord(c) < 32 else c for c in title)
|
2018-10-12 04:26:48 +00:00
|
|
|
|
2017-09-06 17:47:32 +00:00
|
|
|
# handle_message(*, prefix, message, nick, channel, irc)
|
|
|
|
# Called for PRIVMSGs.
|
|
|
|
# prefix is the prefix at the start of the message, without the leading ':'
|
|
|
|
# message is the contents of the message
|
|
|
|
# nick is who sent the message
|
|
|
|
# channel is where you should send the response (note: in queries nick == channel)
|
|
|
|
# irc is the IRC API object
|
2018-06-14 07:50:30 +00:00
|
|
|
# All strings are bytestrings
|
2017-09-06 17:47:32 +00:00
|
|
|
def handle_message(*, prefix, message, nick, channel, irc):
|
2018-10-22 07:51:07 +00:00
|
|
|
# Ignore messages with a zwsp, ^O, or ^B^B in the beginning
|
|
|
|
for ignored in (b'\xe2\x80\x8b', b'\x0f', b'\x02\x02'):
|
|
|
|
if message[:len(ignored)] == ignored:
|
|
|
|
return
|
|
|
|
|
2018-10-12 04:26:48 +00:00
|
|
|
urls = find_urls(message.decode('utf-8'))
|
|
|
|
|
|
|
|
# Don't titlebot >3 urls
|
|
|
|
possible_titles_left = 3
|
|
|
|
for url in urls:
|
|
|
|
if possible_titles_left == 0: break
|
|
|
|
|
2018-10-28 06:51:08 +00:00
|
|
|
domain = sanitize(urllib.parse.urlparse(url).netloc)
|
2018-10-12 04:48:57 +00:00
|
|
|
|
2018-10-28 06:51:08 +00:00
|
|
|
try:
|
|
|
|
try:
|
|
|
|
with urllib.request.urlopen(url, timeout = 5) as response:
|
|
|
|
if response.info().get_content_type() == 'text/html':
|
|
|
|
# First 4KiB of a page should be enough for any <title>
|
|
|
|
# Turns out it's not, so download 64KiB
|
|
|
|
page_source_fragment = response.read(64 * 1024)
|
|
|
|
title = sanitize(extract_title(page_source_fragment))
|
|
|
|
|
|
|
|
if title is not None:
|
|
|
|
message = '%s: %s' % (domain, title)
|
|
|
|
else:
|
|
|
|
message = '%s: <no title found>' % domain
|
|
|
|
irc.bot_response(channel, message)
|
|
|
|
|
|
|
|
possible_titles_left -= 1
|
|
|
|
|
|
|
|
except urllib.error.HTTPError as e:
|
|
|
|
# Tell ppl if server responded with an error code
|
|
|
|
message = '%s: %i %s' % (domain, e.getcode(), e.msg)
|
|
|
|
irc.bot_response(channel, message)
|
|
|
|
possible_titles_left -= 1
|
2018-10-12 04:26:48 +00:00
|
|
|
|
2018-10-12 04:48:57 +00:00
|
|
|
except (IOError, urllib.error.URLError):
|
2018-10-12 04:26:48 +00:00
|
|
|
continue
|
|
|
|
|
2018-10-28 06:51:08 +00:00
|
|
|
|
2017-09-06 17:47:32 +00:00
|
|
|
# handle_nonmessage(*, prefix, command, arguments, irc)
|
|
|
|
# Called for all other commands than PINGs and PRIVMSGs.
|
|
|
|
# prefix is the prefix at the start of the message, without the leading ':'
|
|
|
|
# command is the command or number code
|
|
|
|
# arguments is rest of the arguments of the command, represented as a list. ':'-arguments are handled automatically
|
2018-01-03 16:08:24 +00:00
|
|
|
# irc is the IRC API object
|
2018-06-14 07:50:30 +00:00
|
|
|
# All strings are bytestrings
|
2017-09-06 17:47:32 +00:00
|
|
|
def handle_nonmessage(*, prefix, command, arguments, irc):
|
|
|
|
...
|