import html import urllib.error import urllib.parse import urllib.request # initialize(*, config) # Called to initialize the IRC bot # Runs before even logger is brought up, and blocks further bringup until it's done # config is a configpatser.ConfigParser object containig contents of bot.conf def initialize(*, config): ... # on_connect(*, irc) # Called after IRC bot has connected and sent the USER/NICk commands but not yet attempted anything else # Called for every reconnect # Blocks the bot until it's done, including PING/PONG handling # irc is the IRC API object def on_connect(*, irc): ... # on_quit(*, irc) # Called just before IRC bot sends QUIT # Blocks the bot until it's done, including PING/PONG handling # irc is the IRC API object def on_quit(*, irc): ... def find_urls(message): urls = [] index = 0 while index < len(message): # Scan for "http" as the common subset of http:// and https:// maybe_next_url = message[index:].find('http') # End if there are no longer urls to find if maybe_next_url == -1: break maybe_next_url += index if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://': # Looks like we found a URL, scan for its end index = maybe_next_url parens = 0 while index < len(message): # Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing ( to work as well if message[index] == '(': parens += 1 elif message[index] == ')': if parens > 0: parens -= 1 else: break # Some people punctuate their URLs elif message[index:index+2] in ('. ', ', '): break elif message[index] in (' ', '>'): break index += 1 urls.append(message[maybe_next_url:index]) else: index = maybe_next_url + 1 return urls def extract_title(page_data): # Find the tag title_start = None index = 0 while True: maybe_tag = page_data[index:].find(b'<') if maybe_tag == -1: break maybe_tag += index if page_data[maybe_tag:maybe_tag+7].lower() == b'<title>': title_start = maybe_tag + 7 break else: index = maybe_tag + 1 if title_start is None: return None # Find the tag title_end = None index = 0 while True: maybe_tag = page_data[index:].find(b'<') if maybe_tag == -1: break maybe_tag += index if page_data[maybe_tag:maybe_tag+8].lower() == b'': title_end = maybe_tag break else: index = maybe_tag + 1 if title_end is None: title_end = len(page_data) title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip() while ' ' in title: title = title.replace(' ', ' ') return title def sanitize(title): return ''.join('\ufffd' if ord(c) < 32 else c for c in title) # handle_message(*, prefix, message, nick, channel, irc) # Called for PRIVMSGs. # prefix is the prefix at the start of the message, without the leading ':' # message is the contents of the message # nick is who sent the message # channel is where you should send the response (note: in queries nick == channel) # irc is the IRC API object # All strings are bytestrings def handle_message(*, prefix, message, nick, channel, irc): urls = find_urls(message.decode('utf-8')) # Don't titlebot >3 urls possible_titles_left = 3 for url in urls: if possible_titles_left == 0: break try: with urllib.request.urlopen(url, timeout = 1) as response: if == 'text/html': # First 4KB of a page should be enough for any first_kb = * 1024) title = sanitize(extract_title(first_kb)) domain = sanitize(urllib.parse.urlparse(url).netloc) message = '%s: %s' % (domain, title) irc.bot_response(channel, message) possible_titles_left -= 1 except (IOError, urllib.error.URLError): continue # handle_nonmessage(*, prefix, command, arguments, irc) # Called for all other commands than PINGs and PRIVMSGs. # prefix is the prefix at the start of the message, without the leading ':' # command is the command or number code # arguments is rest of the arguments of the command, represented as a list. ':'-arguments are handled automatically # irc is the IRC API object # All strings are bytestrings def handle_nonmessage(*, prefix, command, arguments, irc): ...