tag title_start = None index = 0 while True: maybe_tag = page_data[index:].find(b'<') if maybe_tag == -1: break maybe_tag += index if page_data[maybe_tag:maybe_tag+6].lower() == b'<title': # Apparently <title> can have key="value" things # Find the end of the tag tag_closing = page_data[maybe_tag:].find(b'>') if tag_closing != -1: # It is relative to maybe_tag and we want # the string after it title_start = maybe_tag + tag_closing + 1 break else: index = maybe_tag + 1 if title

import html import urllib.error import urllib.parse import urllib.request # initialize(*, config) # Called to initialize the IRC bot # Runs before even logger is brought up, and blocks further bringup until it's done # config is a configpatser.ConfigParser object containig contents of bot.conf def initialize(*, config): ... # on_connect(*, irc) # Called after IRC bot has connected and sent the USER/NICk commands but not yet attempted anything else # Called for every reconnect # Blocks the bot until it's done, including PING/PONG handling # irc is the IRC API object def on_connect(*, irc): ... # on_quit(*, irc) # Called just before IRC bot sends QUIT # Blocks the bot until it's done, including PING/PONG handling # irc is the IRC API object def on_quit(*, irc): ... def find_urls(message): urls = [] index = 0 while index < len(message): # Scan for "http" as the common subset of http:// and https:// maybe_next_url = message[index:].find('http') # End if there are no longer urls to find if maybe_next_url == -1: break maybe_next_url += index if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://': # Looks like we found a URL, scan for its end index = maybe_next_url parens = 0 while index < len(message): # Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well if message[index] == '(': parens += 1 elif message[index] == ')': if parens > 0: parens -= 1 else: break # Some people punctuate their URLs elif message[index:index+2] in ('. ', ', '): break elif message[index] in (' ', '>'): break index += 1 urls.append(message[maybe_next_url:index]) else: index = maybe_next_url + 1 return urls def extract_title(page_data): # Find the tag title_start = None index = 0 while True: maybe_tag = page_data[index:].find(b'<') if maybe_tag == -1: break maybe_tag += index if page_data[maybe_tag:maybe_tag+6].lower() == b'<title': # Apparently <title> can have key="value" things # Find the end of the tag tag_closing = page_data[maybe_tag:].find(b'>') if tag_closing != -1: # It is relative to maybe_tag and we want # the string after it title_start = maybe_tag + tag_closing + 1 break else: index = maybe_tag + 1 if title_start is None: return None # Find the tag title_end = None index = 0 while True: maybe_tag = page_data[index:].find(b'<') if maybe_tag == -1: break maybe_tag += index if page_data[maybe_tag:maybe_tag+8].lower() == b'': title_end = maybe_tag break else: index = maybe_tag + 1 if title_end is None: title_end = len(page_data) title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip() while ' ' in title: title = title.replace(' ', ' ') return title def sanitize(title): if title is None: return None return ''.join('\ufffd' if ord(c) < 32 else c for c in title) # handle_message(*, prefix, message, nick, channel, irc) # Called for PRIVMSGs. # prefix is the prefix at the start of the message, without the leading ':' # message is the contents of the message # nick is who sent the message # channel is where you should send the response (note: in queries nick == channel) # irc is the IRC API object # All strings are bytestrings def handle_message(*, prefix, message, nick, channel, irc): # Ignore messages with a zwsp, ^O, or ^B^B in the beginning for ignored in (b'\xe2\x80\x8b', b'\x0f', b'\x02\x02'): if message[:len(ignored)] == ignored: return urls = find_urls(message.decode('utf-8')) # Don't titlebot >3 urls possible_titles_left = 3 for url in urls: if possible_titles_left == 0: break domain = sanitize(urllib.parse.urlparse(url).netloc) try: try: headers = { 'User-Agent': 'Cockatric4 (like Lynx)', 'Accept': '*/*', 'Accept-Language': 'en,*;q=0.1', 'Accept-Charset': 'utf-8', } request = urllib.request.Request(url, headers = headers) with urllib.request.urlopen(request, timeout = 5) as response: if response.info().get_content_type() == 'text/html': # First 4KiB of a page should be enough for any # Turns out it's not, so download 64KiB # As of 2023-09-10, youtube requires up to 320KiB (!), # so download first 512KiB page_source_fragment = response.read(512 * 1024) title = sanitize(extract_title(page_source_fragment)) if title is not None: message = '%s: %s' % (domain, title) else: message = '%s: <no title found>' % domain irc.bot_response(channel, message) possible_titles_left -= 1 except urllib.error.HTTPError as e: # Tell ppl if server responded with an error code message = '%s: %i %s' % (domain, e.getcode(), e.msg) irc.bot_response(channel, message) possible_titles_left -= 1 except (IOError, urllib.error.URLError): continue # handle_nonmessage(*, prefix, command, arguments, irc) # Called for all other commands than PINGs and PRIVMSGs. # prefix is the prefix at the start of the message, without the leading ':' # command is the command or number code # arguments is rest of the arguments of the command, represented as a list. ':'-arguments are handled automatically # irc is the IRC API object # All strings are bytestrings def handle_nonmessage(*, prefix, command, arguments, irc): ...