cockatric4/botcmd.py

import html
import urllib.error
import urllib.parse
import urllib.request

# initialize(*, config)
# Called to initialize the IRC bot
# Runs before even logger is brought up, and blocks further bringup until it's done
# config is a configpatser.ConfigParser object containig contents of bot.conf
def initialize(*, config):
	...

# on_connect(*, irc)
# Called after IRC bot has connected and sent the USER/NICk commands but not yet attempted anything else
# Called for every reconnect
# Blocks the bot until it's done, including PING/PONG handling
# irc is the IRC API object
def on_connect(*, irc):
	...

# on_quit(*, irc)
# Called just before IRC bot sends QUIT
# Blocks the bot until it's done, including PING/PONG handling
# irc is the IRC API object
def on_quit(*, irc):
	...

def find_urls(message):
	urls = []
	index = 0
	while index < len(message):
		# Scan for "http" as the common subset of http:// and https://
		maybe_next_url = message[index:].find('http')
		# End if there are no longer urls to find
		if maybe_next_url == -1: break
		maybe_next_url += index

		if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':
			# Looks like we found a URL, scan for its end
			index = maybe_next_url
			parens = 0
			while index < len(message):
				# Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well
				if message[index] == '(':
					parens += 1
				elif message[index] == ')':
					if parens > 0:
						parens -= 1
					else:
						break
				# Some people punctuate their URLs
				elif message[index:index+2] in ('. ', ', '):
					break
				elif message[index] in (' ', '>'):
					break

				index += 1

			urls.append(message[maybe_next_url:index])
		else:
			index = maybe_next_url + 1

	return urls

def extract_title(page_data):
	# Find the <title> tag
	title_start = None
	index = 0
	while True:
		maybe_tag = page_data[index:].find(b'<')
		if maybe_tag == -1: break
		maybe_tag += index

		if page_data[maybe_tag:maybe_tag+6].lower() == b'<title':
			# Apparently <title> can have key="value" things
			# Find the end of the tag
			tag_closing = page_data[maybe_tag:].find(b'>')
			if tag_closing != -1:
				# It is relative to maybe_tag and we want
				# the string after it
				title_start = maybe_tag + tag_closing + 1
				break
		else:
			index = maybe_tag + 1

	if title_start is None:
		return None

	# Find the </title> tag
	title_end = None
	index = 0
	while True:
		maybe_tag = page_data[index:].find(b'<')
		if maybe_tag == -1: break
		maybe_tag += index

		if page_data[maybe_tag:maybe_tag+8].lower() == b'</title>':
			title_end = maybe_tag
			break
		else:
			index = maybe_tag + 1

	if title_end is None:
		title_end = len(page_data)

	title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip()
	while '  ' in title:
		title = title.replace('  ', ' ')

	return title

def sanitize(title):
	if title is None: return None
	return ''.join('\ufffd' if ord(c) < 32 else c for c in title)

# handle_message(*, prefix, message, nick, channel, irc)
# Called for PRIVMSGs.
# prefix is the prefix at the start of the message, without the leading ':'
# message is the contents of the message
# nick is who sent the message
# channel is where you should send the response (note: in queries nick == channel)
# irc is the IRC API object
# All strings are bytestrings
def handle_message(*, prefix, message, nick, channel, irc):
	# Ignore messages with a zwsp, ^O, or ^B^B in the beginning
	for ignored in (b'\xe2\x80\x8b', b'\x0f', b'\x02\x02'):
		if message[:len(ignored)] == ignored:
			return

	urls = find_urls(message.decode('utf-8'))

	# Don't titlebot >3 urls
	possible_titles_left = 3
	for url in urls:
		if possible_titles_left == 0: break

		domain = sanitize(urllib.parse.urlparse(url).netloc)

		try:
			try:
				headers = {
					'User-Agent': 'Cockatric4 (like Lynx)',
					'Accept': '*/*',
					'Accept-Language': 'en,*;q=0.1',
					'Accept-Charset': 'utf-8',
				}
				request = urllib.request.Request(url, headers = headers)
				with urllib.request.urlopen(request, timeout = 5) as response:
					if response.info().get_content_type() == 'text/html':
						# First 4KiB of a page should be enough for any <title>
						# Turns out it's not, so download 64KiB
						# As of 2023-09-10, youtube requires up to 320KiB (!),
						# so download first 512KiB
						page_source_fragment = response.read(512 * 1024)
						title = sanitize(extract_title(page_source_fragment))

						if title is not None:
							message = '%s: %s' % (domain, title)
						else:
							message = '%s: <no title found>' % domain
						irc.bot_response(channel, message)

						possible_titles_left -= 1

			except urllib.error.HTTPError as e:
				# Tell ppl if server responded with an error code
				message = '%s: %i %s' % (domain, e.getcode(), e.msg)
				irc.bot_response(channel, message)
				possible_titles_left -= 1

		except (IOError, urllib.error.URLError):
			continue


# handle_nonmessage(*, prefix, command, arguments, irc)
# Called for all other commands than PINGs and PRIVMSGs.
# prefix is the prefix at the start of the message, without the leading ':'
# command is the command or number code
# arguments is rest of the arguments of the command, represented as a list. ':'-arguments are handled automatically
# irc is the IRC API object
# All strings are bytestrings
def handle_nonmessage(*, prefix, command, arguments, irc):
	...