cockatric4/botcmd.py

import html
import urllib.error
import urllib.parse
import urllib.request

# initialize(*, config)
# Called to initialize the IRC bot
# Runs before even logger is brought up, and blocks further bringup until it's done
# config is a configpatser.ConfigParser object containig contents of bot.conf
def initialize(*, config):
	...

# on_connect(*, irc)
# Called after IRC bot has connected and sent the USER/NICk commands but not yet attempted anything else
# Called for every reconnect
# Blocks the bot until it's done, including PING/PONG handling
# irc is the IRC API object
def on_connect(*, irc):
	...

# on_quit(*, irc)
# Called just before IRC bot sends QUIT
# Blocks the bot until it's done, including PING/PONG handling
# irc is the IRC API object
def on_quit(*, irc):
	...

def find_urls(message):
	urls = []
	index = 0
	while index < len(message):
		# Scan for "http" as the common subset of http:// and https://
		maybe_next_url = message[index:].find('http')
		# End if there are no longer urls to find
		if maybe_next_url == -1: break
		maybe_next_url += index

		if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':
			# Looks like we found a URL, scan for its end
			index = maybe_next_url
			parens = 0
			while index < len(message):
				# Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well
				if message[index] == '(':
					parens += 1
				elif message[index] == ')':
					if parens > 0:
						parens -= 1
					else:
						break
				# Some people punctuate their URLs
				elif message[index:index+2] in ('. ', ', '):
					break
				elif message[index] in (' ', '>'):
					break

				index += 1

			urls.append(message[maybe_next_url:index])
		else:
			index = maybe_next_url + 1

	return urls

def extract_title(page_data):
	# Find the <title> tag
	title_start = None
	index = 0
	while True:
		maybe_tag = page_data[index:].find(b'<')
		if maybe_tag == -1: break
		maybe_tag += index

		if page_data[maybe_tag:maybe_tag+6].lower() == b'<title':
			# Apparently <title> can have key="value" things
			# Find the end of the tag
			tag_closing = page_data[maybe_tag:].find(b'>')
			if tag_closing != -1:
				# It is relative to maybe_tag and we want
				# the string after it
				title_start = maybe_tag + tag_closing + 1
				break
		else:
			index = maybe_tag + 1

	if title_start is None:
		return None

	# Find the </title> tag
	title_end = None
	index = 0
	while True:
		maybe_tag = page_data[index:].find(b'<')
		if maybe_tag == -1: break
		maybe_tag += index

		if page_data[maybe_tag:maybe_tag+8].lower() == b'</title>':
			title_end = maybe_tag
			break
		else:
			index = maybe_tag + 1

	if title_end is None:
		title_end = len(page_data)

	title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip()
	while '  ' in title:
		title = title.replace('  ', ' ')

	return title

def sanitize(title):
	if title is None: return None
	return ''.join('\ufffd' if ord(c) < 32 else c for c in title)

# handle_message(*, prefix, message, nick, channel, irc)
# Called for PRIVMSGs.
# prefix is the prefix at the start of the message, without the leading ':'
# message is the contents of the message
# nick is who sent the message
# channel is where you should send the response (note: in queries nick == channel)
# irc is the IRC API object
# All strings are bytestrings
def handle_message(*, prefix, message, nick, channel, irc):
	# Ignore messages with a zwsp, ^O, or ^B^B in the beginning
	for ignored in (b'\xe2\x80\x8b', b'\x0f', b'\x02\x02'):
		if message[:len(ignored)] == ignored:
			return

	urls = find_urls(message.decode('utf-8'))

	# Don't titlebot >3 urls
	possible_titles_left = 3
	for url in urls:
		if possible_titles_left == 0: break

		domain = sanitize(urllib.parse.urlparse(url).netloc)

		try:
			try:
				with urllib.request.urlopen(url, timeout = 5) as response:
					if response.info().get_content_type() == 'text/html':
						# First 4KiB of a page should be enough for any <title>
						# Turns out it's not, so download 64KiB
						page_source_fragment = response.read(64 * 1024)
						title = sanitize(extract_title(page_source_fragment))

						if title is not None:
							message = '%s: %s' % (domain, title)
						else:
							message = '%s: <no title found>' % domain
						irc.bot_response(channel, message)

						possible_titles_left -= 1

			except urllib.error.HTTPError as e:
				# Tell ppl if server responded with an error code
				message = '%s: %i %s' % (domain, e.getcode(), e.msg)
				irc.bot_response(channel, message)
				possible_titles_left -= 1

		except (IOError, urllib.error.URLError):
			continue


# handle_nonmessage(*, prefix, command, arguments, irc)
# Called for all other commands than PINGs and PRIVMSGs.
# prefix is the prefix at the start of the message, without the leading ':'
# command is the command or number code
# arguments is rest of the arguments of the command, represented as a list. ':'-arguments are handled automatically
# irc is the IRC API object
# All strings are bytestrings
def handle_nonmessage(*, prefix, command, arguments, irc):
	...
Start on the actual titlebot code 2018-10-12 04:26:48 +00:00			`import html`
Bot titles 2018-10-12 04:48:57 +00:00			`import urllib.error`
			`import urllib.parse`
Start on the actual titlebot code 2018-10-12 04:26:48 +00:00			`import urllib.request`

Read config from a configuration file 2018-01-03 16:08:24 +00:00			`# initialize(*, config)`
Add a stub botcmd.py 2017-09-06 17:47:32 +00:00			`# Called to initialize the IRC bot`
			`# Runs before even logger is brought up, and blocks further bringup until it's done`
Read config from a configuration file 2018-01-03 16:08:24 +00:00			`# config is a configpatser.ConfigParser object containig contents of bot.conf`
			`def initialize(*, config):`
Add a stub botcmd.py 2017-09-06 17:47:32 +00:00			`...`

Add botcmd.on_connect hook, to allow better control over bot bringup 2018-01-02 16:31:23 +00:00			`# on_connect(*, irc)`
			`# Called after IRC bot has connected and sent the USER/NICk commands but not yet attempted anything else`
Only pass bytestrings (and not bytearrays) to user code 2018-06-14 07:50:30 +00:00			`# Called for every reconnect`
Add botcmd.on_connect hook, to allow better control over bot bringup 2018-01-02 16:31:23 +00:00			`# Blocks the bot until it's done, including PING/PONG handling`
Read config from a configuration file 2018-01-03 16:08:24 +00:00			`# irc is the IRC API object`
Add botcmd.on_connect hook, to allow better control over bot bringup 2018-01-02 16:31:23 +00:00			`def on_connect(*, irc):`
			`...`

Add botcmd.on_quit hook for bot cleanup 2018-01-20 14:54:13 +00:00			`# on_quit(*, irc)`
			`# Called just before IRC bot sends QUIT`
			`# Blocks the bot until it's done, including PING/PONG handling`
			`# irc is the IRC API object`
			`def on_quit(*, irc):`
			`...`

Start on the actual titlebot code 2018-10-12 04:26:48 +00:00			`def find_urls(message):`
			`urls = []`
			`index = 0`
			`while index < len(message):`
			`# Scan for "http" as the common subset of http:// and https://`
			`maybe_next_url = message[index:].find('http')`
			`# End if there are no longer urls to find`
			`if maybe_next_url == -1: break`
			`maybe_next_url += index`

			`if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':`
			`# Looks like we found a URL, scan for its end`
			`index = maybe_next_url`
			`parens = 0`
			`while index < len(message):`
			`# Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well`
			`if message[index] == '(':`
			`parens += 1`
			`elif message[index] == ')':`
			`if parens > 0:`
			`parens -= 1`
			`else:`
			`break`
			`# Some people punctuate their URLs`
			`elif message[index:index+2] in ('. ', ', '):`
			`break`
			`elif message[index] in (' ', '>'):`
			`break`

			`index += 1`

			`urls.append(message[maybe_next_url:index])`
			`else:`
			`index = maybe_next_url + 1`

			`return urls`

			`def extract_title(page_data):`
			`# Find the <title> tag`
			`title_start = None`
			`index = 0`
			`while True:`
			`maybe_tag = page_data[index:].find(b'<')`
			`if maybe_tag == -1: break`
			`maybe_tag += index`

Handle <title>s with key="value"s 2019-01-26 02:21:12 +00:00			`if page_data[maybe_tag:maybe_tag+6].lower() == b'<title':`
			`# Apparently <title> can have key="value" things`
			`# Find the end of the tag`
			`tag_closing = page_data[maybe_tag:].find(b'>')`
			`if tag_closing != -1:`
			`# It is relative to maybe_tag and we want`
			`# the string after it`
			`title_start = maybe_tag + tag_closing + 1`
			`break`
Start on the actual titlebot code 2018-10-12 04:26:48 +00:00			`else:`
			`index = maybe_tag + 1`

			`if title_start is None:`
			`return None`

			`# Find the </title> tag`
			`title_end = None`
			`index = 0`
			`while True:`
			`maybe_tag = page_data[index:].find(b'<')`
			`if maybe_tag == -1: break`
			`maybe_tag += index`

			`if page_data[maybe_tag:maybe_tag+8].lower() == b'</title>':`
			`title_end = maybe_tag`
			`break`
			`else:`
			`index = maybe_tag + 1`

			`if title_end is None:`
			`title_end = len(page_data)`

Bot titles 2018-10-12 04:48:57 +00:00			`title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip()`
			`while ' ' in title:`
			`title = title.replace(' ', ' ')`

			`return title`

			`def sanitize(title):`
Handle <title>s with key="value"s 2019-01-26 02:21:12 +00:00			`if title is None: return None`
Bot titles 2018-10-12 04:48:57 +00:00			`return ''.join('\ufffd' if ord(c) < 32 else c for c in title)`
Start on the actual titlebot code 2018-10-12 04:26:48 +00:00
Add a stub botcmd.py 2017-09-06 17:47:32 +00:00			`# handle_message(*, prefix, message, nick, channel, irc)`
			`# Called for PRIVMSGs.`
			`# prefix is the prefix at the start of the message, without the leading ':'`
			`# message is the contents of the message`
			`# nick is who sent the message`
			`# channel is where you should send the response (note: in queries nick == channel)`
			`# irc is the IRC API object`
Only pass bytestrings (and not bytearrays) to user code 2018-06-14 07:50:30 +00:00			`# All strings are bytestrings`
Add a stub botcmd.py 2017-09-06 17:47:32 +00:00			`def handle_message(*, prefix, message, nick, channel, irc):`
Ignore messages with zwsp, ^O, or ^B^B in the beginning 2018-10-22 07:51:07 +00:00			`# Ignore messages with a zwsp, ^O, or ^B^B in the beginning`
			`for ignored in (b'\xe2\x80\x8b', b'\x0f', b'\x02\x02'):`
			`if message[:len(ignored)] == ignored:`
			`return`

Start on the actual titlebot code 2018-10-12 04:26:48 +00:00			`urls = find_urls(message.decode('utf-8'))`

			`# Don't titlebot >3 urls`
			`possible_titles_left = 3`
			`for url in urls:`
			`if possible_titles_left == 0: break`

Tell users if we ran into weird HTTP codes 2018-10-28 06:51:08 +00:00			`domain = sanitize(urllib.parse.urlparse(url).netloc)`
Bot titles 2018-10-12 04:48:57 +00:00
Tell users if we ran into weird HTTP codes 2018-10-28 06:51:08 +00:00			`try:`
			`try:`
			`with urllib.request.urlopen(url, timeout = 5) as response:`
			`if response.info().get_content_type() == 'text/html':`
			`# First 4KiB of a page should be enough for any <title>`
			`# Turns out it's not, so download 64KiB`
			`page_source_fragment = response.read(64 * 1024)`
			`title = sanitize(extract_title(page_source_fragment))`

			`if title is not None:`
			`message = '%s: %s' % (domain, title)`
			`else:`
			`message = '%s: <no title found>' % domain`
			`irc.bot_response(channel, message)`

			`possible_titles_left -= 1`

			`except urllib.error.HTTPError as e:`
			`# Tell ppl if server responded with an error code`
			`message = '%s: %i %s' % (domain, e.getcode(), e.msg)`
			`irc.bot_response(channel, message)`
			`possible_titles_left -= 1`
Start on the actual titlebot code 2018-10-12 04:26:48 +00:00
Bot titles 2018-10-12 04:48:57 +00:00			`except (IOError, urllib.error.URLError):`
Start on the actual titlebot code 2018-10-12 04:26:48 +00:00			`continue`

Tell users if we ran into weird HTTP codes 2018-10-28 06:51:08 +00:00
Add a stub botcmd.py 2017-09-06 17:47:32 +00:00			`# handle_nonmessage(*, prefix, command, arguments, irc)`
			`# Called for all other commands than PINGs and PRIVMSGs.`
			`# prefix is the prefix at the start of the message, without the leading ':'`
			`# command is the command or number code`
			`# arguments is rest of the arguments of the command, represented as a list. ':'-arguments are handled automatically`
Read config from a configuration file 2018-01-03 16:08:24 +00:00			`# irc is the IRC API object`
Only pass bytestrings (and not bytearrays) to user code 2018-06-14 07:50:30 +00:00			`# All strings are bytestrings`
Add a stub botcmd.py 2017-09-06 17:47:32 +00:00			`def handle_nonmessage(*, prefix, command, arguments, irc):`
			`...`