From 5bd2ae5410f973aa553d270a74115f188c1372e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juhani=20Krekel=C3=A4?= Date: Fri, 12 Oct 2018 07:26:48 +0300 Subject: [PATCH] Start on the actual titlebot code --- botcmd.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/botcmd.py b/botcmd.py index 217ab41..829977f 100644 --- a/botcmd.py +++ b/botcmd.py @@ -1,3 +1,6 @@ +import html +import urllib.request + # initialize(*, config) # Called to initialize the IRC bot # Runs before even logger is brought up, and blocks further bringup until it's done @@ -20,6 +23,80 @@ def on_connect(*, irc): def on_quit(*, irc): ... +def find_urls(message): + urls = [] + index = 0 + while index < len(message): + # Scan for "http" as the common subset of http:// and https:// + maybe_next_url = message[index:].find('http') + # End if there are no longer urls to find + if maybe_next_url == -1: break + maybe_next_url += index + + if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://': + # Looks like we found a URL, scan for its end + index = maybe_next_url + parens = 0 + while index < len(message): + # Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well + if message[index] == '(': + parens += 1 + elif message[index] == ')': + if parens > 0: + parens -= 1 + else: + break + # Some people punctuate their URLs + elif message[index:index+2] in ('. ', ', '): + break + elif message[index] in (' ', '>'): + break + + index += 1 + + urls.append(message[maybe_next_url:index]) + else: + index = maybe_next_url + 1 + + return urls + +def extract_title(page_data): + # Find the tag + title_start = None + index = 0 + while True: + maybe_tag = page_data[index:].find(b'<') + if maybe_tag == -1: break + maybe_tag += index + + if page_data[maybe_tag:maybe_tag+7].lower() == b'<title>': + title_start = maybe_tag + 7 + break + else: + index = maybe_tag + 1 + + if title_start is None: + return None + + # Find the tag + title_end = None + index = 0 + while True: + maybe_tag = page_data[index:].find(b'<') + if maybe_tag == -1: break + maybe_tag += index + + if page_data[maybe_tag:maybe_tag+8].lower() == b'': + title_end = maybe_tag + break + else: + index = maybe_tag + 1 + + if title_end is None: + title_end = len(page_data) + + return html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')) + # handle_message(*, prefix, message, nick, channel, irc) # Called for PRIVMSGs. # prefix is the prefix at the start of the message, without the leading ':' @@ -29,7 +106,25 @@ def on_quit(*, irc): # irc is the IRC API object # All strings are bytestrings def handle_message(*, prefix, message, nick, channel, irc): - ... + urls = find_urls(message.decode('utf-8')) + + # Don't titlebot >3 urls + possible_titles_left = 3 + for url in urls: + if possible_titles_left == 0: break + + try: + with urllib.request.urlopen(url, timeout = 1) as response: + if response.info().get_content_type() == 'text/html': + # First 4KB of a page should be enough for any + first_kb = response.read(4 * 1024) + title = extract_title(first_kb) + print(title)#debg + possible_titles_left -= 1 + + except IOError: + continue + # handle_nonmessage(*, prefix, command, arguments, irc) # Called for all other commands than PINGs and PRIVMSGs.