diff --git a/botcmd.py b/botcmd.py index 829977f..6219334 100644 --- a/botcmd.py +++ b/botcmd.py @@ -1,4 +1,6 @@ import html +import urllib.error +import urllib.parse import urllib.request # initialize(*, config) @@ -95,7 +97,14 @@ def extract_title(page_data): if title_end is None: title_end = len(page_data) - return html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')) + title = html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace')).replace('\n', ' ').replace('\t', ' ').strip() + while ' ' in title: + title = title.replace(' ', ' ') + + return title + +def sanitize(title): + return ''.join('\ufffd' if ord(c) < 32 else c for c in title) # handle_message(*, prefix, message, nick, channel, irc) # Called for PRIVMSGs. @@ -118,14 +127,18 @@ def handle_message(*, prefix, message, nick, channel, irc): if response.info().get_content_type() == 'text/html': # First 4KB of a page should be enough for any first_kb = response.read(4 * 1024) - title = extract_title(first_kb) - print(title)#debg + title = sanitize(extract_title(first_kb)) + + domain = sanitize(urllib.parse.urlparse(url).netloc) + + message = '%s: %s' % (domain, title) + irc.bot_response(channel, message) + possible_titles_left -= 1 - except IOError: + except (IOError, urllib.error.URLError): continue - # handle_nonmessage(*, prefix, command, arguments, irc) # Called for all other commands than PINGs and PRIVMSGs. # prefix is the prefix at the start of the message, without the leading ':'