happybot/urls.py

#!/usr/bin/env python3

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

from re import compile as regex

urls = regex(r'(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?')

from subprocess import Popen, PIPE

def cmd(*args):
    proc = Popen(args, stdout=PIPE)
    while True:
        line = proc.stdout.readline()
        if line:
            try:
                yield str(line[:-1], 'utf-8', 'ignore')
            except:
                pass
        else:
            break

from sys import argv

if len(argv) != 2:
    print('Usage:', argv[0], '#channel')
    exit(1)

chan = '/home/zgrep/offtopiabday/irc.freenode.net/' + argv[1]

from urllib.request import Request, urlopen

def quote(url):
    res = ''
    for c in url:
        if ord(c) > 127:
            res += ''.join('%' + hex(b)[2:] for b in c.encode('utf-8'))
        else:
            res += c
    return res

irccloud_none = 'irccloud.com/pastebin/'
irccloud_with = irccloud_none + 'raw/'

for line in cmd('tail', '-n', '0', '-f', chan + '/out'):
    date, time, nick, line = line.split(' ', 3)
    nick = nick[1:-1]
    if nick in ('happybot', 'hatebot'):
        continue
    result = []
    print('Doing line:', line)
    for url in urls.findall(line):
        url = quote(url)
        if irccloud_none in url and irccloud_with not in url:
            result.append(url.replace(irccloud_none, irccloud_with, 1))
            continue
        if url[-5:] == '.gifv': # hack for imgur gifv's
            url = url[:-5]
            print('| Got gifv:', url)
            try:
                r = urlopen(Request(url + '.mp4', method='HEAD'))
                contenttype = r.getheader('content-type')
                length1 = r.getheader('content-length')
                r.close()
            except:
                print('| Could not get mp4.')
            if 'video' not in contenttype.lower():
                print('| Video is not a video?')
                result.append('???')
                continue
            try:
                r = urlopen(Request(url + '.gif', method='HEAD'))
                contenttype = r.getheader('content-type')
                length2 = r.getheader('content-length')
                r.close()
            except:
                print('| Could not get gif.')
            if 'image' not in contenttype.lower():
                print('| Image is not an image?')
                result.append('???')
                continue
            try:
                length1 = int(length1)
                length2 = int(length2)
            except:
                print('| Lengths are not ints.')
                continue
            if length1 <= length2:
                url += '.mp4'
                length = length1
            else:
                url += '.gif'
                length = length2
            result.append(url + ' ' + sizeof_fmt(length))
            continue
        print('| Got URL:', url)
        rq = Request(url, method='HEAD')
        try:
            r = urlopen(rq)
            contenttype = r.getheader('content-type').lower()
            length = r.getheader('content-length')
            r.close()
            print('| | HEAD request completed.')
            download = 0 # 0 ignore, 1 get from HEAD, 2+ get from HEAD otherwise GET
            if 'image' in contenttype:
                download = 2 # download images, fine...
            elif 'video' in contenttype:
                download = 1 # Eh... I'll draw the line at videos.
            if download > 0:
                if length:
                    try:
                        b = int(length)
                    except:
                        b = -1
                elif download > 1:
                    try:
                        rq = Request(url, method='HEAD')
                        r = urlopen(rq)
                        b = len(r.read())
                        r.close()
                        print('| | Normal request required and complete.')
                    except:
                        b = -1
                        print('| | Normal request required and failed.')
                if b < 0:
                    print('| | Failure.')
                    result.append('???')
                else:
                    print('| | Success.')
                    result.append(sizeof_fmt(b))
        except:
            print('| | Failure.')
    if result:
        with open(chan + '/in', 'w') as fh:
            fh.write('[' + '] ['.join(result) + ']\n')