Rewrite HTML to replace https:// links with http:// ones

This commit is contained in:
Juhani Krekelä 2021-10-17 21:18:56 +03:00
parent 636cb9b92d
commit 31fd864970
1 changed files with 151 additions and 36 deletions

View File

@ -117,6 +117,7 @@
# <http://creativecommons.org/publicdomain/zero/1.0/>
import base64
import enum
import select
import socket
import ssl
@ -124,6 +125,103 @@ import sys
import time
import threading
class contexts(enum.Enum):
text, tagname, attributename, after_attributename, after_equals, attributevalue, attributevalue_sq, attributevalue_dq = range(8)
class HtmlProcessor:
def __init__(self):
self.context = contexts.text
self.tag = None
self.attribute = None
self.value = None
def process_attribute(self):
tag = self.tag.lower()
attribute = self.attribute.lower()
# TODO: handle more attributes
if tag == b'a' and attribute == b'href' or tag == b'img' and attribute == b'src':
if self.value.strip().lower().startswith(b'https://'):
# Space is to keep the response size constant
return b' http://' + self.value.strip()[len(b'https://'):]
else:
return self.value
else:
return self.value
def process(self, data):
processed = bytearray()
for char in data:
if self.context == contexts.text and char == ord('<'):
self.context = contexts.tagname
self.tag = bytearray()
self.attribute = None
self.value = None
elif self.context not in (contexts.attributevalue_sq, contexts.attributevalue_dq) and char == ord('>'):
if self.context == contexts.attributevalue: processed.extend(self.process_attribute())
self.context = contexts.text
self.tag = None
self.attribute = None
self.value = None
elif self.context in (contexts.tagname, contexts.attributevalue) and chr(char).isspace():
if self.context == contexts.attributevalue: processed.extend(self.process_attribute())
self.context = contexts.attributename
self.attribute = bytearray()
self.value = None
elif self.context == contexts.attributename and chr(char).isspace():
self.context = contexts.after_attributename
elif self.context == contexts.after_attributename and chr(char).isspace():
pass
elif self.context in (contexts.attributename, contexts.after_attributename) and char == ord('='):
self.context = contexts.after_equals
elif self.context == contexts.after_equals and chr(char).isspace():
pass
elif self.context == contexts.after_equals and char == ord("'"):
self.context = contexts.attributevalue_sq
self.value = bytearray()
elif self.context == contexts.after_equals and char == ord('"'):
self.context = contexts.attributevalue_dq
self.value = bytearray()
elif self.context == contexts.attributevalue_sq and char == ord("'"):
processed.extend(self.process_attribute())
self.context = contexts.attributename
elif self.context == contexts.attributevalue_dq and char == ord('"'):
processed.extend(self.process_attribute())
self.context = contexts.attributename
elif self.context == contexts.tagname:
self.tag.append(char)
elif self.context == contexts.attributename:
self.attribute.append(char)
elif self.context == contexts.after_attributename:
self.context = contexts.attributename
self.attribute = bytearray([char])
self.value = None
elif self.context == contexts.after_equals:
self.context = contexts.attributevalue
self.value = bytearray([char])
elif self.context in (contexts.attributevalue, contexts.attributevalue_sq, contexts.attributevalue_dq):
self.value.append(char)
elif self.context == contexts.text:
pass
if self.context == contexts.attributevalue:
pass
elif self.context == contexts.attributevalue_sq and char != ord("'"):
pass
elif self.context == contexts.attributevalue_dq and char != ord('"'):
pass
else:
processed.append(char)
return processed
def finalize(self):
if self.context in (contexts.attributevalue, contexts.attributevalue_sq, contexts.attributevalue_dq):
return self.process_attribute()
return b''
def connect(host, port):
try:
for res in socket.getaddrinfo(host, port, socket.AF_UNSPEC, socket.SOCK_STREAM):
@ -228,9 +326,10 @@ def proxy(sock, host):
del password
# Remove headers that don't need forwarding or are overwritten
headers = dict((key, value) for key, value in headers.items() if not key.startswith(b'proxy-') and not key in (b'connection', b'keep-alive'))
headers = dict((key, value) for key, value in headers.items() if not key.startswith(b'proxy-') and not key in (b'connection', b'accept-encoding', b'keep-alive'))
headers[b'connection'] = b'close'
headers[b'accept-enoding'] = b'identity'
# Split url into its constituents
fields = url.split(b'://', 1)
@ -327,22 +426,17 @@ def proxy(sock, host):
remote_sock.settimeout(None)
response, _, response_data = response.partition(b'\r\n\r\n')
# Figure out if this is a redirect to HTTPS
# If it is, rewrite to HTTP
# Process response headers
# Figure out if this is a redirect to HTTPS and if so, rewrite to HTTP
# Figure out whether response is html
tls_redirect = False
fields = response.split(b'\r\n')[0].split(b' ')
rewritten_response = None
if len(fields) > 1 and fields[1] in (b'301', b'302', b'303', b'307', b'308'):
rewritten_response = bytearray()
rewritten_response.extend(response.split(b'\r\n')[0]) # Include response line as-is
rewritten_response.extend(b'\r\n')
for line in response.split(b'\r\n')[1:]:
fields = line.split(b':', 1)
if len(fields) != 2 or fields[0].lower() != b'location':
rewritten_response.extend(line)
rewritten_response.extend(b'\r\n')
continue
is_html = True
rewritten_response = bytearray()
rewritten_response.extend(response.split(b'\r\n')[0]) # Include response line as-is
rewritten_response.extend(b'\r\n')
for line in response.split(b'\r\n')[1:]:
fields = line.split(b':', 1)
if len(fields) == 2 and fields[0].lower() == b'location':
destination_url = fields[1].strip()
if destination_url.startswith(b'https://'):
destination_url = b'http://' + destination_url[len(b'https://'):]
@ -355,6 +449,16 @@ def proxy(sock, host):
# This redirect is of the current URL but TLS
tls_redirect = True
elif len(fields) == 2 and fields[0].lower() == b'content-type':
mimetype = fields[1].split(b';')[0].strip().lower()
is_html = mimetype == b'text/html'
rewritten_response.extend(line)
rewritten_response.extend(b'\r\n')
else:
rewritten_response.extend(line)
rewritten_response.extend(b'\r\n')
if tls_redirect and not tls:
# Do upgrade to TLS transparently to client
print('TLS', file=sys.stderr, end=' ')
@ -370,58 +474,69 @@ def proxy(sock, host):
continue
# Forward response to client
if rewritten_response is not None:
sock.sendall(rewritten_response)
else:
sock.sendall(response)
sock.sendall(b'\r\n\r\n')
sock.sendall(response_data)
sock.sendall(rewritten_response)
sock.sendall(b'\r\n')
break
del request_data
# TODO: Un-https links
if is_html:
htmlprocessor = HtmlProcessor()
sock.sendall(htmlprocessor.process(response_data))
else:
sock.sendall(response_data)
print('', file=sys.stderr)
sock.settimeout(60)
remote_sock.settimeout(60)
last_transfer = time.monotonic()
while True:
ending_connection = False
while not ending_connection:
events = poll.poll(60_000)
if len(events) == 0 and time.monotonic() - last_transfer > 60:
remote_sock.close()
return
break
for fd, _ in events:
if fd == sock.fileno():
try:
data = sock.recv(1024)
except (ConnectionResetError):
return
except ConnectionResetError:
ending_connection = True
break
if data != b'':
try:
remote_sock.sendall(data)
except (ConnectionResetError, BrokenPipeError):
return
except socket.timeout:
except (ConnectionResetError, BrokenPipeError, socket.timeout):
pass
else:
try:
data = remote_sock.recv(1024)
except (ConnectionResetError, socket.timeout):
return
ending_connection = True
break
if data == b'':
remote_sock.close()
return
ending_connection = True
break
if is_html:
data = htmlprocessor.process(data)
try:
sock.sendall(data)
except (ConnectionResetError, BrokenPipeError, socket.timeout):
remote_sock.close()
return
ending_connection = True
break
last_transfer = time.monotonic()
remote_sock.close()
if is_html:
try:
sock.sendall(htmlprocessor.finalize())
except (ConnectionResetError, BrokenPipeError, socket.timeout):
pass
class ProxyThread(threading.Thread):
def __init__(self, sock, host):
self.sock = sock