Add parse_regex.py

2019-06-02 21:23:27 +03:00 · 2019-06-02 21:23:27 +03:00 · 7ade1b5977
parent d7bab30809
commit 7ade1b5977
1 changed files with 234 additions and 0 deletions
--- a/parse_regex.py
+++ b/parse_regex.py
@ -0,0 +1,234 @@
+import enum
+import sys
+
+from collections import namedtuple
+
+from regex import Literal, Concatenation, Alternation, Star, lit, concat, bar, star
+
+class token_types(enum.Enum):
+	char, left_paren, right_paren, bar, star, start, end = range(7)
+
+Token = namedtuple('Token', ['type', 'position', 'extra'])
+
+class ParseError(Exception):
+	def __init__(self, text, position):
+		self.text = text
+		self.position = position
+
+	def __repr__(self):
+		if __name__ == '__main__':
+			name = 'ParseError'
+		else:
+			name = __name__ + '.ParseError'
+		return '%s(%s, %i)' % (name, self.text, self.position)
+
+	def __str__(self):
+		return '%i: %s' % (self.position, self.text)
+
+def tokenize(text):
+	tokens = [Token(token_types.start, 0, None)]
+
+	index = 0
+	while index < len(text):
+		char = text[index]
+		token_start = index
+		index += 1
+
+		if char == '\\':
+			# Handle \<char> as one token
+			# Since we don't have any \<letter> extensions, just
+			# implement \ as passing 
+			if index >= len(text):
+				raise ParseError('\\ at end of text, should be followed by a character', token_start)
+
+			tokens.append(Token(token_types.char, token_start, text[index]))
+			index += 1
+
+		elif char == '(':
+			tokens.append(Token(token_types.left_paren, token_start, None))
+
+		elif char == ')':
+			tokens.append(Token(token_types.right_paren, token_start, None))
+
+		elif char == '|':
+			tokens.append(Token(token_types.bar, token_start, None))
+
+		elif char == '*':
+			tokens.append(Token(token_types.star, token_start, None))
+
+		else:
+			tokens.append(Token(token_types.char, token_start, char))
+
+	tokens.append(Token(token_types.end, index, None))
+
+	return tokens
+
+def parse(text):
+	def test_predicate(predicate, element):
+		if type(predicate) != list:
+			predicate = [predicate]
+
+		# If any of the funcs in the list pass, consider it as success
+		for f in predicate:
+			if f(element):
+				return True
+
+		return False
+
+	def match(*predicates):
+		nonlocal working_stack
+
+		# Predicates can only match stuff on the stack if it's there
+		if len(working_stack) < len(predicates):
+			return False
+
+		top_of_stack = working_stack[-len(predicates):]
+
+		for predicate, element in zip(predicates, top_of_stack):
+			if not test_predicate(predicate, element):
+				return False
+
+		return True
+
+	def pop(n = None):
+		nonlocal working_stack
+
+		if n == None:
+			return working_stack.pop()
+
+		else:
+			r = working_stack[-n:]
+			working_stack = working_stack[:-n]
+			return r
+
+	def push(*args):
+		nonlocal working_stack
+		for arg in args:
+			working_stack.append(arg)
+
+	# Predicates
+	def charp(x):
+		return type(x) == Token and x.type == token_types.char
+
+	def lparenp(x):
+		return type(x) == Token and x.type == token_types.left_paren
+
+	def rparenp(x):
+		return type(x) == Token and x.type == token_types.right_paren
+
+	def barp(x):
+		return type(x) == Token and x.type == token_types.bar
+
+	def starp(x):
+		return type(x) == Token and x.type == token_types.star
+
+	def startp(x):
+		return type(x) == Token and x.type == token_types.start
+
+	def endp(x):
+		return type(x) == Token and x.type == token_types.end
+
+	def regexp(x):
+		return type(x) in [Literal, Concatenation, Alternation, Star]
+
+	tokens = tokenize(text)
+	working_stack = []
+
+	while True:
+		# Try all possible reduces
+		if match(charp):
+			char = pop()
+			text = char.extra
+			push(lit(text))
+
+		elif match(regexp, regexp, [lparenp, rparenp, barp, endp]):
+			regex1, regex2, tos = pop(3)
+			push(concat(regex1, regex2), tos)
+
+		elif match(lparenp, regexp, rparenp):
+			_, regex, _ = pop(3)
+			push(regex)
+
+		elif match(lparenp, rparenp):
+			_, _ = pop(2)
+			push(lit(''))
+
+		elif match(regexp, barp, regexp, [lparenp, rparenp, barp, endp]):
+			regex1, _, regex2, tos = pop(4)
+			push(bar(regex1, regex2), tos)
+
+		elif match([lparenp, startp], barp):
+			a, b = pop(2)
+			push(a, lit(''), b)
+
+		elif match(barp, [lparenp, rparenp, barp, endp]):
+			a, b = pop(2)
+			push(a, lit(''), b)
+
+		elif match(regexp, starp):
+			regex, _ = pop(2)
+			push(star(regex))
+
+		elif match(startp, endp):
+			a, b = pop(2)
+			push(a, lit(''), b)
+
+		elif len(tokens) > 0:
+			# Can't reduce but can shift, so let's shift
+			push(tokens.pop(0))
+
+		elif match(startp, regexp, endp):
+			# Can't reduce nor shift, and stack is in the form
+			# signalling that the parsing is complete, so let's
+			# return the completed regex
+			_, regex, _ = pop(3)
+			return regex
+
+		else:
+			# Can't reduce nor shift
+			# As we've already handled the case where we finished
+			# parsing, this means something's gone wrong
+			break
+
+	# Figure out what's gone wrong
+	parens_stack = []
+	for i in working_stack:
+		if startp(i) or endp(i) or regexp(i):
+			continue
+
+		elif lparenp(i):
+			parens_stack.append(i)
+
+		elif rparenp(i):
+			if len(parens_stack) == 0:
+				raise ParseError('Unpaired right parenthesis', i.position)
+			else:
+				parens_stack.pop()
+
+		elif starp(i):
+			raise ParseError('Missing the thing to repeat before a *', i.position)
+
+		elif type(i) == Token:
+			raise ParseError('(Should be impossible!) Unprocessed token of type %s' % i.type.name)
+
+		else:
+			raise ParseError('(Should be impossible!) Unprocessed junk in parsing stack: %s' % (i,))
+
+	if len(parens_stack) > 0:
+		raise ParseError('Unpaired left parenthesis', parens_stack[0].position)
+
+	# Wasn't anything we could figure out
+	raise ParseError('(Should be impossible!) Error figuring out the type of parse error')
+
+def main():
+	try:
+		regex = parse(input('regex> '))
+
+		print(repr(regex))
+		print(regex)
+
+	except ParseError as err:
+		print('%s: Error: %s' % (sys.argv[0], str(err)), file=sys.stderr)
+
+if __name__ == '__main__':
+	main()