Compare commits

...

3 Commits

Author SHA1 Message Date
Juhani Krekelä 3a843fb07f Add regex input to regex_to_nfa.py and regex_to_regex.py 2019-06-02 21:36:28 +03:00
Juhani Krekelä 1409400cc8 Fix numerous small bugs in parse_regex.py 2019-06-02 21:36:12 +03:00
Juhani Krekelä 7ade1b5977 Add parse_regex.py 2019-06-02 21:23:27 +03:00
3 changed files with 266 additions and 11 deletions

242
parse_regex.py Normal file
View File

@ -0,0 +1,242 @@
import enum
import sys
from collections import namedtuple
from regex import Literal, Concatenation, Alternation, Star, lit, concat, bar, star
class token_types(enum.Enum):
char, left_paren, right_paren, bar, star, start, end = range(7)
Token = namedtuple('Token', ['type', 'position', 'extra'])
class ParseError(Exception):
def __init__(self, text, position):
self.text = text
self.position = position
def __repr__(self):
if __name__ == '__main__':
name = 'ParseError'
else:
name = __name__ + '.ParseError'
if self.position is not None:
return '%s(%s, %i)' % (name, self.text, self.position)
else:
return '%s(%s, None)' % (name, self.text)
def __str__(self):
if self.position is not None:
return '%i: %s' % (self.position, self.text)
else:
return self.text
def tokenize(text):
tokens = [Token(token_types.start, 0, None)]
index = 0
while index < len(text):
char = text[index]
token_start = index
index += 1
if char == '\\':
# Handle \<char> as one token
# Since we don't have any \<letter> extensions, just
# implement \ as passing
if index >= len(text):
raise ParseError('\\ at end of text, should be followed by a character', token_start)
tokens.append(Token(token_types.char, token_start, text[index]))
index += 1
elif char == '(':
tokens.append(Token(token_types.left_paren, token_start, None))
elif char == ')':
tokens.append(Token(token_types.right_paren, token_start, None))
elif char == '|':
tokens.append(Token(token_types.bar, token_start, None))
elif char == '*':
tokens.append(Token(token_types.star, token_start, None))
else:
tokens.append(Token(token_types.char, token_start, char))
tokens.append(Token(token_types.end, index, None))
return tokens
def parse(text):
def test_predicate(predicate, element):
if type(predicate) != list:
predicate = [predicate]
# If any of the funcs in the list pass, consider it as success
for f in predicate:
if f(element):
return True
return False
def match(*predicates):
nonlocal working_stack
# Predicates can only match stuff on the stack if it's there
if len(working_stack) < len(predicates):
return False
top_of_stack = working_stack[-len(predicates):]
for predicate, element in zip(predicates, top_of_stack):
if not test_predicate(predicate, element):
return False
return True
def pop(n = None):
nonlocal working_stack
if n == None:
return working_stack.pop()
else:
r = working_stack[-n:]
working_stack = working_stack[:-n]
return r
def push(*args):
nonlocal working_stack
for arg in args:
working_stack.append(arg)
# Predicates
def charp(x):
return type(x) == Token and x.type == token_types.char
def lparenp(x):
return type(x) == Token and x.type == token_types.left_paren
def rparenp(x):
return type(x) == Token and x.type == token_types.right_paren
def barp(x):
return type(x) == Token and x.type == token_types.bar
def starp(x):
return type(x) == Token and x.type == token_types.star
def startp(x):
return type(x) == Token and x.type == token_types.start
def endp(x):
return type(x) == Token and x.type == token_types.end
def regexp(x):
return type(x) in [Literal, Concatenation, Alternation, Star]
tokens = tokenize(text)
working_stack = []
while True:
# Try all possible reduces
if match(charp):
char = pop()
text = char.extra
push(lit(text))
elif match(regexp, regexp, [lparenp, rparenp, barp, endp]):
regex1, regex2, tos = pop(3)
push(concat(regex1, regex2), tos)
elif match(lparenp, regexp, rparenp):
_, regex, _ = pop(3)
push(regex)
elif match(lparenp, rparenp):
_, _ = pop(2)
push(lit(''))
elif match(regexp, barp, regexp, [lparenp, rparenp, barp, endp]):
regex1, _, regex2, tos = pop(4)
push(bar(regex1, regex2), tos)
elif match([lparenp, startp], barp):
a, b = pop(2)
push(a, lit(''), b)
elif match(barp, [lparenp, rparenp, barp, endp]):
a, b = pop(2)
push(a, lit(''), b)
elif match(regexp, starp):
regex, _ = pop(2)
push(star(regex))
elif match(startp, endp):
a, b = pop(2)
push(a, lit(''), b)
elif len(tokens) > 0:
# Can't reduce but can shift, so let's shift
push(tokens.pop(0))
elif match(startp, regexp, endp):
# Can't reduce nor shift, and stack is in the form
# signalling that the parsing is complete, so let's
# return the completed regex
_, regex, _ = pop(3)
return regex
else:
# Can't reduce nor shift
# As we've already handled the case where we finished
# parsing, this means something's gone wrong
break
# Figure out what's gone wrong
parens_stack = []
for i in working_stack:
if startp(i) or endp(i) or regexp(i) or barp(i):
continue
elif lparenp(i):
parens_stack.append(i)
elif rparenp(i):
if len(parens_stack) == 0:
raise ParseError('Unpaired right parenthesis', i.position)
else:
parens_stack.pop()
elif starp(i):
raise ParseError('Missing the thing to repeat before a *', i.position)
elif type(i) == Token:
raise ParseError('(Should be impossible!) Unprocessed token of type %s' % i.type.name, i.position)
else:
raise ParseError('(Should be impossible!) Unprocessed junk in parsing stack: %s' % (i,), i.position)
if len(parens_stack) > 0:
raise ParseError('Unpaired left parenthesis', parens_stack[0].position)
# Wasn't anything we could figure out
print(working_stack)#debg
raise ParseError('(Should be impossible!) Error figuring out the type of parse error', None)
def main():
try:
regex = parse(input('regex> '))
print(repr(regex))
print(regex)
except ParseError as err:
print('%s: Error: %s' % (sys.argv[0], str(err)), file=sys.stderr)
if __name__ == '__main__':
main()

View File

@ -1,5 +1,8 @@
from regex import Literal, Concatenation, Alternation, Star, lit, concat, bar, star import sys
from regex import Literal, Concatenation, Alternation, Star, lit
from nfa import NFA, prettyprint from nfa import NFA, prettyprint
from parse_regex import parse, ParseError
def to_nfa(regex): def to_nfa(regex):
def new_state(): def new_state():
@ -94,12 +97,16 @@ def to_nfa(regex):
return NFA(start_state, [end_state], transitions) return NFA(start_state, [end_state], transitions)
def main(): def main():
regex = concat(lit('x'), star(bar(lit('a'), lit('b'))), lit('y')) try:
print(regex) regex = parse(input('regex> '))
nfa = to_nfa(regex) except ParseError as err:
print('%s: Error: %s' % (sys.argv[0], str(err)), file=sys.stderr)
prettyprint(nfa) else:
nfa = to_nfa(regex)
prettyprint(nfa)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -1,15 +1,21 @@
from regex import lit, concat, bar, star import sys
from regex_to_nfa import to_nfa from regex_to_nfa import to_nfa
from nfa_to_regex import to_regex from nfa_to_regex import to_regex
from parse_regex import parse, ParseError
def main(): def main():
regex = concat(bar(lit('foo'), lit('bar')), bar(lit('baz'), lit('qux'))) try:
print(regex) regex = parse(input('regex> '))
nfa = to_nfa(regex) except ParseError as err:
regex_prime = to_regex(nfa) print('%s: Error: %s' % (sys.argv[0], str(err)), file=sys.stderr)
print(regex_prime) else:
nfa = to_nfa(regex)
regex_prime = to_regex(nfa)
print(regex_prime)
if __name__ == '__main__': if __name__ == '__main__':
main() main()