regexen_nfae/regex.py

124 lines
3.0 KiB
Python

class Literal:
def __init__(self, text):
self.text = text
self.single_char = len(text) == 1
def __repr__(self):
return 'Literal(%s)' % repr(self.text)
def __str__(self):
# ERE-style quotation rules
# A-Za-z0-9 and space are safe, as are non-ASCII
# Everything else is safe to quote with backslash
return ''.join(
char if ord('A') <= ord(char) <= ord('Z') else
char if ord('a') <= ord(char) <= ord('z') else
char if ord('0') <= ord(char) <= ord('9') else
char if char == ' ' else
char if ord(char) >= 128 else # Non-ASCII
'\\' + char # Quote
for char in self.text
)
class Concatenation:
def __init__(self, *elements):
self.elements = elements
def __repr__(self):
return 'Concatenation(%s)' % ', '.join(map(repr, self.elements))
def __str__(self):
return ''.join(
# Only alternation binds looser than concatenation,
# so we can pass everything else pass through with
# no parenthesizing
str(element) if type(element) != Alternation else
'(' + str(element) + ')'
for element in self.elements
)
class Alternation:
def __init__(self, *elements):
self.elements = elements
def __repr__(self):
return 'Alternation(%s)' % ', '.join(map(repr, self.elements))
def __str__(self):
# Nothing binds looser than alternation, so just pass
# everything through as-is
return '|'.join(map(str, self.elements))
class Star:
def __init__(self, element):
self.element = element
def __repr__(self):
return 'Star(%s)' % repr(self.element)
def __str__(self):
# * applies to the previous character or a parenthesized
# group. Therefore, we parentesize unless we havea Literal
# and it is one-char long
if type(self.element) == Literal and self.element.single_char:
return str(self.element) + '*'
else:
return '(%s)*' % str(self.element)
def lit(text):
return Literal(text)
def concat(*elements):
# TODO: drop literals that are just ''
flattened = []
for element in elements:
if type(element) == Concatenation:
flattened.extend(element.elements)
else:
flattened.append(element)
combined = []
for element in flattened:
if type(element) == Literal and element.text == '':
# Drop empty literals
continue
elif len(combined) > 0 and type(combined[-1]) == Literal and type(element) == Literal:
# Combine two literals next to each other
# into one literal
previous = combined.pop()
combined.append(Literal(previous.text + element.text))
else:
combined.append(element)
if len(combined) == 0:
# Empty regex, represent with empty literal
return lit('')
elif len(combined) == 1:
element, = combined
return element
else:
return Concatenation(*combined)
def bar(*elements):
# TODO: rewrite (foo|foo) → foo
flattened = []
for element in elements:
if type(element) == Alternation:
flattened.extend(element.elements)
else:
flattened.append(element)
if len(flattened) == 1:
element, = flattened
return element
else:
return Alternation(*flattened)
def star(element):
return Star(element)