class Literal: def __init__(self, text): self.text = text self.single_char = len(text) == 1 def __repr__(self): return 'Literal(%s)' % repr(self.text) def __str__(self): # ERE-style quotation rules # A-Za-z0-9 and space are safe, as are non-ASCII # Everything else is safe to quote with backslash return ''.join( char if ord('A') <= ord(char) <= ord('Z') else char if ord('a') <= ord(char) <= ord('z') else char if ord('0') <= ord(char) <= ord('9') else char if char == ' ' else char if ord(char) >= 128 else # Non-ASCII '\\' + char # Quote for char in self.text ) class Concatenation: def __init__(self, *elements): self.elements = elements def __repr__(self): return 'Concatenation(%s)' % ', '.join(map(repr, self.elements)) def __str__(self): return ''.join( # Only alternation binds looser than concatenation, # so we can pass everything else pass through with # no parenthesizing str(element) if type(element) != Alternation else '(' + str(element) + ')' for element in self.elements ) class Alternation: def __init__(self, *elements): self.elements = elements def __repr__(self): return 'Alternation(%s)' % ', '.join(map(repr, self.elements)) def __str__(self): # Nothing binds looser than alternation, so just pass # everything through as-is return '|'.join(map(str, self.elements)) class Star: def __init__(self, element): self.element = element def __repr__(self): return 'Star(%s)' % repr(self.element) def __str__(self): # * applies to the previous character or a parenthesized # group. Therefore, we parentesize unless we havea Literal # and it is one-char long if type(self.element) == Literal and self.element.single_char: return str(self.element) + '*' else: return '(%s)*' % str(self.element) def lit(text): return Literal(text) def concat(*elements): # TODO: drop literals that are just '' flattened = [] for element in elements: if type(element) == Concatenation: flattened.extend(element.elements) else: flattened.append(element) combined = [] for element in flattened: if type(element) == Literal and element.text == '': # Drop empty literals continue elif len(combined) > 0 and type(combined[-1]) == Literal and type(element) == Literal: # Combine two literals next to each other # into one literal previous = combined.pop() combined.append(Literal(previous.text + element.text)) else: combined.append(element) if len(combined) == 0: # Empty regex, represent with empty literal return lit('') elif len(combined) == 1: element, = combined return element else: return Concatenation(*combined) def bar(*elements): # TODO: rewrite (foo|foo) → foo flattened = [] for element in elements: if type(element) == Alternation: flattened.extend(element.elements) else: flattened.append(element) if len(flattened) == 1: element, = flattened return element else: return Alternation(*flattened) def star(element): return Star(element)