import re import string @test def test_search_star_plus(): assert re.search('x*', 'axx').span(0) == (0, 0) assert re.search('x*', 'axx').span() == (0, 0) assert re.search('x+', 'axx').span(0) == (1, 3) assert re.search('x+', 'axx').span() == (1, 3) assert re.search('x', 'aaa') is None assert re.match('a*', 'xxx').span(0) == (0, 0) assert re.match('a*', 'xxx').span() == (0, 0) assert re.match('x*', 'xxxa').span(0) == (0, 3) assert re.match('x*', 'xxxa').span() == (0, 3) assert re.match('a+', 'xxx') is None test_search_star_plus() @test def test_branching(): """Test Branching Test expressions using the OR ('|') operator.""" assert re.match('(ab|ba)', 'ab').span() == (0, 2) assert re.match('(ab|ba)', 'ba').span() == (0, 2) assert re.match('(abc|bac|ca|cb)', 'abc').span() == (0, 3) assert re.match('(abc|bac|ca|cb)', 'bac').span() == (0, 3) assert re.match('(abc|bac|ca|cb)', 'ca').span() == (0, 2) assert re.match('(abc|bac|ca|cb)', 'cb').span() == (0, 2) assert re.match('((a)|(b)|(c))', 'a').span() == (0, 1) assert re.match('((a)|(b)|(c))', 'b').span() == (0, 1) assert re.match('((a)|(b)|(c))', 'c').span() == (0, 1) test_branching() @test def test_basic_re_sub(): def bump_num(matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) assert re.sub('y', 'a', 'xyz') == 'xaz' assert re.sub("(?i)b+", "x", "bbbb BBBB") == 'x x' assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y') == '9.3 -3 24x100y' assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y', 3) == '9.3 -3 23x99y' assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y', count=3) == '9.3 -3 23x99y' assert re.sub('.', lambda m: r"\n", 'x') == '\\n' assert re.sub('.', r"\n", 'x') == '\n' s = r"\1\1" assert re.sub('(.)', s, 'x') == 'xx' assert re.sub('(.)', s.replace('\\', r'\\'), 'x') == s assert re.sub('(.)', lambda m: s, 'x') == s assert re.sub('(?Px)', r'\g\g', 'xx') == 'xxxx' assert re.sub('(?Px)', r'\g\g<1>', 'xx') == 'xxxx' assert re.sub('(?Px)', r'\g\g', 'xx') == 'xxxx' assert re.sub('(?Px)', r'\g<1>\g<1>', 'xx') == 'xxxx' assert re.sub('()x', r'\g<0>\g<0>', 'xx') == 'xxxx' assert re.sub('a', r'\t\n\v\r\f\a\b', 'a') == '\t\n\v\r\f\a\b' assert re.sub('a', '\t\n\v\r\f\a\b', 'a') == '\t\n\v\r\f\a\b' assert re.sub('a', '\t\n\v\r\f\a\b', 'a') == (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)) for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': try: re.sub('a', '\\' + c, 'a') assert False except re.error: pass assert re.sub(r'^\s*', 'X', 'test') == 'Xtest' test_basic_re_sub() @test def test_bug_449964(): assert re.sub(r'(?Px)', r'\g<1>\g<1>\b', 'xx') == 'xx\bxx\b' test_bug_449964() @test def test_bug_449000(): # Test for sub() on escaped characters assert re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n') == 'abc\ndef\n' assert re.sub('\r\n', r'\n', 'abc\r\ndef\r\n') =='abc\ndef\n' assert re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n') == 'abc\ndef\n' assert re.sub('\r\n', '\n', 'abc\r\ndef\r\n') == 'abc\ndef\n' test_bug_449000() @test def test_sub_template_numeric_escape(): # bug 776311 and friends assert re.sub('x', r'\0', 'x') == '\0' assert re.sub('x', r'\000', 'x') == '\000' assert re.sub('x', r'\001', 'x') == '\001' assert re.sub('x', r'\008', 'x') == '\0' + '8' assert re.sub('x', r'\009', 'x') == '\0' + '9' assert re.sub('x', r'\111', 'x') == '\111' assert re.sub('x', r'\117', 'x') == '\117' assert re.sub('x', r'\377', 'x') == '\377' assert re.sub('x', r'\1111', 'x'), '\1111' assert re.sub('x', r'\1111', 'x'), '\111' + '1' assert re.sub('x', r'\00', 'x') == '\x00' assert re.sub('x', r'\07', 'x') == '\x07' assert re.sub('x', r'\08', 'x') == '\0' + '8' assert re.sub('x', r'\09', 'x') == '\0' + '9' assert re.sub('x', r'\0a', 'x') == '\0' + 'a' ''' self.checkTemplateError('x', r'\400', 'x', r'octal escape value \400 outside of ' r'range 0-0o377', 0) self.checkTemplateError('x', r'\777', 'x', r'octal escape value \777 outside of ' r'range 0-0o377', 0) self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1) self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1) self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1) self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1) self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1) self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1) self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1) self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1) self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1) self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1) self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1) self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1) self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1) ''' assert re.sub('(((((((((((x)))))))))))', r'\11', 'x') == 'x' assert re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz') == 'xz8' assert re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz') == 'xza' test_sub_template_numeric_escape() @test def test_qualified_re_sub(): assert re.sub('a', 'b', 'aaaaa') == 'bbbbb' assert re.sub('a', 'b', 'aaaaa', 1) == 'baaaa' assert re.sub('a', 'b', 'aaaaa', count=1) == 'baaaa' test_qualified_re_sub() @test def test_bug_114660(): assert re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there') == 'hello there' test_bug_114660() @test def test_symbolic_refs(): assert re.sub('(?Px)|(?Py)', r'\g', 'xx') == '' assert re.sub('(?Px)|(?Py)', r'\2', 'xx') == '' # Support > 100 groups. pat = '|'.join(f'x(?P{hex(i)[2:]})y' for i in range(1, 200 + 1)) assert re.sub(pat, r'\g<200>', 'xc8yzxc8y') == 'c8zc8' test_symbolic_refs() @test def test_re_subn(): assert re.subn("(?i)b+", "x", "bbbb BBBB") == ('x x', 2) assert re.subn("b+", "x", "bbbb BBBB") == ('x BBBB', 1) assert re.subn("b+", "x", "xyz") == ('xyz', 0) assert re.subn("b*", "x", "xyz") == ('xxxyxzx', 4) assert re.subn("b*", "x", "xyz", 2) == ('xxxyz', 2) assert re.subn("b*", "x", "xyz", count=2) == ('xxxyz', 2) test_re_subn() # TODO: Current version does not allow None == None, # so use this as a workaround. def cmp_opt(x, y): if x is None: return y is None elif y is None: return x is None else: return unwrap(x) == unwrap(y) def cmp_list(a, b): if len(a) != len(b): return False for i in range(len(a)): if not cmp_opt(a[i], b[i]): return False return True def cmp_tuple(a, b): if staticlen(a) != staticlen(b): return False elif staticlen(a) == 0: return True else: if not cmp_opt(a[0], b[0]): return False return cmp_tuple(a[1:], b[1:]) @test def test_re_split(): assert cmp_list(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) assert cmp_list(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c']) assert cmp_list(re.split("(:+)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c']) assert cmp_list(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) assert cmp_list(re.split("(:)+", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) assert cmp_list(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) assert cmp_list(re.split("(b)|(:+)", ":a:b::c"), [None, '', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c'][1:]) assert cmp_list(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) for sep, expected in [ (':*', [None, '', '', 'a', '', 'b', '', 'c', ''][1:]), ('(?::*)', [None, '', '', 'a', '', 'b', '', 'c', ''][1:]), ('(:*)', [None, '', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', ''][1:]), ('(:)*', [None, '', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, ''][1:]), ]: assert cmp_list(re.split(sep, ':a:b::c'), expected) for sep, expected in [ ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), # (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), # TODO: this fails; re2 difference maybe? ]: assert cmp_list(re.split(sep, ':a:b::c'), expected) test_re_split() @test def test_qualified_re_split(): assert cmp_list(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) assert cmp_list(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) assert cmp_list(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) assert cmp_list(re.split("(:)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) assert cmp_list(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) assert cmp_list(re.split("(:*)", ":a:b::c", maxsplit=2), ['', ':', '', '', 'a:b::c']) test_qualified_re_split() @test def test_re_findall(): assert re.findall(":+", "abc") == [] assert re.findall(":+", "a:b::c:::d") == [":", "::", ":::"] assert re.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"] # (!) Note: this is different in Codon, as we always return the full match even # if there are capturing groups. assert re.findall("(:)(:*)", "a:b::c:::d") == [":", "::", ":::"] x = "\xe0" xx = x * 2 xxx = x * 3 string = f"a{x}b{xx}c{xxx}d" assert re.findall(f"{x}+", string) == [x, xx, xxx] assert re.findall(f"({x}+)", string) == [x, xx, xxx] # (!) Note: same as above. assert re.findall(f"({x})({x}*)", string) == [x, xx, xxx] test_re_findall() @test def test_re_match(): assert cmp_list(re.match('a', 'a').groups(), List[Optional[str]]()) assert cmp_list(re.match('(a)', 'a').groups(), [None, 'a',][1:]) assert re.match('(a)', 'a').group(0).__val__() == 'a' assert re.match('(a)', 'a').group(1).__val__() == 'a' assert re.match('(a)', 'a').group(1, 1) == (Optional('a'), Optional('a')) assert cmp_list(re.match('\xe0', '\xe0').groups(), List[Optional[str]]()) assert cmp_list(re.match('(\xe0)', '\xe0').groups(), ['\xe0']) assert unwrap(re.match('(\xe0)', '\xe0').group(0)) == '\xe0' assert unwrap(re.match('(\xe0)', '\xe0').group(1)) == '\xe0' assert re.match('(\xe0)', '\xe0').group(1, 1) == (Optional('\xe0'), Optional('\xe0')) pat = re.compile('((a)|(b))(c)?') assert cmp_list(pat.match('a').groups(), [None, 'a', 'a', None, None][1:]) assert cmp_list(pat.match('b').groups(), [None, 'b', None, 'b', None][1:]) assert cmp_list(pat.match('ac').groups(), [None, 'a', 'a', None, 'c'][1:]) assert cmp_list(pat.match('bc').groups(), [None, 'b', None, 'b', 'c'][1:]) assert cmp_list(pat.match('bc').groups(""), [None, 'b', "", 'b', 'c'][1:]) pat = re.compile('(?:(?Pa)|(?Pb))(?Pc)?') assert cmp_tuple(pat.match('a').group(1, 2, 3), (Optional('a'), Optional[str](), Optional[str]())) assert cmp_tuple(pat.match('b').group('a1', 'b2', 'c3'), (Optional[str](), Optional('b'), Optional[str]())) assert cmp_tuple(pat.match('ac').group(1, 'b2', 3), (Optional('a'), Optional[str](), Optional('c'))) test_re_match() def raises(exception: type, function, *args, **kwargs): try: function(*args, **kwargs) except exception: return True except: pass return False @test def test_group(): class Index: value: int def __init__(self, value): self.value = value def __index__(self): return self.value # A single group m = re.match('(a)(b)', 'ab') assert m.group() == 'ab' assert unwrap(m.group(0)) == 'ab' assert unwrap(m.group(1)) == 'a' assert unwrap(m.group(Index(1))) == 'a' assert raises(IndexError, m.group, -1) assert raises(IndexError, m.group, 3) assert raises(IndexError, m.group, 1<<1000) assert raises(IndexError, m.group, Index(1<<1000)) assert raises(IndexError, m.group, 'x') # Multiple groups assert cmp_tuple(m.group(2, 1), ('b', 'a')) assert cmp_tuple(m.group(Index(2), Index(1)), ('b', 'a')) test_group() @test def test_match_getitem(): pat = re.compile('(?:(?Pa)|(?Pb))(?Pc)?') m = pat.match('a') assert unwrap(m['a1']) == 'a' assert m['b2'] is None assert m['c3'] is None assert unwrap(m[0]) == 'a' assert unwrap(m[1]) == 'a' assert m[2] is None assert m[3] is None assert raises(IndexError, lambda i: m[i], 'X') assert raises(IndexError, lambda i: m[i], -1) assert raises(IndexError, lambda i: m[i], 4) m = pat.match('ac') assert unwrap(m['a1']) == 'a' assert m['b2'] is None assert unwrap(m['c3']) == 'c' assert unwrap(m[0]) == 'ac' assert unwrap(m[1]) == 'a' assert m[2] is None assert unwrap(m[3]) == 'c' test_match_getitem() @test def test_re_fullmatch(): assert re.fullmatch(r"a", "a").span() == (0, 1) assert re.fullmatch(r"a|ab", "ab").span() == (0, 2) assert re.fullmatch("\xe0|\xe0\xdf", "\xe0\xdf").span() == (0, 2) assert re.fullmatch(r".*?$", "abc").span() == (0, 3) assert re.fullmatch(r".*?", "abc").span() == (0, 3) assert re.fullmatch(r"a.*?b", "ab").span() == (0, 2) assert re.fullmatch(r"a.*?b", "abb").span() == (0, 3) assert re.fullmatch(r"a.*?b", "axxb").span() == (0, 4) assert re.fullmatch(r"a+", "ab") is None assert re.fullmatch(r"abc$", "abc\n") is None assert re.fullmatch(r"(?m)abc$", "abc\n") is None assert re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3) # TODO: following fails; $ does not respect endpos in re2? #assert re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3) assert re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3) test_re_fullmatch() @test def test_finditer(): iter = re.finditer(r":+", "a:b::c:::d") assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"]) pat = re.compile(r":+") iter = pat.finditer("a:b::c:::d", 1, 10) assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"]) pat = re.compile(r":+") iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"]) pat = re.compile(r":+") iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"]) pat = re.compile(r":+") iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) assert cmp_list([item.group(0) for item in iter], ["::", "::"]) test_finditer() @test def test_constants(): assert re.I == re.IGNORECASE assert re.L == re.LOCALE assert re.M == re.MULTILINE assert re.S == re.DOTALL assert re.X == re.VERBOSE test_constants() @test def test_anyall(): assert unwrap(re.match("a.b", "a\nb", re.DOTALL).group(0)) == "a\nb" assert unwrap(re.match("a.*b", "a\n\nb", re.DOTALL).group(0)) == "a\n\nb" test_anyall() @test def test_groupdict(): d = re.match('(?Pfirst) (?Psecond)', 'first second').groupdict() assert len(d) == 2 and unwrap(d['first']) == 'first' and unwrap(d['second']) == 'second' test_groupdict() @test def test_expand(): assert re.match("(?Pfirst) (?Psecond)", "first second").expand(r"\2 \1 \g \g") == "second first second first" assert re.match("(?Pfirst)|(?Psecond)", "first").expand(r"\2 \g") == " " test_expand() @test def test_getattr(): assert re.compile("(?i)(a)(b)").pattern == "(?i)(a)(b)" # TODO: Codon does not support flags like this # self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) assert re.compile("(?i)(a)(b)").groups == 2 assert re.compile("(?i)(a)(b)").groupindex == {} assert re.compile("(?i)(?Pa)(?Pb)").groupindex == {'first': 1, 'other': 2} assert re.match("(a)", "a").pos == 0 assert re.match("(a)", "a").endpos == 1 assert re.match("(a)", "a").string == "a" assert re.match("(a)", "a").re test_getattr() def check_match(pattern, text, match: Optional[str] = None, span: Optional[re.Span] = None, matcher = re.fullmatch): if match is None and span is None: # the pattern matches the whole text match = text span = (0, len(text)) elif match is None or span is None: raise ValueError('If match is not None, span should be specified ' '(and vice versa).') m = matcher(pattern, text) return bool(m) and m.group() == match and m.span() == span LITERAL_CHARS = string.ascii_letters + string.digits @test def test_re_escape(): p = ''.join(chr(i) for i in range(256)) for c in p: assert check_match(re.escape(c), c) assert check_match('[' + re.escape(c) + ']', c) assert check_match(re.escape(p), p) for c in '-.]{}': assert re.escape(c)[:1] == '\\' literal_chars = LITERAL_CHARS assert re.escape(literal_chars) == literal_chars test_re_escape()