codon/stdlib/re.codon

453 lines
13 KiB
Python
Raw Normal View History

# Copyright (C) 2022-2023 Exaloop Inc. <https://exaloop.io>
# Adapted in part from Google's Python re2 wrapper
# https://github.com/google/re2/blob/abseil/python/re2.py
A = (1 << 0)
ASCII = (1 << 0)
DEBUG = (1 << 1)
I = (1 << 2)
IGNORECASE = (1 << 2)
L = (1 << 3)
LOCALE = (1 << 3)
M = (1 << 4)
MULTILINE = (1 << 4)
S = (1 << 5)
DOTALL = (1 << 5)
X = (1 << 6)
VERBOSE = (1 << 6)
_ANCHOR_NONE = 0
_ANCHOR_START = 1
_ANCHOR_BOTH = 2
@tuple
class Span:
start: int
end: int
def __bool__(self):
return not (self.start == -1 and self.end == -1)
@C
2022-07-09 22:07:15 +08:00
@pure
def seq_re_match(re: cobj,
anchor: int,
string: str,
pos: int,
endpos: int) -> Ptr[Span]:
pass
@C
@pure
def seq_re_match_one(re: cobj,
anchor: int,
string: str,
pos: int,
endpos: int) -> Span:
pass
@C
@pure
def seq_re_pattern_groups(re: cobj) -> int:
pass
@C
@pure
def seq_re_group_name_to_index(re: cobj, name: str) -> int:
pass
@C
@pure
def seq_re_group_index_to_name(re: cobj, index: int) -> str:
pass
@C
@pure
def seq_re_pattern_error(re: cobj) -> str:
pass
@C
@pure
def seq_re_escape(pattern: str) -> str:
pass
@C
Typechecker refactoring (#20) * Initial refactor commit * Support external vars * Simplify refactor; Python scoping [wip] * Python scoping [wip] * Python scoping [fix loops; wip] * Fix lambdas * Python scoping [test fixes; wip] * Fix scoping [wip] * Fix basic tests [no-ci] * Fix tests * CallExpr refactoring [wip] * CallExpr refactoring [wip] * Remove activeUnbounds tracking * Add core.codon * Move Function and other core types to core.codon; Revamp Function and Callable types * Refactor IntExpr, FloatExpr and CallExpr * Refactor ClassStmt * Refactor context, IdExpr and DotExpr * Refactor DotExpr and AssignStmt * Refactor ImportStmt * Refactor FunctionStmt * Refactor * Remove UpdateStmt * Refactor AssignReplacementVisitor * Make SimplifyVisitor in-place * Fix new scoping * Fix import type alias handling * Add docstrings; Complete Simplify refactoring * Fixes for seqtest * Refactor typecheck [wip] * Refactor typecheck [wip] * Refactor typecheck/access; Remove void anduse NoneType; Fix #18 * Refactor typecheck/assign * clang-format and cmake-format * Fix none types in IR * Multi-error support in simplify * Fix IR tests for new void * Simplify ClassStmt * Refactor cond.cpp * Refactor error.cpp * Refactor function.cpp and simplify unbounds * Refactor op.cpp * Refactor call.cpp [wip] [no-ci] * seqassertn updates [noci] * Refactor call.cpp * Refactor call.cpp * Refactor call.cpp * Refactor typecheck * clang-tidy updates [noci] * Refactor infer.cpp [wip] * Refactor infer.cpp * Refactor wrapExpr * Remove visitedAsts * Remove old base logic * Refactor typecheck ctx * Fix JIT bug * Fix JIT tests * Scoping fixes [wip] [noci] * Fix ImperativeForFlow var store * Add newlines [noci] * Dump IR module with log flag * Fix scoping bugs; Add &, ^ and | static operations; Address stylistic review issues * Fix side effect analysis for for-loops * Add support for class variables and ClassVar * Refactor special dot-member cases * Add codon app tests * Fix class variables; clang-tidy * Fix __argv__ * Add datetime constants and update tests * Fix #25; Add Py_None, Py_True and Py_False; External var support [wip] * External var support [wip] * Dump LLVM IR when debug flags are active * clang-format * Fix arg var construction * Extern var fixes * Undo extern var changes related to stdout etc. * Fix tuple magics * Fix extern vars and tuple magics * Fix duplicate var name error * Fix extern vars * Fix #16 * Fix side-effect analysis for try-catch * Move test C var to test executable * Add staticmethod * Fix var status for try-catch * Fix tests * Fix shell var name * Fix test * Fix app test * Fix scoping issue (remove dominated identifier from stack) * Fix no-pie issue * Use PIC when building library object * Don't use -no-pie when building library [noci] * Use -relocation-model=pic in test * Fix lib build on Linux * Fix lib build * Update exceptions to use subclasses vs. header * Fix __repr__ * Fix tests * Fix exceptions test * Don't build docs Co-authored-by: A. R. Shajii <ars@ars.me>
2022-07-27 04:06:00 +08:00
def seq_re_purge() -> None:
pass
@C
2022-07-09 22:07:15 +08:00
@pure
def seq_re_compile(pattern: str, flags: int) -> cobj:
pass
Dynamic Polymorphism (#58) * Use Static[] for static inheritance * Support .seq extension * Fix #36 * Polymorphic typechecking; vtables [wip] * v-table dispatch [wip] * vtable routing [wip; bug] * vtable routing [MVP] * Fix texts * Add union type support * Update FAQs * Clarify * Add BSL license * Add makeUnion * Add IR UnionType * Update union representation in LLVM * Update README * Update README.md * Update README * Update README.md * Add benchmarks * Add more benchmarks and README * Add primes benchmark * Update benchmarks * Fix cpp * Clean up list * Update faq.md * Add binary trees benchmark * Add fannkuch benchmark * Fix paths * Add PyPy * Abort on fail * More benchmarks * Add cpp word_count * Update set_partition cpp * Add nbody cpp * Add TAQ cpp; fix word_count timing * Update CODEOWNERS * Update README * Update README.md * Update CODEOWNERS * Fix bench script * Update binary_trees.cpp * Update taq.cpp * Fix primes benchmark * Add mandelbrot benchmark * Fix OpenMP init * Add Module::unsafeGetUnionType * UnionType [wip] [skip ci] * Integrate IR unions and Union * UnionType refactor [skip ci] * Update README.md * Update docs * UnionType [wip] [skip ci] * UnionType and automatic unions * Add Slack * Update faq.md * Refactor types * New error reporting [wip] * New error reporting [wip] * peglib updates [wip] [skip_ci] * Fix parsing issues * Fix parsing issues * Fix error reporting issues * Make sure random module matches Python * Update releases.md * Fix tests * Fix #59 * Fix #57 * Fix #50 * Fix #49 * Fix #26; Fix #51; Fix #47; Fix #49 * Fix collection extension methods * Fix #62 * Handle *args/**kwargs with Callable[]; Fix #43 * Fix #43 * Fix Ptr.__sub__; Fix polymorphism issues * Add typeinfo * clang-format * Upgrade fmtlib to v9; Use CPM for fmtlib; format spec support; __format__ support * Use CPM for semver and toml++ * Remove extension check * Revamp str methods * Update str.zfill * Fix thunk crashes [wip] [skip_ci] * Fix str.__reversed__ * Fix count_with_max * Fix vtable memory allocation issues * Add poly AST tests * Use PDQsort when stability does not matter * Fix dotted imports; Fix issues * Fix kwargs passing to Python * Fix #61 * Fix #37 * Add isinstance support for unions; Union methods return Union type if different * clang-format * Nicely format error tracebacks * Fix build issues; clang-format * Fix OpenMP init * Fix OpenMP init * Update README.md * Fix tests * Update license [skip ci] * Update license [ci skip] * Add copyright header to all source files * Fix super(); Fix error recovery in ClassStmt * Clean up whitespace [ci skip] * Use Python 3.9 on CI * Print info in random test * Fix single unions * Update random_test.codon * Fix polymorhic thunk instantiation * Fix random test * Add operator.attrgetter and operator.methodcaller * Add code documentation * Update documentation * Update README.md * Fix tests * Fix random init Co-authored-by: A. R. Shajii <ars@ars.me>
2022-12-05 08:45:21 +08:00
class error(Static[Exception]):
pattern: str
Typechecker refactoring (#20) * Initial refactor commit * Support external vars * Simplify refactor; Python scoping [wip] * Python scoping [wip] * Python scoping [fix loops; wip] * Fix lambdas * Python scoping [test fixes; wip] * Fix scoping [wip] * Fix basic tests [no-ci] * Fix tests * CallExpr refactoring [wip] * CallExpr refactoring [wip] * Remove activeUnbounds tracking * Add core.codon * Move Function and other core types to core.codon; Revamp Function and Callable types * Refactor IntExpr, FloatExpr and CallExpr * Refactor ClassStmt * Refactor context, IdExpr and DotExpr * Refactor DotExpr and AssignStmt * Refactor ImportStmt * Refactor FunctionStmt * Refactor * Remove UpdateStmt * Refactor AssignReplacementVisitor * Make SimplifyVisitor in-place * Fix new scoping * Fix import type alias handling * Add docstrings; Complete Simplify refactoring * Fixes for seqtest * Refactor typecheck [wip] * Refactor typecheck [wip] * Refactor typecheck/access; Remove void anduse NoneType; Fix #18 * Refactor typecheck/assign * clang-format and cmake-format * Fix none types in IR * Multi-error support in simplify * Fix IR tests for new void * Simplify ClassStmt * Refactor cond.cpp * Refactor error.cpp * Refactor function.cpp and simplify unbounds * Refactor op.cpp * Refactor call.cpp [wip] [no-ci] * seqassertn updates [noci] * Refactor call.cpp * Refactor call.cpp * Refactor call.cpp * Refactor typecheck * clang-tidy updates [noci] * Refactor infer.cpp [wip] * Refactor infer.cpp * Refactor wrapExpr * Remove visitedAsts * Remove old base logic * Refactor typecheck ctx * Fix JIT bug * Fix JIT tests * Scoping fixes [wip] [noci] * Fix ImperativeForFlow var store * Add newlines [noci] * Dump IR module with log flag * Fix scoping bugs; Add &, ^ and | static operations; Address stylistic review issues * Fix side effect analysis for for-loops * Add support for class variables and ClassVar * Refactor special dot-member cases * Add codon app tests * Fix class variables; clang-tidy * Fix __argv__ * Add datetime constants and update tests * Fix #25; Add Py_None, Py_True and Py_False; External var support [wip] * External var support [wip] * Dump LLVM IR when debug flags are active * clang-format * Fix arg var construction * Extern var fixes * Undo extern var changes related to stdout etc. * Fix tuple magics * Fix extern vars and tuple magics * Fix duplicate var name error * Fix extern vars * Fix #16 * Fix side-effect analysis for try-catch * Move test C var to test executable * Add staticmethod * Fix var status for try-catch * Fix tests * Fix shell var name * Fix test * Fix app test * Fix scoping issue (remove dominated identifier from stack) * Fix no-pie issue * Use PIC when building library object * Don't use -no-pie when building library [noci] * Use -relocation-model=pic in test * Fix lib build on Linux * Fix lib build * Update exceptions to use subclasses vs. header * Fix __repr__ * Fix tests * Fix exceptions test * Don't build docs Co-authored-by: A. R. Shajii <ars@ars.me>
2022-07-27 04:06:00 +08:00
def __init__(self, message: str = "", pattern: str = ""):
super().__init__("re.error", message)
self.pattern = pattern
@property
def msg(self):
return self.message
@tuple
class Pattern:
pattern: str
flags: int
_re: cobj
def compile(pattern: str, flags: int = 0):
re = seq_re_compile(pattern, flags)
err_msg = seq_re_pattern_error(re)
if err_msg:
raise error(err_msg, pattern)
return Pattern(pattern, flags, re)
def search(pattern: str, string: str, flags: int = 0):
return compile(pattern, flags).search(string)
def match(pattern: str, string: str, flags: int = 0):
return compile(pattern, flags).match(string)
def fullmatch(pattern: str, string: str, flags: int = 0):
return compile(pattern, flags).fullmatch(string)
def finditer(pattern: str, string: str, flags: int = 0):
return compile(pattern, flags).finditer(string)
def findall(pattern: str, string: str, flags: int = 0):
return compile(pattern, flags).findall(string)
2022-07-09 22:07:15 +08:00
def split(pattern: str, string: str, maxsplit: int = 0, flags: int = 0):
return compile(pattern, flags).split(string, maxsplit)
2022-07-09 22:07:15 +08:00
def sub(pattern: str, repl, string: str, count: int = 0, flags: int = 0):
return compile(pattern, flags).sub(repl, string, count)
2022-07-09 22:07:15 +08:00
def subn(pattern: str, repl, string: str, count: int = 0, flags: int = 0):
return compile(pattern, flags).subn(repl, string, count)
def escape(pattern: str):
return seq_re_escape(pattern)
def purge():
seq_re_purge()
@tuple
class Match:
_spans: Ptr[Span]
pos: int
endpos: int
re: Pattern
string: str
def _get_group_int(self, g: int, n: int):
if not (0 <= g <= n):
raise IndexError("no such group")
return self._spans[g]
def _get_group_str(self, g: str, n: int):
return self._get_group_int(seq_re_group_name_to_index(self.re._re, g), n)
def _get_group(self, g, n: int):
if isinstance(g, int):
return self._get_group_int(g, n)
elif isinstance(g, str):
return self._get_group_str(g, n)
else:
return self._get_group(g.__index__(), n)
def _span_match(self, span: Span):
if not span:
return None
return self.string._slice(span.start, span.end)
def _get_match(self, g, n: int):
span = self._get_group(g, n)
return self._span_match(span)
def _group_multi(self, n: int, *args):
if staticlen(args) == 1:
return (self._get_match(args[0], n),)
else:
return (self._get_match(args[0], n), *self._group_multi(n, *args[1:]))
def group(self, *args):
if staticlen(args) == 0:
return self._get_match(0, 1).__val__()
elif staticlen(args) == 1:
return self._get_match(args[0], self.re.groups)
else:
return self._group_multi(self.re.groups, *args)
def __getitem__(self, g):
return self._get_match(g, self.re.groups)
def start(self, group = 0):
return self._get_group(group, self.re.groups).start
def end(self, group = 0):
return self._get_group(group, self.re.groups).end
def span(self, group = 0):
start, end = self._get_group(group, self.re.groups)
return start, end
def _split(template: str):
backslash = '\\'
pieces = ['']
index = template.find(backslash)
OCTAL = compile(r'\\[0-7][0-7][0-7]')
GROUP = compile(r'\\[1-9][0-9]?|\\g<\w+>')
while index != -1:
piece, template = template[:index], template[index:]
pieces[-1] += piece
octal_match = OCTAL.match(template)
group_match = GROUP.match(template)
if (not octal_match) and group_match:
index = group_match.end()
piece, template = template[:index], template[index:]
pieces.extend((piece, ''))
else:
index = 2
piece, template = template[:index], template[index:]
pieces[-1] += piece
index = template.find(backslash)
pieces[-1] += template
return pieces
def _unescape(s: str):
r = []
n = len(s)
i = 0
while i < n:
if s[i] == '\\' and i + 1 < n:
c = s[i + 1]
if c == 'a':
r.append('\a')
i += 1
elif c == 'b':
r.append('\b')
i += 1
elif c == 'f':
r.append('\f')
i += 1
elif c == 'n':
r.append('\n')
i += 1
elif c == 'r':
r.append('\r')
i += 1
elif c == 't':
r.append('\t')
i += 1
elif c == 'v':
r.append('\v')
i += 1
elif c == '"':
r.append('\"')
i += 1
elif c == '\'':
r.append('\'')
i += 1
elif c == '\\':
r.append('\\')
i += 1
elif '0' <= c <= '7':
k = i + 2
while k < n and k - i <= 4 and '0' <= s[k] <= '7':
k += 1
code = int(s[i+1:k], 8)
p = Ptr[byte](1)
p[0] = byte(code)
r.append(str(p, 1))
i = k - 1
elif c.isalpha():
raise error(f"bad escape \\{c} at position {i}")
else:
r.append(s[i])
else:
r.append(s[i])
i += 1
return str.cat(r)
def expand(self, template: str):
def get_or_empty(s: Optional[str]):
return s if s is not None else ''
pieces = list(Match._split(template))
INT = compile(r'[+-]?\d+')
for index, piece in enumerate(pieces):
if not (index % 2):
pieces[index] = Match._unescape(piece)
else:
if len(piece) <= 3:
pieces[index] = get_or_empty(self[int(piece[1:])])
else:
group = piece[3:-1]
if INT.fullmatch(group):
pieces[index] = get_or_empty(self[int(group)])
else:
pieces[index] = get_or_empty(self[group])
return str.cat(pieces)
@property
def lastindex(self):
max_end = -1
max_group = None
for group in range(1, self.re.groups + 1):
end = self._spans[group].end
if max_end < end:
max_end = end
max_group = group
return max_group
@property
def lastgroup(self):
max_group = self.lastindex
if max_group is None:
return None
return seq_re_group_index_to_name(self.re._re, max_group)
def groups(self, default: Optional[str] = None):
def get_or_default(item, default):
return item if item is not None else default
n = self.re.groups
return [get_or_default(self._span_match(self._spans[i]), default)
for i in range(1, n + 1)]
def groupdict(self, default: Optional[str] = None):
d = {}
for group, index in self.re.groupindex.items():
item = self[index]
d[group] = item if item is not None else default
return d
def __copy__(self):
return self
def __deepcopy__(self):
return self
def __bool__(self):
return True
@extend
class Pattern:
@property
def groups(self):
return seq_re_pattern_groups(self._re)
@property
def groupindex(self):
d = {}
for i in range(1, self.groups + 1):
name = seq_re_group_index_to_name(self._re, i)
if name:
d[name] = i
return d
def _match_one(self, anchor: int, string: str, pos: Optional[int], endpos: Optional[int]):
posx = 0 if pos is None else max(0, min(pos.__val__(), len(string)))
endposx = len(string) if endpos is None else max(0, min(endpos.__val__(), len(string)))
if posx > endposx:
return None
spans = seq_re_match(self._re, anchor, string, posx, endposx)
if not spans[0]:
return None
return Match(spans, posx, endposx, self, string)
def _match(self, anchor: int, string: str, pos: Optional[int], endpos: Optional[int]):
posx = 0 if pos is None else max(0, min(pos.__val__(), len(string)))
endposx = len(string) if endpos is None else max(0, min(endpos.__val__(), len(string)))
if posx > endposx:
return
while True:
spans = seq_re_match(self._re, anchor, string, posx, endposx)
if not spans[0]:
break
yield Match(spans, posx, endposx, self, string)
if posx == endposx:
break
elif posx == spans[0][1]:
# We matched the empty string at pos and would be stuck, so in order
# to make forward progress, increment the bytes offset.
posx += 1
else:
posx = spans[0][1]
def search(self, string: str, pos: Optional[int] = None, endpos: Optional[int] = None):
return self._match_one(_ANCHOR_NONE, string, pos, endpos)
def match(self, string: str, pos: Optional[int] = None, endpos: Optional[int] = None):
return self._match_one(_ANCHOR_START, string, pos, endpos)
def fullmatch(self, string: str, pos: Optional[int] = None, endpos: Optional[int] = None):
return self._match_one(_ANCHOR_BOTH, string, pos, endpos)
def finditer(self, string: str, pos: Optional[int] = None, endpos: Optional[int] = None):
return self._match(_ANCHOR_NONE, string, pos, endpos)
def findall(self, string: str, pos: Optional[int] = None, endpos: Optional[int] = None):
return [m.group() for m in self.finditer(string, pos, endpos)]
def _split(self, cb, string: str, maxsplit: int = 0, T: type = str):
if maxsplit < 0:
return [T(string)], 0
pieces: List[T] = []
end = 0
numsplit = 0
for match in self.finditer(string):
if (maxsplit > 0 and numsplit >= maxsplit):
break
pieces.append(string[end:match.start()])
pieces.extend(cb(match))
end = match.end()
numsplit += 1
pieces.append(string[end:])
return pieces, numsplit
def split(self, string: str, maxsplit: int = 0):
cb = lambda match: [match[group] for group in range(1, self.groups + 1)]
pieces, _ = self._split(cb, string, maxsplit, Optional[str])
return pieces
def _repl(match, repl):
if isinstance(repl, str):
return match.expand(repl)
else:
return repl(match)
def subn(self, repl, string: str, count: int = 0):
cb = lambda match: [Pattern._repl(match, repl)]
pieces, numsplit = self._split(cb, string, count, str)
joined_pieces = str.cat(pieces)
return joined_pieces, numsplit
def sub(self, repl, string: str, count: int = 0):
joined_pieces, _ = self.subn(repl, string, count)
return joined_pieces
def __bool__(self):
return True