codon/stdlib/algorithms/strings.codon

204 lines
5.8 KiB
Python

def filter_overlaps(v: Generator[int], n: int):
prev = -1
for pos in v:
if prev < 0 or pos - prev >= n:
prev = pos
yield pos
def rfilter_overlaps(v: Generator[int], n: int):
prev = -1
for pos in v:
if prev < 0 or prev - pos >= n:
prev = pos
yield pos
def string_search_slow(text: str, pattern: str):
"""
Return a list containing the non-overlapping position of each index
the pattern is found.
"""
if pattern == '':
for i in range(len(text) + 1):
yield i
return
for i in range(len(text) - len(pattern) + 1):
if text[i:i + len(pattern)] == pattern:
yield i
def rstring_search_slow(text: str, pattern: str):
"""
Return a list containing the non-overlapping position of each index
the pattern is found from right to left.
"""
if pattern == '':
for i in range(len(text), -1, -1):
yield i
return
for i in range(len(text), len(pattern) - 1, -1):
if text[i-len(pattern):i] == pattern:
yield i - len(pattern)
def string_search_rabin_karp(text: str, pattern: str, prime: int = 645419):
"""
Return a list containing the position of each index
the pattern is found.
"""
BASE = 256
# hash formula
hash = 1
for i in range(len(pattern) - 1):
hash = (hash * BASE) % prime
# calculate hash value of the pattern and the hash value of the
# first section of text
p, t = 0, 0
for i in range(len(pattern)):
p = (BASE * p + ord(pattern[i])) % prime
t = (BASE * t + ord(text[i])) % prime
# shift pattern over text one bye one
for i in range(len(text) - len(pattern)):
# check the hash values of current position of text and
# pattern if the hash values are equal then compare
# characters one by one
if p == t and text[i:i + len(pattern)] == pattern:
yield i
# calculate hash value for next position of text.
# the leading digit is removed while trailing digit is added
t = BASE * (t - (ord(text[i]) * hash) % prime + prime) % prime
t = (t + ord(text[i + len(pattern)])) % prime
if p == t and text[-len(pattern):] == pattern:
yield len(text) - len(pattern)
def computeLPArray(pattern: str, len_pat: int) -> List[int]:
"""
Return a list containing the length of the maximum matching
proper prefix of the pattern[0, 1, ..., i]
"""
len = 0 # length of the previous longest prefix
lp = List[int]() # longest proper prefix will hold the longest prefix values for pattern
lp.append(0)
i = 1
while i < len_pat:
if pattern[i] == pattern[len]:
len += 1
lp.append(len)
i += 1
else:
if len != 0:
len = lp[len-1]
else:
lp.append(0)
i += 1
return lp
def string_search_KMP(text: str, pattern: str):
"""
Knuth-Morris-Pratt algorithm
Return a list containing the position of each index
the pattern is found.
"""
if not pattern:
for i in range(len(text) + 1):
yield i
return
lp_array = computeLPArray(pattern, len(pattern))
i, j = 0, 0 # indices for text and pattern
while i < len(text):
if pattern[j] == text[i]:
i += 1
j += 1
if j == len(pattern):
yield i-j
j = lp_array[j-1]
# mismatch after j matches
elif i < len(text) and pattern[j] != text[i]:
# do not match lp[0...lp[j-1]] characters,
# they will match anyways
if j != 0:
j = lp_array[j-1]
else:
i += 1
def replace_interleave(self, new: str, maxcount: int) -> str:
"""
Returns a string inserting the 'new' string everywhere.
"""
j = 0
res = List[str]()
# insert the new string for maxcount <= len(self) times.
for i in range(len(self)):
if i+1 > maxcount:
break
res.append(new)
res.append(self[i])
j = i
if maxcount-1 == len(self) or maxcount > len(self) or maxcount < 0:
res.append(new)
else:
res.append(self[j+1:])
return str.cat(res)
def replace_delete_substring(self, old: str, new: str, maxcount: int) -> str:
"""
Returns a string deleting any instances of the 'old' string in self and
replaceing it with the 'new' string.
"""
li = list(string_search_KMP(self, old))
# no matches
if len(li) == 0:
return self
# when the whole string is replaced by ''
if len(li)*len(old) == len(self) and maxcount >= len(self):
return ''
if maxcount > len(li):
maxcount = len(li)
result_len = len(self) - (maxcount * len(old))
assert result_len >= 0
j = li[0]
res = List[str]()
res.append(self[0:j])
# replace the old substring with the new substring
for i in range(1, maxcount):
res.append(new)
res.append(self[j+len(old): li[i]])
j = li[i]
res.append(new)
res.append(self[j+len(old):])
return str.cat(res)
# # should get [2]
# print rstring_search_slow('abbbc', 'bb')
# print string_search_slow('abbba', 'bb')
# # should get [0, 9, 11, 14]
# print string_search_slow('1214313141212 12', '12')
# print string_search_rabin_karp('1214313141212 12', '12', 1001)
# print string_search_KMP('1214313141212 12', '12')
# #
# # # should get [0, 9]
# print string_search_slow('AABAACAADAABAABA', 'AABA')
# print string_search_rabin_karp('AABAACAADAABAABA', 'AABA', 1001)
# print string_search_KMP('AABAACAADAABAABA', 'AABA')
# #
# # # should get [10]
# print string_search_slow('ABABDABACDABABCABAB', 'ABABCABAB')
# print string_search_rabin_karp('ABABDABACDABABCABAB', 'ABABCABAB', 101)
# print string_search_KMP('ABABDABACDABABCABAB', 'ABABCABAB')