Skip to content

Instantly share code, notes, and snippets.

@YouJiacheng
Last active May 18, 2025 10:51
Show Gist options
  • Select an option

  • Save YouJiacheng/42d54de78d37821ff606aef686e9e50a to your computer and use it in GitHub Desktop.

Select an option

Save YouJiacheng/42d54de78d37821ff606aef686e9e50a to your computer and use it in GitHub Desktop.
import time
import unicodedata
import regex as re
regex_pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
# --- Helper Functions for Character Properties ---
def is_letter(char: str):
"""Checks if a character is a Unicode letter (\\p{L})."""
# L* categories: Lu, Ll, Lt, Lm, Lo
# Note: there are mismatches with \p{N} for Cn category chars
return char.isalpha()
# return unicodedata.category(char).startswith("L")
def is_number(char: str):
"""Checks if a character is a Unicode number (\\p{N})."""
# N* categories: Nd, Nl, No
# return char.isnumeric() # WRONG, e.g. δΈ€, 十, 㐅
# Note: there are mismatches with \p{N} for Cn category chars
return unicodedata.category(char).startswith("N")
def is_whitespace(char: str):
"""Checks if a character is whitespace (\\s)."""
# Python's isspace() matches Unicode whitespace characters, which aligns well with \s
# exceptions: '\x1e', '\x1d', '\x1f', '\x1c'
return char.isspace()
def is_LN(char: str):
return is_letter(char) or is_number(char)
def is_sLN(char: str):
return is_whitespace(char) or is_LN(char)
# --- Core Backtracking Matcher Class ---
class BacktrackingMatcher:
def __init__(self, text: str):
self.text = text
self.length = len(text)
# --- Functions for Specific Parts of the Target Regex ---
# These implement the logic for each alternative | in the main regex.
# These functions try to match a component starting at `index`.
# They return the index *after* the match, or None if no match.
# `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
def _match_alt_A(self, index: int):
max_count = self.length - index
if max_count < 2:
return None
if self.text[index] != "'":
return None
if self.text[index + 1].lower() in "stmd":
return index + 2
if max_count < 3:
return None
substr = self.text[index + 1 : index + 3]
if substr.lower() in {"re", "ve", "ll"}:
return index + 3
# `[^\r\n\p{L}\p{N}]?\p{L}+`
def _match_alt_B(self, index: int):
if index >= self.length:
return None
current_index = index
# Match optional [^\r\n\p{L}\p{N}]?
if not (self.text[current_index] in "\r\n" or is_LN(self.text[current_index])):
current_index += 1
# Match \p{L}+ (one or more letters)
plus_begin = current_index
while current_index < self.length and is_letter(self.text[current_index]):
current_index += 1
# Must match at least one
if not current_index > plus_begin:
return None
return current_index
# `\p{N}`
def _match_alt_C(self, index: int):
if index >= self.length:
return None
if is_number(self.text[index]):
return index + 1
return None
# ` ?[^\s\p{L}\p{N}]+[\r\n]*`
def _match_alt_D(self, index: int):
current_index = index
# Match optional space ' ?'
if current_index < self.length and self.text[current_index] == " ":
current_index += 1
# Match [^\s\p{L}\p{N}]+ (one or more non-space/letter/number)
plus_begin = current_index
while current_index < self.length and not is_sLN(self.text[current_index]):
current_index += 1
# Must match at least one
if not current_index > plus_begin:
return None
# Match [\r\n]*
while current_index < self.length and self.text[current_index] in "\r\n":
current_index += 1
return current_index
# `\s*[\r\n]+`
def _match_alt_E(self, index: int):
current_index = index
last_CR_or_LF_index = None
while current_index < self.length and is_whitespace(self.text[current_index]):
if self.text[current_index] in "\r\n":
last_CR_or_LF_index = current_index
current_index += 1
if last_CR_or_LF_index is None:
return None
return last_CR_or_LF_index + 1
# `\s+(?!\S)|\s+`
def _match_alt_F(self, index: int):
current_index = index
# Match \s+ greedily
plus_begin = index
while current_index < self.length and is_whitespace(self.text[current_index]):
current_index += 1
# Must match at least one
count = current_index - plus_begin
if count == 0:
return None # Failed \s+
# Check lookahead condition (?!\S) implicitly
# Case A: Match reached end of string
if current_index >= self.length:
# Lookahead (?!\S) is satisfied because nothing follows.
# Return the full greedy match.
return current_index
# Case B: Match did NOT reach end of string
# Since the loop stopped, self.text[current_index] MUST be \S.
# The full greedy match fails the lookahead.
# but we can backtrack and match count - 1 spaces.
# This is only possible if count >= 2 (so count - 1 >= 1).
if count >= 2:
# self.text[current_index - 1] is not \S, (?!\S) is satisfied.
return current_index - 1
# count == 1, cannot backtrack to 0 spaces for \s+. \s+(?!\S) fails.
# but \s+ matches
return current_index
def try_match_at_index(self, index: int):
end_index = self._match_alt_A(index)
if end_index is not None:
return end_index
end_index = self._match_alt_B(index)
if end_index is not None:
return end_index
end_index = self._match_alt_C(index)
if end_index is not None:
return end_index
end_index = self._match_alt_D(index)
if end_index is not None:
return end_index
end_index = self._match_alt_E(index)
if end_index is not None:
return end_index
end_index = self._match_alt_F(index)
if end_index is not None:
return end_index
return None
# --- findall Implementation ---
def findall_custom(text):
"""
Performs a findall operation using the hardcoded backtracking matcher
for the specific regex.
"""
results = []
current_index = 0
matcher = BacktrackingMatcher(text)
text_length = matcher.length
while current_index < text_length:
# Try to find a match starting at the current_index
end_index = matcher.try_match_at_index(current_index)
assert end_index is not None
# Match found from current_index to end_index
# match_str = text[current_index:end_index]
# results.append(match_str)
# Start next search after the current match
current_index = end_index
return results
# --- Example Usage ---
test_string1 = "It's a test, aren't we? Let's see 123 results. !@#$%\nAlso---\n\n Spaces at end? "
test_string2 = "WORD word123 'S 'M 'LL 'D 'RE 'VE 'T NoMatch Here!"
test_string3 = " \n\t\r\n " # Whitespace variations
test_string4 = "Emojis like 😊😊😊 and numbers 1.23"
test_string5 = "word's and I'll" # Test case sensitivity and contraction
for i, test_string in enumerate(
[test_string1, test_string2, test_string3, test_string4, test_string5]
):
print(f"Input {i + 1}: {test_string!r}")
print(f"Matches: {findall_custom(test_string)}")
print(f"{findall_custom(test_string) == re.findall(regex_pattern, test_string)=}")
print()
s = open("warandpeace.txt").read()
start = time.perf_counter_ns()
findall_custom(s)
end = time.perf_counter_ns()
print(f"{len(s.encode()) / (end - start) * 1000} MB/s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment