Last active
May 18, 2025 10:51
-
-
Save YouJiacheng/42d54de78d37821ff606aef686e9e50a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| import unicodedata | |
| import regex as re | |
| regex_pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" | |
| # --- Helper Functions for Character Properties --- | |
| def is_letter(char: str): | |
| """Checks if a character is a Unicode letter (\\p{L}).""" | |
| # L* categories: Lu, Ll, Lt, Lm, Lo | |
| # Note: there are mismatches with \p{N} for Cn category chars | |
| return char.isalpha() | |
| # return unicodedata.category(char).startswith("L") | |
| def is_number(char: str): | |
| """Checks if a character is a Unicode number (\\p{N}).""" | |
| # N* categories: Nd, Nl, No | |
| # return char.isnumeric() # WRONG, e.g. δΈ, ε, γ | |
| # Note: there are mismatches with \p{N} for Cn category chars | |
| return unicodedata.category(char).startswith("N") | |
| def is_whitespace(char: str): | |
| """Checks if a character is whitespace (\\s).""" | |
| # Python's isspace() matches Unicode whitespace characters, which aligns well with \s | |
| # exceptions: '\x1e', '\x1d', '\x1f', '\x1c' | |
| return char.isspace() | |
| def is_LN(char: str): | |
| return is_letter(char) or is_number(char) | |
| def is_sLN(char: str): | |
| return is_whitespace(char) or is_LN(char) | |
| # --- Core Backtracking Matcher Class --- | |
| class BacktrackingMatcher: | |
| def __init__(self, text: str): | |
| self.text = text | |
| self.length = len(text) | |
| # --- Functions for Specific Parts of the Target Regex --- | |
| # These implement the logic for each alternative | in the main regex. | |
| # These functions try to match a component starting at `index`. | |
| # They return the index *after* the match, or None if no match. | |
| # `(?i:'s|'t|'re|'ve|'m|'ll|'d)` | |
| def _match_alt_A(self, index: int): | |
| max_count = self.length - index | |
| if max_count < 2: | |
| return None | |
| if self.text[index] != "'": | |
| return None | |
| if self.text[index + 1].lower() in "stmd": | |
| return index + 2 | |
| if max_count < 3: | |
| return None | |
| substr = self.text[index + 1 : index + 3] | |
| if substr.lower() in {"re", "ve", "ll"}: | |
| return index + 3 | |
| # `[^\r\n\p{L}\p{N}]?\p{L}+` | |
| def _match_alt_B(self, index: int): | |
| if index >= self.length: | |
| return None | |
| current_index = index | |
| # Match optional [^\r\n\p{L}\p{N}]? | |
| if not (self.text[current_index] in "\r\n" or is_LN(self.text[current_index])): | |
| current_index += 1 | |
| # Match \p{L}+ (one or more letters) | |
| plus_begin = current_index | |
| while current_index < self.length and is_letter(self.text[current_index]): | |
| current_index += 1 | |
| # Must match at least one | |
| if not current_index > plus_begin: | |
| return None | |
| return current_index | |
| # `\p{N}` | |
| def _match_alt_C(self, index: int): | |
| if index >= self.length: | |
| return None | |
| if is_number(self.text[index]): | |
| return index + 1 | |
| return None | |
| # ` ?[^\s\p{L}\p{N}]+[\r\n]*` | |
| def _match_alt_D(self, index: int): | |
| current_index = index | |
| # Match optional space ' ?' | |
| if current_index < self.length and self.text[current_index] == " ": | |
| current_index += 1 | |
| # Match [^\s\p{L}\p{N}]+ (one or more non-space/letter/number) | |
| plus_begin = current_index | |
| while current_index < self.length and not is_sLN(self.text[current_index]): | |
| current_index += 1 | |
| # Must match at least one | |
| if not current_index > plus_begin: | |
| return None | |
| # Match [\r\n]* | |
| while current_index < self.length and self.text[current_index] in "\r\n": | |
| current_index += 1 | |
| return current_index | |
| # `\s*[\r\n]+` | |
| def _match_alt_E(self, index: int): | |
| current_index = index | |
| last_CR_or_LF_index = None | |
| while current_index < self.length and is_whitespace(self.text[current_index]): | |
| if self.text[current_index] in "\r\n": | |
| last_CR_or_LF_index = current_index | |
| current_index += 1 | |
| if last_CR_or_LF_index is None: | |
| return None | |
| return last_CR_or_LF_index + 1 | |
| # `\s+(?!\S)|\s+` | |
| def _match_alt_F(self, index: int): | |
| current_index = index | |
| # Match \s+ greedily | |
| plus_begin = index | |
| while current_index < self.length and is_whitespace(self.text[current_index]): | |
| current_index += 1 | |
| # Must match at least one | |
| count = current_index - plus_begin | |
| if count == 0: | |
| return None # Failed \s+ | |
| # Check lookahead condition (?!\S) implicitly | |
| # Case A: Match reached end of string | |
| if current_index >= self.length: | |
| # Lookahead (?!\S) is satisfied because nothing follows. | |
| # Return the full greedy match. | |
| return current_index | |
| # Case B: Match did NOT reach end of string | |
| # Since the loop stopped, self.text[current_index] MUST be \S. | |
| # The full greedy match fails the lookahead. | |
| # but we can backtrack and match count - 1 spaces. | |
| # This is only possible if count >= 2 (so count - 1 >= 1). | |
| if count >= 2: | |
| # self.text[current_index - 1] is not \S, (?!\S) is satisfied. | |
| return current_index - 1 | |
| # count == 1, cannot backtrack to 0 spaces for \s+. \s+(?!\S) fails. | |
| # but \s+ matches | |
| return current_index | |
| def try_match_at_index(self, index: int): | |
| end_index = self._match_alt_A(index) | |
| if end_index is not None: | |
| return end_index | |
| end_index = self._match_alt_B(index) | |
| if end_index is not None: | |
| return end_index | |
| end_index = self._match_alt_C(index) | |
| if end_index is not None: | |
| return end_index | |
| end_index = self._match_alt_D(index) | |
| if end_index is not None: | |
| return end_index | |
| end_index = self._match_alt_E(index) | |
| if end_index is not None: | |
| return end_index | |
| end_index = self._match_alt_F(index) | |
| if end_index is not None: | |
| return end_index | |
| return None | |
| # --- findall Implementation --- | |
| def findall_custom(text): | |
| """ | |
| Performs a findall operation using the hardcoded backtracking matcher | |
| for the specific regex. | |
| """ | |
| results = [] | |
| current_index = 0 | |
| matcher = BacktrackingMatcher(text) | |
| text_length = matcher.length | |
| while current_index < text_length: | |
| # Try to find a match starting at the current_index | |
| end_index = matcher.try_match_at_index(current_index) | |
| assert end_index is not None | |
| # Match found from current_index to end_index | |
| # match_str = text[current_index:end_index] | |
| # results.append(match_str) | |
| # Start next search after the current match | |
| current_index = end_index | |
| return results | |
| # --- Example Usage --- | |
| test_string1 = "It's a test, aren't we? Let's see 123 results. !@#$%\nAlso---\n\n Spaces at end? " | |
| test_string2 = "WORD word123 'S 'M 'LL 'D 'RE 'VE 'T NoMatch Here!" | |
| test_string3 = " \n\t\r\n " # Whitespace variations | |
| test_string4 = "Emojis like πππ and numbers 1.23" | |
| test_string5 = "word's and I'll" # Test case sensitivity and contraction | |
| for i, test_string in enumerate( | |
| [test_string1, test_string2, test_string3, test_string4, test_string5] | |
| ): | |
| print(f"Input {i + 1}: {test_string!r}") | |
| print(f"Matches: {findall_custom(test_string)}") | |
| print(f"{findall_custom(test_string) == re.findall(regex_pattern, test_string)=}") | |
| print() | |
| s = open("warandpeace.txt").read() | |
| start = time.perf_counter_ns() | |
| findall_custom(s) | |
| end = time.perf_counter_ns() | |
| print(f"{len(s.encode()) / (end - start) * 1000} MB/s") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment