YouJiacheng · May 18, 2025 10:51
diff --git a/qwen_pretok.py b/qwen_pretok.py
 import time
 import unicodedata

 import regex as re

 regex_pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"

 # --- Helper Functions for Character Properties ---


 def is_letter(char: str):
    """Checks if a character is a Unicode letter (\\p{L})."""
    # L* categories: Lu, Ll, Lt, Lm, Lo
    # Note: there are mismatches with \p{N} for Cn category chars
    return char.isalpha()
    # return unicodedata.category(char).startswith("L")


 def is_number(char: str):
    """Checks if a character is a Unicode number (\\p{N})."""
    # N* categories: Nd, Nl, No
    # return char.isnumeric() # WRONG, e.g. 一, 十, 㐅
    # Note: there are mismatches with \p{N} for Cn category chars
    return unicodedata.category(char).startswith("N")


 def is_whitespace(char: str):
    """Checks if a character is whitespace (\\s)."""
    # Python's isspace() matches Unicode whitespace characters, which aligns well with \s
    # exceptions: '\x1e', '\x1d', '\x1f', '\x1c'
    return char.isspace()


 def is_LN(char: str):
    return is_letter(char) or is_number(char)


 def is_sLN(char: str):
    return is_whitespace(char) or is_LN(char)


 # --- Core Backtracking Matcher Class ---


 class BacktrackingMatcher:
    def __init__(self, text: str):
        self.text = text
        self.length = len(text)

    # --- Functions for Specific Parts of the Target Regex ---
    # These implement the logic for each alternative | in the main regex.
    # These functions try to match a component starting at `index`.
    # They return the index *after* the match, or None if no match.

    # `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
    def _match_alt_A(self, index: int):
        max_count = self.length - index
        if max_count < 2:
            return None

        if self.text[index] != "'":
            return None
        if self.text[index + 1].lower() in "stmd":
            return index + 2
        if max_count < 3:
            return None
        substr = self.text[index + 1 : index + 3]
        if substr.lower() in {"re", "ve", "ll"}:
            return index + 3

    # `[^\r\n\p{L}\p{N}]?\p{L}+`
    def _match_alt_B(self, index: int):
        if index >= self.length:
            return None
        current_index = index

        # Match optional [^\r\n\p{L}\p{N}]?
        if not (self.text[current_index] in "\r\n" or is_LN(self.text[current_index])):
            current_index += 1

        # Match \p{L}+ (one or more letters)
        plus_begin = current_index
        while current_index < self.length and is_letter(self.text[current_index]):
            current_index += 1

        # Must match at least one
        if not current_index > plus_begin:
            return None

        return current_index

    # `\p{N}`
    def _match_alt_C(self, index: int):
        if index >= self.length:
            return None
        if is_number(self.text[index]):
            return index + 1
        return None

    # ` ?[^\s\p{L}\p{N}]+[\r\n]*`
    def _match_alt_D(self, index: int):
        current_index = index
        # Match optional space ' ?'
        if current_index < self.length and self.text[current_index] == " ":
            current_index += 1

        # Match [^\s\p{L}\p{N}]+ (one or more non-space/letter/number)
        plus_begin = current_index
        while current_index < self.length and not is_sLN(self.text[current_index]):
            current_index += 1

        # Must match at least one
        if not current_index > plus_begin:
            return None

        # Match [\r\n]*
        while current_index < self.length and self.text[current_index] in "\r\n":
            current_index += 1

        return current_index

    # `\s*[\r\n]+`
    def _match_alt_E(self, index: int):
        current_index = index
        last_CR_or_LF_index = None

        while current_index < self.length and is_whitespace(self.text[current_index]):
            if self.text[current_index] in "\r\n":
                last_CR_or_LF_index = current_index
            current_index += 1

        if last_CR_or_LF_index is None:
            return None
        return last_CR_or_LF_index + 1

    # `\s+(?!\S)|\s+`
    def _match_alt_F(self, index: int):
        current_index = index

        # Match \s+ greedily
        plus_begin = index
        while current_index < self.length and is_whitespace(self.text[current_index]):
            current_index += 1

        # Must match at least one
        count = current_index - plus_begin
        if count == 0:
            return None  # Failed \s+

        # Check lookahead condition (?!\S) implicitly

        # Case A: Match reached end of string
        if current_index >= self.length:
            # Lookahead (?!\S) is satisfied because nothing follows.
            # Return the full greedy match.
            return current_index

        # Case B: Match did NOT reach end of string
        # Since the loop stopped, self.text[current_index] MUST be \S.
        # The full greedy match fails the lookahead.
        # but we can backtrack and match count - 1 spaces.
        # This is only possible if count >= 2 (so count - 1 >= 1).
        if count >= 2:
            # self.text[current_index - 1] is not \S, (?!\S) is satisfied.
            return current_index - 1

        # count == 1, cannot backtrack to 0 spaces for \s+. \s+(?!\S) fails.
        # but \s+ matches
        return current_index

    def try_match_at_index(self, index: int):
        end_index = self._match_alt_A(index)
        if end_index is not None:
            return end_index

        end_index = self._match_alt_B(index)
        if end_index is not None:
            return end_index

        end_index = self._match_alt_C(index)
        if end_index is not None:
            return end_index

        end_index = self._match_alt_D(index)
        if end_index is not None:
            return end_index

        end_index = self._match_alt_E(index)
        if end_index is not None:
            return end_index

        end_index = self._match_alt_F(index)
        if end_index is not None:
            return end_index

        return None


 # --- findall Implementation ---


 def findall_custom(text):
    """
    Performs a findall operation using the hardcoded backtracking matcher
    for the specific regex.
    """
    results = []
    current_index = 0
    matcher = BacktrackingMatcher(text)
    text_length = matcher.length

    while current_index < text_length:
        # Try to find a match starting at the current_index
        end_index = matcher.try_match_at_index(current_index)

        assert end_index is not None
        # Match found from current_index to end_index
        # match_str = text[current_index:end_index]
        # results.append(match_str)
        # Start next search after the current match
        current_index = end_index

    return results


 # --- Example Usage ---
 test_string1 = "It's a test, aren't we? Let's see 123 results. !@#$%\nAlso---\n\n   Spaces at end?   "
 test_string2 = "WORD word123 'S 'M 'LL 'D 'RE 'VE 'T NoMatch Here!"
 test_string3 = "  \n\t\r\n "  # Whitespace variations
 test_string4 = "Emojis like 😊😊😊 and numbers 1.23"
 test_string5 = "word's and I'll"  # Test case sensitivity and contraction

 for i, test_string in enumerate(
    [test_string1, test_string2, test_string3, test_string4, test_string5]
 ):
    print(f"Input {i + 1}: {test_string!r}")
    print(f"Matches: {findall_custom(test_string)}")
    print(f"{findall_custom(test_string) == re.findall(regex_pattern, test_string)=}")
    print()

 s = open("warandpeace.txt").read()

 start = time.perf_counter_ns()
 findall_custom(s)
 end = time.perf_counter_ns()
 print(f"{len(s.encode()) / (end - start) * 1000} MB/s")
	import time
	import unicodedata

	import regex as re

	regex_pattern = r"(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\r\n\p{L}\p{N}]?\p{L}+\|\p{N}\| ?[^\s\p{L}\p{N}]+[\r\n]\|\s[\r\n]+\|\s+(?!\S)\|\s+"

	# --- Helper Functions for Character Properties ---


	def is_letter(char: str):
	"""Checks if a character is a Unicode letter (\\p{L})."""
	# L* categories: Lu, Ll, Lt, Lm, Lo
	# Note: there are mismatches with \p{N} for Cn category chars
	return char.isalpha()
	# return unicodedata.category(char).startswith("L")


	def is_number(char: str):
	"""Checks if a character is a Unicode number (\\p{N})."""
	# N* categories: Nd, Nl, No
	# return char.isnumeric() # WRONG, e.g. 一, 十, 㐅
	# Note: there are mismatches with \p{N} for Cn category chars
	return unicodedata.category(char).startswith("N")


	def is_whitespace(char: str):
	"""Checks if a character is whitespace (\\s)."""
	# Python's isspace() matches Unicode whitespace characters, which aligns well with \s
	# exceptions: '\x1e', '\x1d', '\x1f', '\x1c'
	return char.isspace()


	def is_LN(char: str):
	return is_letter(char) or is_number(char)


	def is_sLN(char: str):
	return is_whitespace(char) or is_LN(char)


	# --- Core Backtracking Matcher Class ---


	class BacktrackingMatcher:
	def __init__(self, text: str):
	self.text = text
	self.length = len(text)

	# --- Functions for Specific Parts of the Target Regex ---
	# These implement the logic for each alternative \| in the main regex.
	# These functions try to match a component starting at `index`.
	# They return the index after the match, or None if no match.

	# `(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)`
	def _match_alt_A(self, index: int):
	max_count = self.length - index
	if max_count < 2:
	return None

	if self.text[index] != "'":
	return None
	if self.text[index + 1].lower() in "stmd":
	return index + 2
	if max_count < 3:
	return None
	substr = self.text[index + 1 : index + 3]
	if substr.lower() in {"re", "ve", "ll"}:
	return index + 3

	# `[^\r\n\p{L}\p{N}]?\p{L}+`
	def _match_alt_B(self, index: int):
	if index >= self.length:
	return None
	current_index = index

	# Match optional [^\r\n\p{L}\p{N}]?
	if not (self.text[current_index] in "\r\n" or is_LN(self.text[current_index])):
	current_index += 1

	# Match \p{L}+ (one or more letters)
	plus_begin = current_index
	while current_index < self.length and is_letter(self.text[current_index]):
	current_index += 1

	# Must match at least one
	if not current_index > plus_begin:
	return None

	return current_index

	# `\p{N}`
	def _match_alt_C(self, index: int):
	if index >= self.length:
	return None
	if is_number(self.text[index]):
	return index + 1
	return None

	# ` ?[^\s\p{L}\p{N}]+[\r\n]*`
	def _match_alt_D(self, index: int):
	current_index = index
	# Match optional space ' ?'
	if current_index < self.length and self.text[current_index] == " ":
	current_index += 1

	# Match [^\s\p{L}\p{N}]+ (one or more non-space/letter/number)
	plus_begin = current_index
	while current_index < self.length and not is_sLN(self.text[current_index]):
	current_index += 1

	# Must match at least one
	if not current_index > plus_begin:
	return None

	# Match [\r\n]*
	while current_index < self.length and self.text[current_index] in "\r\n":
	current_index += 1

	return current_index

	# `\s*[\r\n]+`
	def _match_alt_E(self, index: int):
	current_index = index
	last_CR_or_LF_index = None

	while current_index < self.length and is_whitespace(self.text[current_index]):
	if self.text[current_index] in "\r\n":
	last_CR_or_LF_index = current_index
	current_index += 1

	if last_CR_or_LF_index is None:
	return None
	return last_CR_or_LF_index + 1

	# `\s+(?!\S)\|\s+`
	def _match_alt_F(self, index: int):
	current_index = index

	# Match \s+ greedily
	plus_begin = index
	while current_index < self.length and is_whitespace(self.text[current_index]):
	current_index += 1

	# Must match at least one
	count = current_index - plus_begin
	if count == 0:
	return None # Failed \s+

	# Check lookahead condition (?!\S) implicitly

	# Case A: Match reached end of string
	if current_index >= self.length:
	# Lookahead (?!\S) is satisfied because nothing follows.
	# Return the full greedy match.
	return current_index

	# Case B: Match did NOT reach end of string
	# Since the loop stopped, self.text[current_index] MUST be \S.
	# The full greedy match fails the lookahead.
	# but we can backtrack and match count - 1 spaces.
	# This is only possible if count >= 2 (so count - 1 >= 1).
	if count >= 2:
	# self.text[current_index - 1] is not \S, (?!\S) is satisfied.
	return current_index - 1

	# count == 1, cannot backtrack to 0 spaces for \s+. \s+(?!\S) fails.
	# but \s+ matches
	return current_index

	def try_match_at_index(self, index: int):
	end_index = self._match_alt_A(index)
	if end_index is not None:
	return end_index

	end_index = self._match_alt_B(index)
	if end_index is not None:
	return end_index

	end_index = self._match_alt_C(index)
	if end_index is not None:
	return end_index

	end_index = self._match_alt_D(index)
	if end_index is not None:
	return end_index

	end_index = self._match_alt_E(index)
	if end_index is not None:
	return end_index

	end_index = self._match_alt_F(index)
	if end_index is not None:
	return end_index

	return None


	# --- findall Implementation ---


	def findall_custom(text):
	"""
	Performs a findall operation using the hardcoded backtracking matcher
	for the specific regex.
	"""
	results = []
	current_index = 0
	matcher = BacktrackingMatcher(text)
	text_length = matcher.length

	while current_index < text_length:
	# Try to find a match starting at the current_index
	end_index = matcher.try_match_at_index(current_index)

	assert end_index is not None
	# Match found from current_index to end_index
	# match_str = text[current_index:end_index]
	# results.append(match_str)
	# Start next search after the current match
	current_index = end_index

	return results


	# --- Example Usage ---
	test_string1 = "It's a test, aren't we? Let's see 123 results. !@#$%\nAlso---\n\n Spaces at end? "
	test_string2 = "WORD word123 'S 'M 'LL 'D 'RE 'VE 'T NoMatch Here!"
	test_string3 = " \n\t\r\n " # Whitespace variations
	test_string4 = "Emojis like 😊😊😊 and numbers 1.23"
	test_string5 = "word's and I'll" # Test case sensitivity and contraction

	for i, test_string in enumerate(
	[test_string1, test_string2, test_string3, test_string4, test_string5]
	):
	print(f"Input {i + 1}: {test_string!r}")
	print(f"Matches: {findall_custom(test_string)}")
	print(f"{findall_custom(test_string) == re.findall(regex_pattern, test_string)=}")
	print()

	s = open("warandpeace.txt").read()

	start = time.perf_counter_ns()
	findall_custom(s)
	end = time.perf_counter_ns()
	print(f"{len(s.encode()) / (end - start) * 1000} MB/s")
No results found