maxjustus · December 6, 2025 01:28
diff --git a/multi_search_bench.py b/multi_search_bench.py
 #!/usr/bin/env python3
 """
 Micro-benchmark: Aho-Corasick (daachorse) vs chunked Volnitsky for large pattern counts.

 Generates test data once to a file, then benchmarks queries against that file.

 Usage:
  python multi_search_bench.py [--rows N]
 """

 import subprocess
 import sys
 import re
 import statistics
 import tempfile
 import os
 import argparse
 from pathlib import Path

 VOLNITSKY_MAX_PATTERNS = 255

 # Use splitByString instead of array literals to avoid AST size explosion.
 # Array literals like ['p1', 'p2', ...] create one AST node per element.
 # splitByString('$str$', 'p1$str$p2$str$...') keeps the AST small and constructs
 # the array at runtime. The $str$ delimiter allows patterns to contain commas.
 PATTERN_DELIMITER = "$str$"


 def get_clickhouse_binary():
    macbuild_path = Path("/Users/audio/dev/ClickHouse/macbuild/programs/clickhouse")
    if macbuild_path.exists():
        return str(macbuild_path)
    return "clickhouse"


 def create_test_data_file(binary: str, num_rows: int) -> str:
    fd, filepath = tempfile.mkstemp(suffix=".parquet", prefix="test_data_")
    os.close(fd)
    os.remove(filepath)

    query = f"""
    SELECT * FROM generateRandom('data String')
    LIMIT {num_rows}
    INTO OUTFILE '{filepath}'
    FORMAT Parquet
    """

    result = subprocess.run(
        [binary, "local", "--query", query],
        capture_output=True, text=True, timeout=120
    )

    if result.returncode != 0 or not os.path.exists(filepath):
        print(f"Error generating test data: {result.stderr}", file=sys.stderr)
        return None

    file_size = os.path.getsize(filepath)
    print(f"Generated: {filepath} ({file_size:,} bytes, {num_rows:,} rows)")
    return filepath


 def build_pattern_string(pattern_count: int) -> str:
    return PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(pattern_count)])


 def build_chunked_query(data_file: str, pattern_count: int, case_insensitive: bool) -> str:
    func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny"
    chunks = []
    for start in range(0, pattern_count, VOLNITSKY_MAX_PATTERNS):
        end = min(start + VOLNITSKY_MAX_PATTERNS, pattern_count)
        pattern_str = PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(start, end)])
        chunks.append(f"{func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))")
    return f"SELECT * FROM file('{data_file}', Parquet) WHERE {' OR '.join(chunks)}"


 def run_query(binary: str, query: str, timeout: int = 600):
    fd, query_file = tempfile.mkstemp(suffix=".sql", prefix="query_")
    os.write(fd, f"{query} FORMAT Null".encode('utf-8'))
    os.close(fd)

    try:
        result = subprocess.run(
            [binary, "local", "--time", "--max_query_size=10000000000", "--queries-file", query_file],
            capture_output=True, text=True, timeout=timeout
        )
        os.remove(query_file)

        if result.returncode == 0:
            match = re.search(r'^(\d+\.\d+)', result.stderr.strip())
            if match:
                return float(match.group(1)) * 1000, None
        return None, result.stderr.strip()[:200] if result.stderr else f"Exit code {result.returncode}"
    except subprocess.TimeoutExpired:
        return None, "TIMEOUT"
    except Exception as e:
        return None, str(e)[:200]


 def benchmark(binary: str, data_file: str, pattern_count: int, iterations: int, case_insensitive: bool):
    func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny"
    pattern_str = build_pattern_string(pattern_count)
    query_single = f"SELECT * FROM file('{data_file}', Parquet) WHERE {func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))"
    query_chunked = build_chunked_query(data_file, pattern_count, case_insensitive)

    # Warm-up
    run_query(binary, query_single)
    run_query(binary, query_chunked)

    aho_times, chunked_times, errors = [], [], {}

    for _ in range(iterations):
        t, err = run_query(binary, query_single)
        if t:
            aho_times.append(t)
        elif err and "aho" not in errors:
            errors["aho"] = err

    for _ in range(iterations):
        t, err = run_query(binary, query_chunked)
        if t:
            chunked_times.append(t)
        elif err and "chunked" not in errors:
            errors["chunked"] = err

    return (
        statistics.median(aho_times) if aho_times else None,
        statistics.median(chunked_times) if chunked_times else None,
        errors
    )


 def main():
    parser = argparse.ArgumentParser(description="Aho-Corasick vs Volnitsky micro-benchmark")
    parser.add_argument("--rows", "-r", type=int, default=100000)
    args = parser.parse_args()

    binary = get_clickhouse_binary()

    print("Aho-Corasick (daachorse) vs Chunked Volnitsky")
    print("=" * 75)

    data_file = create_test_data_file(binary, num_rows=args.rows)
    if not data_file:
        sys.exit(1)

    pattern_counts = [1000, 10000, 50000, 100000, 200000]

    for label, case_insensitive in [("Case-sensitive", False), ("Case-insensitive", True)]:
        print(f"\n{label}:")
        print(f"{'Patterns':<10} {'Chunks':<8} {'Aho (ms)':<12} {'Chunked (ms)':<14} {'Speedup':<10}")
        print("-" * 60)

        for pc in pattern_counts:
            chunks = (pc + VOLNITSKY_MAX_PATTERNS - 1) // VOLNITSKY_MAX_PATTERNS
            aho, chunked, errors = benchmark(binary, data_file, pc, iterations=2, case_insensitive=case_insensitive)

            aho_str = f"{aho:.1f}" if aho else "ERR"
            chunked_str = f"{chunked:.1f}" if chunked else "ERR"
            speedup = f"{chunked/aho:.1f}x" if aho and chunked else ""

            print(f"{pc:<10} {chunks:<8} {aho_str:<12} {chunked_str:<14} {speedup:<10}")

            for key, err in errors.items():
                print(f"  {key}: {err}")

    os.remove(data_file)
    print(f"\nCleaned up: {data_file}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Micro-benchmark: Aho-Corasick (daachorse) vs chunked Volnitsky for large pattern counts.

	Generates test data once to a file, then benchmarks queries against that file.

	Usage:
	python multi_search_bench.py [--rows N]
	"""

	import subprocess
	import sys
	import re
	import statistics
	import tempfile
	import os
	import argparse
	from pathlib import Path

	VOLNITSKY_MAX_PATTERNS = 255

	# Use splitByString instead of array literals to avoid AST size explosion.
	# Array literals like ['p1', 'p2', ...] create one AST node per element.
	# splitByString('$str$', 'p1$str$p2$str$...') keeps the AST small and constructs
	# the array at runtime. The $str$ delimiter allows patterns to contain commas.
	PATTERN_DELIMITER = "$str$"


	def get_clickhouse_binary():
	macbuild_path = Path("/Users/audio/dev/ClickHouse/macbuild/programs/clickhouse")
	if macbuild_path.exists():
	return str(macbuild_path)
	return "clickhouse"


	def create_test_data_file(binary: str, num_rows: int) -> str:
	fd, filepath = tempfile.mkstemp(suffix=".parquet", prefix="test_data_")
	os.close(fd)
	os.remove(filepath)

	query = f"""
	SELECT * FROM generateRandom('data String')
	LIMIT {num_rows}
	INTO OUTFILE '{filepath}'
	FORMAT Parquet
	"""

	result = subprocess.run(
	[binary, "local", "--query", query],
	capture_output=True, text=True, timeout=120
	)

	if result.returncode != 0 or not os.path.exists(filepath):
	print(f"Error generating test data: {result.stderr}", file=sys.stderr)
	return None

	file_size = os.path.getsize(filepath)
	print(f"Generated: {filepath} ({file_size:,} bytes, {num_rows:,} rows)")
	return filepath


	def build_pattern_string(pattern_count: int) -> str:
	return PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(pattern_count)])


	def build_chunked_query(data_file: str, pattern_count: int, case_insensitive: bool) -> str:
	func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny"
	chunks = []
	for start in range(0, pattern_count, VOLNITSKY_MAX_PATTERNS):
	end = min(start + VOLNITSKY_MAX_PATTERNS, pattern_count)
	pattern_str = PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(start, end)])
	chunks.append(f"{func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))")
	return f"SELECT * FROM file('{data_file}', Parquet) WHERE {' OR '.join(chunks)}"


	def run_query(binary: str, query: str, timeout: int = 600):
	fd, query_file = tempfile.mkstemp(suffix=".sql", prefix="query_")
	os.write(fd, f"{query} FORMAT Null".encode('utf-8'))
	os.close(fd)

	try:
	result = subprocess.run(
	[binary, "local", "--time", "--max_query_size=10000000000", "--queries-file", query_file],
	capture_output=True, text=True, timeout=timeout
	)
	os.remove(query_file)

	if result.returncode == 0:
	match = re.search(r'^(\d+\.\d+)', result.stderr.strip())
	if match:
	return float(match.group(1)) * 1000, None
	return None, result.stderr.strip()[:200] if result.stderr else f"Exit code {result.returncode}"
	except subprocess.TimeoutExpired:
	return None, "TIMEOUT"
	except Exception as e:
	return None, str(e)[:200]


	def benchmark(binary: str, data_file: str, pattern_count: int, iterations: int, case_insensitive: bool):
	func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny"
	pattern_str = build_pattern_string(pattern_count)
	query_single = f"SELECT * FROM file('{data_file}', Parquet) WHERE {func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))"
	query_chunked = build_chunked_query(data_file, pattern_count, case_insensitive)

	# Warm-up
	run_query(binary, query_single)
	run_query(binary, query_chunked)

	aho_times, chunked_times, errors = [], [], {}

	for _ in range(iterations):
	t, err = run_query(binary, query_single)
	if t:
	aho_times.append(t)
	elif err and "aho" not in errors:
	errors["aho"] = err

	for _ in range(iterations):
	t, err = run_query(binary, query_chunked)
	if t:
	chunked_times.append(t)
	elif err and "chunked" not in errors:
	errors["chunked"] = err

	return (
	statistics.median(aho_times) if aho_times else None,
	statistics.median(chunked_times) if chunked_times else None,
	errors
	)


	def main():
	parser = argparse.ArgumentParser(description="Aho-Corasick vs Volnitsky micro-benchmark")
	parser.add_argument("--rows", "-r", type=int, default=100000)
	args = parser.parse_args()

	binary = get_clickhouse_binary()

	print("Aho-Corasick (daachorse) vs Chunked Volnitsky")
	print("=" * 75)

	data_file = create_test_data_file(binary, num_rows=args.rows)
	if not data_file:
	sys.exit(1)

	pattern_counts = [1000, 10000, 50000, 100000, 200000]

	for label, case_insensitive in [("Case-sensitive", False), ("Case-insensitive", True)]:
	print(f"\n{label}:")
	print(f"{'Patterns':<10} {'Chunks':<8} {'Aho (ms)':<12} {'Chunked (ms)':<14} {'Speedup':<10}")
	print("-" * 60)

	for pc in pattern_counts:
	chunks = (pc + VOLNITSKY_MAX_PATTERNS - 1) // VOLNITSKY_MAX_PATTERNS
	aho, chunked, errors = benchmark(binary, data_file, pc, iterations=2, case_insensitive=case_insensitive)

	aho_str = f"{aho:.1f}" if aho else "ERR"
	chunked_str = f"{chunked:.1f}" if chunked else "ERR"
	speedup = f"{chunked/aho:.1f}x" if aho and chunked else ""

	print(f"{pc:<10} {chunks:<8} {aho_str:<12} {chunked_str:<14} {speedup:<10}")

	for key, err in errors.items():
	print(f" {key}: {err}")

	os.remove(data_file)
	print(f"\nCleaned up: {data_file}")


	if __name__ == "__main__":
	main()
No results found